diff options
Diffstat (limited to 'mm')
75 files changed, 6495 insertions, 2128 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 40a9bfcd5062..d16ba9249bc5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -96,9 +96,6 @@ config HAVE_FAST_GUP depends on MMU bool -config HOLES_IN_ZONE - bool - # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. @@ -742,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT lifetime of the system until these kthreads finish the initialisation. +config PAGE_IDLE_FLAG + bool + select PAGE_EXTENSION if !64BIT + help + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. + config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU - select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have not been touched during a given period of time. This information can @@ -889,4 +894,6 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +source "mm/damon/Kconfig" + endmenu diff --git a/mm/Makefile b/mm/Makefile index e3436741d539..fc60a40ce954 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -38,7 +38,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o ioremap.o + pgtable-generic.o rmap.o vmalloc.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -118,6 +118,7 @@ obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o @@ -128,3 +129,4 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 271f2ca862c8..4a9d4e27d0d9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + /* * Initial write bandwidth: 100 MB/s */ @@ -293,6 +301,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); + atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; @@ -302,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); @@ -350,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) @@ -398,12 +409,12 @@ static void cgwb_release_workfn(struct work_struct *work) blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); - percpu_ref_exit(&wb->refcnt); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); + percpu_ref_exit(&wb->refcnt); wb_exit(wb); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); @@ -807,6 +818,7 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -928,6 +940,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + del_timer_sync(&bdi->laptop_mode_wb_timer); + /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 5b152dba7344..f03f42f426f6 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -39,7 +39,7 @@ void put_page_bootmem(struct page *page) } #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; @@ -74,7 +74,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) } #else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..bfc93da1c2c7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, * is necessary for the block to be a migration source/target. */ do { - if (pfn_valid_within(pfn)) { - if (check_source && PageLRU(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } - if (check_target && PageBuddy(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; } page += (1 << PAGE_ALLOC_COSTLY_ORDER); @@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; nr_scanned++; - if (!pfn_valid_within(blockpfn)) - goto isolate_fail; /* * For compound pages such as THP and hugetlbfs, we can save @@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, cond_resched(); } - if (!pfn_valid_within(low_pfn)) - goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -2398,7 +2392,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION); + MR_COMPACTION, NULL); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); @@ -2706,6 +2700,30 @@ static void compact_nodes(void) */ unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2750,7 +2768,8 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2885,7 +2904,8 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - unsigned int proactive_defer = 0; + long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); + long timeout = default_timeout; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2900,25 +2920,39 @@ static int kcompactd(void *p) while (!kthread_should_stop()) { unsigned long pflags; + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), - msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); psi_memstall_leave(&pflags); + /* + * Reset the timeout value. The defer timeout from + * proactive compaction is lost here but that is fine + * as the condition of the zone changing substantionally + * then carrying on with the previous defer interval is + * not useful. + */ + timeout = default_timeout; continue; } - /* kcompactd wait timeout */ + /* + * Start the proactive work with default timeout. Based + * on the fragmentation score, this timeout is updated. + */ + timeout = default_timeout; if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; - if (proactive_defer) { - proactive_defer--; - continue; - } prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); @@ -2926,9 +2960,12 @@ static int kcompactd(void *p) * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ - proactive_defer = score < prev_score ? - 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + if (unlikely(score >= prev_score)) + timeout = + default_timeout << COMPACT_MAX_DEFER_SHIFT; } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; } return 0; diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 000000000000..37024798a97c --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +config DAMON_KUNIT_TEST + bool "Test for damon" if !KUNIT_ALL_TESTS + depends on DAMON && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_VADDR + bool "Data access monitoring primitives for virtual address spaces" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for virtual address spaces. + +config DAMON_VADDR_KUNIT_TEST + bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS + depends on DAMON_VADDR && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON virtual addresses primitives Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_DBGFS + bool "DAMON debugfs interface" + depends on DAMON_VADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + +config DAMON_DBGFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_DBGFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON debugfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 000000000000..fed4be3bace3 --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DAMON) := core.o +obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h new file mode 100644 index 000000000000..c938a9c34e6c --- /dev/null +++ b/mm/damon/core-test.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#ifdef CONFIG_DAMON_KUNIT_TEST + +#ifndef _DAMON_CORE_TEST_H +#define _DAMON_CORE_TEST_H + +#include <kunit/test.h> + +static void damon_test_regions(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + + r = damon_new_region(1, 2); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); + + damon_del_region(r, t); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_free_target(t); +} + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static void damon_test_target(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 42ul, t->id); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_add_target(c, t); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); + + damon_destroy_target(t); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_destroy_ctx(c); +} + +/* + * Test kdamond_reset_aggregated() + * + * DAMON checks access to each region and aggregates this information as the + * access frequency of each region. In detail, it increases '->nr_accesses' of + * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes + * the aggregated information ('->nr_accesses' of each regions) to the result + * buffer. As a result of the flushing, the '->nr_accesses' of regions are + * initialized to zero. + */ +static void damon_test_aggregate(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long target_ids[] = {1, 2, 3}; + unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; + unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; + unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; + struct damon_target *t; + struct damon_region *r; + int it, ir; + + damon_set_targets(ctx, target_ids, 3); + + it = 0; + damon_for_each_target(t, ctx) { + for (ir = 0; ir < 3; ir++) { + r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + r->nr_accesses = accesses[it][ir]; + damon_add_region(r, t); + } + it++; + } + kdamond_reset_aggregated(ctx); + it = 0; + damon_for_each_target(t, ctx) { + ir = 0; + /* '->nr_accesses' should be zeroed */ + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + ir++; + } + /* regions should be preserved */ + KUNIT_EXPECT_EQ(test, 3, ir); + it++; + } + /* targets also should be preserved */ + KUNIT_EXPECT_EQ(test, 3, it); + + damon_destroy_ctx(ctx); +} + +static void damon_test_split_at(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + damon_add_region(r, t); + damon_split_region_at(c, t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); + + r = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_merge_two(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2, *r3; + int i; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); + r2->nr_accesses = 20; + damon_add_region(r2, t); + + damon_merge_two_regions(t, r, r2); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + + i = 0; + damon_for_each_region(r3, t) { + KUNIT_EXPECT_PTR_EQ(test, r, r3); + i++; + } + KUNIT_EXPECT_EQ(test, i, 1); + + damon_free_target(t); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +static void damon_test_merge_regions_of(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + + unsigned long saddrs[] = {0, 114, 130, 156, 170}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + int i; + + t = damon_new_target(42); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } + + damon_merge_regions_of(t, 9, 9999); + /* 0-112, 114-130, 130-156, 156-170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + for (i = 0; i < 5; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); + KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); + } + damon_free_target(t); +} + +static void damon_test_split_regions_of(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 22); + damon_add_region(r, t); + damon_split_regions_of(c, t, 2); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(0, 220); + damon_add_region(r, t); + damon_split_regions_of(c, t, 4); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u); + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_target), + KUNIT_CASE(damon_test_regions), + KUNIT_CASE(damon_test_aggregate), + KUNIT_CASE(damon_test_split_at), + KUNIT_CASE(damon_test_merge_two), + KUNIT_CASE(damon_test_merge_regions_of), + KUNIT_CASE(damon_test_split_regions_of), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_CORE_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 000000000000..30e9211f494a --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include <linux/damon.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/random.h> +#include <linux/slab.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/damon.h> + +#ifdef CONFIG_DAMON_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; + +/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + return region; +} + +/* + * Add a region between two other regions + */ +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; +} + +static void damon_del_region(struct damon_region *r, struct damon_target *t) +{ + list_del(&r->list); + t->nr_regions--; +} + +static void damon_free_region(struct damon_region *r) +{ + kfree(r); +} + +void damon_destroy_region(struct damon_region *r, struct damon_target *t) +{ + damon_del_region(r, t); + damon_free_region(r); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(unsigned long id) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->id = id; + t->nr_regions = 0; + INIT_LIST_HEAD(&t->regions_list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->adaptive_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + +unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->sample_interval = 5 * 1000; + ctx->aggr_interval = 100 * 1000; + ctx->primitive_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_primitive_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->min_nr_regions = 10; + ctx->max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets); + + return ctx; +} + +static void damon_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next_t; + + if (ctx->primitive.cleanup) { + ctx->primitive.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_targets(ctx); + kfree(ctx); +} + +/** + * damon_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_destroy_targets(ctx); + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + pr_err("Failed to alloc damon_target\n"); + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @sample_int: time interval between samplings + * @aggr_int: time interval between aggregations + * @primitive_upd_int: time interval between monitoring primitive updates + * @min_nr_reg: minimal number of regions + * @max_nr_reg: maximum number of regions + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg) +{ + if (min_nr_reg < 3) { + pr_err("min_nr_regions (%lu) must be at least 3\n", + min_nr_reg); + return -EINVAL; + } + if (min_nr_reg > max_nr_reg) { + pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", + min_nr_reg, max_nr_reg); + return -EINVAL; + } + + ctx->sample_interval = sample_int; + ctx->aggr_interval = aggr_int; + ctx->primitive_update_interval = primitive_upd_int; + ctx->min_nr_regions = min_nr_reg; + ctx->max_nr_regions = max_nr_reg; + + return 0; +} + +/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + +/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += r->ar.end - r->ar.start; + } + + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + +static bool damon_kdamond_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + + return running; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + ctx->kdamond_stop = false; + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = 0; + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If a + * group of threads that created by other 'damon_start()' call is currently + * running, this function does nothing but returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if (nr_running_ctxs) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); + while (damon_kdamond_running(ctx)) + usleep_range(ctx->sample_interval, + ctx->sample_interval * 2); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + return err; + } + + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->aggr_interval); +} + +/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) { + trace_damon_aggregated(t, r, damon_nr_regions(t)); + r->nr_accesses = 0; + } + } +} + +#define sz_damon_region(r) (r->ar.end - r->ar.start) + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +#define diff_of(a, b) (a > b ? a - b : b - a) + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (prev && prev->ar.end == r->ar.start && + diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = r->ar.end - r->ar.start; + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(ctx, t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(ctx, t, nr_subregions); + + last_nr_regions = nr_regions; +} + +/* + * Check whether it is time to check and apply the target monitoring regions + * + * Returns true if it is. + */ +static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_primitive_update, + ctx->primitive_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + struct damon_target *t; + bool stop; + + mutex_lock(&ctx->kdamond_lock); + stop = ctx->kdamond_stop; + mutex_unlock(&ctx->kdamond_lock); + if (stop) + return true; + + if (!ctx->primitive.target_valid) + return false; + + damon_for_each_target(t, ctx) { + if (ctx->primitive.target_valid(t)) + return false; + } + + return true; +} + +static void set_kdamond_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = (struct damon_ctx *)data; + struct damon_target *t; + struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0; + + mutex_lock(&ctx->kdamond_lock); + pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); + mutex_unlock(&ctx->kdamond_lock); + + if (ctx->primitive.init) + ctx->primitive.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + set_kdamond_stop(ctx); + + sz_limit = damon_region_sz_limit(ctx); + + while (!kdamond_need_stop(ctx)) { + if (ctx->primitive.prepare_access_checks) + ctx->primitive.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + set_kdamond_stop(ctx); + + usleep_range(ctx->sample_interval, ctx->sample_interval + 1); + + if (ctx->primitive.check_accesses) + max_nr_accesses = ctx->primitive.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + set_kdamond_stop(ctx); + kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); + if (ctx->primitive.reset_aggregated) + ctx->primitive.reset_aggregated(ctx); + } + + if (kdamond_need_update_primitive(ctx)) { + if (ctx->primitive.update) + ctx->primitive.update(ctx); + sz_limit = damon_region_sz_limit(ctx); + } + } + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + if (ctx->callback.before_terminate && + ctx->callback.before_terminate(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + mutex_unlock(&damon_lock); + + do_exit(0); +} + +#include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h new file mode 100644 index 000000000000..930e83bceef0 --- /dev/null +++ b/mm/damon/dbgfs-test.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON Debugfs Interface Unit Tests + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST + +#ifndef _DAMON_DBGFS_TEST_H +#define _DAMON_DBGFS_TEST_H + +#include <kunit/test.h> + +static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +{ + char *question; + unsigned long *answers; + unsigned long expected[] = {12, 35, 46}; + ssize_t nr_integers = 0, i; + + question = "123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "123abc"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "a123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "12 35"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 abc 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < 2; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = ""; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "\n"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); +} + +static void damon_dbgfs_test_set_targets(struct kunit *test) +{ + struct damon_ctx *ctx = dbgfs_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + char buf[64]; + + /* Make DAMON consider target id as plain number */ + ctx->primitive.target_valid = NULL; + ctx->primitive.cleanup = NULL; + + damon_set_targets(ctx, ids, 3); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); + + damon_set_targets(ctx, (unsigned long []){2}, 1); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_destroy_ctx(ctx); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_set_targets), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-dbgfs", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 000000000000..faee070977d8 --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,623 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include <linux/damon.h> +#include <linux/debugfs.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page_idle.h> +#include <linux/slab.h> + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; +static DEFINE_MUTEX(damon_dbgfs_lock); + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->sample_interval, ctx->aggr_interval, + ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + unsigned long s, a, r, minr, maxr; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &s, &a, &r, &minr, &maxr) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (err) + ret = err; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +static inline bool targetid_is_pid(const struct damon_ctx *ctx) +{ + return ctx->primitive.target_valid == damon_va_target_valid; +} + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + unsigned long id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + id = t->id; + if (targetid_is_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = (unsigned long)pid_vnr((struct pid *)id); + + rc = scnprintf(&buf[written], len - written, "%lu ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an array of unsigned long integers + * + * Returns an array of unsigned long integers if the conversion success, or + * NULL otherwise. + */ +static unsigned long *str_to_target_ids(const char *str, ssize_t len, + ssize_t *nr_ids) +{ + unsigned long *ids; + const int max_nr_ids = 32; + unsigned long id; + int pos = 0, parsed, ret; + + *nr_ids = 0; + ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); + if (!ids) + return NULL; + while (*nr_ids < max_nr_ids && pos < len) { + ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + pos += parsed; + if (ret != 1) + break; + ids[*nr_ids] = id; + *nr_ids += 1; + } + + return ids; +} + +static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +{ + int i; + + for (i = 0; i < nr_ids; i++) + put_pid((struct pid *)ids[i]); +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf, *nrs; + unsigned long *targets; + ssize_t nr_targets; + ssize_t ret = count; + int i; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + nrs = kbuf; + + targets = str_to_target_ids(nrs, ret, &nr_targets); + if (!targets) { + ret = -ENOMEM; + goto out; + } + + if (targetid_is_pid(ctx)) { + for (i = 0; i < nr_targets; i++) { + targets[i] = (unsigned long)find_get_pid( + (int)targets[i]); + if (!targets[i]) { + dbgfs_put_pids(targets, i); + ret = -EINVAL; + goto free_targets_out; + } + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_targets(ctx, targets, nr_targets); + if (err) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = err; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +free_targets_out: + kfree(targets); +out: + kfree(kbuf); + return ret; +} + +static ssize_t dbgfs_kdamond_pid_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); + else + len = scnprintf(kbuf, count, "none\n"); + mutex_unlock(&ctx->kdamond_lock); + if (!len) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static const struct file_operations kdamond_pid_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_kdamond_pid_read, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "target_ids", + "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, + &kdamond_pid_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static int dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!targetid_is_pid(ctx)) + return 0; + + damon_for_each_target_safe(t, next, ctx) { + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + return 0; +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + damon_va_set_primitives(ctx); + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static void dbgfs_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_ctx(ctx); +} + +/* + * Make a context of @name and create a debugfs directory for it. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Returns 0 on success, negative error code otherwise. + */ +static int dbgfs_mk_context(char *name) +{ + struct dentry *root, **new_dirs, *new_dir; + struct damon_ctx **new_ctxs, *new_ctx; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_ctxs) + return -ENOMEM; + dbgfs_ctxs = new_ctxs; + + new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + dbgfs_dirs = new_dirs; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + new_dir = debugfs_create_dir(name, root); + dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; + + new_ctx = dbgfs_new_ctx(); + if (!new_ctx) { + debugfs_remove(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = NULL; + return -ENOMEM; + } + + dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; + dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], + dbgfs_ctxs[dbgfs_nr_ctxs]); + dbgfs_nr_ctxs++; + + return 0; +} + +static ssize_t dbgfs_mk_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + char *ctx_name; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_mk_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +/* + * Remove a context of @name and its debugfs directory. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Return 0 on success, negative error code otherwise. + */ +static int dbgfs_rm_context(char *name) +{ + struct dentry *root, *dir, **new_dirs; + struct damon_ctx **new_ctxs; + int i, j; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + dir = debugfs_lookup(name, root); + if (!dir) + return -ENOENT; + + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), + GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + + new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), + GFP_KERNEL); + if (!new_ctxs) { + kfree(new_dirs); + return -ENOMEM; + } + + for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { + if (dbgfs_dirs[i] == dir) { + debugfs_remove(dbgfs_dirs[i]); + dbgfs_destroy_ctx(dbgfs_ctxs[i]); + continue; + } + new_dirs[j] = dbgfs_dirs[i]; + new_ctxs[j++] = dbgfs_ctxs[i]; + } + + kfree(dbgfs_dirs); + kfree(dbgfs_ctxs); + + dbgfs_dirs = new_dirs; + dbgfs_ctxs = new_ctxs; + dbgfs_nr_ctxs--; + + return 0; +} + +static ssize_t dbgfs_rm_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret = count; + int err; + char *ctx_name; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_rm_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret = count; + char *kbuf; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + if (!strncmp(kbuf, "on", count)) + err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + else if (!strncmp(kbuf, "off", count)) + err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + else + err = -EINVAL; + + if (err) + ret = err; + kfree(kbuf); + return ret; +} + +static const struct file_operations mk_contexts_fops = { + .write = dbgfs_mk_context_write, +}; + +static const struct file_operations rm_contexts_fops = { + .write = dbgfs_rm_context_write, +}; + +static const struct file_operations monitor_on_fops = { + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"mk_contexts", "rm_contexts", + "monitor_on"}; + const struct file_operations *fops[] = {&mk_contexts_fops, + &rm_contexts_fops, &monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc; + + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) + return -ENOMEM; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + return -ENOMEM; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + + return rc; +} + +module_init(damon_dbgfs_init); + +#include "dbgfs-test.h" diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h new file mode 100644 index 000000000000..1f5c13257dba --- /dev/null +++ b/mm/damon/vaddr-test.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST + +#ifndef _DAMON_VADDR_TEST_H +#define _DAMON_VADDR_TEST_H + +#include <kunit/test.h> + +static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +{ + int i, j; + unsigned long largest_gap, gap; + + if (!nr_vmas) + return; + + for (i = 0; i < nr_vmas - 1; i++) { + vmas[i].vm_next = &vmas[i + 1]; + + vmas[i].vm_rb.rb_left = NULL; + vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; + + largest_gap = 0; + for (j = i; j < nr_vmas; j++) { + if (j == 0) + continue; + gap = vmas[j].vm_start - vmas[j - 1].vm_end; + if (gap > largest_gap) + largest_gap = gap; + } + vmas[i].rb_subtree_gap = largest_gap; + } + vmas[i].vm_next = NULL; + vmas[i].vm_rb.rb_right = NULL; + vmas[i].rb_subtree_gap = 0; +} + +/* + * Test __damon_va_three_regions() function + * + * In case of virtual memory address spaces monitoring, DAMON converts the + * complex and dynamic memory mappings of each target task to three + * discontiguous regions which cover every mapped areas. However, the three + * regions should not include the two biggest unmapped areas in the original + * mapping, because the two biggest areas are normally the areas between 1) + * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack. + * Because these two unmapped areas are very huge but obviously never accessed, + * covering the region is just a waste. + * + * '__damon_va_three_regions() receives an address space of a process. It + * first identifies the start of mappings, end of mappings, and the two biggest + * unmapped areas. After that, based on the information, it constructs the + * three regions and returns. For more detail, refer to the comment of + * 'damon_init_regions_of()' function definition in 'mm/damon.c' file. + * + * For example, suppose virtual address ranges of 10-20, 20-25, 200-210, + * 210-220, 300-305, and 307-330 (Other comments represent this mappings in + * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are + * mapped. To cover every mappings, the three regions should start with 10, + * and end with 305. The process also has three unmapped areas, 25-200, + * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two + * unmapped areas, and thus it should be converted to three regions of 10-25, + * 200-220, and 300-330. + */ +static void damon_test_three_regions_in_vmas(struct kunit *test) +{ + struct damon_addr_range regions[3] = {0,}; + /* 10-20-25, 200-210-220, 300-305, 307-330 */ + struct vm_area_struct vmas[] = { + (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, + (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, + (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, + (struct vm_area_struct) {.vm_start = 210, .vm_end = 220}, + (struct vm_area_struct) {.vm_start = 300, .vm_end = 305}, + (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, + }; + + __link_vmas(vmas, 6); + + __damon_va_three_regions(&vmas[0], regions); + + KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); + KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); + KUNIT_EXPECT_EQ(test, 200ul, regions[1].start); + KUNIT_EXPECT_EQ(test, 220ul, regions[1].end); + KUNIT_EXPECT_EQ(test, 300ul, regions[2].start); + KUNIT_EXPECT_EQ(test, 330ul, regions[2].end); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +/* + * Test 'damon_va_apply_three_regions()' + * + * test kunit object + * regions an array containing start/end addresses of current + * monitoring target regions + * nr_regions the number of the addresses in 'regions' + * three_regions The three regions that need to be applied now + * expected start/end addresses of monitoring target regions that + * 'three_regions' are applied + * nr_expected the number of addresses in 'expected' + * + * The memory mapping of the target processes changes dynamically. To follow + * the change, DAMON periodically reads the mappings, simplifies it to the + * three regions, and updates the monitoring target regions to fit in the three + * regions. The update of current target regions is the role of + * 'damon_va_apply_three_regions()'. + * + * This test passes the given target regions and the new three regions that + * need to be applied to the function and check whether it updates the regions + * as expected. + */ +static void damon_do_test_apply_three_regions(struct kunit *test, + unsigned long *regions, int nr_regions, + struct damon_addr_range *three_regions, + unsigned long *expected, int nr_expected) +{ + struct damon_ctx *ctx = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(42); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + damon_add_region(r, t); + } + damon_add_target(ctx, t); + + damon_va_apply_three_regions(t, three_regions); + + for (i = 0; i < nr_expected / 2; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); + KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); + } + + damon_destroy_ctx(ctx); +} + +/* + * This function test most common case where the three big regions are only + * slightly changed. Target regions should adjust their boundary (10-20-30, + * 50-55, 70-80, 90-100) to fit with the new big regions or remove target + * regions (57-79) that now out of the three regions. + */ +static void damon_test_apply_three_regions1(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 45-55, 73-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 45, .end = 55}, + (struct damon_addr_range){.start = 73, .end = 104} }; + /* 5-20-27, 45-55, 73-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 45, 55, + 73, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test slightly bigger change. Similar to above, but the second big region + * now require two target regions (50-55, 57-59) to be removed. + */ +static void damon_test_apply_three_regions2(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 56-57, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 56, .end = 57}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 56-57, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 56, 57, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test a big change. The second big region has totally freed and mapped to + * different area (50-59 -> 61-63). The target regions which were in the old + * second big region (50-55-57-59) should be removed and new target region + * covering the second big region (61-63) should be created. + */ +static void damon_test_apply_three_regions3(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 61-63, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 61, .end = 63}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 61-63, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 61, 63, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test another big change. Both of the second and third big regions (50-59 + * and 70-100) has totally freed and mapped to different area (30-32 and + * 65-68). The target regions which were in the old second and third big + * regions should now be removed and new target regions covering the new second + * and third big regions should be crated. + */ +static void damon_test_apply_three_regions4(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-7, 30-32, 65-68 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 7}, + (struct damon_addr_range){.start = 30, .end = 32}, + (struct damon_addr_range){.start = 65, .end = 68} }; + /* expect 5-7, 30-32, 65-68 */ + unsigned long expected[] = {5, 7, 30, 32, 65, 68}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + unsigned long i; + + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + t = damon_new_target(42); + r = damon_new_region(0, 100); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u); + + i = 0; + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10); + KUNIT_EXPECT_EQ(test, r->ar.end, i * 10); + } + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 59); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + + i = 0; + damon_for_each_region(r, t) { + if (i == 4) + break; + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++); + KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i); + } + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i); + KUNIT_EXPECT_EQ(test, r->ar.end, 59ul); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 6); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); + + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, 5ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 6ul); + } + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_three_regions_in_vmas), + KUNIT_CASE(damon_test_apply_three_regions1), + KUNIT_CASE(damon_test_apply_three_regions2), + KUNIT_CASE(damon_test_apply_three_regions3), + KUNIT_CASE(damon_test_apply_three_regions4), + KUNIT_CASE(damon_test_split_evenly), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-primitives", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_VADDR_TEST_H */ + +#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 000000000000..58c1fb2aafa9 --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include <linux/damon.h> +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> +#include <linux/highmem.h> +#include <linux/page_idle.h> +#include <linux/pagewalk.h> +#include <linux/random.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +/* + * 't->id' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +#define damon_get_task_struct(t) \ + (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = r->ar.end - r->ar.start; + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +static void swap_ranges(struct damon_addr_range *r1, + struct damon_addr_range *r2) +{ + struct damon_addr_range tmp; + + tmp = *r1; + *r1 = *r2; + *r2 = tmp; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct vm_area_struct *vma, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; + struct vm_area_struct *last_vma = NULL; + unsigned long start = 0; + struct rb_root rbroot; + + /* Find two biggest gaps so that first_gap > second_gap > others */ + for (; vma; vma = vma->vm_next) { + if (!last_vma) { + start = vma->vm_start; + goto next; + } + + if (vma->rb_subtree_gap <= sz_range(&second_gap)) { + rbroot.rb_node = &vma->vm_rb; + vma = rb_entry(rb_last(&rbroot), + struct vm_area_struct, vm_rb); + goto next; + } + + gap.start = last_vma->vm_end; + gap.end = vma->vm_start; + if (sz_range(&gap) > sz_range(&second_gap)) { + swap_ranges(&gap, &second_gap); + if (sz_range(&second_gap) > sz_range(&first_gap)) + swap_ranges(&second_gap, &first_gap); + } +next: + last_vma = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap_ranges(&first_gap, &second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm->mmap, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * <heap> + * <BIG UNMAPPED REGION 1> + * <uppermost mmap()-ed region> + * (other mmap()-ed regions and small unmapped regions) + * <lowermost mmap()-ed region> + * <BIG UNMAPPED REGION 2> + * <stack> + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i; + + if (damon_va_three_regions(t, regions)) { + pr_err("Failed to get three regions of target %lu\n", t->id); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Functions for the dynamic monitoring target regions update + */ + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Update damon regions for the three big regions of the given target + * + * t the given target + * bregions the three big regions of the target + */ +static void damon_va_apply_three_regions(struct damon_target *t, + struct damon_addr_range bregions[3]) +{ + struct damon_region *r, *next; + unsigned int i = 0; + + /* Remove regions which are not in the three big regions now */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < 3; i++) { + if (damon_intersect(r, &bregions[i])) + break; + } + if (i == 3) + damon_destroy_region(r, t); + } + + /* Adjust intersecting regions to fit with the three big regions */ + for (i = 0; i < 3; i++) { + struct damon_region *first = NULL, *last; + struct damon_region *newr; + struct damon_addr_range *br; + + br = &bregions[i]; + /* Get the first and last regions which intersects with br */ + damon_for_each_region(r, t) { + if (damon_intersect(r, br)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= br->end) + break; + } + if (!first) { + /* no damon_region intersects with this big region */ + newr = damon_new_region( + ALIGN_DOWN(br->start, + DAMON_MIN_REGION), + ALIGN(br->end, DAMON_MIN_REGION)); + if (!newr) + continue; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + first->ar.start = ALIGN_DOWN(br->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); + } + } +} + +/* + * Update regions for current memory mappings + */ +void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_va_apply_three_regions(t, three_regions); + } +} + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +static struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, + unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, + unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_huge(*pmd)) { + damon_pmdp_mkold(pmd, walk->mm, addr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + damon_ptep_mkold(pte, walk->mm, addr); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, +}; + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); + mmap_read_unlock(mm); +} + +/* + * Functions for the access checking of the regions + */ + +static void damon_va_prepare_access_check(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + damon_va_prepare_access_check(ctx, mm, r); + mmput(mm); + } +} + +struct damon_young_walk_private { + unsigned long *page_sz; + bool young; +}; + +static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + struct page *page; + struct damon_young_walk_private *priv = walk->private; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_huge(*pmd)) { + spin_unlock(ptl); + goto regular_page; + } + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto huge_out; + if (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, + addr)) { + *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT); + priv->young = true; + } + put_page(page); +huge_out: + spin_unlock(ptl); + return 0; + } + +regular_page: +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + page = damon_get_page(pte_pfn(*pte)); + if (!page) + goto out; + if (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = PAGE_SIZE; + priv->young = true; + } + put_page(page); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, +}; + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + struct damon_young_walk_private arg = { + .page_sz = page_sz, + .young = false, + }; + + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); + mmap_read_unlock(mm); + return arg.young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void damon_va_check_access(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + static struct mm_struct *last_mm; + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_mm = mm; + last_addr = r->sampling_addr; +} + +unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) { + damon_va_check_access(ctx, mm, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +bool damon_va_target_valid(void *target) +{ + struct damon_target *t = target; + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +void damon_va_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = damon_va_init; + ctx->primitive.update = damon_va_update; + ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; + ctx->primitive.check_accesses = damon_va_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_va_target_valid; + ctx->primitive.cleanup = NULL; +} + +#include "vaddr-test.h" diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1c922691aa61..1403639302e4 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -29,6 +29,8 @@ #include <linux/start_kernel.h> #include <linux/sched/mm.h> #include <linux/io.h> + +#include <asm/cacheflush.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -58,10 +60,41 @@ #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) -static void __init pte_basic_tests(unsigned long pfn, int idx) +struct pgtable_debug_args { + struct mm_struct *mm; + struct vm_area_struct *vma; + + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + p4d_t *start_p4dp; + pud_t *start_pudp; + pmd_t *start_pmdp; + pgtable_t start_ptep; + + unsigned long vaddr; + pgprot_t page_prot; + pgprot_t page_prot_none; + + bool is_contiguous_page; + unsigned long pud_pfn; + unsigned long pmd_pfn; + unsigned long pte_pfn; + + unsigned long fixed_pgd_pfn; + unsigned long fixed_p4d_pfn; + unsigned long fixed_pud_pfn; + unsigned long fixed_pmd_pfn; + unsigned long fixed_pte_pfn; +}; + +static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, prot); unsigned long val = idx, *ptr = &val; pr_debug("Validating PTE basic (%pGv)\n", ptr); @@ -86,53 +119,63 @@ static void __init pte_basic_tests(unsigned long pfn, int idx) WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } -static void __init pte_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pte_t pte; /* * Architectures optimize set_pte_at by avoiding TLB flush. * This requires set_pte_at to be not used to update an * existing pte entry. Clear pte before we do set_pte_at + * + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. */ + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) + return; pr_debug("Validating PTE advanced\n"); - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); - ptep_set_wrprotect(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte = pfn_pte(args->pte_pfn, args->page_prot); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_write(pte)); - ptep_get_and_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + ptep_get_and_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_wrprotect(pte); pte = pte_mkclean(pte); - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); - ptep_set_access_flags(vma, vaddr, ptep, pte, 1); - pte = ptep_get(ptep); + ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); + pte = ptep_get(args->ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - ptep_get_and_clear_full(mm, vaddr, ptep, 1); - pte = ptep_get(ptep); + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_mkyoung(pte); - set_pte_at(mm, vaddr, ptep, pte); - ptep_test_and_clear_young(vma, vaddr, ptep); - pte = ptep_get(ptep); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); } -static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -143,7 +186,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_basic_tests(unsigned long pfn, int idx) +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -153,7 +196,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx) return; pr_debug("Validating PMD basic (%pGv)\n", ptr); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -181,57 +224,70 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx) WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pmd_t pmd; + unsigned long vaddr = args->vaddr; if (!has_transparent_hugepage()) return; + page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL; + if (!page) + return; + + /* + * flush_dcache_page() is called after set_pmd_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ vaddr &= HPAGE_PMD_MASK; - pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep); - pmd = pfn_pmd(pfn, prot); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_set_wrprotect(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); + pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_write(pmd)); - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmd = pmd_mkwrite(pmd); pmd = pmd_mkdirty(pmd); - pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); - pmd = READ_ONCE(*pmdp); + pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); pmd = pmd_mkyoung(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_test_and_clear_young(vma, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); + pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_young(pmd)); /* Clear the pte entries */ - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pgtable = pgtable_trans_huge_withdraw(mm, pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pgtable_trans_huge_withdraw(args->mm, args->pmdp); } -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -239,7 +295,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD leaf\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); /* * PMD based THP is a leaf entry. @@ -248,7 +304,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -259,13 +315,13 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -275,7 +331,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int return; pr_debug("Validating PUD basic (%pGv)\n", ptr); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -296,7 +352,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; /* @@ -306,58 +362,71 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int WARN_ON(!pud_bad(pud_mkhuge(pud))); } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; + unsigned long vaddr = args->vaddr; pud_t pud; if (!has_transparent_hugepage()) return; + page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; + if (!page) + return; + + /* + * flush_dcache_page() is called after set_pud_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); - pudp_set_wrprotect(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pud = pfn_pud(args->pud_pfn, args->page_prot); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pudp_set_wrprotect(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); - set_pud_at(mm, vaddr, pudp, pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); - pudp_set_access_flags(vma, vaddr, pudp, pud, 1); - pud = READ_ONCE(*pudp); + pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_mkyoung(pud); - set_pud_at(mm, vaddr, pudp, pud); - pudp_test_and_clear_young(vma, vaddr, pudp); - pud = READ_ONCE(*pudp); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pudp_test_and_clear_young(args->vma, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_young(pud)); - pudp_huge_get_and_clear(mm, vaddr, pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -365,7 +434,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD leaf\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); /* * PUD based THP is a leaf entry. */ @@ -373,41 +442,26 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_basic_tests(unsigned long pfn, int idx) { } -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) -{ -} -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(prot)) + if (!arch_vmap_pmd_supported(args->page_prot)) return; pr_debug("Validating PMD huge\n"); @@ -415,18 +469,18 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) * X86 defined pmd_set_huge() verifies that the given * PMD is not a populated non-leaf entry. */ - WRITE_ONCE(*pmdp, __pmd(0)); - WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pmd_clear_huge(pmdp)); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, __pmd(0)); + WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); + WARN_ON(!pmd_clear_huge(args->pmdp)); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) +static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(prot)) + if (!arch_vmap_pud_supported(args->page_prot)) return; pr_debug("Validating PUD huge\n"); @@ -434,18 +488,18 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) * X86 defined pud_set_huge() verifies that the given * PUD is not a populated non-leaf entry. */ - WRITE_ONCE(*pudp, __pud(0)); - WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pud_clear_huge(pudp)); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, __pud(0)); + WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); + WARN_ON(!pud_clear_huge(args->pudp)); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } +static void __init pud_huge_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init p4d_basic_tests(struct pgtable_debug_args *args) { p4d_t p4d; @@ -454,7 +508,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!p4d_same(p4d, p4d)); } -static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pgd_basic_tests(struct pgtable_debug_args *args) { pgd_t pgd; @@ -464,27 +518,26 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) } #ifndef __PAGETABLE_PUD_FOLDED -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +static void __init pud_clear_tests(struct pgtable_debug_args *args) { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = READ_ONCE(*args->pudp); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD clear\n"); pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*pudp, pud); - pud_clear(pudp); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, pud); + pud_clear(args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) +static void __init pud_populate_tests(struct pgtable_debug_args *args) { pud_t pud; - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD populate\n"); @@ -492,40 +545,36 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pud_populate(mm, pudp, pmdp); - pud = READ_ONCE(*pudp); + pud_populate(args->mm, args->pudp, args->start_pmdp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_bad(pud)); } #else /* !__PAGETABLE_PUD_FOLDED */ -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) -{ -} +static void __init pud_clear_tests(struct pgtable_debug_args *args) { } +static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_P4D_FOLDED -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = READ_ONCE(*args->p4dp); - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D clear\n"); p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*p4dp, p4d); - p4d_clear(p4dp); - p4d = READ_ONCE(*p4dp); + WRITE_ONCE(*args->p4dp, p4d); + p4d_clear(args->p4dp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { p4d_t p4d; - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D populate\n"); @@ -533,34 +582,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, * This entry points to next level page table page. * Hence this must not qualify as p4d_bad(). */ - pud_clear(pudp); - p4d_clear(p4dp); - p4d_populate(mm, p4dp, pudp); - p4d = READ_ONCE(*p4dp); + pud_clear(args->pudp); + p4d_clear(args->p4dp); + p4d_populate(args->mm, args->p4dp, args->start_pudp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(p4d_bad(p4d)); } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = READ_ONCE(*(args->pgdp)); - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD clear\n"); pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*pgdp, pgd); - pgd_clear(pgdp); - pgd = READ_ONCE(*pgdp); + WRITE_ONCE(*args->pgdp, pgd); + pgd_clear(args->pgdp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); } -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { pgd_t pgd; - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD populate\n"); @@ -568,56 +616,60 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, * This entry points to next level page table page. * Hence this must not qualify as pgd_bad(). */ - p4d_clear(p4dp); - pgd_clear(pgdp); - pgd_populate(mm, pgdp, p4dp); - pgd = READ_ONCE(*pgdp); + p4d_clear(args->p4dp); + pgd_clear(args->pgdp); + pgd_populate(args->mm, args->pgdp, args->start_p4dp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(pgd_bad(pgd)); } #else /* !__PAGETABLE_P4D_FOLDED */ -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) -{ -} -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) -{ -} +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { } +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { } +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { } +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_P4D_FOLDED */ -static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_clear_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + struct page *page; + pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); + + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) + return; + /* + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PTE clear\n"); #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); barrier(); - pte_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } -static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +static void __init pmd_clear_tests(struct pgtable_debug_args *args) { - pmd_t pmd = READ_ONCE(*pmdp); + pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*pmdp, pmd); - pmd_clear(pmdp); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, pmd); + pmd_clear(args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } -static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) +static void __init pmd_populate_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -626,14 +678,14 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_populate(mm, pmdp, pgtable); - pmd = READ_ONCE(*pmdp); + pmd_populate(args->mm, args->pmdp, args->start_ptep); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_bad(pmd)); } -static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_special_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) return; @@ -642,9 +694,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pte_special(pte_mkspecial(pte))); } -static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_protnone_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -655,7 +707,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -666,25 +718,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD protnone\n"); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none)); WARN_ON(!pmd_protnone(pmd)); WARN_ON(!pmd_present(pmd)); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); pr_debug("Validating PTE devmap\n"); WARN_ON(!pte_devmap(pte_mkdevmap(pte))); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -692,12 +744,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD devmap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -705,25 +757,25 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD devmap\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_devmap(pud_mkdevmap(pud))); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ -static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -733,9 +785,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte))); } -static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -746,7 +798,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -757,12 +809,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd))); WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -774,31 +826,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) -{ -} +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pte_t pte; pr_debug("Validating PTE swap\n"); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); swp = __pte_to_swp_entry(pte); pte = __swp_entry_to_pte(swp); - WARN_ON(pfn != pte_pfn(pte)); + WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pmd_t pmd; @@ -807,16 +857,16 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); swp = __pmd_to_swp_entry(pmd); pmd = __swp_entry_to_pmd(swp); - WARN_ON(pfn != pmd_pfn(pmd)); + WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init swap_migration_tests(void) +static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; swp_entry_t swp; @@ -824,19 +874,18 @@ static void __init swap_migration_tests(void) if (!IS_ENABLED(CONFIG_MIGRATION)) return; - pr_debug("Validating swap migration\n"); /* * swap_migration_tests() requires a dedicated page as it needs to * be locked before creating a migration entry from it. Locking the * page that actually maps kernel text ('start_kernel') can be real - * problematic. Lets allocate a dedicated page explicitly for this - * purpose that will be freed subsequently. + * problematic. Lets use the allocated page explicitly for this + * purpose. */ - page = alloc_page(GFP_KERNEL); - if (!page) { - pr_err("page allocation failed\n"); + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; - } + + pr_debug("Validating swap migration\n"); /* * make_migration_entry() expects given page to be @@ -855,11 +904,10 @@ static void __init swap_migration_tests(void) WARN_ON(!is_migration_entry(swp)); WARN_ON(is_writable_migration_entry(swp)); __ClearPageLocked(page); - __free_page(page); } #ifdef CONFIG_HUGETLB_PAGE -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { struct page *page; pte_t pte; @@ -869,25 +917,25 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) * Accessing the page associated with the pfn is safe here, * as it was previously derived from a real kernel symbol. */ - page = pfn_to_page(pfn); - pte = mk_huge_pte(page, prot); + page = pfn_to_page(args->fixed_pmd_pfn); + pte = mk_huge_pte(page, args->page_prot); WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pte_huge(pte_mkhuge(pte))); #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -906,7 +954,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) * needs to return true. pmd_present() should be true whenever * pmd_trans_huge() returns true. */ - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd))); #ifndef __HAVE_ARCH_PMDP_INVALIDATE @@ -916,7 +964,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_thp_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -924,7 +972,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD based THP\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_trans_huge(pud_mkhuge(pud))); /* @@ -936,11 +984,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) */ } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static unsigned long __init get_random_vaddr(void) @@ -955,43 +1003,179 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } -static int __init debug_vm_pgtable(void) +static void __init destroy_args(struct pgtable_debug_args *args) { - struct vm_area_struct *vma; - struct mm_struct *mm; - pgd_t *pgdp; - p4d_t *p4dp, *saved_p4dp; - pud_t *pudp, *saved_pudp; - pmd_t *pmdp, *saved_pmdp, pmd; - pte_t *ptep; - pgtable_t saved_ptep; - pgprot_t prot, protnone; - phys_addr_t paddr; - unsigned long vaddr, pte_aligned, pmd_aligned; - unsigned long pud_aligned, p4d_aligned, pgd_aligned; - spinlock_t *ptl = NULL; - int idx; + struct page *page = NULL; + + /* Free (huge) page */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage() && + args->pud_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pud_pfn, + (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); + } else { + page = pfn_to_page(args->pud_pfn); + __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); + } + + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } - pr_info("Validating architecture page table helpers\n"); - prot = vm_get_page_prot(VMFLAGS); - vaddr = get_random_vaddr(); - mm = mm_alloc(); - if (!mm) { - pr_err("mm_struct allocation failed\n"); - return 1; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage() && + args->pmd_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); + } else { + page = pfn_to_page(args->pmd_pfn); + __free_pages(page, HPAGE_PMD_ORDER); + } + + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; } + if (args->pte_pfn != ULONG_MAX) { + page = pfn_to_page(args->pte_pfn); + __free_pages(page, 0); + + args->pte_pfn = ULONG_MAX; + } + + /* Free page table entries */ + if (args->start_ptep) { + pte_free(args->mm, args->start_ptep); + mm_dec_nr_ptes(args->mm); + } + + if (args->start_pmdp) { + pmd_free(args->mm, args->start_pmdp); + mm_dec_nr_pmds(args->mm); + } + + if (args->start_pudp) { + pud_free(args->mm, args->start_pudp); + mm_dec_nr_puds(args->mm); + } + + if (args->start_p4dp) + p4d_free(args->mm, args->start_p4dp); + + /* Free vma and mm struct */ + if (args->vma) + vm_area_free(args->vma); + + if (args->mm) + mmdrop(args->mm); +} + +static struct page * __init +debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) +{ + struct page *page = NULL; + +#ifdef CONFIG_CONTIG_ALLOC + if (order >= MAX_ORDER) { + page = alloc_contig_pages((1 << order), GFP_KERNEL, + first_online_node, NULL); + if (page) { + args->is_contiguous_page = true; + return page; + } + } +#endif + + if (order < MAX_ORDER) + page = alloc_pages(GFP_KERNEL, order); + + return page; +} + +static int __init init_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + phys_addr_t phys; + int ret = 0; + /* + * Initialize the debugging data. + * * __P000 (or even __S000) will help create page table entries with * PROT_NONE permission as required for pxx_protnone_tests(). */ - protnone = __P000; + memset(args, 0, sizeof(*args)); + args->vaddr = get_random_vaddr(); + args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot_none = __P000; + args->is_contiguous_page = false; + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + args->fixed_pgd_pfn = ULONG_MAX; + args->fixed_p4d_pfn = ULONG_MAX; + args->fixed_pud_pfn = ULONG_MAX; + args->fixed_pmd_pfn = ULONG_MAX; + args->fixed_pte_pfn = ULONG_MAX; + + /* Allocate mm and vma */ + args->mm = mm_alloc(); + if (!args->mm) { + pr_err("Failed to allocate mm struct\n"); + ret = -ENOMEM; + goto error; + } + + args->vma = vm_area_alloc(args->mm); + if (!args->vma) { + pr_err("Failed to allocate vma\n"); + ret = -ENOMEM; + goto error; + } + + /* + * Allocate page table entries. They will be modified in the tests. + * Lets save the page table entries so that they can be released + * when the tests are completed. + */ + args->pgdp = pgd_offset(args->mm, args->vaddr); + args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr); + if (!args->p4dp) { + pr_err("Failed to allocate p4d entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_p4dp = p4d_offset(args->pgdp, 0UL); + WARN_ON(!args->start_p4dp); + + args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr); + if (!args->pudp) { + pr_err("Failed to allocate pud entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pudp = pud_offset(args->p4dp, 0UL); + WARN_ON(!args->start_pudp); + + args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr); + if (!args->pmdp) { + pr_err("Failed to allocate pmd entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pmdp = pmd_offset(args->pudp, 0UL); + WARN_ON(!args->start_pmdp); - vma = vm_area_alloc(mm); - if (!vma) { - pr_err("vma allocation failed\n"); - return 1; + if (pte_alloc(args->mm, args->pmdp)) { + pr_err("Failed to allocate pte entries\n"); + ret = -ENOMEM; + goto error; } + args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + WARN_ON(!args->start_ptep); /* * PFN for mapping at PTE level is determined from a standard kernel @@ -1000,40 +1184,65 @@ static int __init debug_vm_pgtable(void) * exist on the platform but that does not really matter as pfn_pxx() * helpers will still create appropriate entries for the test. This * helps avoid large memory block allocations to be used for mapping - * at higher page table levels. + * at higher page table levels in some of the tests. */ - paddr = __pa_symbol(&start_kernel); - - pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; - pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; - pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; - pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; - WARN_ON(!pfn_valid(pte_aligned)); - - pgdp = pgd_offset(mm, vaddr); - p4dp = p4d_alloc(mm, pgdp, vaddr); - pudp = pud_alloc(mm, p4dp, vaddr); - pmdp = pmd_alloc(mm, pudp, vaddr); + phys = __pa_symbol(&start_kernel); + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + /* - * Allocate pgtable_t + * Allocate (huge) pages because some of the tests need to access + * the data in the pages. The corresponding tests will be skipped + * if we fail to allocate (huge) pages. */ - if (pte_alloc(mm, pmdp)) { - pr_err("pgtable allocation failed\n"); - return 1; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, + HPAGE_PUD_SHIFT - PAGE_SHIFT); + if (page) { + args->pud_pfn = page_to_pfn(page); + args->pmd_pfn = args->pud_pfn; + args->pte_pfn = args->pud_pfn; + return 0; + } } - /* - * Save all the page table page addresses as the page table - * entries will be used for testing with random or garbage - * values. These saved addresses will be used for freeing - * page table pages. - */ - pmd = READ_ONCE(*pmdp); - saved_p4dp = p4d_offset(pgdp, 0UL); - saved_pudp = pud_offset(p4dp, 0UL); - saved_pmdp = pmd_offset(pudp, 0UL); - saved_ptep = pmd_pgtable(pmd); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER); + if (page) { + args->pmd_pfn = page_to_pfn(page); + args->pte_pfn = args->pmd_pfn; + return 0; + } + } + + page = alloc_pages(GFP_KERNEL, 0); + if (page) + args->pte_pfn = page_to_pfn(page); + + return 0; + +error: + destroy_args(args); + return ret; +} + +static int __init debug_vm_pgtable(void) +{ + struct pgtable_debug_args args; + spinlock_t *ptl = NULL; + int idx, ret; + + pr_info("Validating architecture page table helpers\n"); + ret = init_args(&args); + if (ret) + return ret; /* * Iterate over the protection_map[] to make sure that all @@ -1042,9 +1251,9 @@ static int __init debug_vm_pgtable(void) * given page table entry. */ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { - pte_basic_tests(pte_aligned, idx); - pmd_basic_tests(pmd_aligned, idx); - pud_basic_tests(mm, pud_aligned, idx); + pte_basic_tests(&args, idx); + pmd_basic_tests(&args, idx); + pud_basic_tests(&args, idx); } /* @@ -1054,79 +1263,70 @@ static int __init debug_vm_pgtable(void) * the above iteration for now to save some test execution * time. */ - p4d_basic_tests(p4d_aligned, prot); - pgd_basic_tests(pgd_aligned, prot); + p4d_basic_tests(&args); + pgd_basic_tests(&args); - pmd_leaf_tests(pmd_aligned, prot); - pud_leaf_tests(pud_aligned, prot); + pmd_leaf_tests(&args); + pud_leaf_tests(&args); - pte_savedwrite_tests(pte_aligned, protnone); - pmd_savedwrite_tests(pmd_aligned, protnone); + pte_savedwrite_tests(&args); + pmd_savedwrite_tests(&args); - pte_special_tests(pte_aligned, prot); - pte_protnone_tests(pte_aligned, protnone); - pmd_protnone_tests(pmd_aligned, protnone); + pte_special_tests(&args); + pte_protnone_tests(&args); + pmd_protnone_tests(&args); - pte_devmap_tests(pte_aligned, prot); - pmd_devmap_tests(pmd_aligned, prot); - pud_devmap_tests(pud_aligned, prot); + pte_devmap_tests(&args); + pmd_devmap_tests(&args); + pud_devmap_tests(&args); - pte_soft_dirty_tests(pte_aligned, prot); - pmd_soft_dirty_tests(pmd_aligned, prot); - pte_swap_soft_dirty_tests(pte_aligned, prot); - pmd_swap_soft_dirty_tests(pmd_aligned, prot); + pte_soft_dirty_tests(&args); + pmd_soft_dirty_tests(&args); + pte_swap_soft_dirty_tests(&args); + pmd_swap_soft_dirty_tests(&args); - pte_swap_tests(pte_aligned, prot); - pmd_swap_tests(pmd_aligned, prot); + pte_swap_tests(&args); + pmd_swap_tests(&args); - swap_migration_tests(); + swap_migration_tests(&args); - pmd_thp_tests(pmd_aligned, prot); - pud_thp_tests(pud_aligned, prot); + pmd_thp_tests(&args); + pud_thp_tests(&args); - hugetlb_basic_tests(pte_aligned, prot); + hugetlb_basic_tests(&args); /* * Page table modifying tests. They need to hold * proper page table lock. */ - ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl); - pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pte_unmap_unlock(ptep, ptl); + args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl); + pte_clear_tests(&args); + pte_advanced_tests(&args); + pte_unmap_unlock(args.ptep, ptl); - ptl = pmd_lock(mm, pmdp); - pmd_clear_tests(mm, pmdp); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pmd_populate_tests(mm, pmdp, saved_ptep); + ptl = pmd_lock(args.mm, args.pmdp); + pmd_clear_tests(&args); + pmd_advanced_tests(&args); + pmd_huge_tests(&args); + pmd_populate_tests(&args); spin_unlock(ptl); - ptl = pud_lock(mm, pudp); - pud_clear_tests(mm, pudp); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - pud_huge_tests(pudp, pud_aligned, prot); - pud_populate_tests(mm, pudp, saved_pmdp); + ptl = pud_lock(args.mm, args.pudp); + pud_clear_tests(&args); + pud_advanced_tests(&args); + pud_huge_tests(&args); + pud_populate_tests(&args); spin_unlock(ptl); - spin_lock(&mm->page_table_lock); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); - spin_unlock(&mm->page_table_lock); - - p4d_free(mm, saved_p4dp); - pud_free(mm, saved_pudp); - pmd_free(mm, saved_pmdp); - pte_free(mm, saved_ptep); - - vm_area_free(vma); - mm_dec_nr_puds(mm); - mm_dec_nr_pmds(mm); - mm_dec_nr_ptes(mm); - mmdrop(mm); + spin_lock(&(args.mm->page_table_lock)); + p4d_clear_tests(&args); + pgd_clear_tests(&args); + p4d_populate_tests(&args); + pgd_populate_tests(&args); + spin_unlock(&(args.mm->page_table_lock)); + + destroy_args(&args); return 0; } late_initcall(debug_vm_pgtable); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 164607c7cdf1..74984c23a87e 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -38,13 +38,8 @@ pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, return prot; } -void __init __weak early_ioremap_shutdown(void) -{ -} - void __init early_ioremap_reset(void) { - early_ioremap_shutdown(); after_paging_init = 1; } diff --git a/mm/filemap.c b/mm/filemap.c index d1458ecf2f51..dae481293b5d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -76,8 +76,9 @@ * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * - * ->i_mutex - * ->i_mmap_rwsem (truncate->unmap_mapping_range) + * ->i_rwsem + * ->invalidate_lock (acquired by fs in truncate path) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem @@ -85,9 +86,10 @@ * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock - * ->lock_page (access_process_vm) + * ->invalidate_lock (filemap_fault) + * ->lock_page (filemap_fault, access_process_vm) * - * ->i_mutex (generic_perform_write) + * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -258,12 +260,11 @@ static void page_cache_free_page(struct address_space *mapping, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long flags; BUG_ON(!PageLocked(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); page_cache_free_page(mapping, page); } @@ -335,19 +336,18 @@ void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec) { int i; - unsigned long flags; if (!pagevec_count(pvec)) return; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_delete_batch(mapping, pvec); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -378,6 +378,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) } /** + * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @wbc: the writeback_control controlling the writeout + * + * Call writepages on the mapping using the provided wbc to control the + * writeout. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_wbc(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + + if (!mapping_can_writeback(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + wbc_attach_fdatawrite_inode(wbc, mapping->host); + ret = do_writepages(mapping, wbc); + wbc_detach_inode(wbc); + return ret; +} +EXPORT_SYMBOL(filemap_fdatawrite_wbc); + +/** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts @@ -397,7 +423,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { - int ret; struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, @@ -405,14 +430,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_can_writeback(mapping) || - !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - wbc_attach_fdatawrite_inode(&wbc, mapping->host); - ret = do_writepages(mapping, &wbc); - wbc_detach_inode(&wbc); - return ret; + return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, @@ -821,7 +839,6 @@ void replace_page_cache_page(struct page *old, struct page *new) void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); @@ -833,7 +850,7 @@ void replace_page_cache_page(struct page *old, struct page *new) mem_cgroup_migrate(old, new); - xas_lock_irqsave(&xas, flags); + xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; @@ -846,7 +863,7 @@ void replace_page_cache_page(struct page *old, struct page *new) __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); - xas_unlock_irqrestore(&xas, flags); + xas_unlock_irq(&xas); if (freepage) freepage(old); put_page(old); @@ -1008,6 +1025,44 @@ EXPORT_SYMBOL(__page_cache_alloc); #endif /* + * filemap_invalidate_lock_two - lock invalidate_lock for two mappings + * + * Lock exclusively invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to lock + * @mapping2: the second mapping to lock + */ +void filemap_invalidate_lock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1 > mapping2) + swap(mapping1, mapping2); + if (mapping1) + down_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + down_write_nested(&mapping2->invalidate_lock, 1); +} +EXPORT_SYMBOL(filemap_invalidate_lock_two); + +/* + * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings + * + * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to unlock + * @mapping2: the second mapping to unlock + */ +void filemap_invalidate_unlock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1) + up_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + up_write(&mapping2->invalidate_lock); +} +EXPORT_SYMBOL(filemap_invalidate_unlock_two); + +/* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all @@ -2368,20 +2423,30 @@ static int filemap_update_page(struct kiocb *iocb, { int error; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!filemap_invalidate_trylock_shared(mapping)) + return -EAGAIN; + } else { + filemap_invalidate_lock_shared(mapping); + } + if (!trylock_page(page)) { + error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) - return -EAGAIN; + goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { + filemap_invalidate_unlock_shared(mapping); put_and_wait_on_page_locked(page, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __lock_page_async(page, iocb->ki_waitq); if (error) - return error; + goto unlock_mapping; } + error = AOP_TRUNCATED_PAGE; if (!page->mapping) - goto truncated; + goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) @@ -2392,15 +2457,13 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = filemap_read_page(iocb->ki_filp, mapping, page); - if (error == AOP_TRUNCATED_PAGE) - put_page(page); - return error; -truncated: - unlock_page(page); - put_page(page); - return AOP_TRUNCATED_PAGE; + goto unlock_mapping; unlock: unlock_page(page); +unlock_mapping: + filemap_invalidate_unlock_shared(mapping); + if (error == AOP_TRUNCATED_PAGE) + put_page(page); return error; } @@ -2415,6 +2478,19 @@ static int filemap_create_page(struct file *file, if (!page) return -ENOMEM; + /* + * Protect against truncate / hole punch. Grabbing invalidate_lock here + * assures we cannot instantiate and bring uptodate new pagecache pages + * after evicting page cache during truncate and before actually + * freeing blocks. Note that we could release invalidate_lock after + * inserting the page into page cache as the locked page would then be + * enough to synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate pages or + * ->readpages() that need to hold invalidate_lock while mapping blocks + * for IO so let's hold the lock here as well to keep locking rules + * simple. + */ + filemap_invalidate_lock_shared(mapping); error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2426,9 +2502,11 @@ static int filemap_create_page(struct file *file, if (error) goto error; + filemap_invalidate_unlock_shared(mapping); pagevec_add(pvec, page); return 0; error: + filemap_invalidate_unlock_shared(mapping); put_page(page); return error; } @@ -2967,6 +3045,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) pgoff_t max_off; struct page *page; vm_fault_t ret = 0; + bool mapping_locked = false; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2976,25 +3055,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { + if (likely(page)) { /* - * We found the page, so try async readahead before - * waiting for the lock. + * We found the page, so try async readahead before waiting for + * the lock. */ - fpin = do_async_mmap_readahead(vmf, page); - } else if (!page) { + if (!(vmf->flags & FAULT_FLAG_TRIED)) + fpin = do_async_mmap_readahead(vmf, page); + if (unlikely(!PageUptodate(page))) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } + } else { /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: + /* + * See comment in filemap_create_page() why we need + * invalidate_lock + */ + if (!mapping_locked) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (!page) { if (fpin) goto out_retry; + filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } @@ -3014,8 +3107,20 @@ retry_find: * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + /* + * The page was in cache and uptodate and now it is not. + * Strange but possible since we didn't hold the page lock all + * the time. Let's drop everything get the invalidate lock and + * try again. + */ + if (!mapping_locked) { + unlock_page(page); + put_page(page); + goto retry_find; + } goto page_not_uptodate; + } /* * We've made it this far and we had to drop our mmap_lock, now is the @@ -3026,6 +3131,8 @@ retry_find: unlock_page(page); goto out_retry; } + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. @@ -3056,6 +3163,7 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; + filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; @@ -3067,6 +3175,8 @@ out_retry: */ if (page) put_page(page); + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; @@ -3437,6 +3547,8 @@ out: * * If the page does not get brought uptodate, return -EIO. * + * The function expects mapping->invalidate_lock to be already held. + * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page(struct address_space *mapping, @@ -3460,6 +3572,8 @@ EXPORT_SYMBOL(read_cache_page); * * If the page does not get brought uptodate, return -EIO. * + * The function expects mapping->invalidate_lock to be already held. + * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, @@ -3704,12 +3818,12 @@ EXPORT_SYMBOL(generic_perform_write); * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * - * It expects i_mutex to be grabbed unless we work on a block device or similar + * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to - * avoid syncing under i_mutex. + * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes @@ -3797,7 +3911,7 @@ EXPORT_SYMBOL(__generic_file_write_iter); * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file - * and acquires i_mutex as needed. + * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write @@ -92,10 +92,17 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) return head; } -/* +/** * try_grab_compound_head() - attempt to elevate a page's refcount, by a * flags-dependent amount. * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the page's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * @@ -103,22 +110,26 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * FOLL_GET: page's refcount will be incremented by @refs. + * + * FOLL_PIN on compound pages that are > two pages long: page's refcount will + * be incremented by @refs, and page[2].hpage_pinned_refcount will be + * incremented by @refs * GUP_PIN_COUNTING_BIAS. + * + * FOLL_PIN on normal pages, or compound pages that are two pages long: + * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. * * Return: head page (with refcount appropriately incremented) for success, or * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -__maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); else if (flags & FOLL_PIN) { - int orig_refs = refs; - /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -143,6 +154,8 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, * * However, be sure to *also* increment the normal page refcount * field at least once, so that the page really is pinned. + * That's why the refcount from the earlier + * try_get_compound_head() is left intact. */ if (hpage_pincount_available(page)) hpage_pincount_add(page, refs); @@ -150,7 +163,7 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, - orig_refs); + refs); return page; } @@ -186,10 +199,8 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) * @flags: gup flags: these are the FOLL_* flag values. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: - * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * time. Cases: please see the try_grab_compound_head() documentation, with + * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or @@ -197,35 +208,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); - - if (flags & FOLL_GET) - return try_get_page(page); - else if (flags & FOLL_PIN) { - int refs = 1; - - page = compound_head(page); - - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_add(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - /* - * Similar to try_grab_compound_head(): even if using the - * hpage_pincount_add/_sub() routines, be sure to - * *also* increment the normal page refcount field at least - * once, so that the page really is pinned. - */ - page_ref_add(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); - } + if (!(flags & (FOLL_GET | FOLL_PIN))) + return true; - return true; + return try_grab_compound_head(page, 1, flags); } /** @@ -1151,7 +1137,6 @@ static long __get_user_pages(struct mm_struct *mm, * We must stop here. */ BUG_ON(gup_flags & FOLL_NOWAIT); - BUG_ON(ret != 0); goto out; } continue; @@ -1276,7 +1261,7 @@ int fixup_user_fault(struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_fault_t ret, major = 0; + vm_fault_t ret; address = untagged_addr(address); @@ -1296,7 +1281,6 @@ retry: return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); - major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); @@ -1475,8 +1459,8 @@ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); @@ -1558,9 +1542,12 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, gup_flags |= FOLL_WRITE; /* - * See check_vma_flags(): Will return -EFAULT on incompatible mappings - * or with insufficient permissions. + * We want to report -EINVAL instead of -EFAULT for any permission + * problems or incompatible mappings. */ + if (check_vma_flags(vma, gup_flags)) + return -EINVAL; + return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } @@ -1772,7 +1759,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN); + MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } @@ -2241,6 +2228,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; + int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2248,21 +2236,22 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); - if (pgmap) - put_dev_pagemap(pgmap); - return 1; + put_dev_pagemap(pgmap); + return ret; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, diff --git a/mm/highmem.c b/mm/highmem.c index 4fb51d735aa6..4212ad0e4a19 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -436,7 +436,7 @@ EXPORT_SYMBOL(zero_user_segments); static inline int kmap_local_idx_push(void) { - WARN_ON_ONCE(in_irq() && !irqs_disabled()); + WARN_ON_ONCE(in_hardirq() && !irqs_disabled()); current->kmap_ctrl.idx += KM_INCR; BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); return current->kmap_ctrl.idx - 1; @@ -295,10 +295,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, goto fault; /* + * Bypass devmap pte such as DAX page when all pfn requested + * flags(pfn_req_flags) are fulfilled. * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (pte_special(pte) && !pte_devmap(pte) && + !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afff3ac87067..5e9ef0fc261e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) goto out; } - /* - * Since we took the NUMA fault, we must have observed the !accessible - * bit. Make sure all other CPUs agree with that, to avoid them - * modifying the page we're about to migrate. - * - * Must be done under PTL such that we'll observe the relevant - * inc_tlb_flush_pending(). - * - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(vma->vm_mm)) { - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - /* - * change_huge_pmd() released the pmd lock before - * invalidating the secondary MMUs sharing the primary - * MMU pagetables (with ->invalidate_range()). The - * mmu_notifier_invalidate_range_end() (which - * internally calls ->invalidate_range()) in - * change_pmd_range() will run after us, so we can't - * rely on it here and we need an explicit invalidate. - */ - mmu_notifier_invalidate_range(vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); - } - pmd = pmd_modify(oldpmd, vma->vm_page_prot); page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) @@ -2454,11 +2428,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); - /* Some pages can be beyond i_size: drop them from page cache */ + /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { ClearPageDirty(head + i); __delete_from_page_cache(head + i, NULL); - if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + if (shmem_mapping(head->mapping)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); } else if (!PageAnon(page)) { @@ -2686,6 +2660,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * head page lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (shmem_mapping(mapping)) + end = shmem_fallocend(mapping->host, end); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 924553aa8f78..95dc7b83381f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) int nid = page_to_nid(page); lockdep_assert_held(&hugetlb_lock); + VM_BUG_ON_PAGE(page_count(page), page); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1143,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1164,7 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -1368,8 +1380,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]--; } + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_page will turn the compound page + * into a simple group of pages. After this the destructor does not + * apply. + * + * This handles the case where more than one ref is held when and + * after update_and_free_page is called. + */ set_page_refcounted(page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (hstate_is_gigantic(h)) + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + else + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; @@ -1399,11 +1431,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, SetHPageVmemmapOptimized(page); /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the last reference. + * This page is about to be managed by the hugetlb allocator and + * should have no users. Drop our reference, and check for others + * just in case. */ zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + if (!zeroed) + /* + * It is VERY unlikely soneone else has taken a ref on + * the page. In this case, we simply return as the + * hugetlb destructor (free_huge_page) will be called + * when this other ref is dropped. + */ + return; + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } @@ -1657,16 +1698,14 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) * cache adding could take a ref on a 'to be' tail page. * We need to respect any increased ref count, and only set * the ref count to zero if count is currently 1. If count - * is not 1, we call synchronize_rcu in the hope that a rcu - * grace period will cause ref count to drop and then retry. - * If count is still inflated on retry we return an error and - * must discard the pages. + * is not 1, we return an error. An error return indicates + * the set of pages can not be converted to a gigantic page. + * The caller who allocated the pages should then discard the + * pages using the appropriate free interface. */ if (!page_ref_freeze(p, 1)) { - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - synchronize_rcu(); - if (!page_ref_freeze(p, 1)) - goto out_error; + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; } set_page_count(p, 0); set_compound_head(p, page); @@ -1830,7 +1869,6 @@ retry: retry = true; goto retry; } - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); return NULL; } } @@ -2020,9 +2058,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) + int nid, nodemask_t *nmask, bool zero_ref) { struct page *page = NULL; + bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2032,6 +2071,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); +retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2049,11 +2089,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; - } else { - h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; } + if (zero_ref) { + /* + * Caller requires a page with zero ref count. + * We will drop ref count here. If someone else is holding + * a ref, the page will be freed when they drop it. Abuse + * temporary page flag to accomplish this. + */ + SetHPageTemporary(page); + if (!put_page_testzero(page)) { + /* + * Unexpected inflated ref count on freshly allocated + * huge. Retry once. + */ + pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); + spin_unlock_irq(&hugetlb_lock); + if (retry) + return NULL; + + retry = true; + goto retry; + } + ClearHPageTemporary(page); + } + + h->surplus_huge_pages++; + h->surplus_huge_pages_node[page_to_nid(page)]++; + out_unlock: spin_unlock_irq(&hugetlb_lock); @@ -2088,16 +2152,26 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); - mpol_cond_put(mpol); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + mpol_cond_put(mpol); return page; } @@ -2167,7 +2241,7 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + NUMA_NO_NODE, NULL, true); if (!page) { alloc_ok = false; break; @@ -2208,24 +2282,20 @@ retry: /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - int zeroed; - if ((--needed) < 0) break; - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + /* Add the page to the hugetlb allocator */ enqueue_huge_page(h, page); } free: spin_unlock_irq(&hugetlb_lock); - /* Free unnecessary surplus pages to the buddy allocator */ + /* + * Free unnecessary surplus pages to the buddy allocator. + * Pages have no ref count, call free_huge_page directly. + */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) - put_page(page); + free_huge_page(page); spin_lock_irq(&hugetlb_lock); return ret; @@ -2476,7 +2546,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * added by alloc_huge_page. We know it was added + * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise * HPageRestoreReserve would be set on the page. * Remove the entry so that a subsequent allocation @@ -2534,6 +2604,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); + bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2544,9 +2615,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ +alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; + /* + * If all goes well, this page will be directly added to the free + * list in the pool. For this the ref count needs to be zero. + * Attempt to drop now, and retry once if needed. It is VERY + * unlikely there is another ref on the page. + * + * If someone else has a reference to the page, it will be freed + * when they drop their ref. Abuse temporary page flag to accomplish + * this. Retry once if there is an inflated ref count. + */ + SetHPageTemporary(new_page); + if (!put_page_testzero(new_page)) { + if (alloc_retry) + return -EBUSY; + + alloc_retry = true; + goto alloc_retry; + } + ClearHPageTemporary(new_page); + __prep_new_huge_page(h, new_page); retry: @@ -2586,11 +2678,10 @@ retry: remove_hugetlb_page(h, old_page, false); /* - * Reference count trick is needed because allocator gives us - * referenced page but the pool requires pages with 0 refcount. + * Ref count on new page is already zero as it was dropped + * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - page_ref_dec(new_page); enqueue_huge_page(h, new_page); /* @@ -2604,6 +2695,8 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); + /* Page has a zero ref count, but needs a ref to be freed */ + set_page_refcounted(new_page); update_and_free_page(h, new_page, false); return ret; @@ -2828,8 +2921,8 @@ static void __init gather_bootmem_prealloc(void) prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* add to the hugepage allocator */ } else { + /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); } /* @@ -4033,8 +4126,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) @@ -4660,7 +4755,9 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: - restore_reserve_on_error(h, vma, haddr, new_page); + /* No restore in case of successful pagetable update (Break COW) */ + if (new_page != old_page) + restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -4776,7 +4873,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page = false; + bool new_page, new_pagecache_page = false; /* * Currently, we are forced to kill the process in the event the @@ -4799,6 +4896,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; retry: + new_page = false; page = find_lock_page(mapping, idx); if (!page) { /* Check for page in userfault range */ @@ -4842,6 +4940,7 @@ retry: goto retry; goto out; } + new_pagecache_page = true; } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { @@ -4926,7 +5025,9 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, haddr, page); + /* restore reserve for newly allocated pages not in page cache */ + if (new_page && !new_pagecache_page) + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -5135,6 +5236,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret = -ENOMEM; struct page *page; int writable; + bool new_pagecache_page = false; if (is_continue) { ret = -EFAULT; @@ -5228,6 +5330,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = huge_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; + new_pagecache_page = true; } ptl = huge_pte_lockptr(h, dst_mm, dst_pte); @@ -5291,7 +5394,8 @@ out_release_unlock: if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: - restore_reserve_on_error(h, dst_vma, dst_addr, page); + if (!new_pagecache_page) + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } @@ -5440,8 +5544,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } - refs = min3(pages_per_huge_page(h) - pfn_offset, - (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder); + /* vaddr may not be aligned to PAGE_SIZE */ + refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, + (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) record_subpages_vmas(mem_map_offset(page, pfn_offset), diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 1ae1ebc2b9b1..aff4d27ec235 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - shake_page(hpage, 0); + shake_page(hpage); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/internal.h b/mm/internal.h index 31ff935b2547..cf3cb933eba3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); +extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -539,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } +static inline int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + return NUMA_NO_NODE; +} #endif extern int hwpoison_filter(struct page *p); diff --git a/mm/ioremap.c b/mm/ioremap.c index 8ee0136f8cb0..5fe598ecd9b7 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -8,33 +8,9 @@ */ #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/sched.h> #include <linux/io.h> #include <linux/export.h> -#include <asm/cacheflush.h> -#include "pgalloc-track.h" - -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1; - -static int __init set_nohugeiomap(char *str) -{ - iomap_max_page_shift = PAGE_SHIFT; - return 0; -} -early_param("nohugeiomap", set_nohugeiomap); -#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static const unsigned int iomap_max_page_shift = PAGE_SHIFT; -#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ - -int ioremap_page_range(unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) -{ - return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); -} - -#ifdef CONFIG_GENERIC_IOREMAP void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) { unsigned long offset, vaddr; @@ -71,4 +47,3 @@ void iounmap(volatile void __iomem *addr) vunmap((void *)((unsigned long)addr & PAGE_MASK)); } EXPORT_SYMBOL(iounmap); -#endif /* CONFIG_GENERIC_IOREMAP */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4ea8c368b5b8..05d1e9460e2e 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -37,16 +37,9 @@ enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_ON, }; -enum kasan_arg_fault { - KASAN_ARG_FAULT_DEFAULT, - KASAN_ARG_FAULT_REPORT, - KASAN_ARG_FAULT_PANIC, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; -static enum kasan_arg_fault kasan_arg_fault __ro_after_init; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether to panic or print a report and disable tag checking on fault. */ -bool kasan_flag_panic __ro_after_init; - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); -/* kasan.fault=report/panic */ -static int __init early_kasan_fault(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "report")) - kasan_arg_fault = KASAN_ARG_FAULT_REPORT; - else if (!strcmp(arg, "panic")) - kasan_arg_fault = KASAN_ARG_FAULT_PANIC; - else - return -EINVAL; - - return 0; -} -early_param("kasan.fault", early_kasan_fault); - /* kasan_init_hw_tags_cpu() is called for each CPU. */ void kasan_init_hw_tags_cpu(void) { @@ -142,8 +115,6 @@ void kasan_init_hw_tags_cpu(void) if (kasan_arg == KASAN_ARG_OFF) return; - hw_init_tags(KASAN_TAG_MAX); - /* * Enable async mode only when explicitly requested through * the command line. @@ -197,22 +168,6 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_fault) { - case KASAN_ARG_FAULT_DEFAULT: - /* - * Default to no panic on report. - * Do nothing, kasan_flag_panic keeps its default value. - */ - break; - case KASAN_ARG_FAULT_REPORT: - /* Do nothing, kasan_flag_panic keeps its default value. */ - break; - case KASAN_ARG_FAULT_PANIC: - /* Enable panic on report. */ - kasan_flag_panic = true; - break; - } - pr_info("KernelAddressSanitizer initialized\n"); } @@ -250,12 +205,6 @@ void kasan_free_pages(struct page *page, unsigned int order) #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_set_tagging_report_once(bool state) -{ - hw_set_tagging_report_once(state); -} -EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); - void kasan_enable_tagging_sync(void) { hw_enable_tagging_sync(); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 98e3059bfea4..8bf568a80eb8 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -3,12 +3,14 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/kasan-tags.h> #include <linux/kfence.h> #include <linux/stackdepot.h> #ifdef CONFIG_KASAN_HW_TAGS #include <linux/static_key.h> +#include "../slab.h" DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); extern bool kasan_flag_async __ro_after_init; @@ -36,7 +38,6 @@ static inline bool kasan_async_mode_enabled(void) #endif -extern bool kasan_flag_panic __ro_after_init; extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) @@ -50,16 +51,6 @@ extern bool kasan_flag_async __ro_after_init; #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) -#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ -#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ -#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ - -#ifdef CONFIG_KASAN_HW_TAGS -#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */ -#else -#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */ -#endif - #ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ @@ -298,12 +289,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_enable_tagging_async #define arch_enable_tagging_async() #endif -#ifndef arch_init_tags -#define arch_init_tags(max_tag) -#endif -#ifndef arch_set_tagging_report_once -#define arch_set_tagging_report_once(state) -#endif #ifndef arch_force_async_tag_fault #define arch_force_async_tag_fault() #endif @@ -319,8 +304,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging_sync() arch_enable_tagging_sync() #define hw_enable_tagging_async() arch_enable_tagging_async() -#define hw_init_tags(max_tag) arch_init_tags(max_tag) -#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) #define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) @@ -331,19 +314,16 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging_sync() #define hw_enable_tagging_async() -#define hw_set_tagging_report_once(state) #endif /* CONFIG_KASAN_HW_TAGS */ #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_set_tagging_report_once(bool state); void kasan_enable_tagging_sync(void); void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_set_tagging_report_once(bool state) { } static inline void kasan_enable_tagging_sync(void) { } static inline void kasan_force_async_fault(void) { } @@ -387,6 +367,17 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init) if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) return; + /* + * Explicitly initialize the memory with the precise object size to + * avoid overwriting the SLAB redzone. This disables initialization in + * the arch code and may thus lead to performance penalty. The penalty + * is accepted since SLAB redzones aren't enabled in production builds. + */ + if (__slub_debug_enabled() && + init && ((unsigned long)size & KASAN_GRANULE_MASK)) { + init = false; + memzero_explicit((void *)addr, size); + } size = round_up(size, KASAN_GRANULE_SIZE); hw_set_mem_tag_range((void *)addr, size, tag, init); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8fff1825b22c..884a950c7026 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -39,6 +39,31 @@ static unsigned long kasan_flags; #define KASAN_BIT_REPORTED 0 #define KASAN_BIT_MULTI_SHOT 1 +enum kasan_arg_fault { + KASAN_ARG_FAULT_DEFAULT, + KASAN_ARG_FAULT_REPORT, + KASAN_ARG_FAULT_PANIC, +}; + +static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT; + +/* kasan.fault=report/panic */ +static int __init early_kasan_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kasan_arg_fault = KASAN_ARG_FAULT_REPORT; + else if (!strcmp(arg, "panic")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.fault", early_kasan_fault); + bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr) panic_on_warn = 0; panic("panic_on_warn set ...\n"); } -#ifdef CONFIG_KASAN_HW_TAGS - if (kasan_flag_panic) + if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); -#endif kasan_enable_current(); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d7666ace9d2e..7a97db8bc8e7 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -20,6 +20,7 @@ #include <linux/moduleparam.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/clock.h> #include <linux/sched/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -196,6 +197,8 @@ static noinline void metadata_update_state(struct kfence_metadata *meta, */ track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); track->pid = task_pid_nr(current); + track->cpu = raw_smp_processor_id(); + track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ /* * Pairs with READ_ONCE() in @@ -734,6 +737,22 @@ void kfence_shutdown_cache(struct kmem_cache *s) void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { /* + * Perform size check before switching kfence_allocation_gate, so that + * we don't disable KFENCE without making an allocation. + */ + if (size > PAGE_SIZE) + return NULL; + + /* + * Skip allocations from non-default zones, including DMA. We cannot + * guarantee that pages in the KFENCE pool will have the requested + * properties (e.g. reside in DMAable memory). + */ + if ((flags & GFP_ZONEMASK) || + (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) + return NULL; + + /* * allocation_gate only needs to become non-zero, so it doesn't make * sense to continue writing to it and pay the associated contention * cost, in case we have a large number of concurrent allocations. @@ -757,9 +776,6 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (!READ_ONCE(kfence_enabled)) return NULL; - if (size > PAGE_SIZE) - return NULL; - return kfence_guarded_alloc(s, size, flags); } diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 24065321ff8a..c1f23c61e5f9 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -36,6 +36,8 @@ enum kfence_object_state { /* Alloc/free tracking information. */ struct kfence_track { pid_t pid; + int cpu; + u64 ts_nsec; int num_stack_entries; unsigned long stack_entries[KFENCE_STACK_DEPTH]; }; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 7f24b9bcb2ec..f1690cf54199 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -23,8 +23,15 @@ #include <linux/tracepoint.h> #include <trace/events/printk.h> +#include <asm/kfence.h> + #include "kfence.h" +/* May be overridden by <asm/kfence.h>. */ +#ifndef arch_kfence_test_address +#define arch_kfence_test_address(addr) (addr) +#endif + /* Report as observed from console. */ static struct { spinlock_t lock; @@ -82,6 +89,7 @@ static const char *get_access_type(const struct expect_report *r) /* Check observed report matches information in @r. */ static bool report_matches(const struct expect_report *r) { + unsigned long addr = (unsigned long)r->addr; bool ret = false; unsigned long flags; typeof(observed.lines) expect; @@ -131,22 +139,25 @@ static bool report_matches(const struct expect_report *r) switch (r->type) { case KFENCE_ERROR_OOB: cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_UAF: cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_CORRUPTION: cur += scnprintf(cur, end - cur, "Corrupted memory at"); break; case KFENCE_ERROR_INVALID: cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_INVALID_FREE: cur += scnprintf(cur, end - cur, "Invalid free of"); break; } - cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); + cur += scnprintf(cur, end - cur, " 0x%p", (void *)addr); spin_lock_irqsave(&observed.lock, flags); if (!report_available()) @@ -789,6 +800,9 @@ static int test_init(struct kunit *test) unsigned long flags; int i; + if (!__kfence_pool) + return -EINVAL; + spin_lock_irqsave(&observed.lock, flags); for (i = 0; i < ARRAY_SIZE(observed.lines); i++) observed.lines[i][0] = '\0'; @@ -852,7 +866,7 @@ static void kfence_test_exit(void) tracepoint_synchronize_unregister(); } -late_initcall(kfence_test_init); +late_initcall_sync(kfence_test_init); module_exit(kfence_test_exit); MODULE_LICENSE("GPL v2"); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 2a319c21c939..f93a7b2a338b 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -5,10 +5,11 @@ * Copyright (C) 2020, Google LLC. */ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/kernel.h> #include <linux/lockdep.h> +#include <linux/math.h> #include <linux/printk.h> #include <linux/sched/debug.h> #include <linux/seq_file.h> @@ -100,6 +101,13 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat bool show_alloc) { const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + u64 ts_sec = track->ts_nsec; + unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC); + + /* Timestamp matches printk timestamp format. */ + seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n", + show_alloc ? "allocated" : "freed", track->pid, + track->cpu, (unsigned long)ts_sec, rem_nsec / 1000); if (track->num_stack_entries) { /* Skip allocation/free internals stack. */ @@ -126,15 +134,14 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met return; } - seq_con_printf(seq, - "kfence-#%td [0x%p-0x%p" - ", size=%d, cache=%s] allocated by task %d:\n", - meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, - (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); + seq_con_printf(seq, "kfence-#%td: 0x%p-0x%p, size=%d, cache=%s\n\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), + size, (cache && cache->name) ? cache->name : "<destroyed>"); + kfence_print_stack(seq, meta, true); if (meta->state == KFENCE_OBJECT_FREED) { - seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); + seq_con_printf(seq, "\n"); kfence_print_stack(seq, meta, false); } } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0412be08fa2..045cc579f724 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 228a2fbe0657..b57383c17cf6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOLOCKDEP)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -290,7 +291,7 @@ static void hex_dump_object(struct seq_file *seq, warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); kasan_enable_current(); } @@ -598,7 +599,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->checksum = 0; /* task information */ - if (in_irq()) { + if (in_hardirq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); } else if (in_serving_softirq()) { @@ -1171,7 +1172,7 @@ static bool update_checksum(struct kmemleak_object *object) kasan_disable_current(); kcsan_disable_current(); - object->checksum = crc32(0, (void *)object->pointer, object->size); + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); kasan_enable_current(); kcsan_enable_current(); @@ -1246,7 +1247,7 @@ static void scan_block(void *_start, void *_end, break; kasan_disable_current(); - pointer = *ptr; + pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); kasan_enable_current(); untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); @@ -259,7 +259,7 @@ static unsigned long ksm_stable_node_chains; static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ -static int ksm_stable_node_chains_prune_millisecs = 2000; +static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; @@ -3105,11 +3105,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; diff --git a/mm/madvise.c b/mm/madvise.c index 6d3d348b17f4..0734db8d53a7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -862,10 +862,12 @@ static long madvise_populate(struct vm_area_struct *vma, switch (pages) { case -EINTR: return -EINTR; - case -EFAULT: /* Incompatible mappings / permissions. */ + case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; + case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ + return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); @@ -910,7 +912,7 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* - * Filesystem's fallocate may need to take i_mutex. We need to + * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. @@ -1046,6 +1048,7 @@ process_madvise_behavior_valid(int behavior) switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: + case MADV_WILLNEED: return true; default: return false; diff --git a/mm/memblock.c b/mm/memblock.c index 0041ff62c584..0ab5a749bfa6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, +static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { @@ -665,6 +665,11 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__, + &base, &end, nid, (void *)_RET_IP_); + return memblock_add_range(&memblock.memory, base, size, nid, 0); } @@ -947,7 +952,8 @@ static bool should_skip_region(struct memblock_type *type, return true; /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && + !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ @@ -1490,18 +1496,12 @@ void * __init memblock_alloc_exact_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, true); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + true); } /** @@ -1528,18 +1528,12 @@ void * __init memblock_alloc_try_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, false); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false); } /** @@ -1679,6 +1673,11 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) if (!size) return; + if (memblock.memory.cnt <= 1) { + pr_warn("%s: No memory registered yet\n", __func__); + return; + } + ret = memblock_isolate_range(&memblock.memory, base, size, &start_rgn, &end_rgn); if (ret) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae1f5d0cb581..b762215d73eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,6 +103,14 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } +/* memcg and lruvec stats flushing */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static void flush_memcg_stats_work(struct work_struct *w); +static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); +static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); +static DEFINE_SPINLOCK(stats_flush_lock); + #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) return &memcg->vmpressure; } -struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { - return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; + return container_of(vmpr, struct mem_cgroup, vmpressure); } #ifdef CONFIG_MEMCG_KMEM @@ -646,17 +654,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) } /* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - -/* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { long x = 0; @@ -671,23 +668,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -static struct mem_cgroup_per_node * -parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) -{ - struct mem_cgroup *parent; - - parent = parent_mem_cgroup(pn->memcg); - if (!parent) - return NULL; - return parent->nodeinfo[nid]; -} - void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -696,21 +681,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - - if (vmstat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > threshold)) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); - struct mem_cgroup_per_node *pi; - - for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) - atomic_long_add(x, &pi->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) + queue_work(system_unbound_wq, &stats_flush_work); } /** @@ -905,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task); static __always_inline struct mem_cgroup *active_memcg(void) { - if (in_interrupt()) + if (!in_task()) return this_cpu_read(int_active_memcg); else return current->active_memcg; @@ -968,7 +941,7 @@ static __always_inline bool memcg_kmem_bypass(void) return false; /* Memcg to charge can't be determined. */ - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) return true; return false; @@ -2205,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy) unsigned long flags; /* - * The only protection from memory hotplug vs. drain_stock races is - * that we always operate on local CPU stock here with IRQ disabled + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled */ local_irq_save(flags); @@ -2273,7 +2247,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; - if (obj_stock_flush_required(stock, root_memcg)) + else if (obj_stock_flush_required(stock, root_memcg)) flush = true; rcu_read_unlock(); @@ -2289,40 +2263,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) -{ - int nid; - - for_each_node(nid) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS]; - struct batched_lruvec_stat *lstatc; - int i; - - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - stat[i] = lstatc->count[i]; - lstatc->count[i] = 0; - } - - do { - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } -} - static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - for_each_mem_cgroup(memcg) - memcg_flush_lruvec_page_state(memcg, cpu); - return 0; } @@ -3106,13 +3053,15 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ + struct pglist_data *oldpg = stock->cached_pgdat; + if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -3574,7 +3523,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - cgroup_rstat_flush(memcg->css.cgroup); + /* mem_cgroup_threshold() calls here from irqsafe context */ + cgroup_rstat_flush_irqsafe(memcg->css.cgroup); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -4113,7 +4063,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (!mem_cgroup_is_root(memcg)) @@ -4665,7 +4615,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } @@ -4889,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, vfs_poll(efile.file, &event->pt); - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); fdput(cfile); fdput(efile); @@ -5126,17 +5076,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_local) { - kfree(pn); - return 1; - } - - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { kfree(pn); return 1; } @@ -5157,8 +5099,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; - free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + free_percpu(pn->lruvec_stats_percpu); kfree(pn); } @@ -5174,15 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { - int cpu; - memcg_wb_domain_exit(memcg); - /* - * Flush percpu lruvec stats to guarantee the value - * correctness on parent's and all ancestor levels. - */ - for_each_online_cpu(cpu) - memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5318,6 +5251,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); return 0; } @@ -5331,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace. */ - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { list_del_init(&event->list); schedule_work(&event->remove); } - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); @@ -5409,13 +5346,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +void mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + spin_unlock(&stats_flush_lock); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + +static void flush_memcg_stats_work(struct work_struct *w) +{ + mem_cgroup_flush_stats(); +} + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; long delta, v; - int i; + int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5463,6 +5420,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent) parent->vmstats.events_pending[i] += delta; } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } } #ifdef CONFIG_MMU @@ -6396,6 +6383,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -6701,8 +6690,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, - gfp_t gfp) +static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) { unsigned int nr_pages = thp_nr_pages(page); int ret; @@ -6723,7 +6711,7 @@ out: } /** - * mem_cgroup_charge - charge a newly allocated page to a cgroup + * __mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode @@ -6736,16 +6724,14 @@ out: * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { struct mem_cgroup *memcg; int ret; - if (mem_cgroup_disabled()) - return 0; - memcg = get_mem_cgroup_from_mm(mm); - ret = __mem_cgroup_charge(page, memcg, gfp_mask); + ret = charge_memcg(page, memcg, gfp_mask); css_put(&memcg->css); return ret; @@ -6780,7 +6766,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = __mem_cgroup_charge(page, memcg, gfp); + ret = charge_memcg(page, memcg, gfp); css_put(&memcg->css); return ret; @@ -6916,18 +6902,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } /** - * mem_cgroup_uncharge - uncharge a page + * __mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_charge(). + * Uncharge a page previously charged with __mem_cgroup_charge(). */ -void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct page *page) { struct uncharge_gather ug; - if (mem_cgroup_disabled()) - return; - /* Don't touch page->lru of any random page, pre-check: */ if (!page_memcg(page)) return; @@ -6938,20 +6921,17 @@ void mem_cgroup_uncharge(struct page *page) } /** - * mem_cgroup_uncharge_list - uncharge a list of page + * __mem_cgroup_uncharge_list - uncharge a list of page * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_charge(). + * __mem_cgroup_charge(). */ -void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; struct page *page; - if (mem_cgroup_disabled()) - return; - uncharge_gather_clear(&ug); list_for_each_entry(page, page_list, lru) uncharge_page(page, &ug); @@ -7047,14 +7027,14 @@ void mem_cgroup_sk_free(struct sock *sk) * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge * @nr_pages: number of pages to charge + * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within - * @memcg's configured limit, %false if the charge had to be forced. + * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) { - gfp_t gfp_mask = GFP_KERNEL; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *fail; @@ -7062,21 +7042,19 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) memcg->tcpmem_pressure = 0; return true; } - page_counter_charge(&memcg->tcpmem, nr_pages); memcg->tcpmem_pressure = 1; + if (gfp_mask & __GFP_NOFAIL) { + page_counter_charge(&memcg->tcpmem, nr_pages); + return true; + } return false; } - /* Don't block in the packet receive path */ - if (in_softirq()) - gfp_mask = GFP_NOWAIT; - - mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); - - if (try_charge(memcg, gfp_mask, nr_pages) == 0) + if (try_charge(memcg, gfp_mask, nr_pages) == 0) { + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); return true; + } - try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); return false; } @@ -7243,7 +7221,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_try_charge_swap - try charging swap space for a page + * __mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * @@ -7251,16 +7229,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * * Returns 0 on success, -ENOMEM on failure. */ -int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { unsigned int nr_pages = thp_nr_pages(page); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; - if (mem_cgroup_disabled()) - return 0; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; @@ -7296,11 +7271,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_uncharge_swap - uncharge swap space + * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eefd823deb67..54879c339024 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -68,7 +68,7 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool __page_handle_poison(struct page *page) { - bool ret; + int ret; zone_pcp_disable(page_zone(page)); ret = dissolve_free_huge_page(page); @@ -76,7 +76,7 @@ static bool __page_handle_poison(struct page *page) ret = take_page_off_buddy(page); zone_pcp_enable(page_zone(page)); - return ret; + return ret > 0; } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) @@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) /* * Unknown page type encountered. Try to check whether it can turn PageLRU by - * lru_add_drain_all, or a free page by reclaiming slabs when possible. + * lru_add_drain_all. */ -void shake_page(struct page *p, int access) +void shake_page(struct page *p) { if (PageHuge(p)) return; @@ -296,11 +296,9 @@ void shake_page(struct page *p, int access) } /* - * Only call shrink_node_slabs here (which would also shrink - * other caches) if access is not potentially fatal. + * TODO: Could shrink slab caches here if a lightweight range-based + * shrinker will be available. */ - if (access) - drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -391,8 +389,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, /* * Kill the processes that have been collected earlier. * - * Only do anything when DOIT is set, otherwise just free the list - * (this is used for clean pages which do not need killing) + * Only do anything when FORCEKILL is set, otherwise just free the + * list (this is used for clean pages which do not need killing) * Also when FAIL is set do a force kill because something went * wrong earlier. */ @@ -632,7 +630,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, { struct hwp_walk *hwp = (struct hwp_walk *)walk->private; int ret = 0; - pte_t *ptep; + pte_t *ptep, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmdp, walk->vma); @@ -645,14 +643,15 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, if (pmd_trans_unstable(pmdp)) goto out; - ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, + addr, &ptl); for (; addr != end; ptep++, addr += PAGE_SIZE) { ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; } - pte_unmap_unlock(ptep - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); out: cond_resched(); return ret; @@ -866,7 +865,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Truncation is a bit tricky. Enable it per file system for now. * - * Open: to take i_mutex or not for this? Right now we don't. + * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, pfn, mapping); out: @@ -1146,7 +1145,7 @@ static int __get_hwpoison_page(struct page *page) * unexpected races caused by taking a page refcount. */ if (!HWPoisonHandlable(head)) - return 0; + return -EBUSY; if (PageTransHuge(head)) { /* @@ -1199,9 +1198,15 @@ try_again: } goto out; } else if (ret == -EBUSY) { - /* We raced with freeing huge page to buddy, retry. */ - if (pass++ < 3) + /* + * We raced with (possibly temporary) unhandlable + * page, retry. + */ + if (pass++ < 3) { + shake_page(p); goto try_again; + } + ret = -EIO; goto out; } } @@ -1215,7 +1220,7 @@ try_again: */ if (pass++ < 3) { put_page(p); - shake_page(p, 1); + shake_page(p); count_increased = false; goto try_again; } @@ -1223,6 +1228,9 @@ try_again: ret = -EIO; } out: + if (ret == -EIO) + dump_page(p, "hwpoison: unhandlable page"); + return ret; } @@ -1264,14 +1272,13 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page **hpagep) + int flags, struct page *hpage) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int kill = 1, forcekill; - struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); /* @@ -1363,7 +1370,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage, 0); + shake_page(hpage); /* * Now that the dirty bit has been propagated to the @@ -1496,7 +1503,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - if (!hwpoison_user_mappings(p, pfn, flags, &head)) { + if (!hwpoison_user_mappings(p, pfn, flags, head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1512,7 +1519,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { struct page *page = pfn_to_page(pfn); - const bool unmap_success = true; unsigned long size = 0; struct to_kill *tk; LIST_HEAD(tokill); @@ -1584,7 +1590,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); + kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); @@ -1718,7 +1724,7 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p, 0); + shake_page(p); lock_page(p); @@ -1777,7 +1783,7 @@ try_again: * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, &p)) { + if (!hwpoison_user_mappings(p, pfn, flags, p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto unlock_page; @@ -2093,7 +2099,7 @@ static int __soft_offline_page(struct page *page) if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { bool release = !huge; @@ -2202,9 +2208,6 @@ retry: try_again = false; goto retry; } - } else if (ret == -EIO) { - pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", - __func__, pfn, page->flags, &page->flags); } return ret; diff --git a/mm/memory.c b/mm/memory.c index 747a01d495f2..25fc46e87214 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4026,8 +4026,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(vma->vm_mm); + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; + } } /* See comment in handle_pte_fault() */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8cb75b26ea4f..9fd0be32a281 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,73 @@ module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); #endif +enum { + ONLINE_POLICY_CONTIG_ZONES = 0, + ONLINE_POLICY_AUTO_MOVABLE, +}; + +const char *online_policy_to_str[] = { + [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", + [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", +}; + +static int set_online_policy(const char *val, const struct kernel_param *kp) +{ + int ret = sysfs_match_string(online_policy_to_str, val); + + if (ret < 0) + return ret; + *((int *)kp->arg) = ret; + return 0; +} + +static int get_online_policy(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); +} + +/* + * memory_hotplug.online_policy: configure online behavior when onlining without + * specifying a zone (MMOP_ONLINE) + * + * "contig-zones": keep zone contiguous + * "auto-movable": online memory to ZONE_MOVABLE if the configuration + * (auto_movable_ratio, auto_movable_numa_aware) allows for it + */ +static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; +static const struct kernel_param_ops online_policy_ops = { + .set = set_online_policy, + .get = get_online_policy, +}; +module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); +MODULE_PARM_DESC(online_policy, + "Set the online policy (\"contig-zones\", \"auto-movable\") " + "Default: \"contig-zones\""); + +/* + * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio + * + * The ratio represent an upper limit and the kernel might decide to not + * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory + * doesn't allow for more MOVABLE memory. + */ +static unsigned int auto_movable_ratio __read_mostly = 301; +module_param(auto_movable_ratio, uint, 0644); +MODULE_PARM_DESC(auto_movable_ratio, + "Set the maximum ratio of MOVABLE:KERNEL memory in the system " + "in percent for \"auto-movable\" online policy. Default: 301"); + +/* + * memory_hotplug.auto_movable_numa_aware: consider numa node stats + */ +#ifdef CONFIG_NUMA +static bool auto_movable_numa_aware __read_mostly = true; +module_param(auto_movable_numa_aware, bool, 0644); +MODULE_PARM_DESC(auto_movable_numa_aware, + "Consider numa node stats in addition to global stats in " + "\"auto-movable\" online policy. Default: true"); +#endif /* CONFIG_NUMA */ + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -410,15 +477,13 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, sizeof(struct page) * cur_nr_pages); } -#ifdef CONFIG_ZONE_DEVICE /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So * we will not try to shrink the zones - which is okay as * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. */ - if (zone_idx(zone) == ZONE_DEVICE) + if (zone_is_zone_device(zone)) return; -#endif clear_zone_contiguous(zone); @@ -663,6 +728,109 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, set_zone_contiguous(zone); } +struct auto_movable_stats { + unsigned long kernel_early_pages; + unsigned long movable_pages; +}; + +static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, + struct zone *zone) +{ + if (zone_idx(zone) == ZONE_MOVABLE) { + stats->movable_pages += zone->present_pages; + } else { + stats->kernel_early_pages += zone->present_early_pages; +#ifdef CONFIG_CMA + /* + * CMA pages (never on hotplugged memory) behave like + * ZONE_MOVABLE. + */ + stats->movable_pages += zone->cma_pages; + stats->kernel_early_pages -= zone->cma_pages; +#endif /* CONFIG_CMA */ + } +} +struct auto_movable_group_stats { + unsigned long movable_pages; + unsigned long req_kernel_early_pages; +}; + +static int auto_movable_stats_account_group(struct memory_group *group, + void *arg) +{ + const int ratio = READ_ONCE(auto_movable_ratio); + struct auto_movable_group_stats *stats = arg; + long pages; + + /* + * We don't support modifying the config while the auto-movable online + * policy is already enabled. Just avoid the division by zero below. + */ + if (!ratio) + return 0; + + /* + * Calculate how many early kernel pages this group requires to + * satisfy the configured zone ratio. + */ + pages = group->present_movable_pages * 100 / ratio; + pages -= group->present_kernel_pages; + + if (pages > 0) + stats->req_kernel_early_pages += pages; + stats->movable_pages += group->present_movable_pages; + return 0; +} + +static bool auto_movable_can_online_movable(int nid, struct memory_group *group, + unsigned long nr_pages) +{ + unsigned long kernel_early_pages, movable_pages; + struct auto_movable_group_stats group_stats = {}; + struct auto_movable_stats stats = {}; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int i; + + /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ + if (nid == NUMA_NO_NODE) { + /* TODO: cache values */ + for_each_populated_zone(zone) + auto_movable_stats_account_zone(&stats, zone); + } else { + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (populated_zone(zone)) + auto_movable_stats_account_zone(&stats, zone); + } + } + + kernel_early_pages = stats.kernel_early_pages; + movable_pages = stats.movable_pages; + + /* + * Kernel memory inside dynamic memory group allows for more MOVABLE + * memory within the same group. Remove the effect of all but the + * current group from the stats. + */ + walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, + group, &group_stats); + if (kernel_early_pages <= group_stats.req_kernel_early_pages) + return false; + kernel_early_pages -= group_stats.req_kernel_early_pages; + movable_pages -= group_stats.movable_pages; + + if (group && group->is_dynamic) + kernel_early_pages += group->present_kernel_pages; + + /* + * Test if we could online the given number of pages to ZONE_MOVABLE + * and still stay in the configured ratio. + */ + movable_pages += nr_pages; + return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; +} + /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go @@ -684,6 +852,117 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn return &pgdat->node_zones[ZONE_NORMAL]; } +/* + * Determine to which zone to online memory dynamically based on user + * configuration and system stats. We care about the following ratio: + * + * MOVABLE : KERNEL + * + * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in + * one of the kernel zones. CMA pages inside one of the kernel zones really + * behaves like ZONE_MOVABLE, so we treat them accordingly. + * + * We don't allow for hotplugged memory in a KERNEL zone to increase the + * amount of MOVABLE memory we can have, so we end up with: + * + * MOVABLE : KERNEL_EARLY + * + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * boot. We base our calculation on KERNEL_EARLY internally, because: + * + * a) Hotplugged memory in one of the kernel zones can sometimes still get + * hotunplugged, especially when hot(un)plugging individual memory blocks. + * There is no coordination across memory devices, therefore "automatic" + * hotunplugging, as implemented in hypervisors, could result in zone + * imbalances. + * b) Early/boot memory in one of the kernel zones can usually not get + * hotunplugged again (e.g., no firmware interface to unplug, fragmented + * with unmovable allocations). While there are corner cases where it might + * still work, it is barely relevant in practice. + * + * Exceptions are dynamic memory groups, which allow for more MOVABLE + * memory within the same memory group -- because in that case, there is + * coordination within the single memory device managed by a single driver. + * + * We rely on "present pages" instead of "managed pages", as the latter is + * highly unreliable and dynamic in virtualized environments, and does not + * consider boot time allocations. For example, memory ballooning adjusts the + * managed pages when inflating/deflating the balloon, and balloon compaction + * can even migrate inflated pages between zones. + * + * Using "present pages" is better but some things to keep in mind are: + * + * a) Some memblock allocations, such as for the crashkernel area, are + * effectively unused by the kernel, yet they account to "present pages". + * Fortunately, these allocations are comparatively small in relevant setups + * (e.g., fraction of system memory). + * b) Some hotplugged memory blocks in virtualized environments, esecially + * hotplugged by virtio-mem, look like they are completely present, however, + * only parts of the memory block are actually currently usable. + * "present pages" is an upper limit that can get reached at runtime. As + * we base our calculations on KERNEL_EARLY, this is not an issue. + */ +static struct zone *auto_movable_zone_for_pfn(int nid, + struct memory_group *group, + unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long online_pages = 0, max_pages, end_pfn; + struct page *page; + + if (!auto_movable_ratio) + goto kernel_zone; + + if (group && !group->is_dynamic) { + max_pages = group->s.max_pages; + online_pages = group->present_movable_pages; + + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (group->present_kernel_pages) + goto kernel_zone; + } else if (!group || group->d.unit_pages == nr_pages) { + max_pages = nr_pages; + } else { + max_pages = group->d.unit_pages; + /* + * Take a look at all online sections in the current unit. + * We can safely assume that all pages within a section belong + * to the same zone, because dynamic memory groups only deal + * with hotplugged memory. + */ + pfn = ALIGN_DOWN(pfn, group->d.unit_pages); + end_pfn = pfn + group->d.unit_pages; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (page_zonenum(page) != ZONE_MOVABLE) + goto kernel_zone; + online_pages += PAGES_PER_SECTION; + } + } + + /* + * Online MOVABLE if we could *currently* online all remaining parts + * MOVABLE. We expect to (add+) online them immediately next, so if + * nobody interferes, all will be MOVABLE if possible. + */ + nr_pages = max_pages - online_pages; + if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) + goto kernel_zone; + +#ifdef CONFIG_NUMA + if (auto_movable_numa_aware && + !auto_movable_can_online_movable(nid, group, nr_pages)) + goto kernel_zone; +#endif /* CONFIG_NUMA */ + + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; +kernel_zone: + return default_kernel_zone_for_pfn(nid, pfn, nr_pages); +} + static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { @@ -708,7 +987,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, +struct zone *zone_for_pfn_range(int online_type, int nid, + struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) @@ -717,6 +997,9 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) + return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); + return default_zone_for_pfn(nid, start_pfn, nr_pages); } @@ -724,10 +1007,25 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct zone *zone, long nr_pages) +void adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages) { + struct zone *zone = page_zone(page); + const bool movable = zone_idx(zone) == ZONE_MOVABLE; + + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; + + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -773,7 +1071,8 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) { unsigned long flags; int need_zonelists_rebuild = 0; @@ -826,7 +1125,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(zone, nr_pages); + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1059,6 +1358,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; struct vmem_altmap mhp_altmap = {}; + struct memory_group *group = NULL; u64 start, size; bool new_node = false; int ret; @@ -1070,6 +1370,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) if (ret) return ret; + if (mhp_flags & MHP_NID_IS_MGID) { + group = memory_group_find_by_id(nid); + if (!group) + return -EINVAL; + nid = group->nid; + } + if (!node_possible(nid)) { WARN(1, "node %d was absent from the node_possible_map\n", nid); return -EINVAL; @@ -1104,9 +1411,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc, + group); if (ret) { - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(start, size, NULL); goto error; } @@ -1298,7 +1606,7 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; - int i; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { @@ -1307,17 +1615,10 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, continue; for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && - !pfn_valid_within(pfn + i)) - i++; - if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) - continue; /* Check if we got outside of the zone */ - if (zone && !zone_spans_pfn(zone, pfn + i)) + if (zone && !zone_spans_pfn(zone, pfn)) return NULL; - page = pfn_to_page(pfn + i); + page = pfn_to_page(pfn); if (zone && page_zone(page) != zone) return NULL; zone = page_zone(page); @@ -1469,7 +1770,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { @@ -1568,7 +1869,8 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; @@ -1704,7 +2006,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(zone, -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); @@ -1731,6 +2033,7 @@ failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: + lru_cache_enable(); zone_pcp_enable(zone); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", @@ -1745,7 +2048,9 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); + int *nid = arg; + *nid = mem->nid; if (unlikely(ret)) { phys_addr_t beginpa, endpa; @@ -1838,12 +2143,12 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -static int __ref try_remove_memory(int nid, u64 start, u64 size) +static int __ref try_remove_memory(u64 start, u64 size) { - int rc = 0; struct vmem_altmap mhp_altmap = {}; struct vmem_altmap *altmap = NULL; unsigned long nr_vmemmap_pages; + int rc = 0, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1851,8 +2156,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error * if this is not the case. + * + * While at it, determine the nid. Note that if we'd have mixed nodes, + * we'd only try to offline the last determined one -- which is good + * enough for the cases we care about. */ - rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); + rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); if (rc) return rc; @@ -1892,7 +2201,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, altmap); + arch_remove_memory(start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); @@ -1901,7 +2210,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) release_mem_region_adjustable(start, size); - try_offline_node(nid); + if (nid != NUMA_NO_NODE) + try_offline_node(nid); mem_hotplug_done(); return 0; @@ -1909,7 +2219,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /** * __remove_memory - Remove memory if every memory block is offline - * @nid: the node ID * @start: physical address of the region to remove * @size: size of the region to remove * @@ -1917,14 +2226,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * and online/offline operations before this call, as required by * try_offline_node(). */ -void __remove_memory(int nid, u64 start, u64 size) +void __remove_memory(u64 start, u64 size) { /* * trigger BUG() if some memory is not offlined prior to calling this * function */ - if (try_remove_memory(nid, start, size)) + if (try_remove_memory(start, size)) BUG(); } @@ -1932,12 +2241,12 @@ void __remove_memory(int nid, u64 start, u64 size) * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline */ -int remove_memory(int nid, u64 start, u64 size) +int remove_memory(u64 start, u64 size) { int rc; lock_device_hotplug(); - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); unlock_device_hotplug(); return rc; @@ -1997,7 +2306,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) * unplugged all memory (so it's no longer in use) and want to offline + remove * that memory. */ -int offline_and_remove_memory(int nid, u64 start, u64 size) +int offline_and_remove_memory(u64 start, u64 size) { const unsigned long mb_count = size / memory_block_size_bytes(); uint8_t *online_types, *tmp; @@ -2033,7 +2342,7 @@ int offline_and_remove_memory(int nid, u64 start, u64 size) * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); if (rc) pr_err("%s: Failed to remove memory: %d", __func__, rc); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e90274..1592b081c58e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } -static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; @@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes @@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { - .create = mpol_new_interleave, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { @@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { - .create = mpol_new_bind, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: *nodes = p->nodes; break; case MPOL_LOCAL: @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } @@ -1362,16 +1362,33 @@ mpol_out: /* * User space interface with variable sized bitmaps for nodelists. */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - unsigned long k; - unsigned long t; - unsigned long nlongs; - unsigned long endmask; - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) @@ -1379,49 +1396,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* * When the user specified more nodes than supported just check - * if the non supported part is all zero. - * - * If maxnode have more longs than MAX_NUMNODES, check - * the bits in that area first. And then go through to - * check the rest bits which equal or bigger than MAX_NUMNODES. - * Otherwise, just check bits [MAX_NUMNODES, maxnode). + * if the non supported part is all zero, one word at a time, + * starting at the end. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { - unsigned long valid_mask = endmask; + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; - valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); - if (get_user(t, nmask + nlongs - 1)) + if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) return -EFAULT; - if (t & valid_mask) + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + } + if (t) return -EINVAL; } - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ @@ -1430,6 +1427,10 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1437,7 +1438,13 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; + maxnode = nr_node_ids; } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } @@ -1446,7 +1453,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1641,116 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, - compat_ulong_t, addr, compat_ulong_t, flags) -{ - long err; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) - nm = compat_alloc_user_space(alloc_size); - - err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); - - if (!err && nmask) { - unsigned long copy_size; - copy_size = min_t(unsigned long, sizeof(bm), alloc_size); - err = copy_from_user(bm, nm, copy_size); - /* ensure entire bitmap is zeroed */ - err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); - err |= compat_put_bitmap(nmask, bm, nr_bits); - } - - return err; -} - -COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode) -{ - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(bm, nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, bm, alloc_size)) - return -EFAULT; - } - - return kernel_set_mempolicy(mode, nm, nr_bits+1); -} - -COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, - compat_ulong_t, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, compat_ulong_t, flags) -{ - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - nodemask_t bm; - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, nodes_addr(bm), alloc_size)) - return -EFAULT; - } - - return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return kernel_migrate_pages(pid, nr_bits + 1, old, new); -} - -#endif /* CONFIG_COMPAT */ - bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) @@ -1875,16 +1773,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + if (unlikely(mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; + + if (mode == MPOL_PREFERRED_MANY) return &policy->nodes; return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED) { @@ -1922,7 +1831,7 @@ unsigned int mempolicy_slab_node(void) struct mempolicy *policy; int node = numa_mem_id(); - if (in_interrupt()) + if (!in_task()) return node; policy = current->mempolicy; @@ -1936,7 +1845,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -1965,17 +1876,26 @@ unsigned int mempolicy_slab_node(void) */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - unsigned nnodes = nodes_weight(pol->nodes); - unsigned target; + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; int i; int nid; + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; - nid = first_node(pol->nodes); + nid = first_node(nodemask); for (i = 0; i < target; i++) - nid = next_node(nid, pol->nodes); + nid = next_node(nid, nodemask); return nid; } @@ -2008,12 +1928,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2021,16 +1941,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->nodes; } return nid; @@ -2063,6 +1985,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; @@ -2128,6 +2051,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + if (!page) + page = __alloc_pages(gfp, order, numa_node_id(), NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2163,6 +2107,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2173,7 +2123,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED) @@ -2240,6 +2190,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + numa_node_id(), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2311,6 +2264,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2425,8 +2379,8 @@ static void sp_free(struct sp_node *n) * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * - * Return: -1 if the page is in a node that is valid for this policy, or a - * suitable node ID to allocate a replacement page from. + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { @@ -2437,7 +2391,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; - int ret = -1; + int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) @@ -2451,6 +2405,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: + if (node_isset(curnid, pol->nodes)) + goto out; polnid = first_node(pol->nodes); break; @@ -2465,9 +2421,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; goto out; } + fallthrough; + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2829,6 +2786,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2907,6 +2865,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -2993,6 +2952,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_LOCAL: break; case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; @@ -3021,3 +2981,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif diff --git a/mm/memremap.c b/mm/memremap.c index 15a074ffb8d7..ed593bf87109 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -140,14 +140,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - int nid; /* make sure to access a memmap that was actually initialized */ first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(first_page); - mem_hotplug_begin(); remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), PHYS_PFN(range_len(range))); @@ -155,7 +152,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) __remove_pages(PHYS_PFN(range->start), PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, range->start, range_len(range), + arch_remove_memory(range->start, range_len(range), pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } diff --git a/mm/migrate.c b/mm/migrate.c index 23cbd9de030b..a6a7743ee98f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -49,6 +49,7 @@ #include <linux/sched/mm.h> #include <linux/ptrace.h> #include <linux/oom.h> +#include <linux/memory.h> #include <asm/tlbflush.h> @@ -537,54 +538,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* - * Gigantic pages are so large that we do not guarantee that page++ pointer - * arithmetic will work across the entire page. We need something more - * specialized. - */ -static void __copy_gigantic_page(struct page *dst, struct page *src, - int nr_pages) -{ - int i; - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < nr_pages; ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - int nr_pages; - - if (PageHuge(src)) { - /* hugetlbfs page */ - struct hstate *h = page_hstate(src); - nr_pages = pages_per_huge_page(h); - - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { - __copy_gigantic_page(dst, src, nr_pages); - return; - } - } else { - /* thp page */ - BUG_ON(!PageTransHuge(src)); - nr_pages = thp_nr_pages(src); - } - - for (i = 0; i < nr_pages; i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - -/* * Copy the page to its new location */ void migrate_page_states(struct page *newpage, struct page *page) @@ -1007,7 +960,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; - int page_was_mapped = 0; + bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(page); @@ -1055,7 +1008,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* - * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() @@ -1110,7 +1063,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); try_to_migrate(page, 0); - page_was_mapped = 1; + page_was_mapped = true; } if (!page_mapped(page)) @@ -1147,6 +1100,80 @@ out: return rc; } + +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { 1, // Node 0 migrates to 1 + * 2, // Node 1 migrates to 2 + * -1, // Node 2 does not migrate + * 4, // Node 3 migrates to 4 + * 5, // Node 4 migrates to 5 + * -1} // Node 5 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +static int node_demotion[MAX_NUMNODES] __read_mostly = + {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + int target; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target = READ_ONCE(node_demotion[node]); + rcu_read_unlock(); + + return target; +} + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -1402,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of pages migrated successfully if + * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. @@ -1412,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; int thp_retry = 1; @@ -1567,6 +1596,9 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + return rc; } @@ -1636,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; @@ -1868,6 +1900,23 @@ set_status: mmap_read_unlock(mm); } +static int get_compat_pages_array(const void __user *chunk_pages[], + const void __user * __user *pages, + unsigned long chunk_nr) +{ + compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; + compat_uptr_t p; + int i; + + for (i = 0; i < chunk_nr; i++) { + if (get_user(p, pages32 + i)) + return -EFAULT; + chunk_pages[i] = compat_ptr(p); + } + + return 0; +} + /* * Determine the nodes of a user array of pages and store it in * a user array of status. @@ -1887,8 +1936,15 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) chunk_nr = DO_PAGES_STAT_CHUNK_NR; - if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) - break; + if (in_compat_syscall()) { + if (get_compat_pages_array(chunk_pages, pages, + chunk_nr)) + break; + } else { + if (copy_from_user(chunk_pages, pages, + chunk_nr * sizeof(*chunk_pages))) + break; + } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); @@ -1991,28 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); -} -#endif /* CONFIG_COMPAT */ - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA @@ -2075,6 +2109,7 @@ out: static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; + int nr_pages = thp_nr_pages(page); VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); @@ -2083,7 +2118,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) return 0; if (isolate_lru_page(page)) @@ -2091,7 +2126,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, - thp_nr_pages(page)); + nr_pages); /* * Isolating the page has taken another reference, so the @@ -2116,7 +2151,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, LIST_HEAD(migratepages); new_page_t *new; bool compound; - unsigned int nr_pages = thp_nr_pages(page); + int nr_pages = thp_nr_pages(page); /* * PTE mapped THP or HugeTLB page can't reach here so the page could @@ -2151,7 +2186,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -3030,3 +3065,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ + +#if defined(CONFIG_MEMORY_HOTPLUG) +/* Disable reclaim-based migration. */ +static void __disable_all_migrate_targets(void) +{ + int node; + + for_each_online_node(node) + node_demotion[node] = NUMA_NO_NODE; +} + +static void disable_all_migrate_targets(void) +{ + __disable_all_migrate_targets(); + + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + * + * The before+after state together might have cycles and + * could cause readers to do things like loop until this + * function finishes. This ensures they can only see a + * single "bad" read and would, for instance, only loop + * once. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for 'node'. + * Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static int establish_migrate_target(int node, nodemask_t *used) +{ + int migration_target; + + /* + * Can not set a migration target on a + * node with it already set. + * + * No need for READ_ONCE() here since this + * in the write path for node_demotion[]. + * This should be the only thread writing. + */ + if (node_demotion[node] != NUMA_NO_NODE) + return NUMA_NO_NODE; + + migration_target = find_next_best_node(node, used); + if (migration_target == NUMA_NO_NODE) + return NUMA_NO_NODE; + + node_demotion[node] = migration_target; + + return migration_target; +} + +/* + * When memory fills up on a node, memory contents can be + * automatically migrated to another node instead of + * discarded at reclaim. + * + * Establish a "migration path" which will start at nodes + * with CPUs and will follow the priorities used to build the + * page allocator zonelists. + * + * The difference here is that cycles must be avoided. If + * node0 migrates to node1, then neither node1, nor anything + * node1 migrates to can migrate to node0. + * + * This function can run simultaneously with readers of + * node_demotion[]. However, it can not run simultaneously + * with itself. Exclusion is provided by memory hotplug events + * being single-threaded. + */ +static void __set_migration_target_nodes(void) +{ + nodemask_t next_pass = NODE_MASK_NONE; + nodemask_t this_pass = NODE_MASK_NONE; + nodemask_t used_targets = NODE_MASK_NONE; + int node; + + /* + * Avoid any oddities like cycles that could occur + * from changes in the topology. This will leave + * a momentary gap when migration is disabled. + */ + disable_all_migrate_targets(); + + /* + * Allocations go close to CPUs, first. Assume that + * the migration path starts at the nodes with CPUs. + */ + next_pass = node_states[N_CPU]; +again: + this_pass = next_pass; + next_pass = NODE_MASK_NONE; + /* + * To avoid cycles in the migration "graph", ensure + * that migration sources are not future targets by + * setting them in 'used_targets'. Do this only + * once per pass so that multiple source nodes can + * share a target node. + * + * 'used_targets' will become unavailable in future + * passes. This limits some opportunities for + * multiple source nodes to share a destination. + */ + nodes_or(used_targets, used_targets, this_pass); + for_each_node_mask(node, this_pass) { + int target_node = establish_migrate_target(node, &used_targets); + + if (target_node == NUMA_NO_NODE) + continue; + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } + /* + * 'next_pass' contains nodes which became migration + * targets in this pass. Make additional passes until + * no more migrations targets are available. + */ + if (!nodes_empty(next_pass)) + goto again; +} + +/* + * For callers that do not hold get_online_mems() already. + */ +static void set_migration_target_nodes(void) +{ + get_online_mems(); + __set_migration_target_nodes(); + put_online_mems(); +} + +/* + * React to hotplug events that might affect the migration targets + * like events that online or offline NUMA nodes. + * + * The ordering is also currently dependent on which nodes have + * CPUs. That means we need CPU on/offline notification too. + */ +static int migration_online_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +static int migration_offline_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +/* + * This leaves migrate-on-reclaim transiently disabled between + * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs + * whether reclaim-based migration is enabled or not, which + * ensures that the user can turn reclaim-based migration at + * any time without needing to recalculate migration targets. + * + * These callbacks already hold get_online_mems(). That is why + * __set_migration_target_nodes() can be used as opposed to + * set_migration_target_nodes(). + */ +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Make sure there are not transient states where + * an offline node is a migration target. This + * will leave migration disabled until the offline + * completes and the MEM_OFFLINE case below runs. + */ + disable_all_migrate_targets(); + break; + case MEM_OFFLINE: + case MEM_ONLINE: + /* + * Recalculate the target nodes once the node + * reaches its final state (online or offline). + */ + __set_migration_target_nodes(); + break; + case MEM_CANCEL_OFFLINE: + /* + * MEM_GOING_OFFLINE disabled all the migration + * targets. Reenable them. + */ + __set_migration_target_nodes(); + break; + case MEM_GOING_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + return notifier_from_errno(0); +} + +static int __init migrate_on_reclaim_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim", + migration_online_cpu, + migration_offline_cpu); + /* + * In the unlikely case that this fails, the automatic + * migration targets may become suboptimal for nodes + * where N_CPU changes. With such a small impact in a + * rare case, do not bother trying to do anything special. + */ + WARN_ON(ret < 0); + + hotplug_memory_notifier(migrate_on_reclaim_callback, 100); + return 0; +} +late_initcall(migrate_on_reclaim_init); +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203a..88dcc5c25225 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -148,8 +148,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_DENYWRITE) - allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -534,6 +532,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, { struct rb_node **__rb_link, *__rb_parent, *rb_prev; + mmap_assert_locked(mm); __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; @@ -666,8 +665,6 @@ static void __vma_link_file(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - if (vma->vm_flags & VM_DENYWRITE) - put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -1517,12 +1514,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; - /* - * Make sure there are no mandatory locks on the file. - */ - if (locks_verify_locked(file)) - return -EAGAIN; - vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); @@ -1630,8 +1621,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, return PTR_ERR(file); } - flags &= ~MAP_DENYWRITE; - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) @@ -1788,22 +1777,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_DENYWRITE) { - error = deny_write_access(file); - if (error) - goto free_vma; - } if (vm_flags & VM_SHARED) { error = mapping_map_writable(file->f_mapping); if (error) - goto allow_write_and_free_vma; + goto free_vma; } - /* ->mmap() can change vma->vm_file, but must guarantee that - * vma_link() below can deny write-access if VM_DENYWRITE is set - * and map writably if VM_SHARED is set. This usually means the - * new file must not have been exposed to user-space, yet. - */ vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) @@ -1860,13 +1839,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (file) { unmap_writable: - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - } + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); file = vma->vm_file; out: perf_event_mmap(vma); @@ -1906,9 +1881,6 @@ unmap_and_free_vma: charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); -allow_write_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); free_vma: vm_area_free(vma); unacct_error: @@ -2303,6 +2275,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct rb_node *rb_node; struct vm_area_struct *vma; + mmap_assert_locked(mm); /* Check the cache first. */ vma = vmacache_find(mm, addr); if (likely(vma)) @@ -2992,14 +2965,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_vma(mm, start); + vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start) - goto out; - if (start + size > vma->vm_end) { struct vm_area_struct *next; diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f5852a058ce0..1854850b4b89 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -156,14 +156,14 @@ static inline void put_memcg_path_buf(void) #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0) #else /* !CONFIG_MEMCG */ diff --git a/mm/mremap.c b/mm/mremap.c index 5989d3990020..badfe17ade1f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) - vm_acct_memory(new_len >> PAGE_SHIFT); + vm_acct_memory(old_len >> PAGE_SHIFT); excess = 0; } diff --git a/mm/nommu.c b/mm/nommu.c index 3a93d4054810..02d2427b8f9e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -826,9 +826,6 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file)) - return -EAGAIN; - if (!(capabilities & NOMMU_MAP_DIRECT)) return -ENODEV; @@ -1296,8 +1293,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out; } - flags &= ~MAP_DENYWRITE; - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c729a4c4a1ac..831340e7ad8b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -28,6 +28,7 @@ #include <linux/sched/task.h> #include <linux/sched/debug.h> #include <linux/swap.h> +#include <linux/syscalls.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/cpuset.h> @@ -1141,3 +1142,72 @@ void pagefault_out_of_memory(void) out_of_memory(&oc); mutex_unlock(&oom_lock); } + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + struct task_struct *p; + unsigned int f_flags; + bool reap = true; + struct pid *pid; + long ret = 0; + + if (flags) + return -EINVAL; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* + * Make sure to choose a thread which still has a reference to mm + * during the group exit + */ + p = find_lock_task_mm(task); + if (!p) { + ret = -ESRCH; + goto put_task; + } + + mm = p->mm; + mmgrab(mm); + + /* If the work has been done already, just exit with success */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + reap = false; + else if (!task_will_free_mem(p)) { + reap = false; + ret = -EINVAL; + } + task_unlock(p); + + if (!reap) + goto drop_mm; + + if (mmap_read_lock_killable(mm)) { + ret = -EINTR; + goto drop_mm; + } + if (!__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + +drop_mm: + mmdrop(mm); +put_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..4812a17b288c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { - unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; @@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; - unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; @@ -1115,7 +1115,7 @@ out: &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; - wb->avg_write_bandwidth = avg; + WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) @@ -1147,8 +1147,8 @@ update: dom->dirty_limit = limit; } -static void domain_update_bandwidth(struct dirty_throttle_control *dtc, - unsigned long now) +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); @@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, else dirty_ratelimit -= step; - wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); @@ -1332,35 +1332,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, - unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); + spin_lock(&wb->list_lock); /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) - goto snapshot; - if (update_ratelimit) { - domain_update_bandwidth(gdtc, now); + domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* @@ -1368,23 +1361,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { - domain_update_bandwidth(mdtc, now); + domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); -snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); } -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - __wb_update_bandwidth(&gdtc, NULL, start_time, false); + __wb_update_bandwidth(&gdtc, NULL, false); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); + } } /* @@ -1407,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty, static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { - unsigned long bw = wb->avg_write_bandwidth; + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* @@ -1429,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { - long hi = ilog2(wb->avg_write_bandwidth); - long lo = ilog2(wb->dirty_ratelimit); + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1710,15 +1721,12 @@ free_running: if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; - if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); - __wb_update_bandwidth(gdtc, mdtc, start_time, true); - spin_unlock(&wb->list_lock); - } + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ - dirty_ratelimit = wb->dirty_ratelimit; + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); @@ -2010,7 +2018,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = @@ -2045,7 +2052,6 @@ void laptop_sync_completion(void) rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload @@ -2347,9 +2353,12 @@ EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); @@ -2360,6 +2369,14 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } @@ -2731,6 +2748,24 @@ int clear_page_dirty_for_io(struct page *page) } EXPORT_SYMBOL(clear_page_dirty_for_io); +static void wb_inode_writeback_start(struct bdi_writeback *wb) +{ + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); +} + int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2752,6 +2787,9 @@ int test_clear_page_writeback(struct page *page) dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); + if (!mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } } @@ -2794,8 +2832,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) - inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + inc_wb_stat(wb, WB_WRITEBACK); + if (!on_wblist) + wb_inode_writeback_start(wb); + } /* * We can come through here when swapping anonymous diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b97e17806be..b37435c274cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { - if (!pfn_valid_within(page_to_pfn(page))) - return 0; if (zone != page_zone(page)) return 0; @@ -840,21 +838,24 @@ void init_mem_debugging_and_hardening(void) } #endif - if (_init_on_alloc_enabled_early) { - if (page_poisoning_requested) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc\n"); - else - static_branch_enable(&init_on_alloc); - } - if (_init_on_free_enabled_early) { - if (page_poisoning_requested) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_free\n"); - else - static_branch_enable(&init_on_free); + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + page_poisoning_requested) { + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc and init_on_free\n"); + _init_on_alloc_enabled_early = false; + _init_on_free_enabled_early = false; } + if (_init_on_alloc_enabled_early) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + + if (_init_on_free_enabled_early) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -1022,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, if (order >= MAX_ORDER - 2) return false; - if (!pfn_valid_within(buddy_pfn)) - return false; - combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - return pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1); + return page_is_buddy(higher_page, higher_buddy, order + 1); } /* @@ -1092,8 +1089,6 @@ continue_merging: buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (!pfn_valid_within(buddy_pfn)) - goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -1751,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. + * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * @@ -1869,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; return true; @@ -2517,11 +2508,6 @@ static int move_freepages(struct zone *zone, int pages_moved = 0; for (pfn = start_pfn; pfn <= end_pfn;) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } - page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* @@ -3442,27 +3428,20 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) + if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); + continue; + } /* * Free isolated pages directly to the allocator, see * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(page); - if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { - if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, - migratetype, FPI_NONE); - continue; - } - - /* - * Non-isolated types over MIGRATE_PCPTYPES get added - * to the MIGRATE_MOVABLE pcp list. - */ - set_pcppage_migratetype(page, MIGRATE_MOVABLE); + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + continue; } set_page_private(page, pfn); @@ -3472,7 +3451,15 @@ void free_unref_page_list(struct list_head *list) list_for_each_entry_safe(page, next, list, lru) { pfn = page_private(page); set_page_private(page, 0); + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) + migratetype = MIGRATE_MOVABLE; + trace_mm_page_free_batched(page); free_unref_page_commit(page, pfn, migratetype, 0); @@ -3820,7 +3807,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ -static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); } @@ -4209,7 +4196,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; show_mem(filter, nodemask); @@ -4547,14 +4534,14 @@ static bool __need_reclaim(gfp_t gfp_mask) return true; } -void __fs_reclaim_acquire(void) +void __fs_reclaim_acquire(unsigned long ip) { - lock_map_acquire(&__fs_reclaim_map); + lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); } -void __fs_reclaim_release(void) +void __fs_reclaim_release(unsigned long ip) { - lock_map_release(&__fs_reclaim_map); + lock_release(&__fs_reclaim_map, ip); } void fs_reclaim_acquire(gfp_t gfp_mask) @@ -4563,7 +4550,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_RET_IP_); #ifdef CONFIG_MMU_NOTIFIER lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); @@ -4580,7 +4567,7 @@ void fs_reclaim_release(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_release(); + __fs_reclaim_release(_RET_IP_); } } EXPORT_SYMBOL_GPL(fs_reclaim_release); @@ -4695,7 +4682,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(current)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_HARDER; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5155,7 +5142,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. */ - if (!in_interrupt() && !ac->nodemask) + if (in_task() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; @@ -5221,9 +5208,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0, nr_account = 0; - if (unlikely(nr_pages <= 0)) - return 0; - /* * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. @@ -5231,19 +5215,35 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, while (page_array && nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; + /* No pages requested? */ + if (unlikely(nr_pages <= 0)) + goto out; + /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return nr_populated; + goto out; /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) goto failed; +#ifdef CONFIG_PAGE_OWNER + /* + * PAGE_OWNER may recurse into the allocator to allocate space to + * save the stack with pagesets.lock held. Releasing/reacquiring + * removes much of the performance benefit of bulk allocation so + * force the caller to allocate one page at a time as it'll have + * similar performance to added complexity to the bulk allocator. + */ + if (static_branch_unlikely(&page_owner_inited)) + goto failed; +#endif + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ gfp &= gfp_allowed_mask; alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) - return 0; + goto out; gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ @@ -5311,6 +5311,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); +out: return nr_populated; failed_irq: @@ -5326,7 +5327,7 @@ failed: nr_populated++; } - return nr_populated; + goto out; } EXPORT_SYMBOL_GPL(__alloc_pages_bulk); @@ -5887,6 +5888,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), @@ -5903,6 +5905,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); @@ -6139,7 +6142,7 @@ static int node_load[MAX_NUMNODES]; * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ -static int find_next_best_node(int node, nodemask_t *used_node_mask) +int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -6624,7 +6627,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -6669,13 +6671,6 @@ static void __init init_unavailable_range(unsigned long spfn, pr_info("On node %d, zone %s: %lld pages in unavailable ranges", node, zone_names[zone], pgcnt); } -#else -static inline void init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) -{ -} -#endif static void __init memmap_init_zone_range(struct zone *zone, unsigned long start_pfn, @@ -6705,7 +6700,7 @@ static void __init memmap_init(void) { unsigned long start_pfn, end_pfn; unsigned long hole_pfn = 0; - int i, j, zone_id, nid; + int i, j, zone_id = 0, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { struct pglist_data *node = NODE_DATA(nid); @@ -6738,6 +6733,26 @@ static void __init memmap_init(void) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } +void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU @@ -7241,6 +7256,9 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; +#if defined(CONFIG_MEMORY_HOTPLUG) + zone->present_early_pages = real_size; +#endif totalpages += size; realtotalpages += real_size; @@ -7485,7 +7503,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } #ifdef CONFIG_FLATMEM -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -7509,8 +7527,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node(size, SMP_CACHE_BYTES, - pgdat->node_id); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); @@ -7531,7 +7549,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif } #else -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -8798,9 +8816,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, } for (; iter < pageblock_nr_pages - offset; iter++) { - if (!pfn_valid_within(pfn + iter)) - continue; - page = pfn_to_page(pfn + iter); /* @@ -8960,7 +8975,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migration_target, - NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless diff --git a/mm/page_ext.c b/mm/page_ext.c index 293b2685fc48..dfb91653d359 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -58,11 +58,21 @@ * can utilize this callback to initialize the state of it correctly. */ +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif }; diff --git a/mm/page_idle.c b/mm/page_idle.c index 64e5344a992c..edead6a8a5f9 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -207,16 +207,6 @@ static const struct attribute_group page_idle_attr_group = { .name = "page_idle", }; -#ifndef CONFIG_64BIT -static bool need_page_idle(void) -{ - return true; -} -struct page_ext_operations page_idle_ops = { - .need = need_page_idle, -}; -#endif - static int __init page_idle_init(void) { int err; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bddf788f45bf..a95c2c6562d0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(buddy_pfn) && - !is_migrate_isolate_page(buddy)) { + if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; } @@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, struct page *page; while (pfn < end_pfn) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); if (PageBuddy(page)) /* @@ -287,6 +282,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn, flags; struct page *page; struct zone *zone; + int ret; /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages @@ -299,15 +295,21 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); - if ((pfn < end_pfn) || !page) - return -EBUSY; + if ((pfn < end_pfn) || !page) { + ret = -EBUSY; + goto out; + } + /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); spin_unlock_irqrestore(&zone->lock, flags); + ret = pfn < end_pfn ? -EBUSY : 0; + +out: trace_test_pages_isolated(start_pfn, end_pfn, pfn); - return pfn < end_pfn ? -EBUSY : 0; + return ret; } diff --git a/mm/page_owner.c b/mm/page_owner.c index f51a57e92aa3..62402d22539b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); @@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - /* Check for holes within a MAX_ORDER area */ - if (!pfn_valid_within(pfn)) - continue; - page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); @@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { - struct page *page; + struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) continue; diff --git a/mm/percpu.c b/mm/percpu.c index 7f2e0151c4e2..e0a986818903 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -146,7 +146,6 @@ static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; -EXPORT_SYMBOL_GPL(pcpu_base_addr); static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ @@ -1520,9 +1519,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. - * - * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it - * is to serve an allocation in that area. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) diff --git a/mm/readahead.c b/mm/readahead.c index d589f147f4c2..41b75d76d36e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -192,6 +192,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. */ @@ -236,6 +237,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * will then handle the error. */ read_pages(ractl, &page_pool, false); + filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); diff --git a/mm/rmap.c b/mm/rmap.c index 795f9d5f8386..6aebd1747251 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -20,28 +20,29 @@ /* * Lock ordering in mm: * - * inode->i_mutex (while writing or truncating, not reading or faulting) + * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock - * page->flags PG_locked (lock_page) * (see huegtlbfs below) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * anon_vma->rwsem - * mm->page_table_lock or pte_lock - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * lock_page_memcg move_lock (in __set_page_dirty_buffers) - * i_pages lock (widely used) - * lruvec->lru_lock (in lock_page_lruvec_irq) - * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * i_pages lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within bdi.wb->list_lock in __sync_single_inode) + * mapping->invalidate_lock (in filemap_fault) + * page->flags PG_locked (lock_page) * (see hugetlbfs below) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * lock_page_memcg move_lock (in __set_page_dirty_buffers) + * i_pages lock (widely used) + * lruvec->lru_lock (in lock_page_lruvec_irq) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * i_pages lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * @@ -1230,11 +1231,13 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { + struct page *head = compound_head(page); + VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + SetPageDoubleMap(head); if (PageMlocked(page)) - clear_page_mlock(compound_head(page)); + clear_page_mlock(head); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1440,21 +1443,20 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * If the page is mlock()d, we cannot swap it out. */ - if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - /* PTE-mapped THP are never marked as mlocked */ - if (!PageTransCompound(page) || - (PageHead(page) && !PageDoubleMap(page))) { - /* - * Holding pte lock, we do *not* need - * mmap_lock here - */ - mlock_vma_page(page); - } - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } + if (!(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) { + /* + * PTE-mapped THP are never marked as mlocked: so do + * not set it on a DoubleMap THP, nor on an Anon THP + * (which may still be PTE-mapped after DoubleMap was + * cleared). But stop unmapping even in those cases. + */ + if (!PageTransCompound(page) || (PageHead(page) && + !PageDoubleMap(page) && !PageAnon(page))) + mlock_vma_page(page); + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; } /* Unexpected PMD-mapped THP? */ @@ -1986,8 +1988,10 @@ static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, */ if (vma->vm_flags & VM_LOCKED) { /* - * PTE-mapped THP are never marked as mlocked, but - * this function is never called when PageDoubleMap(). + * PTE-mapped THP are never marked as mlocked; but + * this function is never called on a DoubleMap THP, + * nor on an Anon THP (which may still be PTE-mapped + * after DoubleMap was cleared). */ mlock_vma_page(page); /* @@ -2022,6 +2026,10 @@ void page_mlock(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + /* Anon THP are only marked as mlocked when singly mapped */ + if (PageTransCompound(page) && PageAnon(page)) + return; + rmap_walk(page, &rwc); } diff --git a/mm/secretmem.c b/mm/secretmem.c index f77d25467a14..1fea68b8d5a6 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -18,6 +18,7 @@ #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> +#include <linux/refcount.h> #include <uapi/linux/magic.h> @@ -40,11 +41,11 @@ module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); -static atomic_t secretmem_users; +static refcount_t secretmem_users; bool secretmem_active(void) { - return !!atomic_read(&secretmem_users); + return !!refcount_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) @@ -103,7 +104,7 @@ static const struct vm_operations_struct secretmem_vm_ops = { static int secretmem_release(struct inode *inode, struct file *file) { - atomic_dec(&secretmem_users); + refcount_dec(&secretmem_users); return 0; } @@ -152,6 +153,7 @@ static void secretmem_freepage(struct page *page) } const struct address_space_operations secretmem_aops = { + .set_page_dirty = __set_page_dirty_no_writeback, .freepage = secretmem_freepage, .migratepage = secretmem_migratepage, .isolate_page = secretmem_isolate_page, @@ -216,7 +218,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) file->f_flags |= O_LARGEFILE; fd_install(fd, file); - atomic_inc(&secretmem_users); + refcount_inc(&secretmem_users); return fd; err_put_fd: diff --git a/mm/shmem.c b/mm/shmem.c index 70d9ce294bb4..88742953532c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,8 +38,7 @@ #include <linux/hugetlb.h> #include <linux/frontswap.h> #include <linux/fs_parser.h> - -#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ +#include <linux/swapfile.h> static struct vfsmount *shm_mnt; @@ -96,7 +95,7 @@ static struct vfsmount *shm_mnt; /* * shmem_fallocate communicates with shmem_fault or shmem_writepage via - * inode->i_private (with i_mutex making sure that it has only one user at + * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { @@ -137,9 +136,6 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static bool shmem_should_replace_page(struct page *page, gfp_t gfp); -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index); static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, @@ -278,10 +274,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; @@ -304,7 +300,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) } *inop = ino; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it @@ -319,13 +315,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * to worry about things like glibc compatibility. */ ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } @@ -341,9 +338,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -474,7 +471,38 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; + +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + loff_t i_size; + + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + + switch (SHMEM_SB(inode->i_sb)->huge) { + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + index = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) + return true; + fallthrough; + case SHMEM_HUGE_ADVISE: + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; + default: + return false; + } +} #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) @@ -645,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + return false; +} + static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -652,15 +686,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) -{ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && - shmem_huge != SHMEM_HUGE_DENY) - return true; - return false; -} - /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -774,7 +799,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -806,7 +831,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -905,6 +930,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + pagevec_init(&pvec); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, @@ -1038,7 +1066,6 @@ static int shmem_getattr(struct user_namespace *mnt_userns, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -1047,7 +1074,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, } generic_fillattr(&init_user_ns, inode, stat); - if (is_huge_enabled(sb_info)) + if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; return 0; @@ -1058,7 +1085,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = setattr_prepare(&init_user_ns, dentry, attr); @@ -1069,7 +1095,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns, loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; @@ -1094,24 +1120,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - - /* - * Part of the huge page can be beyond i_size: subject - * to shrink under memory pressure. - */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } } } @@ -1156,8 +1164,6 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -extern struct swap_info_struct *swap_info[]; - static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, @@ -1338,7 +1344,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; - VM_BUG_ON_PAGE(PageCompound(page), page); + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (PageTransCompound(page)) { + /* Ensure the subpages are still dirty */ + SetPageDirty(page); + if (split_huge_page(page) < 0) + goto redirty; + ClearPageDirty(page); + } + BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; @@ -1453,10 +1471,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -1696,8 +1714,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct swap_info_struct *si; - struct page *page = NULL; + struct page *page; swp_entry_t swap; int error; @@ -1705,12 +1722,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*pagep); *pagep = NULL; - /* Prevent swapoff from happening to us. */ - si = get_swap_device(swap); - if (!si) { - error = EINVAL; - goto failed; - } /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1772,8 +1783,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap_free(swap); *pagep = page; - if (si) - put_swap_device(si); return 0; failed: if (!shmem_confirm_swap(mapping, index, swap)) @@ -1784,9 +1793,6 @@ unlock: put_page(page); } - if (si) - put_swap_device(si); - return error; } @@ -1810,7 +1816,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct page *page; - enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; gfp_t huge_gfp; int error; @@ -1819,8 +1824,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1852,26 +1855,31 @@ repeat: return error; } - if (page) + if (page) { hindex = page->index; - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (sgp == SGP_WRITE) + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + /* fallocated page */ if (sgp != SGP_READ) goto clear; unlock_page(page); put_page(page); - page = NULL; - hindex = index; } - if (page || sgp == SGP_READ) - goto out; /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL page, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + */ + *pagep = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { @@ -1879,36 +1887,12 @@ repeat: return 0; } - /* shmem_symlink() */ - if (!shmem_mapping(mapping)) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: + /* Never use a huge page for shmem_symlink() */ + if (S_ISLNK(inode->i_mode)) goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: { - loff_t i_size; - pgoff_t off; - - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - - fallthrough; - } - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ + if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; - } -alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); @@ -2064,14 +2048,13 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; int err; vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_mutex. So refrain from + * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, @@ -2082,7 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_mutex in fault, + * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ if (unlikely(inode->i_private)) { @@ -2127,15 +2110,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); @@ -2482,7 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; - /* i_mutex is held by caller */ + /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) @@ -2582,7 +2557,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) /* * We must evaluate after, since reads (unlike writes) - * are called without i_mutex protection against truncate + * are called without i_rwsem protection against truncate */ nr = PAGE_SIZE; i_size = i_size_read(inode); @@ -2652,7 +2627,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return -ENXIO; inode_lock(inode); - /* We're holding i_mutex so we can access i_size directly */ + /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); @@ -2667,7 +2642,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2681,7 +2656,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; @@ -2736,7 +2711,16 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); - for (index = start; index < end; index++) { + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + + for (index = start; index < end; ) { struct page *page; /* @@ -2750,6 +2734,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { + info->fallocend = undo_fallocend; /* Remove the !PageUptodate pages we added */ if (index > start) { shmem_undo_range(inode, @@ -2759,13 +2744,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } + index++; + /* + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same huge page will clear it, + * making it PageUptodate and un-undoable if we fail later. + */ + if (PageTransCompound(page)) { + index = round_up(index, HPAGE_PMD_NR); + /* Beware 32-bit wraparound */ + if (!index) + index--; + } + /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - shmem_falloc.next++; if (!PageUptodate(page)) - shmem_falloc.nr_falloced++; + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; /* * If !PageUptodate, leave it that way so that freeable pages @@ -3500,9 +3498,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3547,14 +3546,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3625,7 +3625,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; - int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), @@ -3671,7 +3670,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); @@ -3703,7 +3702,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return err; + return -ENOMEM; } static int shmem_get_tree(struct fs_context *fc) @@ -3919,7 +3918,7 @@ int __init shmem_init(void) if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif return 0; @@ -3988,42 +3987,6 @@ struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - loff_t i_size; - pgoff_t off; - - if (!transhuge_vma_enabled(vma, vma->vm_flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; - case SHMEM_HUGE_ALWAYS: - return true; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - return true; - fallthrough; - case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); - default: - VM_BUG_ON(1); - return false; - } -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #else /* !CONFIG_SHMEM */ /* diff --git a/mm/slab.h b/mm/slab.h index 67e06637ff2e..58c01a34e5b8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -216,10 +216,18 @@ DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); long validate_slab_cache(struct kmem_cache *s); +static inline bool __slub_debug_enabled(void) +{ + return static_branch_unlikely(&slub_debug_enabled); +} #else static inline void print_tracking(struct kmem_cache *s, void *object) { } +static inline bool __slub_debug_enabled(void) +{ + return false; +} #endif /* @@ -229,11 +237,10 @@ static inline void print_tracking(struct kmem_cache *s, void *object) */ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) { -#ifdef CONFIG_SLUB_DEBUG - VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); - if (static_branch_unlikely(&slub_debug_enabled)) + if (IS_ENABLED(CONFIG_SLUB_DEBUG)) + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); + if (__slub_debug_enabled()) return s->flags & flags; -#endif return false; } @@ -339,7 +346,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, continue; page = virt_to_head_page(p[i]); - objcgs = page_objcgs(page); + objcgs = page_objcgs_check(page); if (!objcgs) continue; diff --git a/mm/slab_common.c b/mm/slab_common.c index 1c673c323baf..ec2bb0beed75 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; + cpus_read_lock(); mutex_lock(&slab_mutex); s->refcount--; @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cache *s) } out_unlock: mutex_unlock(&slab_mutex); + cpus_read_unlock(); } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/slub.c b/mm/slub.c index dc863c1ea324..3d2025f7163b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -26,7 +26,6 @@ #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> -#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/kfence.h> @@ -47,13 +46,21 @@ /* * Lock order: * 1. slab_mutex (Global Mutex) - * 2. node->list_lock - * 3. slab_lock(page) (Only on some arches and for debugging) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(page) (Only on some arches or for debugging) + * 5. object_map_lock (Only for debugging) * * slab_mutex * * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. * * The slab_lock is only used for debugging and on arches that do not * have the ability to do a cmpxchg_double. It only protects: @@ -62,6 +69,8 @@ * C. page->objects -> Number of objects in page * D. page->frozen -> frozen state * + * Frozen slabs + * * If a slab is frozen then it is exempt from list management. It is not * on any list except per cpu partial list. The processor that froze the * slab is the one who can perform list operations on the page. Other @@ -69,6 +78,8 @@ * froze the slab is the only one that can retrieve the objects from the * page's freelist. * + * list_lock + * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or * removed from the lists nor make the number of partial slabs be modified. @@ -80,10 +91,36 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * Interrupts are disabled during allocation and deallocation in order to - * make the slab allocator safe to use in the context of an irq. In addition - * interrupts are disabled to ensure that the processor does not change - * while handling per_cpu slabs, due to kernel preemption. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * On PREEMPT_RT, the local lock does not actually disable irqs (and thus + * prevent the lockless operations), so fastpath operations also need to take + * the lock and are no longer lockless. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. * * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. @@ -119,26 +156,32 @@ * the fast path and disables lockless freelists. */ -#ifdef CONFIG_SLUB_DEBUG +/* + * We could simply use migrate_disable()/enable() but as long as it's a + * function call even on !PREEMPT_RT, use inline preempt_disable() there. + */ +#ifndef CONFIG_PREEMPT_RT +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#else +#define slub_get_cpu_ptr(var) \ +({ \ + migrate_disable(); \ + this_cpu_ptr(var); \ +}) +#define slub_put_cpu_ptr(var) \ +do { \ + (void)(var); \ + migrate_enable(); \ +} while (0) +#endif +#ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); #else DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif - -static inline bool __slub_debug_enabled(void) -{ - return static_branch_unlikely(&slub_debug_enabled); -} - -#else /* CONFIG_SLUB_DEBUG */ - -static inline bool __slub_debug_enabled(void) -{ - return false; -} - #endif /* CONFIG_SLUB_DEBUG */ static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -221,8 +264,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKDEPOT - depot_stack_handle_t handle; +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -374,25 +417,44 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) /* * Per slab locking using the pagelock */ -static __always_inline void slab_lock(struct page *page) +static __always_inline void __slab_lock(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void slab_unlock(struct page *page) +static __always_inline void __slab_unlock(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } -/* Interrupts must be disabled (for the fallback code to work right) */ +static __always_inline void slab_lock(struct page *page, unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); + __slab_lock(page); +} + +static __always_inline void slab_unlock(struct page *page, unsigned long *flags) +{ + __slab_unlock(page); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +/* + * Interrupts must be disabled (for the fallback code to work right), typically + * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different + * so we disable interrupts as part of slab_[un]lock(). + */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - VM_BUG_ON(!irqs_disabled()); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { @@ -403,15 +465,18 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page } else #endif { - slab_lock(page); + /* init to 0 to prevent spurious warnings */ + unsigned long flags = 0; + + slab_lock(page, &flags); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; - slab_unlock(page); + slab_unlock(page, &flags); return true; } - slab_unlock(page); + slab_unlock(page, &flags); } cpu_relax(); @@ -442,16 +507,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, unsigned long flags; local_irq_save(flags); - slab_lock(page); + __slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; - slab_unlock(page); + __slab_unlock(page); local_irq_restore(flags); return true; } - slab_unlock(page); + __slab_unlock(page); local_irq_restore(flags); } @@ -467,7 +532,19 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_SPINLOCK(object_map_lock); +static DEFINE_RAW_SPINLOCK(object_map_lock); + +static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct page *page) +{ + void *addr = page_address(page); + void *p; + + bitmap_zero(obj_map, page->objects); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(__obj_to_index(s, addr, p), obj_map); +} #if IS_ENABLED(CONFIG_KUNIT) static bool slab_add_kunit_errors(void) @@ -498,17 +575,11 @@ static inline bool slab_add_kunit_errors(void) { return false; } static unsigned long *get_map(struct kmem_cache *s, struct page *page) __acquires(&object_map_lock) { - void *p; - void *addr = page_address(page); - VM_BUG_ON(!irqs_disabled()); - spin_lock(&object_map_lock); + raw_spin_lock(&object_map_lock); - bitmap_zero(object_map, page->objects); - - for (p = page->freelist; p; p = get_freepointer(s, p)) - set_bit(__obj_to_index(s, addr, p), object_map); + __fill_map(object_map, s, page); return object_map; } @@ -516,7 +587,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); - spin_unlock(&object_map_lock); + raw_spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) @@ -591,8 +662,8 @@ static void print_section(char *level, char *text, u8 *addr, unsigned int length) { metadata_access_enable(); - print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS, - 16, 1, addr, length, 1); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); metadata_access_disable(); } @@ -626,27 +697,22 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } -#ifdef CONFIG_STACKDEPOT -static depot_stack_handle_t save_stack_depot_trace(gfp_t flags) -{ - unsigned long entries[TRACK_ADDRS_COUNT]; - depot_stack_handle_t handle; - unsigned int nr_entries; - - nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 4); - handle = stack_depot_save(entries, nr_entries, flags); - return handle; -} -#endif - static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { struct track *p = get_track(s, object, alloc); if (addr) { -#ifdef CONFIG_STACKDEPOT - p->handle = save_stack_depot_trace(GFP_NOWAIT); +#ifdef CONFIG_STACKTRACE + unsigned int nr_entries; + + metadata_access_enable(); + nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), + TRACK_ADDRS_COUNT, 3); + metadata_access_disable(); + + if (nr_entries < TRACK_ADDRS_COUNT) + p->addrs[nr_entries] = 0; #endif p->addr = addr; p->cpu = smp_processor_id(); @@ -673,19 +739,14 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKDEPOT +#ifdef CONFIG_STACKTRACE { - depot_stack_handle_t handle; - unsigned long *entries; - unsigned int nr_entries; - - handle = READ_ONCE(t->handle); - if (!handle) { - pr_err("object allocation/free stack trace missing\n"); - } else { - nr_entries = stack_depot_fetch(handle, &entries); - stack_trace_print(entries, nr_entries, 0); - } + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + pr_err("\t%pS\n", (void *)t->addrs[i]); + else + break; } #endif } @@ -1028,8 +1089,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) { int maxobj; - VM_BUG_ON(!irqs_disabled()); - if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; @@ -1290,11 +1349,11 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long flags; + unsigned long flags, flags2; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page); + slab_lock(page, &flags2); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, page)) @@ -1327,7 +1386,7 @@ out: slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); - slab_unlock(page); + slab_unlock(page, &flags2); spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); @@ -1425,12 +1484,13 @@ check_slabs: static int __init setup_slub_debug(char *str) { slab_flags_t flags; + slab_flags_t global_flags; char *saved_str; char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - slub_debug = DEBUG_DEFAULT_FLAGS; + global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) /* * No options specified. Switch on full debugging. @@ -1442,7 +1502,7 @@ static int __init setup_slub_debug(char *str) str = parse_slub_debug_flags(str, &flags, &slab_list, true); if (!slab_list) { - slub_debug = flags; + global_flags = flags; global_slub_debug_changed = true; } else { slab_list_specified = true; @@ -1451,16 +1511,18 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of - * slabs means debugging is only enabled for those slabs, so the global - * slub_debug should be 0. We can extended that to multiple lists as + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ if (slab_list_specified) { if (!global_slub_debug_changed) - slub_debug = 0; + global_flags = slub_debug; slub_debug_string = saved_str; } out: + slub_debug = global_flags; if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else @@ -1607,20 +1669,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, { kmemleak_free_recursive(x, s->flags); - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#ifdef CONFIG_LOCKDEP - { - unsigned long flags; + debug_check_no_locks_freed(x, s->object_size); - local_irq_save(flags); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); @@ -1837,9 +1887,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (gfpflags_allow_blocking(flags)) - local_irq_enable(); - flags |= s->allocflags; /* @@ -1898,8 +1945,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (gfpflags_allow_blocking(flags)) - local_irq_disable(); if (!page) return NULL; @@ -1913,6 +1958,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + return allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } @@ -2036,18 +2083,24 @@ static inline void *acquire_slab(struct kmem_cache *s, return freelist; } +#ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +#else +static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, + int drain) { } +#endif static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct kmem_cache_cpu *c, gfp_t flags) + struct page **ret_page, gfp_t gfpflags) { struct page *page, *page2; void *object = NULL; unsigned int available = 0; + unsigned long flags; int objects; /* @@ -2059,11 +2112,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(page, flags)) + if (!pfmemalloc_match(page, gfpflags)) continue; t = acquire_slab(s, n, page, object == NULL, &objects); @@ -2072,7 +2125,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, available += objects; if (!object) { - c->page = page; + *ret_page = page; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2084,7 +2137,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; } - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); return object; } @@ -2092,7 +2145,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, * Get a page from somewhere. Search in increasing NUMA distances. */ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct kmem_cache_cpu *c) + struct page **ret_page) { #ifdef CONFIG_NUMA struct zonelist *zonelist; @@ -2134,7 +2187,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c, flags); + object = get_partial_node(s, n, ret_page, flags); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2156,7 +2209,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * Get a partial page, lock it and return it. */ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct kmem_cache_cpu *c) + struct page **ret_page) { void *object; int searchnode = node; @@ -2164,11 +2217,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), c, flags); + object = get_partial_node(s, get_node(s, searchnode), ret_page, flags); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, c); + return get_any_partial(s, flags, ret_page); } #ifdef CONFIG_PREEMPTION @@ -2235,16 +2288,23 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; + struct kmem_cache_cpu *c; - for_each_possible_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_lock_init(&c->lock); + c->tid = init_tid(cpu); + } } /* - * Remove the cpu slab + * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, + * unfreezes the slabs and puts it on the proper list. + * Assumes the slab has been already safely taken away from kmem_cache_cpu + * by the caller. */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist, struct kmem_cache_cpu *c) + void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2252,6 +2312,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, enum slab_modes l = M_NONE, m = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; + unsigned long flags = 0; struct page new; struct page old; @@ -2327,7 +2388,7 @@ redo: * that acquire_slab() will see a slab page that * is frozen */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } else { m = M_FULL; @@ -2338,7 +2399,7 @@ redo: * slabs from diagnostic functions will not see * any frozen slabs. */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } @@ -2355,14 +2416,14 @@ redo: } l = m; - if (!__cmpxchg_double_slab(s, page, + if (!cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) goto redo; if (lock) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); if (m == M_PARTIAL) stat(s, tail); @@ -2373,38 +2434,29 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } - - c->page = NULL; - c->freelist = NULL; } -/* - * Unfreeze all the cpu partial slabs. - * - * This function must be called with interrupts disabled - * for the cpu using c (or some other guarantee must be there - * to guarantee no concurrent accesses). - */ -static void unfreeze_partials(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ #ifdef CONFIG_SLUB_CPU_PARTIAL +static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) +{ struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; + unsigned long flags = 0; - while ((page = slub_percpu_partial(c))) { + while (partial_page) { struct page new; struct page old; - slub_set_percpu_partial(c, page); + page = partial_page; + partial_page = page->next; n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); n = n2; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } do { @@ -2433,7 +2485,7 @@ static void unfreeze_partials(struct kmem_cache *s, } if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); while (discard_page) { page = discard_page; @@ -2443,7 +2495,35 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } -#endif /* CONFIG_SLUB_CPU_PARTIAL */ +} + +/* + * Unfreeze all the cpu partial slabs. + */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct page *partial_page; + unsigned long flags; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + partial_page = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (partial_page) + __unfreeze_partials(s, partial_page); +} + +static void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + struct page *partial_page; + + partial_page = slub_percpu_partial(c); + c->partial = NULL; + + if (partial_page) + __unfreeze_partials(s, partial_page); } /* @@ -2455,97 +2535,170 @@ static void unfreeze_partials(struct kmem_cache *s, */ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { -#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; - int pages; - int pobjects; + struct page *page_to_unfreeze = NULL; + unsigned long flags; + int pages = 0; + int pobjects = 0; - preempt_disable(); - do { - pages = 0; - pobjects = 0; - oldpage = this_cpu_read(s->cpu_slab->partial); + local_lock_irqsave(&s->cpu_slab->lock, flags); + + oldpage = this_cpu_read(s->cpu_slab->partial); - if (oldpage) { + if (oldpage) { + if (drain && oldpage->pobjects > slub_cpu_partial(s)) { + /* + * Partial array is full. Move the existing set to the + * per node partial list. Postpone the actual unfreezing + * outside of the critical section. + */ + page_to_unfreeze = oldpage; + oldpage = NULL; + } else { pobjects = oldpage->pobjects; pages = oldpage->pages; - if (drain && pobjects > slub_cpu_partial(s)) { - unsigned long flags; - /* - * partial array is full. Move the existing - * set to the per node partial list. - */ - local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); - oldpage = NULL; - pobjects = 0; - pages = 0; - stat(s, CPU_PARTIAL_DRAIN); - } } + } - pages++; - pobjects += page->objects - page->inuse; + pages++; + pobjects += page->objects - page->inuse; - page->pages = pages; - page->pobjects = pobjects; - page->next = oldpage; + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) - != oldpage); - if (unlikely(!slub_cpu_partial(s))) { - unsigned long flags; + this_cpu_write(s->cpu_slab->partial, page); - local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page_to_unfreeze) { + __unfreeze_partials(s, page_to_unfreeze); + stat(s, CPU_PARTIAL_DRAIN); } - preempt_enable(); -#endif /* CONFIG_SLUB_CPU_PARTIAL */ } +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void unfreeze_partials(struct kmem_cache *s) { } +static inline void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist, c); + unsigned long flags; + struct page *page; + void *freelist; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + + page = c->page; + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page) { + deactivate_slab(s, page, freelist); + stat(s, CPUSLAB_FLUSH); + } +} + +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + void *freelist = c->freelist; + struct page *page = c->page; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + if (page) { + deactivate_slab(s, page, freelist); + stat(s, CPUSLAB_FLUSH); + } + + unfreeze_partials_cpu(s, c); } +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + /* * Flush cpu slab. * - * Called from IPI handler with interrupts disabled. + * Called from CPU work handler with migration disabled. */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static void flush_cpu_slab(struct work_struct *w) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache *s; + struct kmem_cache_cpu *c; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); + + s = sfw->s; + c = this_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); - unfreeze_partials(s, c); + unfreeze_partials(s); } -static void flush_cpu_slab(void *d) +static bool has_cpu_slab(int cpu, struct kmem_cache *s) { - struct kmem_cache *s = d; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - __flush_cpu_slab(s, smp_processor_id()); + return c->page || slub_percpu_partial(c); } -static bool has_cpu_slab(int cpu, void *info) +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + +static void flush_all_cpus_locked(struct kmem_cache *s) { - struct kmem_cache *s = info; - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct slub_flush_work *sfw; + unsigned int cpu; - return c->page || slub_percpu_partial(c); + lockdep_assert_cpus_held(); + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (!has_cpu_slab(cpu, s)) { + sfw->skip = true; + continue; + } + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; + schedule_work_on(cpu, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (sfw->skip) + continue; + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); } static void flush_all(struct kmem_cache *s) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); } /* @@ -2555,14 +2708,10 @@ static void flush_all(struct kmem_cache *s) static int slub_cpu_dead(unsigned int cpu) { struct kmem_cache *s; - unsigned long flags; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); + list_for_each_entry(s, &slab_caches, list) __flush_cpu_slab(s, cpu); - local_irq_restore(flags); - } mutex_unlock(&slab_mutex); return 0; } @@ -2645,44 +2794,22 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) #endif } -static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, - int node, struct kmem_cache_cpu **pc) +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) { - void *freelist; - struct kmem_cache_cpu *c = *pc; - struct page *page; - - WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); - - freelist = get_partial(s, flags, node, c); - - if (freelist) - return freelist; - - page = new_slab(s, flags, node); - if (page) { - c = raw_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); - - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - freelist = page->freelist; - page->freelist = NULL; - - stat(s, ALLOC_SLAB); - c->page = page; - *pc = c; - } + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); - return freelist; + return true; } -static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +/* + * A variant of pfmemalloc_match() that tests page flags without asserting + * PageSlab. Intended for opportunistic checks before taking a lock and + * rechecking that nobody else freed the page under us. + */ +static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) { - if (unlikely(PageSlabPfmemalloc(page))) + if (unlikely(__PageSlabPfmemalloc(page))) return gfp_pfmemalloc_allowed(gfpflags); return true; @@ -2695,8 +2822,6 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) * The page is still frozen if the return value is not NULL. * * If this function returns NULL then the page has been unfrozen. - * - * This function must be called with interrupt disabled. */ static inline void *get_freelist(struct kmem_cache *s, struct page *page) { @@ -2704,6 +2829,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) unsigned long counters; void *freelist; + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + do { freelist = page->freelist; counters = page->counters; @@ -2738,7 +2865,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. * - * Version of __slab_alloc to use when we know that interrupts are + * Version of __slab_alloc to use when we know that preemption is * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, @@ -2746,10 +2873,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void *freelist; struct page *page; + unsigned long flags; stat(s, ALLOC_SLOWPATH); - page = c->page; +reread_page: + + page = READ_ONCE(c->page); if (!page) { /* * if the node is not online or has no normal memory, just @@ -2772,8 +2902,7 @@ redo: goto redo; } else { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist, c); - goto new_slab; + goto deactivate_slab; } } @@ -2782,12 +2911,15 @@ redo: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist, c); - goto new_slab; + if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) + goto deactivate_slab; + + /* must check again c->page in case we got preempted and it changed */ + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(page != c->page)) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; } - - /* must check again c->freelist in case of cpu migration or IRQ */ freelist = c->freelist; if (freelist) goto load_freelist; @@ -2796,6 +2928,7 @@ redo: if (!freelist) { c->page = NULL; + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2803,6 +2936,9 @@ redo: stat(s, ALLOC_REFILL); load_freelist: + + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + /* * freelist is pointing to the list of objects to be used. * page is pointing to the page from which the objects are obtained. @@ -2811,59 +2947,141 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist; +deactivate_slab: + + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (page != c->page) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; + } + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + deactivate_slab(s, page, freelist); + new_slab: if (slub_percpu_partial(c)) { + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->page)) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; + } + if (unlikely(!slub_percpu_partial(c))) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + /* we were preempted and partial list got empty */ + goto new_objects; + } + page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } - freelist = new_slab_objects(s, gfpflags, node, &c); +new_objects: + + freelist = get_partial(s, gfpflags, node, &page); + if (freelist) + goto check_new_page; + + slub_put_cpu_ptr(s->cpu_slab); + page = new_slab(s, gfpflags, node); + c = slub_get_cpu_ptr(s->cpu_slab); - if (unlikely(!freelist)) { + if (unlikely(!page)) { slab_out_of_memory(s, gfpflags, node); return NULL; } - page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + freelist = page->freelist; + page->freelist = NULL; - /* Only entered in the debug case */ - if (kmem_cache_debug(s) && - !alloc_debug_processing(s, page, freelist, addr)) - goto new_slab; /* Slab failed checks. Next slab needed */ + stat(s, ALLOC_SLAB); + +check_new_page: - deactivate_slab(s, page, get_freepointer(s, freelist), c); + if (kmem_cache_debug(s)) { + if (!alloc_debug_processing(s, page, freelist, addr)) { + /* Slab failed checks. Next slab needed */ + goto new_slab; + } else { + /* + * For debug case, we don't load freelist so that all + * allocations go through alloc_debug_processing() + */ + goto return_single; + } + } + + if (unlikely(!pfmemalloc_match(page, gfpflags))) + /* + * For !pfmemalloc_match() case we don't load freelist so that + * we don't make further mismatched allocations easier. + */ + goto return_single; + +retry_load_page: + + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->page)) { + void *flush_freelist = c->freelist; + struct page *flush_page = c->page; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + deactivate_slab(s, flush_page, flush_freelist); + + stat(s, CPUSLAB_FLUSH); + + goto retry_load_page; + } + c->page = page; + + goto load_freelist; + +return_single: + + deactivate_slab(s, page, get_freepointer(s, freelist)); return freelist; } /* - * Another one that disabled interrupt and compensates for possible - * cpu changes by refetching the per cpu area pointer. + * A wrapper for ___slab_alloc() for contexts where preemption is not yet + * disabled. Compensates for possible cpu changes by refetching the per cpu area + * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *p; - unsigned long flags; - local_irq_save(flags); -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_COUNT /* * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area + * cpu before disabling preemption. Need to reload cpu area * pointer. */ - c = this_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); #endif p = ___slab_alloc(s, gfpflags, node, addr, c); - local_irq_restore(flags); +#ifdef CONFIG_PREEMPT_COUNT + slub_put_cpu_ptr(s->cpu_slab); +#endif return p; } @@ -2914,15 +3132,14 @@ redo: * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * We should guarantee that tid and kmem_cache are retrieved on - * the same cpu. It could be different if CONFIG_PREEMPTION so we need - * to check if it is matched or not. + * We must guarantee that tid and kmem_cache_cpu are retrieved on the + * same cpu. We read first the kmem_cache_cpu pointer and use it to read + * the tid. If we are preempted and switched to another cpu between the + * two reads, it's OK as the two are still associated with the same cpu + * and cmpxchg later will validate the cpu. */ - do { - tid = this_cpu_read(s->cpu_slab->tid); - c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPTION) && - unlikely(tid != READ_ONCE(c->tid))); + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); /* * Irqless object alloc/free algorithm used here depends on sequence @@ -2943,7 +3160,15 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !page || !node_match(page, node))) { + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if a + * slowpath has taken the local_lock_irqsave(), it is not protected + * against a fast path operation in an irq handler. So we need to take + * the slow path which uses local_lock. It is still relatively fast if + * there is a suitable cpu freelist. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || + unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); @@ -3196,16 +3421,14 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succeed. */ - do { - tid = this_cpu_read(s->cpu_slab->tid); - c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPTION) && - unlikely(tid != READ_ONCE(c->tid))); + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); /* Same with comment on barrier() in slab_alloc_node() */ barrier(); if (likely(page == c->page)) { +#ifndef CONFIG_PREEMPT_RT void **freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3218,6 +3441,31 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } +#else /* CONFIG_PREEMPT_RT */ + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if + * a slowpath has taken the local_lock_irqsave(), it is not + * protected against a fast path operation in an irq handler. So + * we need to take the local_lock. We shouldn't simply defer to + * __slab_free() as that wouldn't use the cpu freelist at all. + */ + void **freelist; + + local_lock(&s->cpu_slab->lock); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(page != c->page)) { + local_unlock(&s->cpu_slab->lock); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail_obj, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock(&s->cpu_slab->lock); +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, head, tail_obj, cnt, addr); @@ -3261,6 +3509,16 @@ struct detached_freelist { struct kmem_cache *s; }; +static inline void free_nonslab_page(struct page *page, void *object) +{ + unsigned int order = compound_order(page); + + VM_BUG_ON_PAGE(!PageCompound(page), page); + kfree_hook(object); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __free_pages(page, order); +} + /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3297,9 +3555,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!PageSlab(page))) { - BUG_ON(!PageCompound(page)); - kfree_hook(object); - __free_pages(page, compound_order(page)); + free_nonslab_page(page, object); p[size] = NULL; /* mark object processed */ return size; } @@ -3387,8 +3643,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * IRQs, which protects against PREEMPT and interrupts * handlers invoking normal fastpath. */ - local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); + local_lock_irq(&s->cpu_slab->lock); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3409,6 +3665,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); + local_unlock_irq(&s->cpu_slab->lock); + /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist @@ -3421,6 +3679,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); + local_lock_irq(&s->cpu_slab->lock); + continue; /* goto for-loop */ } c->freelist = get_freepointer(s, object); @@ -3428,7 +3688,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); + slub_put_cpu_ptr(s->cpu_slab); /* * memcg and kmem_cache debug support and memory initialization. @@ -3438,7 +3699,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s)); return i; error: - local_irq_enable(); + slub_put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; @@ -3952,11 +4213,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long flags; unsigned long *map; void *p; slab_err(s, page, text, s->name); - slab_lock(page); + slab_lock(page, &flags); map = get_map(s, page); for_each_object(p, s, addr, page->objects) { @@ -3967,7 +4229,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } put_map(map); - slab_unlock(page); + slab_unlock(page, &flags); #endif } @@ -4017,7 +4279,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) int node; struct kmem_cache_node *n; - flush_all(s); + flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { free_partial(s, n); @@ -4059,26 +4321,18 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKDEPOT - { - depot_stack_handle_t handle; - unsigned long *entries; - unsigned int nr_entries; - - handle = READ_ONCE(trackp->handle); - if (handle) { - nr_entries = stack_depot_fetch(handle, &entries); - for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) - kpp->kp_stack[i] = (void *)entries[i]; - } +#ifdef CONFIG_STACKTRACE + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_stack[i]) + break; + } - trackp = get_track(s, objp, TRACK_FREE); - handle = READ_ONCE(trackp->handle); - if (handle) { - nr_entries = stack_depot_fetch(handle, &entries); - for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) - kpp->kp_free_stack[i] = (void *)entries[i]; - } + trackp = get_track(s, objp, TRACK_FREE); + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_free_stack[i]) + break; } #endif #endif @@ -4283,13 +4537,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - unsigned int order = compound_order(page); - - BUG_ON(!PageCompound(page)); - kfree_hook(object); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(page, order); + free_nonslab_page(page, object); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); @@ -4307,7 +4555,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +static int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; @@ -4319,7 +4567,6 @@ int __kmem_cache_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) @@ -4369,13 +4616,21 @@ int __kmem_cache_shrink(struct kmem_cache *s) return ret; } +int __kmem_cache_shrink(struct kmem_cache *s) +{ + flush_all(s); + return __kmem_cache_do_shrink(s); +} + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + list_for_each_entry(s, &slab_caches, list) { + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); + } mutex_unlock(&slab_mutex); return 0; @@ -4701,33 +4956,33 @@ static int count_total(struct page *page) #endif #ifdef CONFIG_SLUB_DEBUG -static void validate_slab(struct kmem_cache *s, struct page *page) +static void validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *obj_map) { void *p; void *addr = page_address(page); - unsigned long *map; + unsigned long flags; - slab_lock(page); + slab_lock(page, &flags); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) goto unlock; /* Now we know that a valid freelist exists */ - map = get_map(s, page); + __fill_map(obj_map, s, page); for_each_object(p, s, addr, page->objects) { - u8 val = test_bit(__obj_to_index(s, addr, p), map) ? + u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; if (!check_object(s, page, p, val)) break; } - put_map(map); unlock: - slab_unlock(page); + slab_unlock(page, &flags); } static int validate_slab_node(struct kmem_cache *s, - struct kmem_cache_node *n) + struct kmem_cache_node *n, unsigned long *obj_map) { unsigned long count = 0; struct page *page; @@ -4736,7 +4991,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) { - validate_slab(s, page); + validate_slab(s, page, obj_map); count++; } if (count != n->nr_partial) { @@ -4749,7 +5004,7 @@ static int validate_slab_node(struct kmem_cache *s, goto out; list_for_each_entry(page, &n->full, slab_list) { - validate_slab(s, page); + validate_slab(s, page, obj_map); count++; } if (count != atomic_long_read(&n->nr_slabs)) { @@ -4768,10 +5023,17 @@ long validate_slab_cache(struct kmem_cache *s) int node; unsigned long count = 0; struct kmem_cache_node *n; + unsigned long *obj_map; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) + return -ENOMEM; flush_all(s); for_each_kmem_cache_node(s, node, n) - count += validate_slab_node(s, n); + count += validate_slab_node(s, n, obj_map); + + bitmap_free(obj_map); return count; } @@ -4907,17 +5169,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + unsigned long *obj_map) { void *addr = page_address(page); void *p; - unsigned long *map; - map = get_map(s, page); + __fill_map(obj_map, s, page); + for_each_object(p, s, addr, page->objects) - if (!test_bit(__obj_to_index(s, addr, p), map)) + if (!test_bit(__obj_to_index(s, addr, p), obj_map)) add_location(t, s, get_track(s, p, alloc)); - put_map(map); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ @@ -5844,17 +6106,21 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, sizeof(struct loc_track)); struct kmem_cache *s = file_inode(filep)->i_private; + unsigned long *obj_map; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) + return -ENOMEM; if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) alloc = TRACK_ALLOC; else alloc = TRACK_FREE; - if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { + bitmap_free(obj_map); return -ENOMEM; - - /* Push back cpu slabs */ - flush_all(s); + } for_each_kmem_cache_node(s, node, n) { unsigned long flags; @@ -5865,12 +6131,13 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) - process_slab(t, s, page, alloc); + process_slab(t, s, page, alloc, obj_map); list_for_each_entry(page, &n->full, slab_list) - process_slab(t, s, page, alloc); + process_slab(t, s, page, alloc, obj_map); spin_unlock_irqrestore(&n->list_lock, flags); } + bitmap_free(obj_map); return 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 6326cdf36c4f..120bc8ea5293 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -#ifdef CONFIG_SPARSEMEM_EXTREME -unsigned long __section_nr(struct mem_section *ms) -{ - unsigned long root_nr; - struct mem_section *root = NULL; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - VM_BUG_ON(!root); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} -#else -unsigned long __section_nr(struct mem_section *ms) -{ - return (unsigned long)(ms - mem_section[0]); -} -#endif - /* * During early boot, before section_mem_map is used for an actual * mem_map, we use section_mem_map to store the section's NUMA @@ -143,7 +117,7 @@ unsigned long __section_nr(struct mem_section *ms) */ static inline unsigned long sparse_encode_early_nid(int nid) { - return (nid << SECTION_NID_SHIFT); + return ((unsigned long)nid << SECTION_NID_SHIFT); } static inline int sparse_early_nid(struct mem_section *section) @@ -187,10 +161,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * those loops early. */ unsigned long __highest_present_section_nr; -static void section_mark_present(struct mem_section *ms) +static void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) { - unsigned long section_nr = __section_nr(ms); - if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; @@ -280,7 +253,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; - section_mark_present(ms); + __section_mark_present(ms, section); } } } @@ -348,7 +321,8 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA - return __pa_symbol(pgdat); + VM_BUG_ON(pgdat != &contig_page_data); + return __pa_symbol(&contig_page_data); #else return __pa(pgdat); #endif @@ -462,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, size, addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", __func__, size, PAGE_SIZE, nid, &addr); @@ -490,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), - addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; } @@ -934,7 +906,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); - section_mark_present(ms); + __section_mark_present(ms, section_nr); /* Align memmap to section boundary in the subsection case */ if (section_nr_to_pfn(section_nr) != start_pfn) diff --git a/mm/swap.c b/mm/swap.c index 19600430e536..897200d27dd0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -/* - * get_kernel_page() - pin a kernel page in memory - * @start: starting kernel address - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointer to the page pinned. - * Must be at least nr_segs long. - * - * Returns 1 if page is pinned. If the page was not pinned, returns - * -errno. The page returned must be released with a put_page() call - * when it is finished with. - */ -int get_kernel_page(unsigned long start, int write, struct page **pages) -{ - const struct kvec kiov = { - .iov_base = (void *)start, - .iov_len = PAGE_SIZE - }; - - return get_kernel_pages(&kiov, 1, write, pages); -} -EXPORT_SYMBOL_GPL(get_kernel_page); - static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { diff --git a/mm/swap_slots.c b/mm/swap_slots.c index a66f3e0ec973..16f706c55d92 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -70,9 +70,9 @@ void disable_swap_slots_cache_lock(void) swap_slot_cache_enabled = false; if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ - get_online_cpus(); + cpus_read_lock(); __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); - put_online_cpus(); + cpus_read_unlock(); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index c56aa9ac050d..bc7cee6b2ec5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -628,13 +628,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; - /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { - struct inode *inode = si->swap_file->f_mapping->host; - if (inode_read_congested(inode)) - goto skip; - } - do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e07d1c776f2..22d10f713848 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; + struct dentry *dentry; int prio; int error; union swap_header *swap_header; @@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; + dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(p, inode); @@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; inode_lock(inode); + if (d_unlinked(dentry) || cant_mount(dentry)) { + error = -ENOENT; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; @@ -3773,7 +3779,7 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; int nid = page_to_nid(page); diff --git a/mm/truncate.c b/mm/truncate.c index 234ddd879caa..714eaf19821d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -412,7 +412,8 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @mapping: mapping to truncate * @lstart: offset from which to truncate * - * Called under (and serialised by) inode->i_mutex. + * Called under (and serialised by) inode->i_rwsem and + * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __delete_from_page_cache()) in the specified range. Thus @@ -429,7 +430,7 @@ EXPORT_SYMBOL(truncate_inode_pages); * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * - * Called under (and serialized by) inode->i_mutex. + * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. @@ -483,8 +484,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, index = indices[i]; if (xa_is_value(page)) { - invalidate_exceptional_entry(mapping, index, - page); + count += invalidate_exceptional_entry(mapping, + index, + page); continue; } index += thp_nr_pages(page) - 1; @@ -512,19 +514,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, } /** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate + * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode + * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. + * This function removes pages that are clean, unmapped and unlocked, + * as well as shadow entries. It will not block on IO activity. * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. + * If you want to remove all the pages of one inode, regardless of + * their use and writeback state, use truncate_inode_pages(). * - * Return: the number of the pages that were invalidated + * Return: the number of the cache entries that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -560,21 +561,19 @@ void invalidate_mapping_pagevec(struct address_space *mapping, static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - unsigned long flags; - if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -582,7 +581,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } @@ -748,7 +747,7 @@ EXPORT_SYMBOL(truncate_pagecache); * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally - * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * i_rwsem but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) @@ -777,7 +776,7 @@ EXPORT_SYMBOL(truncate_setsize); * * The function must be called after i_size is updated so that page fault * coming after we unlock the page will already see the new i_size. - * The function must be called while we still hold i_mutex - this not only + * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0e2132834bc7..7a9008415534 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mcopy_mode, - bool *mmap_changing, + atomic_t *mmap_changing, __u64 mode) { struct vm_area_struct *dst_vma; @@ -517,7 +517,7 @@ retry: * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; /* @@ -650,28 +650,29 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode) + atomic_t *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, mmap_changing, 0); } ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, bool *mmap_changing) + unsigned long len, bool enable_wp, + atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; pgprot_t newprot; @@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; err = -ENOENT; diff --git a/mm/util.c b/mm/util.c index 99c6cc77de9e..499b6b5767ed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -593,6 +593,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; + /* Don't even allow crazy sizes */ + if (WARN_ON_ONCE(size > INT_MAX)) + return NULL; + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } @@ -635,6 +639,21 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; @@ -731,6 +750,16 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +void copy_huge_page(struct page *dst, struct page *src) +{ + unsigned i, nr = compound_nr(src); + + for (i = 0; i < nr; i++) { + cond_resched(); + copy_highpage(nth_page(dst, i), nth_page(src, i)); + } +} + int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5cd52805149..d77830ff604c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -44,6 +44,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_max_page_shift = PAGE_SHIFT; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; @@ -298,15 +311,14 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, return err; } -int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) +int ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { int err; - err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), + ioremap_max_page_shift); flush_cache_vmap(addr, end); - return err; } @@ -787,6 +799,28 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +{ + struct vmap_area *va = NULL; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *tmp; + + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end > addr) { + va = tmp; + if (tmp->va_start <= addr) + break; + + n = n->rb_left; + } else + n = n->rb_right; + } + + return va; +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -1479,6 +1513,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; + unsigned long freed; unsigned long addr; int purged = 0; int ret; @@ -1542,13 +1577,12 @@ overflow: goto retry; } - if (gfpflags_allow_blocking(gfp_mask)) { - unsigned long freed = 0; - blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); - if (freed > 0) { - purged = 0; - goto retry; - } + freed = 0; + blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); + + if (freed > 0) { + purged = 0; + goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) @@ -2779,7 +2813,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn); static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, - unsigned int order, unsigned long nr_pages, struct page **pages) + unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; @@ -2789,10 +2823,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order) - nr_allocated = alloc_pages_bulk_array_node( - gfp, nid, nr_pages, pages); - else + if (!order) { + while (nr_allocated < nr_pages) { + unsigned int nr, nr_pages_request; + + /* + * A maximum allowed request is hard-coded and is 100 + * pages per call. That is done in order to prevent a + * long preemption off scenario in the bulk-allocator + * so the range is [1:100]. + */ + nr_pages_request = min(100U, nr_pages - nr_allocated); + + nr = alloc_pages_bulk_array_node(gfp, nid, + nr_pages_request, pages + nr_allocated); + + nr_allocated += nr; + cond_resched(); + + /* + * If zero or pages were obtained partly, + * fallback to a single page allocator. + */ + if (nr != nr_pages_request) + break; + } + } else /* * Compound pages required for remap_vmalloc_page if * high-order pages. @@ -2816,9 +2872,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - if (gfpflags_allow_blocking(gfp)) - cond_resched(); - + cond_resched(); nr_allocated += 1U << order; } @@ -3267,9 +3321,14 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished; + + /* no intersects with alive vmap_area */ + if ((unsigned long)addr + count <= va->va_start) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d69019fc3789..76518e4166dc 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) @@ -240,7 +239,12 @@ static void vmpressure_work_fn(struct work_struct *work) void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { - struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure *vmpr; + + if (mem_cgroup_disabled()) + return; + + vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to diff --git a/mm/vmscan.c b/mm/vmscan.c index 4620df62f0ff..74296c2d1fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/oom.h> @@ -100,9 +101,12 @@ struct scan_control { unsigned int may_swap:1; /* - * Cgroups are not reclaimed below their configured memory.low, - * unless we threaten to OOM. If any cgroups are skipped due to - * memory.low and nothing was reclaimed, go back for memory.low. + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; @@ -118,6 +122,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + /* Allocation order */ s8 order; @@ -515,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } +static bool can_demote(int nid, struct scan_control *sc) +{ + if (!numa_demotion_enabled) + return false; + if (sc) { + if (sc->no_demotion) + return false; + /* It is pointless to do demotion in memcg reclaim */ + if (cgroup_reclaim(sc)) + return false; + } + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + return true; +} + +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { + /* + * For non-memcg reclaim, is there + * space in any swap device? + */ + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } + + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote(nid, sc); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -526,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); @@ -890,6 +939,7 @@ out: void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -902,7 +952,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 1); } void drop_slab(void) @@ -1049,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed, struct mem_cgroup *target_memcg) { - unsigned long flags; int refcount; void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. * @@ -1097,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -1123,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (freepage != NULL) freepage(page); @@ -1132,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } @@ -1261,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +static struct page *alloc_demote_page(struct page *page, unsigned long node) +{ + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_THISNODE | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = node + }; + + return alloc_migration_target(page, (unsigned long)&mtc); +} + +/* + * Take pages on @demote_list and attempt to demote them to + * another node. Pages which are not demoted are left on + * @demote_pages. + */ +static unsigned int demote_page_list(struct list_head *demote_pages, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + int err; + + if (list_empty(demote_pages)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + /* Demotion ignores all cpuset and mempolicy settings */ + err = migrate_pages(demote_pages, alloc_demote_page, NULL, + target_nid, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + + return nr_succeeded; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1272,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc); +retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1427,6 +1528,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, } /* + * Before reclaiming the page, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !PageTransHuge(page))) { + list_add(&page->lru, &demote_pages); + unlock_page(page); + continue; + } + + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree page could be freed directly @@ -1621,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list, /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { - page_ref_unfreeze(page, 1); - goto keep_locked; - } - + /* + * The page has only one reference left, which is + * from the isolation. After the caller puts the + * page back on lru and drops the reference, the + * page will be freed anyway. It doesn't matter + * which lru it goes. So we don't bother checking + * PageDirty here. + */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true, @@ -1677,6 +1792,17 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* 'page_list' is always empty here */ + + /* Migrate pages selected for demotion */ + nr_reclaimed += demote_page_list(&demote_pages, pgdat); + /* Pages that could not be demoted are still in @demote_pages */ + if (!list_empty(&demote_pages)) { + /* Pages which failed to demoted go back on @page_list for retry: */ + list_splice_init(&demote_pages, page_list); + do_demote_pass = false; + goto retry; + } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -1695,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, { struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_unmap = 1, }; struct reclaim_stat stat; @@ -2320,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list) unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, + .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); @@ -2449,6 +2574,7 @@ enum scan_balance { static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); @@ -2459,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } @@ -2537,15 +2663,14 @@ out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; + unsigned long low, min; unsigned long scan; - unsigned long protection; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(sc->target_mem_cgroup, - memcg, - sc->memcg_low_reclaim); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, + &min, &low); - if (protection) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2576,12 +2701,21 @@ out: * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); scan = lruvec_size - lruvec_size * protection / - cgroup_size; + (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep @@ -2634,6 +2768,21 @@ out: } } +/* + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct pglist_data *pgdat, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote(pgdat->node_id, sc); +} + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2743,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2813,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; @@ -2887,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; @@ -3423,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ - if (!(gfp_mask & __GFP_FS)) { + if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); - goto check_pending; - } - - /* Throttle until kswapd wakes the process */ - wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); - -check_pending: if (fatal_signal_pending(current)) return true; @@ -3572,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat, struct mem_cgroup *memcg; struct lruvec *lruvec; - if (!total_swap_pages) + if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); @@ -3801,7 +3953,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); @@ -3927,9 +4079,9 @@ restart: wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; @@ -3981,7 +4133,7 @@ out: } snapshot_refaults(NULL, pgdat); - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); @@ -4279,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kswapd) - return 0; + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; } - return ret; } /* @@ -4413,11 +4562,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; + unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -4442,6 +4593,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); diff --git a/mm/vmstat.c b/mm/vmstat.c index b0534e068166..8ce2620344b2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -129,9 +129,9 @@ static void sum_vm_events(unsigned long *ret) */ void all_vm_events(unsigned long *ret) { - get_online_cpus(); + cpus_read_lock(); sum_vm_events(ret); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone) * * Some sample thresholds: * - * Threshold Processors (fls) Zonesize fls(mem+1) + * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 @@ -319,6 +319,16 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long x; long t; + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -328,6 +338,9 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -350,6 +363,10 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -359,6 +376,9 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -391,6 +411,10 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -399,6 +423,9 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v + overstep, zone, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -409,6 +436,10 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -417,6 +448,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -437,6 +471,10 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -445,6 +483,9 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v - overstep, zone, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -455,6 +496,10 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -463,6 +508,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) @@ -1217,6 +1265,8 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgdemote_kswapd", + "pgdemote_direct", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", @@ -1452,7 +1502,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } /* Print out the free pages at each order for each migatetype */ -static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1464,8 +1514,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); - - return 0; } static void pagetypeinfo_showblockcount_print(struct seq_file *m, @@ -1501,7 +1549,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, } /* Print out the number of pageblocks for each migratetype */ -static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1512,8 +1560,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showblockcount_print); - - return 0; } /* @@ -1874,11 +1920,6 @@ static void vmstat_update(struct work_struct *w) } /* - * Switch off vmstat processing and then fold all the remaining differentials - * until the diffs stay at zero. The function is used by NOHZ and can only be - * invoked when tick processing is not active. - */ -/* * Check if the diffs for a certain cpu indicate that * an update is needed. */ @@ -1894,17 +1935,15 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(pzstats->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff))) return true; if (last_pgdat == zone->zone_pgdat) continue; last_pgdat = zone->zone_pgdat; n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); - if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * - sizeof(n->vm_node_stat_diff[0]))) - return true; + if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff))) + return true; } return false; } @@ -1948,7 +1987,7 @@ static void vmstat_shepherd(struct work_struct *w) { int cpu; - get_online_cpus(); + cpus_read_lock(); /* Check processors whose vmstat worker threads have been disabled */ for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); @@ -1958,7 +1997,7 @@ static void vmstat_shepherd(struct work_struct *w) cond_resched(); } - put_online_cpus(); + cpus_read_unlock(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); @@ -2037,9 +2076,9 @@ void __init init_mm_internals(void) if (ret < 0) pr_err("vmstat: failed to register 'online' hotplug state\n"); - get_online_cpus(); + cpus_read_lock(); init_cpu_node_state(); - put_online_cpus(); + cpus_read_unlock(); start_shepherd_timer(); #endif diff --git a/mm/workingset.c b/mm/workingset.c index 5ba3e42446fa..d4268d8e9a82 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -249,7 +249,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) * @target_memcg: the cgroup that is causing the reclaim * @page: the page being evicted * - * Returns a shadow entry to be stored in @page->mapping->i_pages in place + * Return: a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) |