diff options
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 10 | ||||
-rw-r--r-- | include/linux/list.h | 17 | ||||
-rw-r--r-- | include/linux/mmzone.h | 1 | ||||
-rw-r--r-- | init/Kconfig | 24 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 6 | ||||
-rw-r--r-- | mm/shuffle.c | 184 | ||||
-rw-r--r-- | mm/shuffle.h | 52 |
9 files changed, 302 insertions, 2 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43176340c73d..5be4d3ff5e70 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3174,6 +3174,16 @@ This will also cause panics on machine check exceptions. Useful together with panic=30 to trigger a reboot. + page_alloc.shuffle= + [KNL] Boolean flag to control whether the page allocator + should randomize its free lists. The randomization may + be automatically enabled if the kernel detects it is + running on a platform with a direct-mapped memory-side + cache, and this parameter can be used to + override/disable that behavior. The state of the flag + can be read from sysfs at: + /sys/module/page_alloc/parameters/shuffle. + page_owner= [KNL] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, diff --git a/include/linux/list.h b/include/linux/list.h index 9e9a6403dbe4..d3b4db895340 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -151,6 +151,23 @@ static inline void list_replace_init(struct list_head *old, } /** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + +/** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a4aedc160bd..1fb5a04530aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1271,6 +1271,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define pfn_present pfn_valid #endif /* CONFIG_SPARSEMEM */ /* diff --git a/init/Kconfig b/init/Kconfig index 82b84e5ee30d..8b9ffe236e4f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED sacrifies to harden the kernel slab allocator against common freelist exploit methods. +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..ac5e5ba78874 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o \ + maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ @@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c0c4f48638e..328878b6799d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -39,6 +39,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -891,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + shuffle_zone(zone); + if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b2d5f50431d..548f8f5d3295 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#include "shuffle.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -1874,9 +1875,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1900,6 +1901,9 @@ void __init page_alloc_init_late(void) /* Discard memblock private memory */ memblock_discard(); + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); + for_each_populated_zone(zone) set_zone_contiguous(zone); } diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000000..bc0419a61fbe --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/mmzone.h> +#include <linux/random.h> +#include <linux/moduleparam.h> +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +static unsigned long shuffle_state __ro_after_init; + +/* + * Depending on the architecture, module parameter parsing may run + * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents, + * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE + * attempts to turn on the implementation, but aborts if it finds + * SHUFFLE_FORCE_DISABLE already set. + */ +__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ + if (ctl == SHUFFLE_FORCE_DISABLE) + set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state); + + if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) { + if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_disable(&page_alloc_shuffle_key); + } else if (ctl == SHUFFLE_ENABLE + && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_enable(&page_alloc_shuffle_key); +} + +static bool shuffle_param; +extern int shuffle_show(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) + ? 'Y' : 'N'); +} + +static __meminit int shuffle_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + if (shuffle_param) + page_alloc_shuffle(SHUFFLE_ENABLE); + else + page_alloc_shuffle(SHUFFLE_FORCE_DISABLE); + return 0; +} +module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000000..644c8ee97b9e --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include <linux/jump_label.h> + +/* + * SHUFFLE_ENABLE is called from the command line enabling path, or by + * platform-firmware enabling that indicates the presence of a + * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from + * the command line path and overrides any previous or future + * SHUFFLE_ENABLE. + */ +enum mm_shuffle_ctl { + SHUFFLE_ENABLE, + SHUFFLE_FORCE_DISABLE, +}; + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); +extern void __shuffle_free_memory(pg_data_t *pgdat); +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} +#else +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ +} +#endif +#endif /* _MM_SHUFFLE_H */ |