summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2021-09-05 18:58:05 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2021-09-05 18:58:05 -0700
commit8be98d2f2a0a262f8bf8a0bc1fdf522b3c7aab17 (patch)
treea226b265d692d1933c0541802527d8aeb0d469ab /mm
parent818b26588994d9d95743fca0a427f08ec6c1c41d (diff)
parent3e204d6b76b29274cc8e57f8bd8d9873f04a7f48 (diff)
Merge branch 'next' into for-linus
Prepare input updates for 5.15 merge window.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig41
-rw-r--r--mm/Makefile7
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/cma.c62
-rw-r--r--mm/cma.h25
-rw-r--r--mm/cma_debug.c8
-rw-r--r--mm/cma_sysfs.c112
-rw-r--r--mm/compaction.c111
-rw-r--r--mm/debug_vm_pgtable.c8
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c152
-rw-r--r--mm/frontswap.c12
-rw-r--r--mm/gup.c325
-rw-r--r--mm/gup_test.c29
-rw-r--r--mm/gup_test.h3
-rw-r--r--mm/highmem.c13
-rw-r--r--mm/huge_memory.c386
-rw-r--r--mm/hugetlb.c940
-rw-r--r--mm/hugetlb_cgroup.c9
-rw-r--r--mm/internal.h117
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/ioremap.c225
-rw-r--r--mm/kasan/common.c45
-rw-r--r--mm/kasan/generic.c12
-rw-r--r--mm/kasan/hw_tags.c66
-rw-r--r--mm/kasan/init.c4
-rw-r--r--mm/kasan/kasan.h72
-rw-r--r--mm/kasan/quarantine.c4
-rw-r--r--mm/kasan/report.c22
-rw-r--r--mm/kasan/report_generic.c2
-rw-r--r--mm/kasan/shadow.c14
-rw-r--r--mm/kasan/sw_tags.c12
-rw-r--r--mm/kfence/core.c59
-rw-r--r--mm/kfence/report.c2
-rw-r--r--mm/khugepaged.c65
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c20
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c811
-rw-r--r--mm/memory-failure.c123
-rw-r--r--mm/memory.c254
-rw-r--r--mm/memory_hotplug.c216
-rw-r--r--mm/mempolicy.c94
-rw-r--r--mm/mempool.c6
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c100
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/msync.c6
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c13
-rw-r--r--mm/page_alloc.c469
-rw-r--r--mm/page_counter.c8
-rw-r--r--mm/page_owner.c70
-rw-r--r--mm/page_poison.c6
-rw-r--r--mm/page_vma_mapped.c162
-rw-r--r--mm/percpu-internal.h2
-rw-r--r--mm/percpu-vm.c7
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgalloc-track.h6
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/process_vm_access.c1
-rw-r--r--mm/readahead.c101
-rw-r--r--mm/rmap.c41
-rw-r--r--mm/shmem.c39
-rw-r--r--mm/shuffle.h4
-rw-r--r--mm/slab.c53
-rw-r--r--mm/slab.h25
-rw-r--r--mm/slab_common.c25
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c133
-rw-r--r--mm/sparse.c18
-rw-r--r--mm/swap.c73
-rw-r--r--mm/swap_slots.c2
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/truncate.c62
-rw-r--r--mm/userfaultfd.c67
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmalloc.c815
-rw-r--r--mm/vmscan.c411
-rw-r--r--mm/vmstat.c37
-rw-r--r--mm/workingset.c1
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zpool.c2
-rw-r--r--mm/zsmalloc.c12
-rw-r--r--mm/zswap.c2
93 files changed, 4671 insertions, 2782 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 24c045b24b95..02d44e3420f5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
- default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
help
@@ -149,6 +148,9 @@ config MEMORY_ISOLATION
config HAVE_BOOTMEM_INFO_NODE
def_bool n
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ bool
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -177,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
Say N here if you want the default policy to keep all hot-plugged
memory blocks in 'offline' state.
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ bool
+
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
+config MHP_MEMMAP_ON_MEMORY
+ def_bool y
+ depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+ depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -274,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config ARCH_ENABLE_THP_MIGRATION
bool
+config HUGETLB_PAGE_SIZE_VARIABLE
+ def_bool n
+ help
+ Allows the pageblock_order value to be dynamic instead of just standard
+ HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
+ on a platform.
+
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
@@ -283,12 +300,11 @@ config PHYS_ADDR_T_64BIT
config BOUNCE
bool "Enable bounce buffers"
default y
- depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+ depends on BLOCK && MMU && HIGHMEM
help
- Enable bounce buffers for devices that cannot access
- the full range of memory available to the CPU. Enabled
- by default when ZONE_DMA or HIGHMEM is selected, but you
- may say n to override this.
+ Enable bounce buffers for devices that cannot access the full range of
+ memory available to the CPU. Enabled by default when HIGHMEM is
+ selected, but you may say n to override this.
config VIRT_TO_BUS
bool
@@ -513,6 +529,13 @@ config CMA_DEBUGFS
help
Turns on the DebugFS interface for CMA.
+config CMA_SYSFS
+ bool "CMA information through sysfs interface"
+ depends on CMA && SYSFS
+ help
+ This option exposes some sysfs attributes to get information
+ from CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
@@ -760,6 +783,9 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+config ARCH_HAS_CACHE_LINE_SIZE
+ bool
+
config ARCH_HAS_PTE_DEVMAP
bool
@@ -872,4 +898,7 @@ config MAPPING_DIRTY_HELPERS
config KMAP_LOCAL
bool
+# struct io_mapping based helper. Selected by drivers that need them
+config IO_MAPPING
+ bool
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 72227b24a616..bf71e295e9f6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
page-alloc-y := page_alloc.o
page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
@@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
@@ -109,6 +112,7 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
@@ -120,3 +124,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_IO_MAPPING) += io-mapping.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 26de020aae7b..907fefde2572 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
/**
* balloon_page_list_dequeue() - removes pages from balloon's page list and
* returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
* @pages: pointer to the list of pages that would be returned to the caller.
* @n_req_pages: number of requested pages.
*
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
* its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
*
* Driver must call this function to properly dequeue a previously enqueued page
* before definitively releasing it back to the guest system.
diff --git a/mm/cma.c b/mm/cma.c
index 54eee2119822..995e15480937 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/log2.h>
@@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
}
static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned int count)
+ unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
+ unsigned long flags;
bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
- mutex_lock(&cma->lock);
+ spin_lock_irqsave(&cma->lock, flags);
bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irqrestore(&cma->lock, flags);
}
static void __init cma_activate_area(struct cma *cma)
@@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma)
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
- mutex_init(&cma->lock);
+ spin_lock_init(&cma->lock);
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
@@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma)
unsigned long nr_part, nr_total = 0;
unsigned long nbits = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
@@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma)
start = next_zero_bit + nr_zero;
}
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
}
#else
static inline void cma_debug_show_areas(struct cma *cma) { }
@@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- size_t i;
+ unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
if (!cma || !cma->count || !cma->bitmap)
- return NULL;
+ goto out;
- pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+ pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
count, align);
if (!count)
- return NULL;
+ goto out;
+
+ trace_cma_alloc_start(cma->name, count, align);
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return NULL;
+ goto out;
for (;;) {
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
break;
}
bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
@@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* our exclusive use. If the migration fails we will take the
* lock again and unmark it.
*/
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
@@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
pr_debug("%s(): memory range at %p is busy, retrying\n",
__func__, pfn_to_page(pfn));
+
+ trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
+ count, align);
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
- trace_cma_alloc(pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align);
/*
* CMA can allocate multiple page blocks, which results in different
@@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
}
if (ret && !no_warn) {
- pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
- __func__, cma->name, count, ret);
+ pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
+ __func__, cma->name, count, ret);
cma_debug_show_areas(cma);
}
pr_debug("%s(): returned %p\n", __func__, page);
+out:
+ if (page) {
+ count_vm_event(CMA_ALLOC_SUCCESS);
+ cma_sysfs_account_success_pages(cma, count);
+ } else {
+ count_vm_event(CMA_ALLOC_FAIL);
+ if (cma)
+ cma_sysfs_account_fail_pages(cma, count);
+ }
+
return page;
}
@@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+ unsigned long count)
{
unsigned long pfn;
if (!cma || !pages)
return false;
- pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
@@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
- trace_cma_release(pfn, pages, count);
+ trace_cma_release(cma->name, pfn, pages, count);
return true;
}
diff --git a/mm/cma.h b/mm/cma.h
index 42ae082cb067..2c775877eae2 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,19 +3,33 @@
#define __MM_CMA_H__
#include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_kobject {
+ struct kobject kobj;
+ struct cma *cma;
+};
struct cma {
unsigned long base_pfn;
unsigned long count;
unsigned long *bitmap;
unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
+ spinlock_t lock;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+#ifdef CONFIG_CMA_SYSFS
+ /* the number of CMA page successful allocations */
+ atomic64_t nr_pages_succeeded;
+ /* the number of CMA page allocation failures */
+ atomic64_t nr_pages_failed;
+ /* kobject requires dynamic object */
+ struct cma_kobject *cma_kobj;
+#endif
};
extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
return cma->count >> cma->order_per_bit;
}
+#ifdef CONFIG_CMA_SYSFS
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+#else
+static inline void cma_sysfs_account_success_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+static inline void cma_sysfs_account_fail_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+#endif
#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index d5bf8aa34fdc..2e7704955f4f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
struct cma *cma = data;
unsigned long used;
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
/* pages counter is smaller than sizeof(int) */
used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)used << cma->order_per_bit;
return 0;
@@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
unsigned long start, end = 0;
unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
if (start >= bitmap_maxno)
@@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
}
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
return 0;
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644
index 000000000000..eb2f39caff59
--- /dev/null
+++ b/mm/cma_sysfs.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+#define CMA_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_succeeded);
+}
+
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_failed);
+}
+
+static inline struct cma *cma_from_kobj(struct kobject *kobj)
+{
+ return container_of(kobj, struct cma_kobject, kobj)->cma;
+}
+
+static ssize_t alloc_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ atomic64_read(&cma->nr_pages_succeeded));
+}
+CMA_ATTR_RO(alloc_pages_success);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+ struct cma_kobject *cma_kobj = cma->cma_kobj;
+
+ kfree(cma_kobj);
+ cma->cma_kobj = NULL;
+}
+
+static struct attribute *cma_attrs[] = {
+ &alloc_pages_success_attr.attr,
+ &alloc_pages_fail_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static struct kobj_type cma_ktype = {
+ .release = cma_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = cma_groups,
+};
+
+static int __init cma_sysfs_init(void)
+{
+ struct kobject *cma_kobj_root;
+ struct cma_kobject *cma_kobj;
+ struct cma *cma;
+ int i, err;
+
+ cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
+ if (!cma_kobj_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++) {
+ cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
+ if (!cma_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cma = &cma_areas[i];
+ cma->cma_kobj = cma_kobj;
+ cma_kobj->cma = cma;
+ err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
+ cma_kobj_root, "%s", cma->name);
+ if (err) {
+ kobject_put(&cma_kobj->kobj);
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ while (--i >= 0) {
+ cma = &cma_areas[i];
+ kobject_put(&cma->cma_kobj->kobj);
+ }
+ kobject_put(cma_kobj_root);
+
+ return err;
+}
+subsys_initcall(cma_sysfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index e04f4476e68e..84fde270ae74 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat)
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
*
* The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
*/
-static unsigned long
+static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
@@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ int ret = 0;
+
+ cc->migrate_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
while (unlikely(too_many_isolated(pgdat))) {
/* stop isolation if there are still pages not migrated */
if (cc->nr_migratepages)
- return 0;
+ return -EAGAIN;
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
- return 0;
+ return -EAGAIN;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
- return 0;
+ return -EINTR;
}
cond_resched();
@@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (fatal_signal_pending(current)) {
cc->contended = true;
+ ret = -EINTR;
- low_pfn = 0;
goto fatal_pending;
}
@@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page()
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /* Do not report -EBUSY down the chain */
+ if (ret == -EBUSY)
+ ret = 0;
+ low_pfn += (1UL << compound_order(page)) - 1;
+ goto isolate_fail;
+ }
+
+ if (PageHuge(page)) {
+ /*
+ * Hugepage was successfully isolated and placed
+ * on the cc->migratepages list.
+ */
+ low_pfn += compound_nr(page) - 1;
+ goto isolate_success_no_list;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_success:
list_add(&page->lru, &cc->migratepages);
+isolate_success_no_list:
cc->nr_migratepages += compound_nr(page);
nr_isolated += compound_nr(page);
@@ -1063,7 +1098,7 @@ isolate_fail_put:
put_page(page);
isolate_fail:
- if (!skip_on_failure)
+ if (!skip_on_failure && ret != -ENOMEM)
continue;
/*
@@ -1089,6 +1124,9 @@ isolate_fail:
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (ret == -ENOMEM)
+ break;
}
/*
@@ -1130,7 +1168,9 @@ fatal_pending:
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
- return low_pfn;
+ cc->migrate_pfn = low_pfn;
+
+ return ret;
}
/**
@@ -1139,15 +1179,15 @@ fatal_pending:
* @start_pfn: The first PFN to start isolating.
* @end_pfn: The one-past-last PFN.
*
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
*/
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, block_start_pfn, block_end_pfn;
+ int ret = 0;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
@@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
block_end_pfn, cc->zone))
continue;
- pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
- ISOLATE_UNEVICTABLE);
+ ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
- if (!pfn)
+ if (ret)
break;
if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
- return pfn;
+ return ret;
}
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
*/
for (; block_end_pfn <= cc->free_pfn;
fast_find_block = false,
- low_pfn = block_end_pfn,
+ cc->migrate_pfn = low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_block(cc, low_pfn,
- block_end_pfn, isolate_mode);
-
- if (!low_pfn)
+ if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+ isolate_mode))
return ISOLATE_ABORT;
/*
@@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
break;
}
- /* Record where migration scanner will be restarted. */
- cc->migrate_pfn = low_pfn;
-
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1977,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
unsigned int wmark_low;
/*
- * Cap the low watermak to avoid excessive compaction
- * activity in case a user sets the proactivess tunable
+ * Cap the low watermark to avoid excessive compaction
+ * activity in case a user sets the proactiveness tunable
* close to 100 (maximum).
*/
wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
@@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
- migrate_prep_local();
+ /* lru_add_drain_all could be expensive with involving other CPUs */
+ lru_add_drain();
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
@@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
*/
WRITE_ONCE(current->capture_control, NULL);
*capture = READ_ONCE(capc.page);
+ /*
+ * Technically, it is also possible that compaction is skipped but
+ * the page is still captured out of luck(IRQ came and freed the page).
+ * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+ * the COMPACT[STALL|FAIL] when compaction is skipped.
+ */
+ if (*capture)
+ ret = COMPACT_SUCCESS;
return ret;
}
@@ -2657,9 +2701,6 @@ static void compact_nodes(void)
compact_node(nid);
}
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
-
/*
* Tunable for proactive compaction. It determines how
* aggressively the kernel should compact memory in the
@@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
*/
static int kcompactd(void *p)
{
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
unsigned int proactive_defer = 0;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a9bd6ce1ba02..297d1b349c19 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -192,7 +192,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pr_debug("Validating PMD advanced\n");
/* Align the address wrt HPAGE_PMD_SIZE */
- vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ vaddr &= HPAGE_PMD_MASK;
pgtable_trans_huge_deposit(mm, pmdp, pgtable);
@@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!arch_ioremap_pmd_supported())
+ if (!arch_vmap_pmd_supported(prot))
return;
pr_debug("Validating PMD huge\n");
@@ -330,7 +330,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pr_debug("Validating PUD advanced\n");
/* Align the address wrt HPAGE_PUD_SIZE */
- vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+ vaddr &= HPAGE_PUD_MASK;
set_pud_at(mm, vaddr, pudp, pud);
pudp_set_wrprotect(mm, vaddr, pudp);
@@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!arch_ioremap_pud_supported())
+ if (!arch_vmap_pud_supported(prot))
return;
pr_debug("Validating PUD huge\n");
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f3791532fef2..16483f86360e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -157,7 +157,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
if (!retval)
return retval;
- strlcpy(retval->name, name, sizeof(retval->name));
+ strscpy(retval->name, name, sizeof(retval->name));
retval->dev = dev;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ce832dc59e7..66f7e9fdfbc4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping,
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
-
- if (shadow) {
- mapping->nrexceptional += nr;
- /*
- * Make sure the nrexceptional update is committed before
- * the nrpages update so that final truncate racing
- * with reclaim does not see both counters 0 at the
- * same time and miss a shadow entry.
- */
- smp_wmb();
- }
mapping->nrpages -= nr;
}
@@ -629,13 +618,53 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- if (dax_mapping(mapping))
- return mapping->nrexceptional;
-
return mapping->nrpages;
}
/**
+ * filemap_range_needs_writeback - check if range potentially needs writeback
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback. Used by O_DIRECT
+ * read/write with IOCB_NOWAIT, to see if the caller needs to do
+ * filemap_write_and_wait_range() before proceeding.
+ *
+ * Return: %true if the caller should do filemap_write_and_wait_range() before
+ * doing O_DIRECT to a page in this range, %false otherwise.
+ */
+bool filemap_range_needs_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+ struct page *page;
+
+ if (!mapping_needs_writeback(mapping))
+ return false;
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ return false;
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, max) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ break;
+ }
+ rcu_read_unlock();
+ return page != NULL;
+}
+EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
+
+/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
* @lstart: offset in bytes where the range starts
@@ -882,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page,
if (xas_error(&xas))
goto unlock;
- if (old)
- mapping->nrexceptional--;
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -1433,6 +1460,67 @@ void unlock_page(struct page *page)
EXPORT_SYMBOL(unlock_page);
/**
+ * end_page_private_2 - Clear PG_private_2 and release any waiters
+ * @page: The page
+ *
+ * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
+ * this. The page ref held for PG_private_2 being set is released.
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+void end_page_private_2(struct page *page)
+{
+ page = compound_head(page);
+ VM_BUG_ON_PAGE(!PagePrivate2(page), page);
+ clear_bit_unlock(PG_private_2, &page->flags);
+ wake_up_page_bit(page, PG_private_2);
+ put_page(page);
+}
+EXPORT_SYMBOL(end_page_private_2);
+
+/**
+ * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ */
+void wait_on_page_private_2(struct page *page)
+{
+ page = compound_head(page);
+ while (PagePrivate2(page))
+ wait_on_page_bit(page, PG_private_2);
+}
+EXPORT_SYMBOL(wait_on_page_private_2);
+
+/**
+ * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * fatal signal is received by the calling task.
+ *
+ * Return:
+ * - 0 if successful.
+ * - -EINTR if a fatal signal was encountered.
+ */
+int wait_on_page_private_2_killable(struct page *page)
+{
+ int ret = 0;
+
+ page = compound_head(page);
+ while (PagePrivate2(page)) {
+ ret = wait_on_page_bit_killable(page, PG_private_2);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_on_page_private_2_killable);
+
+/**
* end_page_writeback - end writeback against a page
* @page: the page
*/
@@ -1663,7 +1751,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
+ * Looks up the page cache slot at @mapping & @index. If there is a
* page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
@@ -2244,8 +2332,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
return error;
if (PageUptodate(page))
return 0;
- if (!page->mapping) /* page truncated */
- return AOP_TRUNCATED_PAGE;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
@@ -2577,8 +2663,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
+ if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
@@ -2669,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page)
* entirely memory-based such as tmpfs, and filesystems which support
* unwritten extents.
*
- * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
* SEEK_DATA and there is no data after @start. There is an implicit hole
* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
* and @end contain data.
@@ -2778,7 +2864,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
- DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
+ DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
unsigned int mmap_miss;
@@ -2790,7 +2876,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_ra(&ractl, ra, ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra->ra_pages);
return fpin;
}
@@ -2876,7 +2962,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
- struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
@@ -2963,14 +3048,8 @@ page_not_uptodate:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- error = -EIO;
- }
+ error = filemap_read_page(file, mapping, page);
if (fpin)
goto out_retry;
put_page(page);
@@ -2978,7 +3057,6 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
@@ -3189,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
/* This is used for a general mmap of a disk file */
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
@@ -3214,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
@@ -3646,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write);
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written = 0;
ssize_t err;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 2183a56c7874..130e301c5ac0 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -60,16 +60,20 @@ static u64 frontswap_succ_stores;
static u64 frontswap_failed_stores;
static u64 frontswap_invalidates;
-static inline void inc_frontswap_loads(void) {
+static inline void inc_frontswap_loads(void)
+{
data_race(frontswap_loads++);
}
-static inline void inc_frontswap_succ_stores(void) {
+static inline void inc_frontswap_succ_stores(void)
+{
data_race(frontswap_succ_stores++);
}
-static inline void inc_frontswap_failed_stores(void) {
+static inline void inc_frontswap_failed_stores(void)
+{
data_race(frontswap_failed_stores++);
}
-static inline void inc_frontswap_invalidates(void) {
+static inline void inc_frontswap_invalidates(void)
+{
data_race(frontswap_invalidates++);
}
#else
diff --git a/mm/gup.c b/mm/gup.c
index ef7d2da9f03f..3ded6a5f26b2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
int orig_refs = refs;
/*
- * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
- * path, so fail and let the caller fall back to the slow path.
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
*/
- if (unlikely(flags & FOLL_LONGTERM) &&
- is_migrate_cma_page(page))
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !is_pinnable_page(page)))
return NULL;
/*
@@ -213,6 +214,58 @@ void unpin_user_page(struct page *page)
}
EXPORT_SYMBOL(unpin_user_page);
+static inline void compound_range_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *next, *page;
+ unsigned int nr = 1;
+
+ if (i >= npages)
+ return;
+
+ next = *list + i;
+ page = compound_head(next);
+ if (PageCompound(page) && compound_order(page) >= 1)
+ nr = min_t(unsigned int,
+ page + compound_nr(page) - next, npages - i);
+
+ *head = page;
+ *ntails = nr;
+}
+
+#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
+
+static inline void compound_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *page;
+ unsigned int nr;
+
+ if (i >= npages)
+ return;
+
+ page = compound_head(list[i]);
+ for (nr = i + 1; nr < npages; nr++) {
+ if (compound_head(list[nr]) != page)
+ break;
+ }
+
+ *head = page;
+ *ntails = nr - i;
+}
+
+#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)))
+
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
@@ -239,20 +292,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
unsigned long index;
-
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
+ struct page *head;
+ unsigned int ntails;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
+ for_each_compound_head(index, pages, npages, head, ntails) {
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
@@ -273,14 +321,50 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
- if (!PageDirty(page))
- set_page_dirty_lock(page);
- unpin_user_page(page);
+ if (!PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
+ * unpin_user_page_range_dirty_lock() - release and optionally dirty
+ * gup-pinned page range
+ *
+ * @page: the starting page of a range maybe marked dirty, and definitely released.
+ * @npages: number of consecutive pages to release.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page range" refers to a range of pages that has had one of the
+ * pin_user_pages() variants called on that page.
+ *
+ * For the page ranges defined by [page .. page+npages], make that range (or
+ * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
+ * page range was previously listed as clean.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long index;
+ struct page *head;
+ unsigned int ntails;
+
+ for_each_compound_range(index, &page, npages, head, ntails) {
+ if (make_dirty && !PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
+ }
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+
+/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
@@ -292,6 +376,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
+ struct page *head;
+ unsigned int ntails;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -300,13 +386,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
*/
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
- for (index = 0; index < npages; index++)
- unpin_user_page(pages[index]);
+
+ for_each_compound_head(index, pages, npages, head, ntails)
+ put_compound_head(head, ntails, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_pages);
@@ -435,18 +517,6 @@ retry:
}
}
- if (flags & FOLL_SPLIT && PageTransCompound(page)) {
- get_page(page);
- pte_unmap_unlock(ptep, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return ERR_PTR(ret);
- goto retry;
- }
-
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
@@ -591,7 +661,7 @@ retry_locked:
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
+ if (flags & FOLL_SPLIT_PMD) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -600,19 +670,7 @@ retry_locked:
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else if (flags & FOLL_SPLIT) {
- if (unlikely(!try_get_page(page))) {
- spin_unlock(ptl);
- return ERR_PTR(-ENOMEM);
- }
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- } else { /* flags & FOLL_SPLIT_PMD */
+ } else {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
@@ -1470,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
{
struct vm_area_struct *vma;
unsigned long vm_flags;
- int i;
+ long i;
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
@@ -1517,7 +1575,7 @@ finish_or_fault:
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
+ * allowing a hole to be left in the corefile to save disk space.
*
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
@@ -1535,120 +1593,96 @@ struct page *get_dump_page(unsigned long addr)
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
if (locked)
mmap_read_unlock(mm);
-
- if (ret == 1 && is_page_poisoned(page))
- return NULL;
-
return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+#ifdef CONFIG_MIGRATION
+/*
+ * Check whether all pages are pinnable, if so return number of pages. If some
+ * pages are not pinnable, migrate them, and unpin all pages. Return zero if
+ * pages were migrated, or if some pages were not successfully isolated.
+ * Return negative error if migration fails.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages,
+ unsigned int gup_flags)
{
unsigned long i;
- unsigned long step;
+ unsigned long isolation_error_count = 0;
bool drain_allow = true;
- bool migrate_allow = true;
- LIST_HEAD(cma_page_list);
- long ret = nr_pages;
+ LIST_HEAD(movable_page_list);
+ long ret = 0;
+ struct page *prev_head = NULL;
+ struct page *head;
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ .gfp_mask = GFP_USER | __GFP_NOWARN,
};
-check_again:
- for (i = 0; i < nr_pages;) {
-
- struct page *head = compound_head(pages[i]);
-
- /*
- * gup may start from a tail page. Advance step by the left
- * part.
- */
- step = compound_nr(head) - (pages[i] - head);
+ for (i = 0; i < nr_pages; i++) {
+ head = compound_head(pages[i]);
+ if (head == prev_head)
+ continue;
+ prev_head = head;
/*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible.
+ * If we get a movable page, since we are going to be pinning
+ * these entries, try to move them out if possible.
*/
- if (is_migrate_cma_page(head)) {
- if (PageHuge(head))
- isolate_huge_page(head, &cma_page_list);
- else {
+ if (!is_pinnable_page(head)) {
+ if (PageHuge(head)) {
+ if (!isolate_huge_page(head, &movable_page_list))
+ isolation_error_count++;
+ } else {
if (!PageLRU(head) && drain_allow) {
lru_add_drain_all();
drain_allow = false;
}
- if (!isolate_lru_page(head)) {
- list_add_tail(&head->lru, &cma_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_lru(head),
- thp_nr_pages(head));
+ if (isolate_lru_page(head)) {
+ isolation_error_count++;
+ continue;
}
+ list_add_tail(&head->lru, &movable_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_lru(head),
+ thp_nr_pages(head));
}
}
-
- i += step;
}
- if (!list_empty(&cma_page_list)) {
- /*
- * drop the above get_user_pages reference.
- */
- if (gup_flags & FOLL_PIN)
- unpin_user_pages(pages, nr_pages);
- else
- for (i = 0; i < nr_pages; i++)
- put_page(pages[i]);
-
- if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
- /*
- * some of the pages failed migration. Do get_user_pages
- * without migration.
- */
- migrate_allow = false;
+ /*
+ * If list is empty, and no isolation errors, means that all pages are
+ * in the correct zone.
+ */
+ if (list_empty(&movable_page_list) && !isolation_error_count)
+ return nr_pages;
- if (!list_empty(&cma_page_list))
- putback_movable_pages(&cma_page_list);
- }
- /*
- * We did migrate all the pages, Try to get the page references
- * again migrating any new CMA pages which we failed to isolate
- * earlier.
- */
- ret = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags);
-
- if ((ret > 0) && migrate_allow) {
- nr_pages = ret;
- drain_allow = true;
- goto check_again;
- }
+ if (gup_flags & FOLL_PIN) {
+ unpin_user_pages(pages, nr_pages);
+ } else {
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ }
+ if (!list_empty(&movable_page_list)) {
+ ret = migrate_pages(&movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN);
+ if (ret && !list_empty(&movable_page_list))
+ putback_movable_pages(&movable_page_list);
}
- return ret;
+ return ret > 0 ? -ENOMEM : ret;
}
#else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages,
+ unsigned int gup_flags)
{
return nr_pages;
}
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
/*
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1661,21 +1695,22 @@ static long __gup_longterm_locked(struct mm_struct *mm,
struct vm_area_struct **vmas,
unsigned int gup_flags)
{
- unsigned long flags = 0;
+ unsigned int flags;
long rc;
- if (gup_flags & FOLL_LONGTERM)
- flags = memalloc_nocma_save();
-
- rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
- gup_flags);
+ if (!(gup_flags & FOLL_LONGTERM))
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, gup_flags);
+ flags = memalloc_pin_save();
+ do {
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, gup_flags);
+ if (rc <= 0)
+ break;
+ rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
+ } while (!rc);
+ memalloc_pin_restore(flags);
- if (gup_flags & FOLL_LONGTERM) {
- if (rc > 0)
- rc = check_and_migrate_cma_pages(mm, start, rc, pages,
- vmas, gup_flags);
- memalloc_nocma_restore(flags);
- }
return rc;
}
diff --git a/mm/gup_test.c b/mm/gup_test.c
index e3cf78e5873e..d974dec19e1c 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
dump_page(page, "gup_test failure");
break;
+ } else if (cmd == PIN_LONGTERM_BENCHMARK &&
+ WARN(!is_pinnable_page(page),
+ "pages[%lu] is NOT pinnable but pinned\n",
+ i)) {
+ dump_page(page, "gup_test failure");
+ break;
}
}
break;
@@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd,
{
ktime_t start_time, end_time;
unsigned long i, nr_pages, addr, next;
- int nr;
+ long nr;
struct page **pages;
int ret = 0;
bool needs_mmap_lock =
@@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
- /* Filter out most gup flags: only allow a tiny subset here: */
- gup->flags &= FOLL_WRITE;
-
switch (cmd) {
case GUP_FAST_BENCHMARK:
- nr = get_user_pages_fast(addr, nr, gup->flags,
+ nr = get_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case GUP_BASIC_TEST:
- nr = get_user_pages(addr, nr, gup->flags, pages + i,
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
NULL);
break;
case PIN_FAST_BENCHMARK:
- nr = pin_user_pages_fast(addr, nr, gup->flags,
+ nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case PIN_BASIC_TEST:
- nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
NULL);
break;
case PIN_LONGTERM_BENCHMARK:
nr = pin_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
+ gup->gup_flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case DUMP_USER_PAGES_TEST:
- if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
- nr = pin_user_pages(addr, nr, gup->flags,
+ if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+ nr = pin_user_pages(addr, nr, gup->gup_flags,
pages + i, NULL);
else
- nr = get_user_pages(addr, nr, gup->flags,
+ nr = get_user_pages(addr, nr, gup->gup_flags,
pages + i, NULL);
break;
default:
@@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd,
start_time = ktime_get();
- put_back_pages(cmd, pages, nr_pages, gup->flags);
+ put_back_pages(cmd, pages, nr_pages, gup->test_flags);
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 90a6713d50eb..887ac1d5f5bc 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -21,7 +21,8 @@ struct gup_test {
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
- __u32 flags;
+ __u32 gup_flags;
+ __u32 test_flags;
/*
* Each non-zero entry is the number of the page (1-based: first page is
* page 1, so that zero entries mean "do nothing") from the .addr base.
diff --git a/mm/highmem.c b/mm/highmem.c
index 6ef8f5e05e7e..4fb51d735aa6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-unsigned int __nr_free_highpages (void)
+unsigned int __nr_free_highpages(void)
{
struct zone *zone;
unsigned int pages = 0;
@@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void)
static int pkmap_count[LAST_PKMAP];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
-pte_t * pkmap_page_table;
+pte_t *pkmap_page_table;
/*
* Most architectures have no use for kmap_high_get(), so let's abstract
@@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr)
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
int i = PKMAP_NR(addr);
+
return pte_page(pkmap_page_table[i]);
}
@@ -278,9 +279,8 @@ void *kmap_high(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
unlock_kmap();
- return (void*) vaddr;
+ return (void *) vaddr;
}
-
EXPORT_SYMBOL(kmap_high);
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
@@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
}
unlock_kmap_any(flags);
- return (void*) vaddr;
+ return (void *) vaddr;
}
#endif
@@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
/*
* Disable migration so resulting virtual address is stable
- * accross preemption.
+ * across preemption.
*/
migrate_disable();
preempt_disable();
@@ -737,7 +737,6 @@ done:
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
-
EXPORT_SYMBOL(page_address);
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ae907a9c2050..6d2a0119fc58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
@@ -61,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -77,18 +79,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
return false;
}
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
{
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return READ_ONCE(huge_zero_page);
+ return true;
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_page) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return NULL;
+ return false;
}
count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
@@ -97,11 +99,12 @@ retry:
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return READ_ONCE(huge_zero_page);
+ return true;
}
static void put_huge_zero_page(void)
@@ -146,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -624,14 +628,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
- vm_fault_t ret2;
-
spin_unlock(vmf->ptl);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
- VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
- return ret2;
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
}
entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -1293,7 +1295,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
}
page = pmd_page(orig_pmd);
- VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
/* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
@@ -1464,12 +1466,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == NUMA_NO_NODE) {
- /* If the page was locked, there are no parallel migrations */
- if (page_locked)
- goto clear_pmdnuma;
- }
-
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
page_nid = NUMA_NO_NODE;
@@ -1478,6 +1474,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
spin_unlock(vmf->ptl);
put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
+ } else if (target_nid == NUMA_NO_NODE) {
+ /* There are no parallel migrations and page is in the right
+ * node. Clear the numa hinting info in this pmd.
+ */
+ goto clear_pmdnuma;
}
/*
@@ -1696,7 +1697,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1794,8 +1795,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* Returns
* - 0 if PMD could not be locked
- * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ * - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2046,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2055,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2104,7 +2114,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
@@ -2303,60 +2313,54 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
__split_huge_pmd(vma, pmd, address, freeze, page);
}
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+ /*
+ * If the new address isn't hpage aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+ range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+ ALIGN(address, HPAGE_PMD_SIZE)))
+ split_huge_pmd_address(vma, address, false, NULL);
+}
+
void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
{
- /*
- * If the new start address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (start & ~HPAGE_PMD_MASK &&
- (start & HPAGE_PMD_MASK) >= vma->vm_start &&
- (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start, false, NULL);
+ /* Check if we need to split start first. */
+ split_huge_pmd_if_needed(vma, start);
- /*
- * If the new end address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (end & ~HPAGE_PMD_MASK &&
- (end & HPAGE_PMD_MASK) >= vma->vm_start &&
- (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end, false, NULL);
+ /* Check if we need to split end next. */
+ split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't hpage aligned and it could previously
- * contain an hugepage: check if we need to split an huge pmd.
+ * If we're also updating the vma->vm_next->vm_start,
+ * check if we need to split it.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
nstart += adjust_next;
- if (nstart & ~HPAGE_PMD_MASK &&
- (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
- (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart, false, NULL);
+ split_huge_pmd_if_needed(next, nstart);
}
}
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
@@ -2477,7 +2481,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = lock_page_lruvec(head);
for (i = nr - 1; i >= 1; i--) {
@@ -2667,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2726,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
@@ -2744,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+ if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
@@ -2766,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
@@ -2838,8 +2832,8 @@ void deferred_split_huge_page(struct page *page)
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
if (memcg)
- memcg_set_shrinker_bit(memcg, page_to_nid(page),
- deferred_split_shrinker.id);
+ set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
#endif
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2924,16 +2918,14 @@ static struct shrinker deferred_split_shrinker = {
};
#ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
- if (val != 1)
- return -EINVAL;
-
+ pr_debug("Split all THPs\n");
for_each_populated_zone(zone) {
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2957,15 +2949,243 @@ static int split_huge_pages_set(void *data, u64 val)
unlock_page(page);
next:
put_page(page);
+ cond_resched();
}
}
- pr_info("%lu of %lu THP split\n", split, total);
+ pr_debug("%lu of %lu THP split\n", split, total);
+}
- return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+ return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+ is_vm_hugetlb_page(vma);
}
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
- "%llu\n");
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+ unsigned long vaddr_end)
+{
+ int ret = 0;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned long total = 0, split = 0;
+ unsigned long addr;
+
+ vaddr_start &= PAGE_MASK;
+ vaddr_end &= PAGE_MASK;
+
+ /* Find the task_struct from pid */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /* Find the mm_struct */
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+ pid, vaddr_start, vaddr_end);
+
+ mmap_read_lock(mm);
+ /*
+ * always increase addr by PAGE_SIZE, since we could have a PTE page
+ * table filled with PTE-mapped THPs, each of which is distinct.
+ */
+ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ unsigned int follflags;
+ struct page *page;
+
+ if (!vma || addr < vma->vm_start)
+ break;
+
+ /* skip special VMA and hugetlb VMA */
+ if (vma_not_suitable_for_thp_split(vma)) {
+ addr = vma->vm_end;
+ continue;
+ }
+
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ follflags = FOLL_GET | FOLL_DUMP;
+ page = follow_page(vma, addr, follflags);
+
+ if (IS_ERR(page))
+ continue;
+ if (!page)
+ continue;
+
+ if (!is_transparent_hugepage(page))
+ goto next;
+
+ total++;
+ if (!can_split_huge_page(compound_head(page), NULL))
+ goto next;
+
+ if (!trylock_page(page))
+ goto next;
+
+ if (!split_huge_page(page))
+ split++;
+
+ unlock_page(page);
+next:
+ put_page(page);
+ cond_resched();
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+ return ret;
+}
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+ pgoff_t off_end)
+{
+ struct filename *file;
+ struct file *candidate;
+ struct address_space *mapping;
+ int ret = -EINVAL;
+ pgoff_t index;
+ int nr_pages = 1;
+ unsigned long total = 0, split = 0;
+
+ file = getname_kernel(file_path);
+ if (IS_ERR(file))
+ return ret;
+
+ candidate = file_open_name(file, O_RDONLY, 0);
+ if (IS_ERR(candidate))
+ goto out;
+
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+ file_path, off_start, off_end);
+
+ mapping = candidate->f_mapping;
+
+ for (index = off_start; index < off_end; index += nr_pages) {
+ struct page *fpage = pagecache_get_page(mapping, index,
+ FGP_ENTRY | FGP_HEAD, 0);
+
+ nr_pages = 1;
+ if (xa_is_value(fpage) || !fpage)
+ continue;
+
+ if (!is_transparent_hugepage(fpage))
+ goto next;
+
+ total++;
+ nr_pages = thp_nr_pages(fpage);
+
+ if (!trylock_page(fpage))
+ goto next;
+
+ if (!split_huge_page(fpage))
+ split++;
+
+ unlock_page(fpage);
+next:
+ put_page(fpage);
+ cond_resched();
+ }
+
+ filp_close(candidate, NULL);
+ ret = 0;
+
+ pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+ putname(file);
+ return ret;
+}
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppops)
+{
+ static DEFINE_MUTEX(split_debug_mutex);
+ ssize_t ret;
+ /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+ char input_buf[MAX_INPUT_BUF_SZ];
+ int pid;
+ unsigned long vaddr_start, vaddr_end;
+
+ ret = mutex_lock_interruptible(&split_debug_mutex);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+
+ memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+ if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+ goto out;
+
+ input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+ if (input_buf[0] == '/') {
+ char *tok;
+ char *buf = input_buf;
+ char file_path[MAX_INPUT_BUF_SZ];
+ pgoff_t off_start = 0, off_end = 0;
+ size_t input_len = strlen(input_buf);
+
+ tok = strsep(&buf, ",");
+ if (tok) {
+ strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+ if (ret != 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = split_huge_pages_in_file(file_path, off_start, off_end);
+ if (!ret)
+ ret = input_len;
+
+ goto out;
+ }
+
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+ if (ret == 1 && pid == 1) {
+ split_huge_pages_all();
+ ret = strlen(input_buf);
+ goto out;
+ } else if (ret != 3) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+ if (!ret)
+ ret = strlen(input_buf);
+out:
+ mutex_unlock(&split_debug_mutex);
+ return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+ .owner = THIS_MODULE,
+ .write = split_huge_pages_write,
+ .llseek = no_llseek,
+};
static int __init split_huge_pages_debugfs(void)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a86a58ef132d..5ba5a0da6d57 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -39,7 +39,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"
@@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool)
return true;
}
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+ unsigned long irq_flags)
{
- spin_unlock(&spool->lock);
+ spin_unlock_irqrestore(&spool->lock, irq_flags);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
@@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
- spin_lock(&spool->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&spool->lock, flags);
BUG_ON(!spool->count);
spool->count--;
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
}
/*
@@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
if (!spool)
return ret;
- spin_lock(&spool->lock);
+ spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
unlock_ret:
- spin_unlock(&spool->lock);
+ spin_unlock_irq(&spool->lock);
return ret;
}
@@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
+ unsigned long flags;
if (!spool)
return delta;
- spin_lock(&spool->lock);
+ spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
@@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
return ret;
}
@@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
- * for all the existings adds_in_progress. We should only be
+ * for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -553,7 +556,6 @@ retry:
resv->adds_in_progress -= in_regions_needed;
spin_unlock(&resv->lock);
- VM_BUG_ON(add < 0);
return add;
}
@@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+ bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (rsv_adjust) {
+ if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
- hugetlb_acct_memory(h, 1);
+ if (!hugetlb_acct_memory(h, 1))
+ reserved = true;
+ } else if (!rsv_adjust) {
+ reserved = true;
}
+
+ if (!reserved)
+ pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
@@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
+
+ lockdep_assert_held(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
@@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
- bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+ bool pin = !!(current->flags & PF_MEMALLOC_PIN);
+ lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (nocma && is_migrate_cma_page(page))
+ if (pin && !is_pinnable_page(page))
continue;
if (PageHWPoison(page))
@@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
}
/*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
@@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned long nr_pages = 1UL << huge_page_order(h);
+ unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
@@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
#endif
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page. A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return;
+
+ list_del(&page->lru);
+
+ if (HPageFreed(page)) {
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ }
+ if (adjust_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ }
+
+ set_page_refcounted(page);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[nid]--;
+}
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
@@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- h->nr_huge_pages--;
- h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
- set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
- /*
- * Temporarily drop the hugetlb_lock, because
- * we might block in free_gigantic_page().
- */
- spin_unlock(&hugetlb_lock);
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
- spin_lock(&hugetlb_lock);
} else {
__free_pages(page, huge_page_order(h));
}
}
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+ struct page *page, *t_page;
+
+ list_for_each_entry_safe(page, t_page, list, lru) {
+ update_and_free_page(h, page);
+ cond_resched();
+ }
+}
+
struct hstate *size_to_hstate(unsigned long size)
{
struct hstate *h;
@@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page)
int nid = page_to_nid(page);
struct hugepage_subpool *spool = hugetlb_page_subpool(page);
bool restore_reserve;
+ unsigned long flags;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page)
restore_reserve = true;
}
- spin_lock(&hugetlb_lock);
+ spin_lock_irqsave(&hugetlb_lock, flags);
ClearHPageMigratable(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
@@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page)
h->resv_huge_pages++;
if (HPageTemporary(page)) {
- list_del(&page->lru);
- ClearHPageTemporary(page);
+ remove_hugetlb_page(h, page, false);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- list_del(&page->lru);
+ remove_hugetlb_page(h, page, true);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page);
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
}
- spin_unlock(&hugetlb_lock);
}
/*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
*/
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
-{
- struct llist_node *node;
- struct page *page;
-
- node = llist_del_all(&hpage_freelist);
-
- while (node) {
- page = container_of((struct address_space **)node,
- struct page, mapping);
- node = node->next;
- __free_huge_page(page);
- }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
- /*
- * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
- */
- if (!in_task()) {
- /*
- * Only call schedule_work() if hpage_freelist is previously
- * empty. Otherwise, schedule_work() had been called but the
- * workfn hasn't retrieved the list yet.
- */
- if (llist_add((struct llist_node *)&page->mapping,
- &hpage_freelist))
- schedule_work(&free_hpage_work);
- return;
- }
-
- __free_huge_page(page);
+ lockdep_assert_held(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void __prep_new_huge_page(struct page *page)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
- spin_lock(&hugetlb_lock);
- h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
- ClearHPageFreed(page);
- spin_unlock(&hugetlb_lock);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ __prep_new_huge_page(page);
+ spin_lock_irq(&hugetlb_lock);
+ __prep_account_new_huge_page(h, nid);
+ spin_unlock_irq(&hugetlb_lock);
}
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1577,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
@@ -1616,7 +1624,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ page = __alloc_pages(gfp_mask, order, nid, nmask);
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
@@ -1693,17 +1701,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
}
/*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free. Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page. The caller must make
+ * an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
- bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+ nodemask_t *nodes_allowed,
+ bool acct_surplus)
{
int nr_nodes, node;
- int ret = 0;
+ struct page *page = NULL;
+ lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
@@ -1711,23 +1722,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
- struct page *page =
- list_entry(h->hugepage_freelists[node].next,
+ page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[node]--;
- if (acct_surplus) {
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[node]--;
- }
- update_and_free_page(h, page);
- ret = 1;
+ remove_hugetlb_page(h, page, acct_surplus);
break;
}
}
- return ret;
+ return page;
}
/*
@@ -1749,7 +1751,7 @@ retry:
if (!PageHuge(page))
return 0;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!PageHuge(page)) {
rc = 0;
goto out;
@@ -1758,7 +1760,6 @@ retry:
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
- int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
@@ -1767,7 +1768,7 @@ retry:
* when it is dissolved.
*/
if (unlikely(!HPageFreed(head))) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
cond_resched();
/*
@@ -1789,15 +1790,14 @@ retry:
SetPageHWPoison(page);
ClearPageHWPoison(head);
}
- list_del(&head->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+ remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
+ spin_unlock_irq(&hugetlb_lock);
update_and_free_page(h, head);
- rc = 0;
+ return 0;
}
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return rc;
}
@@ -1839,16 +1839,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
if (hstate_is_gigantic(h))
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
@@ -1858,7 +1858,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetHPageTemporary(page);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
} else {
@@ -1867,7 +1867,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
}
out_unlock:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return page;
}
@@ -1917,17 +1917,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
if (page) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return page;
}
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}
@@ -1964,6 +1964,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long needed, allocated;
bool alloc_ok = true;
+ lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
h->resv_huge_pages += delta;
@@ -1975,7 +1976,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
ret = -ENOMEM;
retry:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
@@ -1992,7 +1993,7 @@ retry:
* After retaking hugetlb_lock, we need to recalculate 'needed'
* because either resv_huge_pages or free_huge_pages may have changed.
*/
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0) {
@@ -2032,12 +2033,12 @@ retry:
enqueue_huge_page(h, page);
}
free:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
put_page(page);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
return ret;
}
@@ -2049,17 +2050,17 @@ free:
* to the associated reservation map.
* 2) Free any unused surplus pages that may have been allocated to satisfy
* the reservation. As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held. However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing. Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
+ struct page *page;
+ LIST_HEAD(page_list);
+
+ lockdep_assert_held(&hugetlb_lock);
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
@@ -2076,24 +2077,21 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the freed pages across the
+ * remove_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
- *
- * Note that we decrement resv_huge_pages as we free the pages. If
- * we drop the lock, resv_huge_pages will still be sufficiently large
- * to cover subsequent pages we may free.
*/
while (nr_pages--) {
- h->resv_huge_pages--;
- unused_resv_pages--;
- if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+ if (!page)
goto out;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
out:
- /* Fully uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
@@ -2120,12 +2118,18 @@ out:
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed. It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
+ VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
@@ -2169,33 +2173,42 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
+ case VMA_DEL_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ } else {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ }
+ break;
default:
BUG();
}
- if (vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
- else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
- /*
- * In most cases, reserves always exist for private mappings.
- * However, a file associated with mapping could have been
- * hole punched or truncated after reserves were consumed.
- * As subsequent fault on such a range will not use reserves.
- * Subtle - The reserve map for private mappings has the
- * opposite meaning than that of shared mappings. If NO
- * entry is in the reserve map, it means a reservation exists.
- * If an entry exists in the reserve map, it means the
- * reservation has already been consumed. As a result, the
- * return value of this routine is the opposite of the
- * value returned from reserve map manipulation routines above.
- */
- if (ret)
- return 0;
- else
- return 1;
- }
- else
- return ret < 0 ? ret : 0;
+ /*
+ * We know private mapping must have HPAGE_RESV_OWNER set.
+ *
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret > 0)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
}
static long vma_needs_reservation(struct hstate *h,
@@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
+static long vma_del_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
/*
- * This routine is called to restore a reservation on error paths. In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set
- * HPageRestoreReserve in the newly allocated page. When the page is freed
- * via free_huge_page, the global reservation count will be incremented if
- * HPageRestoreReserve is set. However, free_huge_page can not adjust the
- * reserve map. Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for pages allocated via alloc_huge_page(), and
+ * the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the page consumed the reservation.
+ * HPageRestoreReserve is set in the page.
+ * 2) No reservation was in place for the page, so HPageRestoreReserve is
+ * not set. However, alloc_huge_page always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count. But, free_huge_page does not have enough context
+ * to adjust the reservation map. This case deals primarily with private
+ * mappings. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
-static void restore_reserve_on_error(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address,
- struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct page *page)
{
- if (unlikely(HPageRestoreReserve(page))) {
- long rc = vma_needs_reservation(h, vma, address);
+ long rc = vma_needs_reservation(h, vma, address);
- if (unlikely(rc < 0)) {
+ if (HPageRestoreReserve(page)) {
+ if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
@@ -2253,19 +2280,188 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
- } else if (rc) {
- rc = vma_add_reservation(h, vma, address);
- if (unlikely(rc < 0))
+ else if (rc)
+ (void)vma_add_reservation(h, vma, address);
+ else
+ vma_end_reservation(h, vma, address);
+ } else {
+ if (!rc) {
+ /*
+ * This indicates there is an entry in the reserve map
+ * added by alloc_huge_page. We know it was added
+ * before the alloc_huge_page call, otherwise
+ * HPageRestoreReserve would be set on the page.
+ * Remove the entry so that a subsequent allocation
+ * does not consume a reservation.
+ */
+ rc = vma_del_reservation(h, vma, address);
+ if (rc < 0)
+ /*
+ * VERY rare out of memory condition. Since
+ * we can not delete the entry, set
+ * HPageRestoreReserve so that the reserve
+ * count will be incremented when the page
+ * is freed. This reserve will be consumed
+ * on a subsequent allocation.
+ */
+ SetHPageRestoreReserve(page);
+ } else if (rc < 0) {
+ /*
+ * Rare out of memory condition from
+ * vma_needs_reservation call. Memory allocation is
+ * only attempted if a new entry is needed. Therefore,
+ * this implies there is not an entry in the
+ * reserve map.
+ *
+ * For shared mappings, no entry in the map indicates
+ * no reservation. We are done.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
/*
- * See above comment about rare out of
- * memory condition.
+ * For private mappings, no entry indicates
+ * a reservation is present. Since we can
+ * not add an entry, set SetHPageRestoreReserve
+ * on the page so reserve count will be
+ * incremented when freed. This reserve will
+ * be consumed on a subsequent allocation.
*/
- ClearHPageRestoreReserve(page);
+ SetHPageRestoreReserve(page);
} else
- vma_end_reservation(h, vma, address);
+ /*
+ * No reservation present, do nothing
+ */
+ vma_end_reservation(h, vma, address);
}
}
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+ struct list_head *list)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = page_to_nid(old_page);
+ struct page *new_page;
+ int ret = 0;
+
+ /*
+ * Before dissolving the page, we need to allocate a new one for the
+ * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+ * not having to deal with prep_new_huge_page() and avoids dealing of any
+ * counters. This simplifies and let us do the whole thing under the
+ * lock.
+ */
+ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+ if (!new_page)
+ return -ENOMEM;
+
+retry:
+ spin_lock_irq(&hugetlb_lock);
+ if (!PageHuge(old_page)) {
+ /*
+ * Freed from under us. Drop new_page too.
+ */
+ goto free_new;
+ } else if (page_count(old_page)) {
+ /*
+ * Someone has grabbed the page, try to isolate it here.
+ * Fail with -EBUSY if not possible.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ if (!isolate_huge_page(old_page, list))
+ ret = -EBUSY;
+ spin_lock_irq(&hugetlb_lock);
+ goto free_new;
+ } else if (!HPageFreed(old_page)) {
+ /*
+ * Page's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succeed here if
+ * we retry.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_page is still a genuine free hugepage. Remove it from
+ * the freelist and decrease the counters. These will be
+ * incremented again when calling __prep_account_new_huge_page()
+ * and enqueue_huge_page() for new_page. The counters will remain
+ * stable since this happens under the lock.
+ */
+ remove_hugetlb_page(h, old_page, false);
+
+ /*
+ * new_page needs to be initialized with the standard hugetlb
+ * state. This is normally done by prep_new_huge_page() but
+ * that takes hugetlb_lock which is already held so we need to
+ * open code it here.
+ * Reference count trick is needed because allocator gives us
+ * referenced page but the pool requires pages with 0 refcount.
+ */
+ __prep_new_huge_page(new_page);
+ __prep_account_new_huge_page(h, nid);
+ page_ref_dec(new_page);
+ enqueue_huge_page(h, new_page);
+
+ /*
+ * Pages have been replaced, we can safely free the old one.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_page(h, old_page);
+ }
+
+ return ret;
+
+free_new:
+ spin_unlock_irq(&hugetlb_lock);
+ __free_pages(new_page, huge_page_order(h));
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+ struct hstate *h;
+ struct page *head;
+ int ret = -EBUSY;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHuge(page)) {
+ head = compound_head(page);
+ h = page_hstate(head);
+ } else {
+ spin_unlock_irq(&hugetlb_lock);
+ return 0;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOMEM as this has the effect
+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ if (page_count(head) && isolate_huge_page(head, list))
+ ret = 0;
+ else if (!page_count(head))
+ ret = alloc_and_dissolve_huge_page(h, head, list);
+
+ return ret;
+}
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
@@ -2316,7 +2512,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
/* If this allocation is not consuming a reservation, charge it now.
*/
- deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ deferred_reserve = map_chg || avoid_reserve;
if (deferred_reserve) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
@@ -2328,7 +2524,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (ret)
goto out_uncharge_cgroup_reservation;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
* from the global free pool (global change). gbl_chg == 0 indicates
@@ -2336,7 +2532,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
@@ -2344,7 +2540,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
@@ -2357,7 +2553,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h_cg, page);
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
hugetlb_set_page_subpool(page, spool);
@@ -2547,24 +2743,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
int i;
+ LIST_HEAD(page_list);
+ lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h))
return;
+ /*
+ * Collect pages to be freed on a list, and free after dropping lock
+ */
for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
if (count >= h->nr_huge_pages)
- return;
+ goto out;
if (PageHighMem(page))
continue;
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[page_to_nid(page)]--;
+ remove_hugetlb_page(h, page, false);
+ list_add(&page->lru, &page_list);
}
}
+
+out:
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2583,6 +2787,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
{
int nr_nodes, node;
+ lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
@@ -2610,6 +2815,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ struct page *page;
+ LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
/*
@@ -2622,7 +2829,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
else
return -ENOMEM;
- spin_lock(&hugetlb_lock);
+ /*
+ * resize_lock mutex prevents concurrent adjustments to number of
+ * pages in hstate via the proc/sysfs interfaces.
+ */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* Check for a node specific request.
@@ -2653,7 +2865,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
*/
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
@@ -2682,14 +2895,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
/* yield cpu to avoid soft lockup */
cond_resched();
ret = alloc_pool_huge_page(h, nodes_allowed,
node_alloc_noretry);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!ret)
goto out;
@@ -2716,18 +2929,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
+
+ /*
+ * Collect pages to be removed on list without dropping lock
+ */
while (min_count < persistent_huge_pages(h)) {
- if (!free_pool_huge_page(h, nodes_allowed, 0))
+ page = remove_pool_huge_page(h, nodes_allowed, 0);
+ if (!page)
break;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
+ /* free the pages after dropping lock */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
+
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
h->max_huge_pages = persistent_huge_pages(h);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
@@ -2882,9 +3107,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
if (err)
return err;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return count;
}
@@ -3215,6 +3440,7 @@ void __init hugetlb_add_hstate(unsigned int order)
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
+ mutex_init(&h->resize_lock);
h->order = order;
h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3493,10 @@ static int __init hugepages_setup(char *s)
/*
* Global state is always initialized later in hugetlb_init.
- * But we need to allocate >= MAX_ORDER hstates here early to still
+ * But we need to allocate gigantic hstates here early to still
* use the bootmem allocator.
*/
- if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
@@ -3470,9 +3696,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
goto out;
if (write) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
}
out:
return ret;
@@ -3568,7 +3794,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
if (!delta)
return 0;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
@@ -3607,7 +3833,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
return_unused_surplus_pages(h, (unsigned long) -delta);
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
@@ -3795,7 +4021,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, addr, sz);
+ dst_pte = huge_pte_alloc(dst, vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -3879,6 +4105,8 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
+ restore_reserve_on_error(h, vma, addr,
+ new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
@@ -3898,6 +4126,7 @@ again:
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_pte_wrprotect(entry);
}
page_dup_rmap(ptepage, true);
@@ -4310,6 +4539,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return 0;
}
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx,
+ unsigned int flags,
+ unsigned long haddr,
+ unsigned long reason)
+{
+ vm_fault_t ret;
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .flags = flags,
+
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ ret = handle_userfault(&vmf, reason);
+ i_mmap_lock_read(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ return ret;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
@@ -4348,35 +4615,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- /*
- * Check for page in userfault range
- */
+ /* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- u32 hash;
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .flags = flags,
- /*
- * Hard to debug if it ends up being
- * used by a callee that assumes
- * something about the other
- * uninitialized fields... same as in
- * memory.c
- */
- };
-
- /*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MISSING);
goto out;
}
@@ -4395,13 +4638,10 @@ retry:
* sure there really is no pte entry.
*/
ptl = huge_pte_lock(h, mm, ptep);
- if (!huge_pte_none(huge_ptep_get(ptep))) {
- ret = 0;
- spin_unlock(ptl);
- goto out;
- }
+ ret = 0;
+ if (huge_pte_none(huge_ptep_get(ptep)))
+ ret = vmf_error(PTR_ERR(page));
spin_unlock(ptl);
- ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4435,6 +4675,16 @@ retry:
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
+
+ /* Check for page in userfault range. */
+ if (userfaultfd_minor(vma)) {
+ unlock_page(page);
+ put_page(page);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MINOR);
+ goto out;
+ }
}
/*
@@ -4563,7 +4813,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
mapping = vma->vm_file->f_mapping;
i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
i_mmap_unlock_read(mapping);
return VM_FAULT_OOM;
@@ -4675,6 +4925,7 @@ out_mutex:
return ret;
}
+#ifdef CONFIG_USERFAULTFD
/*
* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
* modifications for huge pages.
@@ -4684,8 +4935,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
struct page **pagep)
{
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
unsigned long size;
@@ -4695,12 +4948,31 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spinlock_t *ptl;
int ret;
struct page *page;
+ int writable;
+
+ mapping = dst_vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ if (is_continue) {
+ ret = -EFAULT;
+ page = find_lock_page(mapping, idx);
+ if (!page)
+ goto out;
+ } else if (!*pagep) {
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
- if (!*pagep) {
- ret = -ENOMEM;
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
@@ -4725,13 +4997,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*/
__SetPageUptodate(page);
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
- /*
- * If shared, add to page cache
- */
- if (vm_shared) {
+ /* Add shared, newly allocated pages to the page cache. */
+ if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
@@ -4776,8 +5043,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
- if (dst_vma->vm_flags & VM_WRITE)
+ /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+ if (is_continue && !vm_shared)
+ writable = 0;
+ else
+ writable = dst_vma->vm_flags & VM_WRITE;
+
+ _dst_pte = make_huge_pte(dst_vma, page, writable);
+ if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
@@ -4791,20 +5064,23 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- SetHPageMigratable(page);
- if (vm_shared)
+ if (!is_continue)
+ SetHPageMigratable(page);
+ if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
- if (vm_shared)
+ if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
+#endif /* CONFIG_USERFAULTFD */
static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
int refs, struct page **pages,
@@ -4996,14 +5272,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
@@ -5280,6 +5548,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
/*
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
+ *
+ * Note that !resv_map implies freed == 0. So (chg - freed)
+ * won't go negative.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5597,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
return false;
}
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
+ return vma_shareable(vma, addr);
+}
+
/*
* Determine if start,end range within vma could be mapped by shared pmd.
* If yes, adjust start and end to cover range associated with possible
@@ -5338,8 +5618,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
/*
- * vma need span at least one aligned PUD size and the start,end range
- * must at least partialy within it.
+ * vma needs to span at least one aligned PUD size, and the range
+ * must be at least partially within in.
*/
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
(*end <= v_start) || (*start >= v_end))
@@ -5370,9 +5650,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
* only required for subsequent processing.
*/
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
@@ -5382,9 +5662,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
pte_t *pte;
spinlock_t *ptl;
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
@@ -5448,9 +5725,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
-#define want_pmd_share() (1)
+
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
return NULL;
}
@@ -5465,11 +5743,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
-#define want_pmd_share() (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+ return false;
+}
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
@@ -5487,8 +5769,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
- if (want_pmd_share() && pud_none(*pud))
- pte = huge_pmd_share(mm, addr, pud);
+ if (want_pmd_share(vma, addr) && pud_none(*pud))
+ pte = huge_pmd_share(mm, vma, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
@@ -5632,7 +5914,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!PageHeadHuge(page) ||
!HPageMigratable(page) ||
!get_page_unless_zero(page)) {
@@ -5642,16 +5924,31 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
ClearHPageMigratable(page);
list_move_tail(&page->lru, list);
unlock:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ int ret = 0;
+
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHeadHuge(page)) {
+ *hugetlb = true;
+ if (HPageFreed(page) || HPageMigratable(page))
+ ret = get_page_unless_zero(page);
+ }
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page)
{
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
SetHPageMigratable(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
put_page(page);
}
@@ -5679,13 +5976,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
SetHPageTemporary(oldpage);
ClearHPageTemporary(newpage);
- spin_lock(&hugetlb_lock);
+ /*
+ * There is no need to transfer the per-node surplus state
+ * when we do not cross the node.
+ */
+ if (new_nid == old_nid)
+ return;
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
h->surplus_huge_pages_node[old_nid]--;
h->surplus_huge_pages_node[new_nid]++;
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
+ unsigned long address, start, end;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ start = ALIGN(vma->vm_start, PUD_SIZE);
+ end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ if (start >= end)
+ return;
+
+ /*
+ * No need to call adjust_range_if_pmd_sharing_possible(), because
+ * we have already done the PUD_SIZE alignment.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ for (address = start; address < end; address += PUD_SIZE) {
+ unsigned long tmp = address;
+
+ ptep = huge_pte_offset(mm, address, sz);
+ if (!ptep)
+ continue;
+ ptl = huge_pte_lock(h, mm, ptep);
+ /* We don't want 'address' to be changed */
+ huge_pmd_unshare(mm, vma, &tmp, ptep);
+ spin_unlock(ptl);
}
+ flush_hugetlb_tlb_range(vma, start, end);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ /*
+ * No need to call mmu_notifier_invalidate_range(), see
+ * Documentation/vm/mmu_notifier.rst.
+ */
+ mmu_notifier_invalidate_range_end(&range);
}
#ifdef CONFIG_CMA
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 603a131e262d..5383023d0cca 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
do {
idx = 0;
for_each_hstate(h) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
hugetlb_cgroup_move_parent(idx, h_cg, page);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
idx++;
}
cond_resched();
@@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
@@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
set_hugetlb_cgroup(newhpage, h_cg);
set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
list_move(&newhpage->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return;
}
diff --git a/mm/internal.h b/mm/internal.h
index cb3c5e0a7799..e8fdb531f887 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb,
void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
- unsigned long nr);
+void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- DEFINE_READAHEAD(ractl, file, mapping, index);
- force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
+ DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
+ force_page_cache_ra(&ractl, nr_to_read);
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
@@ -97,26 +96,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-/*
- * When kernel touch the user page, the user page may be have been marked
- * poison but still mapped in user space, if without this page, the kernel
- * can guarantee the data integrity and operation success, the kernel is
- * better to check the posion status and avoid touching it, be good not to
- * panic, coredump for process fatal signal is a sample case matching this
- * scenario. Or if kernel can't guarantee the data integrity, it's better
- * not to call this function, let kernel touch the poison page and get to
- * panic.
- */
-static inline bool is_page_poisoned(struct page *page)
-{
- if (PageHWPoison(page))
- return true;
- else if (PageHuge(page) && PageHWPoison(compound_head(page)))
- return true;
-
- return false;
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -146,10 +125,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* family of functions.
*
* nodemask, migratetype and highest_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
+ * __alloc_pages() and then never change.
*
* zonelist, preferred_zone and highest_zoneidx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * __alloc_pages() for the fast path, and might be later changed
* in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
@@ -245,7 +224,13 @@ struct compact_control {
unsigned int nr_freepages; /* Number of isolated free pages */
unsigned int nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
+ /*
+ * Acts as an in/out parameter to page isolation for migration.
+ * isolate_migratepages uses it as a search base.
+ * isolate_migratepages_block will update the value to the next pfn
+ * after the last isolated one.
+ */
+ unsigned long migrate_pfn;
unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
unsigned long total_migrate_scanned;
@@ -281,7 +266,7 @@ struct capture_control {
unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
int find_suitable_fallback(struct free_area *area, unsigned int order,
@@ -329,7 +314,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
}
/*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
@@ -399,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
}
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address_end(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
-
- return max(start, vma->vm_start);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
@@ -447,7 +457,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-
+static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+}
#endif /* !CONFIG_MMU */
/*
@@ -638,4 +650,21 @@ struct migration_target_control {
gfp_t gfp_mask;
};
+/*
+ * mm/vmalloc.c
+ */
+#ifdef CONFIG_MMU
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift);
+#else
+static inline
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ return -EINVAL;
+}
+#endif
+
+void vunmap_range_noflush(unsigned long start, unsigned long end);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 11c75fb07584..32e390c42c53 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
unsigned long, shared.rb_subtree_last,
- vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+ vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
new file mode 100644
index 000000000000..01b362799930
--- /dev/null
+++ b/mm/io-mapping.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+/**
+ * io_mapping_map_user - remap an I/O mapping to userspace
+ * @iomap: the source io_mapping
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size)
+{
+ vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+ if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
+ return -EINVAL;
+
+ /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+ return remap_pfn_range_notrack(vma, addr, pfn, size,
+ __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+}
+EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..8ee0136f8cb0 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,237 +16,22 @@
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1;
static int __init set_nohugeiomap(char *str)
{
- ioremap_huge_disabled = 1;
+ iomap_max_page_shift = PAGE_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const unsigned int iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel_track(pmd, addr, mask);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
- return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pmd_enabled())
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PMD_MODIFIED;
- continue;
- }
-
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pud_enabled())
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc_track(&init_mm, p4d, addr, mask);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PUD_MODIFIED;
- continue;
- }
-
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_p4d_enabled())
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_P4D_MODIFIED;
- continue;
- }
-
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
- pgtbl_mod_mask mask = 0;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
- &mask);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
-
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
-
- return err;
+ return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
}
#ifdef CONFIG_GENERIC_IOREMAP
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 7b53291dafa1..6bb87f2acd4e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -60,7 +60,7 @@ void kasan_disable_current(void)
void __kasan_unpoison_range(const void *address, size_t size)
{
- kasan_unpoison(address, size);
+ kasan_unpoison(address, size, false);
}
#ifdef CONFIG_KASAN_STACK
@@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task)
{
void *base = task_stack_page(task);
- kasan_unpoison(base, THREAD_SIZE);
+ kasan_unpoison(base, THREAD_SIZE, false);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison(base, watermark - base);
+ kasan_unpoison(base, watermark - base, false);
}
#endif /* CONFIG_KASAN_STACK */
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
return 0;
}
-void __kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
@@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
tag = kasan_random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison(page_address(page), PAGE_SIZE << order);
+ kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
-void __kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order, bool init)
{
if (likely(!PageHighMem(page)))
kasan_poison(page_address(page), PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ KASAN_FREE_PAGE, init);
}
/*
@@ -251,18 +251,18 @@ void __kasan_poison_slab(struct page *page)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
kasan_poison(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size, false);
}
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
/*
@@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
return (void *)object;
}
-static inline bool ____kasan_slab_free(struct kmem_cache *cache,
- void *object, unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine, bool init)
{
u8 tag;
void *tagged_object;
@@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
}
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_FREE);
+ KASAN_KMALLOC_FREE, init);
if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
@@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
return kasan_quarantine_put(cache, object);
}
-bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool init)
{
- return ____kasan_slab_free(cache, object, ip, true);
+ return ____kasan_slab_free(cache, object, ip, true, init);
}
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
@@ -407,9 +408,9 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
if (unlikely(!PageSlab(page))) {
if (____kasan_kfree_large(ptr, ip))
return;
- kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
+ kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
} else {
- ____kasan_slab_free(page->slab_cache, ptr, ip, false);
+ ____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
}
}
@@ -428,7 +429,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object,
}
void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
- void *object, gfp_t flags)
+ void *object, gfp_t flags, bool init)
{
u8 tag;
void *tagged_object;
@@ -453,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
* Unpoison the whole object.
* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
*/
- kasan_unpoison(tagged_object, cache->object_size);
+ kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
if (kasan_stack_collection_enabled())
@@ -496,7 +497,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
redzone_end = round_up((unsigned long)(object + cache->object_size),
KASAN_GRANULE_SIZE);
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
/*
* Save alloc info (if possible) for kmalloc() allocations.
@@ -546,7 +547,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ KASAN_PAGE_REDZONE, false);
return (void *)ptr;
}
@@ -563,7 +564,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
* Part of it might already have been unpoisoned, but it's unknown
* how big that part is.
*/
- kasan_unpoison(object, size);
+ kasan_unpoison(object, size, false);
page = virt_to_head_page(object);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 2e55e0f82f39..53cbf28859b5 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -208,11 +208,11 @@ static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size, false);
kasan_poison(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ KASAN_GLOBAL_REDZONE, false);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -292,11 +292,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
+ size - rounded_down_size, false);
kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
+ KASAN_ALLOCA_LEFT, false);
kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ KASAN_ALLOCA_RIGHT, false);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -306,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 2aad21fda156..4004388b4e4b 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -25,6 +25,12 @@ enum kasan_arg {
KASAN_ARG_ON,
};
+enum kasan_arg_mode {
+ KASAN_ARG_MODE_DEFAULT,
+ KASAN_ARG_MODE_SYNC,
+ KASAN_ARG_MODE_ASYNC,
+};
+
enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_DEFAULT,
KASAN_ARG_STACKTRACE_OFF,
@@ -38,6 +44,7 @@ enum kasan_arg_fault {
};
static enum kasan_arg kasan_arg __ro_after_init;
+static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
@@ -45,6 +52,10 @@ static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
+/* Whether the asynchronous mode is enabled. */
+bool kasan_flag_async __ro_after_init;
+EXPORT_SYMBOL_GPL(kasan_flag_async);
+
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
@@ -68,6 +79,23 @@ static int __init early_kasan_flag(char *arg)
}
early_param("kasan", early_kasan_flag);
+/* kasan.mode=sync/async */
+static int __init early_kasan_mode(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "sync"))
+ kasan_arg_mode = KASAN_ARG_MODE_SYNC;
+ else if (!strcmp(arg, "async"))
+ kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.mode", early_kasan_mode);
+
/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
@@ -115,7 +143,15 @@ void kasan_init_hw_tags_cpu(void)
return;
hw_init_tags(KASAN_TAG_MAX);
- hw_enable_tagging();
+
+ /*
+ * Enable async mode only when explicitly requested through
+ * the command line.
+ */
+ if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+ hw_enable_tagging_async();
+ else
+ hw_enable_tagging_sync();
}
/* kasan_init_hw_tags() is called once on boot CPU. */
@@ -132,6 +168,22 @@ void __init kasan_init_hw_tags(void)
/* Enable KASAN. */
static_branch_enable(&kasan_flag_enabled);
+ switch (kasan_arg_mode) {
+ case KASAN_ARG_MODE_DEFAULT:
+ /*
+ * Default to sync mode.
+ * Do nothing, kasan_flag_async keeps its default value.
+ */
+ break;
+ case KASAN_ARG_MODE_SYNC:
+ /* Do nothing, kasan_flag_async keeps its default value. */
+ break;
+ case KASAN_ARG_MODE_ASYNC:
+ /* Async mode enabled. */
+ kasan_flag_async = true;
+ break;
+ }
+
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
/* Default to enabling stack trace collection. */
@@ -194,10 +246,16 @@ void kasan_set_tagging_report_once(bool state)
}
EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once);
-void kasan_enable_tagging(void)
+void kasan_enable_tagging_sync(void)
+{
+ hw_enable_tagging_sync();
+}
+EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync);
+
+void kasan_force_async_fault(void)
{
- hw_enable_tagging();
+ hw_force_async_tag_fault();
}
-EXPORT_SYMBOL_GPL(kasan_enable_tagging);
+EXPORT_SYMBOL_GPL(kasan_force_async_fault);
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index c4605ac9837b..348f31d15a97 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -220,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
/**
* kasan_populate_early_shadow - populate shadow memory region with
* kasan_early_shadow_page
- * @shadow_start - start of the memory range to populate
- * @shadow_end - end of the memory range to populate
+ * @shadow_start: start of the memory range to populate
+ * @shadow_end: end of the memory range to populate
*/
int __ref kasan_populate_early_shadow(const void *shadow_start,
const void *shadow_end)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 3436c6bf7c0c..8f450bc28045 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,20 +7,37 @@
#include <linux/stackdepot.h>
#ifdef CONFIG_KASAN_HW_TAGS
+
#include <linux/static_key.h>
+
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+extern bool kasan_flag_async __ro_after_init;
+
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
}
+
+static inline bool kasan_async_mode_enabled(void)
+{
+ return kasan_flag_async;
+}
#else
+
static inline bool kasan_stack_collection_enabled(void)
{
return true;
}
+
+static inline bool kasan_async_mode_enabled(void)
+{
+ return false;
+}
+
#endif
extern bool kasan_flag_panic __ro_after_init;
+extern bool kasan_flag_async __ro_after_init;
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@@ -38,9 +55,9 @@ extern bool kasan_flag_panic __ro_after_init;
#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
#ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */
+#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */
#else
-#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */
+#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */
#endif
#ifdef CONFIG_KASAN_GENERIC
@@ -146,7 +163,7 @@ struct kasan_alloc_meta {
struct kasan_track alloc_track;
#ifdef CONFIG_KASAN_GENERIC
/*
- * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The auxiliary stack is stored into struct kasan_alloc_meta.
* The free stack is stored into struct kasan_free_meta.
*/
depot_stack_handle_t aux_stack[2];
@@ -275,8 +292,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifdef CONFIG_KASAN_HW_TAGS
-#ifndef arch_enable_tagging
-#define arch_enable_tagging()
+#ifndef arch_enable_tagging_sync
+#define arch_enable_tagging_sync()
+#endif
+#ifndef arch_enable_tagging_async
+#define arch_enable_tagging_async()
#endif
#ifndef arch_init_tags
#define arch_init_tags(max_tag)
@@ -284,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_set_tagging_report_once
#define arch_set_tagging_report_once(state)
#endif
+#ifndef arch_force_async_tag_fault
+#define arch_force_async_tag_fault()
+#endif
#ifndef arch_get_random_tag
#define arch_get_random_tag() (0xFF)
#endif
@@ -291,19 +314,23 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define arch_get_mem_tag(addr) (0xFF)
#endif
#ifndef arch_set_mem_tag_range
-#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr))
+#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr))
#endif
-#define hw_enable_tagging() arch_enable_tagging()
+#define hw_enable_tagging_sync() arch_enable_tagging_sync()
+#define hw_enable_tagging_async() arch_enable_tagging_async()
#define hw_init_tags(max_tag) arch_init_tags(max_tag)
#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state)
+#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
-#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#define hw_set_mem_tag_range(addr, size, tag, init) \
+ arch_set_mem_tag_range((addr), (size), (tag), (init))
#else /* CONFIG_KASAN_HW_TAGS */
-#define hw_enable_tagging()
+#define hw_enable_tagging_sync()
+#define hw_enable_tagging_async()
#define hw_set_tagging_report_once(state)
#endif /* CONFIG_KASAN_HW_TAGS */
@@ -311,12 +338,14 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_set_tagging_report_once(bool state);
-void kasan_enable_tagging(void);
+void kasan_enable_tagging_sync(void);
+void kasan_force_async_fault(void);
#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
static inline void kasan_set_tagging_report_once(bool state) { }
-static inline void kasan_enable_tagging(void) { }
+static inline void kasan_enable_tagging_sync(void) { }
+static inline void kasan_force_async_fault(void) { }
#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
@@ -330,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; }
#ifdef CONFIG_KASAN_HW_TAGS
-static inline void kasan_poison(const void *addr, size_t size, u8 value)
+static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
addr = kasan_reset_tag(addr);
@@ -343,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value)
if (WARN_ON(size & KASAN_GRANULE_MASK))
return;
- hw_set_mem_tag_range((void *)addr, size, value);
+ hw_set_mem_tag_range((void *)addr, size, value, init);
}
-static inline void kasan_unpoison(const void *addr, size_t size)
+static inline void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -360,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- hw_set_mem_tag_range((void *)addr, size, tag);
+ hw_set_mem_tag_range((void *)addr, size, tag, init);
}
static inline bool kasan_byte_accessible(const void *addr)
@@ -368,33 +397,34 @@ static inline bool kasan_byte_accessible(const void *addr)
u8 ptr_tag = get_tag(addr);
u8 mem_tag = hw_get_mem_tag((void *)addr);
- return (mem_tag != KASAN_TAG_INVALID) &&
- (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
+ return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag;
}
#else /* CONFIG_KASAN_HW_TAGS */
/**
- * kasan_poison - mark the memory range as unaccessible
+ * kasan_poison - mark the memory range as inaccessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, must be aligned to KASAN_GRANULE_SIZE
* @value - value that's written to metadata for the range
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* The size gets aligned to KASAN_GRANULE_SIZE before marking the range.
*/
-void kasan_poison(const void *addr, size_t size, u8 value);
+void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, can be unaligned
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
* For the generic mode, the last granule of the memory range gets partially
* unpoisoned based on the @size.
*/
-void kasan_unpoison(const void *addr, size_t size);
+void kasan_unpoison(const void *addr, size_t size, bool init);
bool kasan_byte_accessible(const void *addr);
@@ -404,7 +434,7 @@ bool kasan_byte_accessible(const void *addr);
/**
* kasan_poison_last_granule - mark the last granule of the memory range as
- * unaccessible
+ * inaccessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size
*
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 728fb24c5683..d8ccff4c1275 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -27,7 +27,7 @@
/* Data structure and operations for quarantine queues. */
/*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
* objects inside of it.
*/
struct qlist_head {
@@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
local_irq_save(flags);
/*
- * As the object now gets freed from the quaratine, assume that its
+ * As the object now gets freed from the quarantine, assume that its
* free track is no longer valid.
*/
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 87b271206163..14bd51ea2348 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -87,7 +87,8 @@ static void start_report(unsigned long *flags)
static void end_report(unsigned long *flags, unsigned long addr)
{
- trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+ if (!kasan_async_mode_enabled())
+ trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
@@ -360,6 +361,25 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags, (unsigned long)object);
}
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
+{
+ unsigned long flags;
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
+ start_report(&flags);
+ pr_err("BUG: KASAN: invalid-access\n");
+ pr_err("Asynchronous mode enabled: no access details available\n");
+ pr_err("\n");
+ dump_stack();
+ end_report(&flags, 0);
+}
+#endif /* CONFIG_KASAN_HW_TAGS */
+
static void __kasan_report(unsigned long addr, size_t size, bool is_write,
unsigned long ip)
{
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index de732bc341c5..139615ef326b 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr,
}
/* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
+ strscpy(token, *frame_descr, tok_len + 1);
}
/* Advance frame_descr past separator. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 63f43443f5d7..082ee5b6d9a1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len);
}
-void kasan_poison(const void *addr, size_t size, u8 value)
+void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
@@ -106,7 +106,7 @@ void kasan_poison_last_granule(const void *addr, size_t size)
}
#endif
-void kasan_unpoison(const void *addr, size_t size)
+void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -129,7 +129,7 @@ void kasan_unpoison(const void *addr, size_t size)
return;
/* Unpoison all granules that cover the object. */
- kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag);
+ kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false);
/* Partially poison the last granule for the generic mode. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
* // rest of vmalloc process <data dependency>
* STORE p, a LOAD shadow(x+99)
*
- * If there is no barrier between the end of unpoisioning the shadow
+ * If there is no barrier between the end of unpoisoning the shadow
* and the store of the result to p, the stores could be committed
* in a different order by CPU#0, and CPU#1 could erroneously observe
* poison in the shadow.
@@ -344,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
}
void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -352,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
if (!is_vmalloc_or_module_addr(start))
return;
- kasan_unpoison(start, size);
+ kasan_unpoison(start, size, false);
}
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
@@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
* How does this work?
* -------------------
*
- * We have a region that is page aligned, labelled as A.
+ * We have a region that is page aligned, labeled as A.
* That might not map onto the shadow in a way that is page-aligned:
*
* start end
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 94c2d33be333..9df8e7f69e87 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
bool kasan_byte_accessible(const void *addr)
{
u8 tag = get_tag(addr);
- u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
+ void *untagged_addr = kasan_reset_tag(addr);
+ u8 shadow_byte;
- return (shadow_byte != KASAN_TAG_INVALID) &&
- (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
+ if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))
+ return false;
+
+ shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
+ return tag == KASAN_TAG_KERNEL || tag == shadow_byte;
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
@@ -159,7 +163,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- kasan_poison((void *)addr, size, tag);
+ kasan_poison((void *)addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d53c91f881a4..4d21ac44d5d3 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/irq_work.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
@@ -19,6 +20,7 @@
#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
/* Restore page protection if there was an OOB access. */
if (meta->unprotected_page) {
+ memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
kfence_protect(meta->unprotected_page);
meta->unprotected_page = 0;
}
@@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init);
/* === Allocation Gate Timer ================================================ */
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+ wake_up(&allocation_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+#endif
+
/*
* Set up delayed work, which will enable and disable the static key. We need to
* use a work queue (rather than a simple timer), since enabling and disabling a
@@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work)
if (!READ_ONCE(kfence_enabled))
return;
- /* Enable static key, and await allocation to happen. */
atomic_set(&kfence_allocation_gate, 0);
#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- /*
- * Await an allocation. Timeout after 1 second, in case the kernel stops
- * doing allocations, to avoid stalling this worker task for too long.
- */
- {
- unsigned long end_wait = jiffies + HZ;
-
- do {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (atomic_read(&kfence_allocation_gate) != 0)
- break;
- schedule_timeout(1);
- } while (time_before(jiffies, end_wait));
- __set_current_state(TASK_RUNNING);
+
+ if (sysctl_hung_task_timeout_secs) {
+ /*
+ * During low activity with no allocations we might wait a
+ * while; let's avoid the hung task warning.
+ */
+ wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } else {
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
}
+
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
#endif
- schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+ queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+ msecs_to_jiffies(kfence_sample_interval));
}
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
@@ -654,7 +666,7 @@ void __init kfence_init(void)
}
WRITE_ONCE(kfence_enabled, true);
- schedule_delayed_work(&kfence_timer, 0);
+ queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
@@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
*/
if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /*
+ * waitqueue_active() is fully ordered after the update of
+ * kfence_allocation_gate per atomic_inc_return().
+ */
+ if (waitqueue_active(&allocation_wait)) {
+ /*
+ * Calling wake_up() here may deadlock when allocations happen
+ * from within timer code. Use an irq_work to defer it.
+ */
+ irq_work_queue(&wake_up_kfence_timer_work);
+ }
+#endif
if (!READ_ONCE(kfence_enabled))
return NULL;
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index e3f71451ad9e..2a319c21c939 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
if (panic_on_warn)
panic("panic_on_warn set ...\n");
- /* We encountered a memory unsafety error, taint the kernel! */
+ /* We encountered a memory safety error, taint the kernel! */
add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a7d6cb912b05..6c0185fdd815 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*
* The page table that maps the page has been already unlinked
* from the page table tree and this process cannot get
- * an additinal pin on the page.
+ * an additional pin on the page.
*
* New pins can come later if the page is shared across fork,
* but not from this process. The other process cannot write to
@@ -716,17 +716,17 @@ next:
if (pte_write(pteval))
writable = true;
}
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
+
+ if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
+ } else if (unlikely(!referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
}
-
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
* If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!node_reclaim_mode)
+ if (!node_reclaim_enabled())
return false;
/* If there is a count for this node already, it must be acceptable */
@@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
- goto out;
+ goto out_up_write;
/* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd)
- goto out;
+ goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
@@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
result = SCAN_FAIL;
- goto out;
+ goto out_up_write;
}
/*
@@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm,
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
&compound_pagelist);
pte_unmap(pte);
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), but
+ * the smp_wmb() inside __SetPageUptodate() can be reused to
+ * avoid the copy_huge_page writes to become visible after
+ * the set_pmd_at() write.
+ */
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
- /*
- * spin_lock() below is not the equivalent of smp_wmb(), so
- * this is needed to avoid the copy_huge_page writes to become
- * visible after the set_pmd_at() write.
- */
- smp_wmb();
-
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
@@ -1216,8 +1215,6 @@ out_nolock:
mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
-out:
- goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
- if (!pte_present(pteval)) {
- result = SCAN_PTE_NON_PRESENT;
- goto out_unmap;
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
@@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
int i;
if (!vma || !vma->vm_file ||
- vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
return;
/*
@@ -1533,16 +1526,16 @@ abort:
goto drop_hpage;
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
- return 0;
+ return;
if (!mmap_write_trylock(mm))
- return -EBUSY;
+ return;
if (unlikely(khugepaged_test_exit(mm)))
goto out;
@@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
- return 0;
}
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
@@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
BUILD_BUG();
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
- return 0;
}
#endif
@@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void)
{
struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
lru_add_drain_all();
while (progress < pages) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fe6e3ae8e8c6..92a2d4885808 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1203,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object)
}
/*
- * Memory scanning is a long process and it needs to be interruptable. This
+ * Memory scanning is a long process and it needs to be interruptible. This
* function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
diff --git a/mm/ksm.c b/mm/ksm.c
index 9694ee2c71de..2f3aaeb34a42 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -215,8 +215,6 @@ struct rmap_item {
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
-#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
- /* to mask all the flags */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -461,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
*
* FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
@@ -794,6 +792,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
+ rmap_item->head = NULL;
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -817,8 +816,7 @@ out:
cond_resched(); /* we're called from many long loops */
}
-static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
{
while (*rmap_list) {
struct rmap_item *rmap_item = *rmap_list;
@@ -989,7 +987,7 @@ static int unmerge_and_remove_all_rmap_items(void)
goto error;
}
- remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ remove_trailing_rmap_items(&mm_slot->rmap_list);
mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
@@ -1068,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
- * with the pagecount against the mapcount is racey and
+ * with the pagecount against the mapcount is racy and
* O_DIRECT can happen right after the check.
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
@@ -1438,7 +1436,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
*/
*_stable_node = found;
/*
- * Just for robustneess as stable_node is
+ * Just for robustness, as stable_node is
* otherwise left as a stable pointer, the
* compiler shall optimize it away at build
* time.
@@ -1771,7 +1769,6 @@ chain_append:
* stable_node_dup is the dup to replace.
*/
if (stable_node_dup == stable_node) {
- VM_BUG_ON(is_stable_node_chain(stable_node_dup));
VM_BUG_ON(is_stable_node_dup(stable_node_dup));
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1782,6 @@ chain_append:
* of the current nid for this page
* content.
*/
- VM_BUG_ON(!is_stable_node_chain(stable_node));
VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
VM_BUG_ON(page_node->head != &migrate_nodes);
list_del(&page_node->list);
@@ -2337,7 +2333,7 @@ next_mm:
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
- remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+ remove_trailing_rmap_items(ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -2634,7 +2630,7 @@ again:
vma = vmac->vma;
/* Ignore the stable/unstable/sqnr flags */
- addr = rmap_item->address & ~KSM_FLAG_MASK;
+ addr = rmap_item->address & PAGE_MASK;
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 6f067b6b935f..cd58790d0fb3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
- memcg_set_shrinker_bit(memcg, nid,
- lru_shrinker_id(lru));
+ set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
if (src->nr_items) {
dst->nr_items += src->nr_items;
- memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+ set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 01fef79ac761..63e489e5bfdb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
- * vma was splitted while the mmap_lock was
+ * vma was split while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
@@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior)
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
* MADV_COLD - the application is not expected to use this memory soon,
* deactivate pages in this range so that they can be reclaimed
- * easily if memory pressure hanppens.
+ * easily if memory pressure happens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e064ac0d850a..64ada9e650a5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -215,7 +215,7 @@ enum res_type {
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/* Used for OOM nofiier */
+/* Used for OOM notifier */
#define OOM_CONTROL (0)
/*
@@ -255,10 +255,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages);
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages);
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -295,7 +293,7 @@ static void obj_cgroup_release(struct percpu_ref *ref)
spin_lock_irqsave(&css_set_lock, flags);
memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
- __memcg_kmem_uncharge(memcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
list_del(&objcg->list);
mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);
@@ -402,129 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
- kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
- int size, int old_size)
-{
- struct memcg_shrinker_map *new, *old;
- int nid;
-
- lockdep_assert_held(&memcg_shrinker_map_mutex);
-
- for_each_node(nid) {
- old = rcu_dereference_protected(
- mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
- /* Not yet online memcg */
- if (!old)
- return 0;
-
- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- if (!new)
- return -ENOMEM;
-
- /* Set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_size);
- memset((void *)new->map + old_size, 0, size - old_size);
-
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
- call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
- }
-
- return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *pn;
- struct memcg_shrinker_map *map;
- int nid;
-
- if (mem_cgroup_is_root(memcg))
- return;
-
- for_each_node(nid) {
- pn = mem_cgroup_nodeinfo(memcg, nid);
- map = rcu_dereference_protected(pn->shrinker_map, true);
- kvfree(map);
- rcu_assign_pointer(pn->shrinker_map, NULL);
- }
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct memcg_shrinker_map *map;
- int nid, size, ret = 0;
-
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- size = memcg_shrinker_map_size;
- for_each_node(nid) {
- map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
- if (!map) {
- memcg_free_shrinker_maps(memcg);
- ret = -ENOMEM;
- break;
- }
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
- }
- mutex_unlock(&memcg_shrinker_map_mutex);
-
- return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
- int size, old_size, ret = 0;
- struct mem_cgroup *memcg;
-
- size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
- old_size = memcg_shrinker_map_size;
- if (size <= old_size)
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- if (!root_mem_cgroup)
- goto unlock;
-
- for_each_mem_cgroup(memcg) {
- if (mem_cgroup_is_root(memcg))
- continue;
- ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret) {
- mem_cgroup_iter_break(NULL, memcg);
- goto unlock;
- }
- }
-unlock:
- if (!ret)
- memcg_shrinker_map_size = size;
- mutex_unlock(&memcg_shrinker_map_mutex);
- return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
- if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- struct memcg_shrinker_map *map;
-
- rcu_read_lock();
- map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, map->map);
- rcu_read_unlock();
- }
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -713,7 +588,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
+ mz = memcg->nodeinfo[nid];
mctz = soft_limit_tree_node(nid);
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
@@ -764,28 +639,37 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x, threshold = MEMCG_CHARGE_BATCH;
-
if (mem_cgroup_disabled())
return;
- if (memcg_stat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+}
- x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > threshold)) {
- struct mem_cgroup *mi;
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->stat[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[idx]);
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
+#ifdef CONFIG_SMP
+ if (x < 0)
x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+#endif
+ return x;
}
static struct mem_cgroup_per_node *
@@ -796,7 +680,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
- return mem_cgroup_nodeinfo(parent, nid);
+ return parent->nodeinfo[nid];
}
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -855,18 +739,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
- struct mem_cgroup *memcg = page_memcg(head);
+ struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
+ rcu_read_lock();
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
+ rcu_read_unlock();
__mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(__mod_lruvec_page_state);
@@ -898,35 +786,21 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
*/
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- unsigned long x;
-
if (mem_cgroup_disabled())
return;
- x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
- if (unlikely(x > MEMCG_CHARGE_BATCH)) {
- struct mem_cgroup *mi;
-
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->events[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+ __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->vmevents[event]);
+ return READ_ONCE(memcg->vmstats.events[event]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
@@ -935,7 +809,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
int cpu;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
return x;
}
@@ -1030,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
rcu_read_lock();
do {
/*
- * Page cache insertions can happen withou an
+ * Page cache insertions can happen without an
* actual mm context, e.g. during disk probing
* on boot, loopback IO, acct() writes etc.
*/
@@ -1055,20 +929,6 @@ static __always_inline struct mem_cgroup *active_memcg(void)
return current->active_memcg;
}
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = active_memcg();
- /* remote memcg must hold a ref. */
- if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
-
- return memcg;
-}
-
static __always_inline bool memcg_kmem_bypass(void)
{
/* Allow remote memcg charging from any context. */
@@ -1083,20 +943,6 @@ static __always_inline bool memcg_kmem_bypass(void)
}
/**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
- if (memcg_kmem_bypass())
- return NULL;
-
- if (unlikely(active_memcg()))
- return get_active_memcg();
-
- return get_mem_cgroup_from_mm(current->mm);
-}
-
-/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -1136,7 +982,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (reclaim) {
struct mem_cgroup_per_node *mz;
- mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
@@ -1238,7 +1084,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(from, nid);
+ mz = from->nodeinfo[nid];
iter = &mz->iter;
cmpxchg(&iter->position, dead_memcg, NULL);
}
@@ -1571,6 +1417,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*
* Current memory state:
*/
+ cgroup_rstat_flush(memcg->css.cgroup);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@@ -1865,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * Be careful about under_oom underflows becase a child memcg
+ * Be careful about under_oom underflows because a child memcg
* could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
@@ -2037,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
+ * uncharges. Wake any sleepers explicitly.
*/
memcg_oom_recover(memcg);
}
@@ -2118,11 +1965,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
* This function protects unlocked LRU pages from being moved to
* another cgroup.
*
- * It ensures lifetime of the returned memcg. Caller is responsible
- * for the lifetime of the page; __unlock_page_memcg() is available
- * when @page might get freed inside the locked section.
+ * It ensures lifetime of the locked memcg. Caller is responsible
+ * for the lifetime of the page.
*/
-struct mem_cgroup *lock_page_memcg(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
@@ -2132,21 +1978,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page itself from being freed. E.g. writeback
- * doesn't hold a page reference and relies on PG_writeback to
- * keep off truncation, migration and so forth.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page_memcg(head);
if (unlikely(!memcg))
- return NULL;
+ return;
#ifdef CONFIG_PROVE_LOCKING
local_irq_save(flags);
@@ -2155,7 +1995,7 @@ again:
#endif
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page_memcg(head)) {
@@ -2164,24 +2004,17 @@ again:
}
/*
- * When charge migration first begins, we can have locked and
- * unlocked page stat updates happening concurrently. Track
- * the task who has the lock for unlock_page_memcg().
+ * When charge migration first begins, we can have multiple
+ * critical sections holding the fast-path RCU lock and one
+ * holding the slowpath move_lock. Track the task who has the
+ * move_lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
-
- return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
-/**
- * __unlock_page_memcg - unlock and unpin a memcg
- * @memcg: the memcg
- *
- * Unlock and unpin a memcg returned by lock_page_memcg().
- */
-void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __unlock_page_memcg(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -2381,50 +2214,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_hotplug_cpu_dead(unsigned int cpu)
+static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
{
- struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg, *mi;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ int nid;
- for_each_mem_cgroup(memcg) {
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ unsigned long stat[NR_VM_NODE_STAT_ITEMS];
+ struct batched_lruvec_stat *lstatc;
int i;
- for (i = 0; i < MEMCG_NR_STAT; i++) {
- int nid;
- long x;
-
- x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmstats[i]);
-
- if (i >= NR_VM_NODE_STAT_ITEMS)
- continue;
+ lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ stat[i] = lstatc->count[i];
+ lstatc->count[i] = 0;
+ }
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn;
+ do {
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
+ }
+}
- pn = mem_cgroup_nodeinfo(memcg, nid);
- x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- if (x)
- do {
- atomic_long_add(x, &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
- }
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
+{
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- long x;
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
- x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmevents[i]);
- }
- }
+ for_each_mem_cgroup(memcg)
+ memcg_flush_lruvec_page_state(memcg, cpu);
return 0;
}
@@ -2793,9 +2615,6 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (gfp_mask & __GFP_NOFAIL)
- goto force;
-
if (fatal_signal_pending(current))
goto force;
@@ -2905,6 +2724,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
page->memcg_data = (unsigned long)memcg;
}
+static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(objcg);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page)
@@ -3056,23 +2889,45 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-/**
- * __memcg_kmem_charge: charge a number of kernel pages to a memcg
- * @memcg: memory cgroup to charge
+/*
+ * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
+ * @objcg: object cgroup to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+}
+
+/*
+ * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
+ * @objcg: object cgroup to charge
* @gfp: reclaim mode
* @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
+ unsigned int nr_pages)
{
struct page_counter *counter;
+ struct mem_cgroup *memcg;
int ret;
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
- return ret;
+ goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
@@ -3084,25 +2939,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
*/
if (gfp & __GFP_NOFAIL) {
page_counter_charge(&memcg->kmem, nr_pages);
- return 0;
+ goto out;
}
cancel_charge(memcg, nr_pages);
- return -ENOMEM;
+ ret = -ENOMEM;
}
- return 0;
-}
-
-/**
- * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+out:
+ css_put(&memcg->css);
- refill_stock(memcg, nr_pages);
+ return ret;
}
/**
@@ -3115,18 +2960,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;
- memcg = get_mem_cgroup_from_current();
- if (memcg && !mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
- page->memcg_data = (unsigned long)memcg |
+ page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
return ret;
}
@@ -3138,16 +2983,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
- if (!memcg)
+ if (!PageMemcgKmem(page))
return;
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
+ objcg = __page_objcg(page);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
page->memcg_data = 0;
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3180,11 +3025,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages) {
- rcu_read_lock();
- __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
- rcu_read_unlock();
- }
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(old, nr_pages);
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3242,7 +3084,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
{
- struct mem_cgroup *memcg;
unsigned int nr_pages, nr_bytes;
int ret;
@@ -3259,24 +3100,16 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
* refill_obj_stock(), called from this function or
* independently later.
*/
- rcu_read_lock();
-retry:
- memcg = obj_cgroup_memcg(objcg);
- if (unlikely(!css_tryget(&memcg->css)))
- goto retry;
- rcu_read_unlock();
-
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
if (nr_bytes)
nr_pages += 1;
- ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
- css_put(&memcg->css);
return ret;
}
@@ -3300,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int nr)
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
- css_get_many(&memcg->css, nr - 1);
+
+ if (PageMemcgKmem(head))
+ obj_cgroup_get_many(__page_objcg(head), nr - 1);
+ else
+ css_get_many(&memcg->css, nr - 1);
}
#ifdef CONFIG_MEMCG_SWAP
@@ -3549,6 +3386,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
+ cgroup_rstat_flush(memcg->css.cgroup);
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
@@ -3613,57 +3451,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
-{
- unsigned long stat[MEMCG_NR_STAT] = {0};
- struct mem_cgroup *mi;
- int node, cpu, i;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < MEMCG_NR_STAT; i++)
- atomic_long_add(stat[i], &mi->vmstats[i]);
-
- for_each_node(node) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- struct mem_cgroup_per_node *pi;
-
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] += per_cpu(
- pn->lruvec_stat_cpu->count[i], cpu);
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pi->lruvec_stat[i]);
- }
-}
-
-static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
-{
- unsigned long events[NR_VM_EVENT_ITEMS];
- struct mem_cgroup *mi;
- int cpu, i;
-
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += per_cpu(memcg->vmstats_percpu->events[i],
- cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- atomic_long_add(events[i], &mi->vmevents[i]);
-}
-
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3980,6 +3767,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@ -4050,6 +3839,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4108,7 +3899,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
- mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ mz = memcg->nodeinfo[pgdat->node_id];
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
@@ -4137,7 +3928,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > 100)
return -EINVAL;
- if (css->parent)
+ if (!mem_cgroup_is_root(memcg))
memcg->swappiness = val;
else
vm_swappiness = val;
@@ -4487,7 +4278,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!css->parent || !((val == 0) || (val == 1)))
+ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
memcg->oom_kill_disable = val;
@@ -4526,22 +4317,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page().
- */
-static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = atomic_long_read(&memcg->vmstats[idx]);
- int cpu;
-
- for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
- if (x < 0)
- x = 0;
- return x;
-}
-
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -4567,13 +4342,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
- *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
- memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
- *pheadroom = PAGE_COUNTER_MAX;
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_page_state(memcg, NR_ACTIVE_FILE);
+ *pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->memory.high));
@@ -4588,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
- * trackes ownership per-page while the latter per-inode. This was a
+ * tracks ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
@@ -4603,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
* severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
@@ -5205,19 +4981,20 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ int cpu;
+
memcg_wb_domain_exit(memcg);
/*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
+ * Flush percpu lruvec stats to guarantee the value
+ * correctness on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg);
- memcg_flush_percpu_vmevents(memcg);
+ for_each_online_cpu(cpu)
+ memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5244,11 +5021,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
- GFP_KERNEL_ACCOUNT);
- if (!memcg->vmstats_local)
- goto fail;
-
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
@@ -5346,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/*
- * A memcg must be visible for memcg_expand_shrinker_maps()
+ * A memcg must be visible for expand_shrinker_info()
* by the time the maps are allocated. So, we allocate maps
* here, when for_each_mem_cgroup() can't skip it.
*/
- if (memcg_alloc_shrinker_maps(memcg)) {
+ if (alloc_shrinker_info(memcg)) {
mem_cgroup_id_remove(memcg);
return -ENOMEM;
}
@@ -5382,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
+ reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
drain_all_stock(memcg);
@@ -5414,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
- memcg_free_shrinker_maps(memcg);
+ free_shrinker_info(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5448,6 +5221,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ long delta, v;
+ int i;
+
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ /*
+ * Collect the aggregated propagation counts of groups
+ * below us. We're in a per-cpu loop here and this is
+ * a global counter, so the first cycle will get them.
+ */
+ delta = memcg->vmstats.state_pending[i];
+ if (delta)
+ memcg->vmstats.state_pending[i] = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ v = READ_ONCE(statc->state[i]);
+ if (v != statc->state_prev[i]) {
+ delta += v - statc->state_prev[i];
+ statc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ /* Aggregate counts on this level and propagate upwards */
+ memcg->vmstats.state[i] += delta;
+ if (parent)
+ parent->vmstats.state_pending[i] += delta;
+ }
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ delta = memcg->vmstats.events_pending[i];
+ if (delta)
+ memcg->vmstats.events_pending[i] = 0;
+
+ v = READ_ONCE(statc->events[i]);
+ if (v != statc->events_prev[i]) {
+ delta += v - statc->events_prev[i];
+ statc->events_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ memcg->vmstats.events[i] += delta;
+ if (parent)
+ parent->vmstats.events_pending[i] += delta;
+ }
+}
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
@@ -5945,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
return 0;
/*
- * We are now commited to this value whatever it is. Changes in this
+ * We are now committed to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
@@ -6501,6 +6330,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
@@ -6683,6 +6513,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+ gfp_t gfp)
+{
+ unsigned int nr_pages = thp_nr_pages(page);
+ int ret;
+
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret)
+ goto out;
+
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ memcg_check_events(memcg, page);
+ local_irq_enable();
+out:
+ return ret;
+}
+
/**
* mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
@@ -6692,55 +6543,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
+ * Do not use this for pages allocated for swapin.
+ *
* Returns 0 on success. Otherwise, an error code is returned.
*/
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
- unsigned int nr_pages = thp_nr_pages(page);
- struct mem_cgroup *memcg = NULL;
- int ret = 0;
+ struct mem_cgroup *memcg;
+ int ret;
if (mem_cgroup_disabled())
- goto out;
+ return 0;
- if (PageSwapCache(page)) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id;
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ css_put(&memcg->css);
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page and memcg binding is
- * protected by the page lock, which serializes swap cache
- * removal, which in turn serializes uncharging.
- */
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page_memcg(compound_head(page)))
- goto out;
+ return ret;
+}
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+/**
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ gfp_t gfp, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+ int ret;
- if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
+ if (mem_cgroup_disabled())
+ return 0;
- ret = try_charge(memcg, gfp_mask, nr_pages);
- if (ret)
- goto out_put;
+ id = lookup_swap_cgroup_id(entry);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (!memcg || !css_tryget_online(&memcg->css))
+ memcg = get_mem_cgroup_from_mm(mm);
+ rcu_read_unlock();
- css_get(&memcg->css);
- commit_charge(page, memcg);
+ ret = __mem_cgroup_charge(page, memcg, gfp);
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
- memcg_check_events(memcg, page);
- local_irq_enable();
+ css_put(&memcg->css);
+ return ret;
+}
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
/*
* Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap
@@ -6753,25 +6620,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (do_memsw_account() && PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
+ if (!mem_cgroup_disabled() && do_memsw_account()) {
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, nr_pages);
+ mem_cgroup_uncharge_swap(entry, 1);
}
-
-out_put:
- css_put(&memcg->css);
-out:
- return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
- unsigned long nr_pages;
+ unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
struct page *dummy_page;
@@ -6786,10 +6647,10 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
- if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ if (ug->nr_memory) {
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
@@ -6797,7 +6658,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -6808,40 +6669,60 @@ static void uncharge_batch(const struct uncharge_gather *ug)
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
unsigned long nr_pages;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page_memcg(page))
- return;
-
/*
* Nobody should be changing or seriously looking at
- * page_memcg(page) at this point, we have fully
+ * page memcg or objcg at this point, we have fully
* exclusive access to the page.
*/
+ if (PageMemcgKmem(page)) {
+ objcg = __page_objcg(page);
+ /*
+ * This get matches the put at the end of the function and
+ * kmem pages do not hold memcg references anymore.
+ */
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ } else {
+ memcg = __page_memcg(page);
+ }
- if (ug->memcg != page_memcg(page)) {
+ if (!memcg)
+ return;
+
+ if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page_memcg(page);
+ ug->memcg = memcg;
+ ug->dummy_page = page;
/* pairs with css_put in uncharge_batch */
- css_get(&ug->memcg->css);
+ css_get(&memcg->css);
}
nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
- if (PageMemcgKmem(page))
+ if (PageMemcgKmem(page)) {
+ ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
- else
+
+ page->memcg_data = 0;
+ obj_cgroup_put(objcg);
+ } else {
+ /* LRU pages aren't accounted at the root level */
+ if (!mem_cgroup_is_root(memcg))
+ ug->nr_memory += nr_pages;
ug->pgpgout++;
- ug->dummy_page = page;
- page->memcg_data = 0;
- css_put(&ug->memcg->css);
+ page->memcg_data = 0;
+ }
+
+ css_put(&memcg->css);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 24210c9bd843..6f5f78885ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
/*
* We could fail to take off the target page from buddy
- * for example due to racy page allocaiton, but that's
+ * for example due to racy page allocation, but that's
* acceptable because soft-offlined page is not broken
* and if someone really want to use it, they should
* take it.
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -949,6 +963,17 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false. This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
+ */
+static inline bool HWPoisonHandlable(struct page *page)
+{
+ return PageLRU(page) || __PageMovable(page);
+}
+
/**
* __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error)
@@ -959,8 +984,22 @@ static int page_action(struct page_state *ps, struct page *p,
static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * This check prevents from calling get_hwpoison_unless_zero()
+ * for any unsupported type of page in order to reduce the risk of
+ * unexpected races caused by taking a page refcount.
+ */
+ if (!HWPoisonHandlable(head))
+ return 0;
- if (!PageHuge(head) && PageTransHuge(head)) {
+ if (PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -1017,7 +1056,7 @@ try_again:
ret = -EIO;
}
} else {
- if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ if (PageHuge(p) || HWPoisonHandlable(p)) {
ret = 1;
} else {
/*
@@ -1228,7 +1267,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ return -EHWPOISON;
}
num_poisoned_pages_inc();
@@ -1288,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1368,7 +1407,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* communicated in siginfo, see kill_proc()
*/
start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ unmap_mapping_range(page->mapping, start, size, 0);
}
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
@@ -1404,9 +1443,10 @@ int memory_failure(unsigned long pfn, int flags)
struct page *hpage;
struct page *orig_head;
struct dev_pagemap *pgmap;
- int res;
+ int res = 0;
unsigned long page_flags;
bool retry = true;
+ static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -1424,13 +1464,19 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
+ mutex_lock(&mf_mutex);
+
try_again:
- if (PageHuge(p))
- return memory_failure_hugetlb(pfn, flags);
+ if (PageHuge(p)) {
+ res = memory_failure_hugetlb(pfn, flags);
+ goto unlock_mutex;
+ }
+
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ res = -EHWPOISON;
+ goto unlock_mutex;
}
orig_head = hpage = compound_head(p);
@@ -1463,17 +1509,19 @@ try_again:
res = MF_FAILED;
}
action_result(pfn, MF_MSG_BUDDY, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
+ res = res == MF_RECOVERED ? 0 : -EBUSY;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- return -EBUSY;
+ res = -EBUSY;
}
+ goto unlock_mutex;
}
if (PageTransHuge(hpage)) {
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- return -EBUSY;
+ res = -EBUSY;
+ goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
}
@@ -1497,7 +1545,7 @@ try_again:
if (PageCompound(p) && compound_head(p) != orig_head) {
action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1517,17 +1565,22 @@ try_again:
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
- if (!PageTransTail(p) && !PageLRU(p))
+ /*
+ * __munlock_pagevec may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
@@ -1543,7 +1596,7 @@ try_again:
if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1552,13 +1605,17 @@ try_again:
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
-out:
+ mutex_unlock(&mf_mutex);
+ return res;
+unlock_page:
unlock_page(p);
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..486f4a2874e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
+
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
@@ -2260,26 +2271,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
+ * must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
- unsigned long remap_pfn = pfn;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2309,10 +2311,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
- if (err)
- return -EINVAL;
-
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
@@ -2324,12 +2322,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
- untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+ return -EINVAL;
+ err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ if (err)
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -2446,13 +2468,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
}
do {
next = pmd_addr_end(addr, end);
- if (create || !pmd_none_or_clear_bad(pmd)) {
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pmd_none(*pmd) && !create)
+ continue;
+ if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+ return -EINVAL;
+ if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+ if (!create)
+ continue;
+ pmd_clear_bad(pmd);
}
+ err = apply_to_pte_range(mm, pmd, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
+
return err;
}
@@ -2474,13 +2504,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
}
do {
next = pud_addr_end(addr, end);
- if (create || !pud_none_or_clear_bad(pud)) {
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pud_none(*pud) && !create)
+ continue;
+ if (WARN_ON_ONCE(pud_leaf(*pud)))
+ return -EINVAL;
+ if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+ if (!create)
+ continue;
+ pud_clear_bad(pud);
}
+ err = apply_to_pmd_range(mm, pud, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
+
return err;
}
@@ -2502,13 +2540,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
}
do {
next = p4d_addr_end(addr, end);
- if (create || !p4d_none_or_clear_bad(p4d)) {
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (p4d_none(*p4d) && !create)
+ continue;
+ if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+ return -EINVAL;
+ if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+ if (!create)
+ continue;
+ p4d_clear_bad(p4d);
}
+ err = apply_to_pud_range(mm, p4d, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (p4d++, addr = next, addr != end);
+
return err;
}
@@ -2528,9 +2574,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (!create && pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd) && !create)
continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return -EINVAL;
+ if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+ if (!create)
+ continue;
+ pgd_clear_bad(pgd);
+ }
+ err = apply_to_p4d_range(mm, pgd, addr, next,
+ fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -2896,6 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -3193,6 +3248,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + thp_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
@@ -3296,7 +3381,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
- delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
@@ -3309,28 +3394,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
- int err;
-
__SetPageLocked(page);
__SetPageSwapBacked(page);
- set_page_private(page, entry.val);
- /* Tell memcg to use swap ownership records */
- SetPageSwapCache(page);
- err = mem_cgroup_charge(page, vma->vm_mm,
- GFP_KERNEL);
- ClearPageSwapCache(page);
- if (err) {
+ if (mem_cgroup_swapin_charge_page(page,
+ vma->vm_mm, GFP_KERNEL, entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
+ mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
lru_cache_add(page);
+
+ /* To provide entry to swap_readpage() */
+ set_page_private(page, entry.val);
swap_readpage(page, true);
+ set_page_private(page, 0);
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
@@ -3347,7 +3430,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -3361,13 +3444,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
@@ -3561,6 +3644,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -3686,7 +3770,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
return ret;
/*
- * Archs like ppc64 need additonal space to store information
+ * Archs like ppc64 need additional space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -3745,6 +3829,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -4100,7 +4186,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
- bool migrated = false;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -4117,29 +4202,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
goto out;
}
- /*
- * Make it present again, Depending on how arch implementes non
- * accessible ptes, some can allow access by kernel mode.
- */
- old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ /* Get the normal PTE */
+ old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
- pte = pte_mkyoung(pte);
- if (was_writable)
- pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (!page)
+ goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (PageCompound(page))
+ goto out_map;
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4149,7 +4222,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
- if (!pte_write(pte))
+ if (!was_writable)
flags |= TNF_NO_GROUP;
/*
@@ -4163,24 +4236,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
- goto out;
+ goto out_map;
}
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
+ if (migrate_misplaced_page(page, vma, target_nid)) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
+out_map:
+ /*
+ * Make it present again, depending on how arch implements
+ * non-accessible ptes, some can allow access by kernel mode.
+ */
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (was_writable)
+ pte = pte_mkwrite(pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
@@ -4454,7 +4548,7 @@ retry_pud:
}
/**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
*
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
@@ -4463,9 +4557,9 @@ retry_pud:
* @flags: the fault flags.
* @ret: the fault retcode.
*
- * This will take care of most of the page fault accountings. Meanwhile, it
+ * This will take care of most of the page fault accounting. Meanwhile, it
* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
* still be in per-arch page fault handlers at the entry of page fault.
*/
static inline void mm_account_fault(struct pt_regs *regs,
@@ -4799,7 +4893,7 @@ out:
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
* @buf: buffer to read/write
* @len: length of transfer
* @write: set to FOLL_WRITE when writing, otherwise reading
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0cdbbfbc5757..70620d0dd923 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,16 @@
#include "internal.h"
#include "shuffle.h"
+
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+#endif
+
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* decide to not expose all pages to the buddy (e.g., expose them
* later). We account all pages as being online and belonging to this
* zone ("present").
+ * When using memmap_on_memory, the range might not be aligned to
+ * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+ * this and the first chunk to online will be pageblock_nr_pages.
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
- (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+ for (pfn = start_pfn; pfn < end_pfn;) {
+ int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ pfn += (1UL << order);
+ }
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
@@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
@@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
+{
+ unsigned long flags;
+
+ zone->present_pages += nr_pages;
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages += nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+ int ret;
+
+ ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+ if (ret)
+ return ret;
+
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections online here as otherwise they will be
+ * left offline.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections offline here as otherwise they will be
+ * left online.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ /*
+ * The pages associated with this vmemmap have been offlined, so
+ * we can reset its state here.
+ */
+ remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+ kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
{
unsigned long flags;
- struct zone *zone;
int need_zonelists_rebuild = 0;
+ const int nid = zone_to_nid(zone);
int ret;
struct memory_notify arg;
- /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisly to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+ !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
mem_hotplug_begin();
/* associate pfn range with the zone */
- zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
@@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
}
online_pages_range(pfn, nr_pages);
- zone->present_pages += nr_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ adjust_present_page_count(zone, nr_pages);
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
@@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+ unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+ unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+ unsigned long remaining_size = size - vmemmap_size;
+
+ /*
+ * Besides having arch support and the feature enabled at runtime, we
+ * need a few more assumptions to hold true:
+ *
+ * a) We span a single memory block: memory onlining/offlinin;g happens
+ * in memory block granularity. We don't want the vmemmap of online
+ * memory blocks to reside on offline memory blocks. In the future,
+ * we might want to support variable-sized memory blocks to make the
+ * feature more versatile.
+ *
+ * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * to populate memory from the altmap for unrelated parts (i.e.,
+ * other memory blocks)
+ *
+ * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * the buddy) have to cover full pageblocks: memory onlining/offlining
+ * code requires applicable ranges to be page-aligned, for example, to
+ * set the migratetypes properly.
+ *
+ * TODO: Although we have a check here to make sure that vmemmap pages
+ * fully populate a PMD, it is not the right place to check for
+ * this. A much better solution involves improving vmemmap code
+ * to fallback to base pages when trying to populate vmemmap using
+ * altmap as an alternative source of memory, and we do not exactly
+ * populate a single PMD.
+ */
+ return memmap_on_memory &&
+ IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+ size == memory_block_size_bytes() &&
+ IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+ IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ struct vmem_altmap mhp_altmap = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
new_node = ret;
+ /*
+ * Self hosted memmap array
+ */
+ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+ if (!mhp_supports_memmap_on_memory(size)) {
+ ret = -EINVAL;
+ goto error;
+ }
+ mhp_altmap.free = PHYS_PFN(size);
+ mhp_altmap.base_pfn = PHYS_PFN(start);
+ params.altmap = &mhp_altmap;
+ }
+
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size);
+ ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
if (ret) {
arch_remove_memory(nid, start, size, NULL);
goto error;
@@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
int ret, node;
char *reason;
- /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisly to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+ !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
mem_hotplug_begin();
@@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
* in a way that pages from isolated pageblock are left on pcplists.
*/
zone_pcp_disable(zone);
+ lru_cache_disable();
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
@@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
}
cond_resched();
- lru_add_drain_all();
ret = scan_movable_pages(pfn, end_pfn, &pfn);
if (!ret) {
@@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
+ lru_cache_enable();
zone_pcp_enable(zone);
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
- zone->present_pages -= nr_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ adjust_present_page_count(zone, -nr_pages);
init_per_zone_wmark_min();
@@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+ /*
+ * If not set, continue with the next block.
+ */
+ return mem->nr_vmemmap_pages;
+}
+
static int check_cpu_on_node(pg_data_t *pgdat)
{
int cpu;
@@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ unsigned long nr_vmemmap_pages;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
if (rc)
return rc;
+ /*
+ * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+ * the same granularity it was added - a single memory block.
+ */
+ if (memmap_on_memory) {
+ nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+ get_nr_vmemmap_pages_cb);
+ if (nr_vmemmap_pages) {
+ if (size != memory_block_size_bytes()) {
+ pr_warn("Refuse to remove %#llx - %#llx,"
+ "wrong granularity\n",
+ start, start + size);
+ return -EINVAL;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table do the
+ * right thing if we used vmem_altmap when hot-adding
+ * the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+ }
+ }
+
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(nid, start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ab51132547b8..d79fa299b70c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
@@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_lock, so after calling
+ * will drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
@@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
int err = 0;
nodemask_t tmp;
- migrate_prep();
+ lru_cache_disable();
mmap_read_lock(mm);
@@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
- int s,d;
+ int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
@@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
break;
}
mmap_read_unlock(mm);
+
+ lru_cache_enable();
if (err < 0)
return err;
return busy;
@@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- migrate_prep();
+ lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
@@ -1371,6 +1373,8 @@ up_out:
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_enable();
return err;
}
@@ -1863,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->v.nodes is intersect with node_states[N_MEMORY].
- * so if the following test faile, it implies
+ * so if the following test fails, it implies
* policy->v.nodes has movable memory only.
*/
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
@@ -2094,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
*
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
* policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local'
* policy, always return true since it may allocate elsewhere on fallback.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
@@ -2140,7 +2144,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
{
struct page *page;
- page = __alloc_pages(gfp, order, nid);
+ page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
@@ -2153,30 +2157,22 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_pages_vma - Allocate a page for a VMA.
- *
- * @gfp:
- * %GFP_USER user allocation.
- * %GFP_KERNEL kernel allocations,
- * %GFP_HIGHMEM highmem/user allocations,
- * %GFP_FS allocation should not call back into a file system.
- * %GFP_ATOMIC don't sleep.
+ * alloc_pages_vma - Allocate a page for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the GFP allocation.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: For hugepages try only the preferred node if possible.
*
- * @order:Order of the GFP allocation.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual Address of the allocation. Must be inside the VMA.
- * @node: Which node to prefer for allocation (modulo policy).
- * @hugepage: for hugepages try only the preferred node if possible
+ * Allocate a page for a specific address in @vma, using the appropriate
+ * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
+ * of the mm_struct of the VMA to prevent it from going away. Should be
+ * used for all allocations for pages that will be mapped into user space.
*
- * This function allocates a page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must read-lock the mmap_lock of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into user space. Returns
- * NULL when no page can be allocated.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *
-alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
@@ -2237,7 +2233,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+ page = __alloc_pages(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
@@ -2245,21 +2241,20 @@ out:
EXPORT_SYMBOL(alloc_pages_vma);
/**
- * alloc_pages_current - Allocate pages.
+ * alloc_pages - Allocate pages.
+ * @gfp: GFP flags.
+ * @order: Power of two of number of pages to allocate.
*
- * @gfp:
- * %GFP_USER user allocation,
- * %GFP_KERNEL kernel allocation,
- * %GFP_HIGHMEM highmem allocation,
- * %GFP_FS don't call back into a file system.
- * %GFP_ATOMIC don't sleep.
- * @order: Power of two of allocation size in pages. 0 is a single page.
+ * Allocate 1 << @order contiguous pages. The physical address of the
+ * first page is naturally aligned (eg an order-3 allocation will be aligned
+ * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
+ * process is honoured when in process context.
*
- * Allocate a page from the kernel page pool. When not in
- * interrupt context and apply the current process NUMA policy.
- * Returns NULL when no page can be allocated.
+ * Context: Can be called from any context, providing the appropriate GFP
+ * flags are used.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
@@ -2274,13 +2269,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
- page = __alloc_pages_nodemask(gfp, order,
+ page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
@@ -2457,14 +2452,11 @@ static void sp_free(struct sp_node *n)
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
- *
- * Returns:
- * -1 - not misplaced, page is in the right node
- * node - node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
+ * node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
+ *
+ * Return: -1 if the page is in a node that is valid for this policy, or a
+ * suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/mempool.c b/mm/mempool.c
index 79959fac27d7..a258cf4de575 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
- kasan_free_pages(element, (unsigned long)pool->pool_data);
+ kasan_free_pages(element, (unsigned long)pool->pool_data, false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
- kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+ kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
@@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init);
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
- return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+ return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
diff --git a/mm/memremap.c b/mm/memremap.c
index 7aa7d6e80ee5..15a074ffb8d7 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
#include <linux/device.h>
#include <linux/io.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..41ff2c9896c4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -57,28 +57,6 @@
#include "internal.h"
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-void migrate_prep(void)
-{
- /*
- * Clear the LRU lists so pages can be isolated.
- * Note that pages may be moved off the LRU after we have
- * drained them. Those pages will fail to migrate like other
- * pages that may be busy.
- */
- lru_add_drain_all();
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-void migrate_prep_local(void)
-{
- lru_add_drain();
-}
-
int isolate_movable_page(struct page *page, isolate_mode_t mode)
{
struct address_space *mapping;
@@ -140,15 +118,10 @@ out:
return -EBUSY;
}
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
{
struct address_space *mapping;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
mapping = page_mapping(page);
mapping->a_ops->putback_page(page);
__ClearPageIsolated(page);
@@ -322,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
goto out;
page = migration_entry_to_page(entry);
+ page = compound_head(page);
/*
* Once page cache replacement of page migration started, page_count
@@ -1375,7 +1349,7 @@ out_unlock:
out:
if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
- else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+ else if (rc != -EAGAIN)
list_move_tail(&hpage->lru, ret);
/*
@@ -1445,6 +1419,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ trace_mm_migrate_pages_start(mode, reason);
+
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -1617,7 +1593,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+ new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
if (new_page && PageTransHuge(new_page))
prep_transhuge_page(new_page);
@@ -1769,7 +1745,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
int start, i;
int err = 0, err1;
- migrate_prep();
+ lru_cache_disable();
for (i = start = 0; i < nr_pages; i++) {
const void __user *p;
@@ -1838,6 +1814,7 @@ out_flush:
if (err >= 0)
err = err1;
out:
+ lru_cache_enable();
return err;
}
@@ -2110,17 +2087,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
-static inline bool is_shared_exec_page(struct vm_area_struct *vma,
- struct page *page)
-{
- if (page_mapcount(page) != 1 &&
- (page_is_file_lru(page) || vma_is_shmem(vma)) &&
- (vma->vm_flags & VM_EXEC))
- return true;
-
- return false;
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2138,7 +2104,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (is_shared_exec_page(vma, page))
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+ (vma->vm_flags & VM_EXEC))
goto out;
/*
@@ -2193,9 +2160,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
- if (is_shared_exec_page(vma, page))
- goto out;
-
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
@@ -2307,7 +2271,6 @@ out_fail:
out_unlock:
unlock_page(page);
-out:
put_page(page);
return 0;
}
@@ -2316,44 +2279,38 @@ out:
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
+static int migrate_vma_collect_skip(unsigned long start,
unsigned long end,
- __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- /* Only allow populating anonymous memory. */
- if (!vma_is_anonymous(walk->vma)) {
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = 0;
- migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- }
- return 0;
- }
-
for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- migrate->cpages++;
+ migrate->src[migrate->npages++] = 0;
}
return 0;
}
-static int migrate_vma_collect_skip(unsigned long start,
+static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma))
+ return migrate_vma_collect_skip(start, end, walk);
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = 0;
+ migrate->npages++;
+ migrate->cpages++;
}
return 0;
@@ -2823,11 +2780,11 @@ restore:
*
* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address. For this the caller simply has to allocate device memory and
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses. For this the caller simply has to allocate device memory and
* properly set the destination entry like for regular migration. Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
* just like for regular migration.
*
* After that, the callers must call migrate_vma_pages() to go over each entry
@@ -2973,6 +2930,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
diff --git a/mm/mlock.c b/mm/mlock.c
index f8f8cc32d03d..df590fda5688 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
vm_flags_t flags)
{
unsigned long nstart, end, tmp;
- struct vm_area_struct * vma, * prev;
+ struct vm_area_struct *vma, *prev;
int error;
VM_BUG_ON(offset_in_page(start));
@@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
- struct vm_area_struct * vma, * prev = NULL;
+ struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8e02e865cc65..9ddaf0e1b0ab 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -19,10 +19,6 @@
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
-#ifndef SECTIONS_SHIFT
-#define SECTIONS_SHIFT 0
-#endif
-
/* The zonelists are simply reported, validation is manual. */
void __init mminit_verify_zonelist(void)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..0584e540246e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -93,6 +93,12 @@ static void unmap_region(struct mm_struct *mm,
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ *
+ * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
+ * MAP_PRIVATE (with Enhanced PAN supported):
+ * r: (no) no
+ * w: (no) no
+ * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -606,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long nr_pages = 0;
struct vm_area_struct *vma;
- /* Find first overlaping mapping */
+ /* Find first overlapping mapping */
vma = find_vma_intersection(mm, addr, end);
if (!vma)
return 0;
@@ -2869,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
- * will remain splitted, but userland will get a
+ * will remain split, but userland will get a
* highly unexpected error anyway. This is no
* different than the case where the first of the two
* __split_vma fails, but we don't undo the first
@@ -3023,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED) {
- struct vm_area_struct *tmp;
+ if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- /* drop PG_Mlocked flag for over-mapped range */
- for (tmp = vma; tmp->vm_start >= start + size;
- tmp = tmp->vm_next) {
- /*
- * Split pmd and munlock page on the border
- * of the range.
- */
- vma_adjust_trans_huge(tmp, start, start + size, 0);
-
- munlock_vma_pages_range(tmp,
- max(tmp->vm_start, start),
- min(tmp->vm_end, start + size));
- }
- }
-
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
@@ -3403,14 +3393,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}
-static int special_mapping_mremap(struct vm_area_struct *new_vma,
- unsigned long flags)
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
- if (flags & MREMAP_DONTUNMAP)
- return -EINVAL;
-
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94188df1ee55..e7a443157988 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
mmap_write_unlock(current->mm);
/*
- * We could provie warnings or errors if any VMA still
+ * We could provide warnings or errors if any VMA still
* has the pkey set here.
*/
return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index ec8f840399ed..47c255b60150 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
- err = vma->vm_ops->mremap(new_vma, flags);
+ err = vma->vm_ops->mremap(new_vma);
}
if (unlikely(err)) {
@@ -653,8 +653,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}
- if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
- vma->vm_flags & VM_SHARED))
+ if ((flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
if (is_vm_hugetlb_page(vma))
@@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
* So, to avoid such scenario we can pre-compute if the whole
* operation has high chances to success map-wise.
* Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmaping it.
+ * split in 3 before unmapping it.
* That means 2 more maps (1 for each) to the ones we already hold.
* Check whether current map count plus 2 still leads us to 4 maps below
* the threshold, otherwise return -ENOMEM here to be more safe.
diff --git a/mm/msync.c b/mm/msync.c
index 69c6d2029531..137d1c104f3e 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
- * just ignore them, but return -ENOMEM at the end.
+ * just ignore them, but return -ENOMEM at the end. Besides, if the
+ * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM
+ * anyway and there is nothing left to do, so return immediately.
*/
mmap_read_lock(mm);
vma = find_vma(mm, start);
@@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
+ if (flags == MS_ASYNC)
+ goto out_unlock;
start = vma->vm_start;
if (start >= end)
goto out_unlock;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..85a3a68dffb6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count)
return count;
}
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- memcpy(addr, buf, count);
- return count;
-}
-
/*
* vmalloc - allocate virtually contiguous memory
*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fa1cf18bac97..eefd3f5fde46 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
#ifdef CONFIG_NUMA
/**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
* @start: task struct of which task to consider
* @oc: pointer to struct oom_control
*
@@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
- (void*)message);
+ (void *)message);
mem_cgroup_put(oom_group);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9e35b636a393..0062d5c57d41 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1806,7 +1806,7 @@ pause:
break;
/*
- * In the case of an unresponding NFS server and the NFS dirty
+ * In the case of an unresponsive NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
@@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping,
* Page truncated or invalidated. We can freely skip it
* then, even for data integrity operations: the page
* has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
+ * real expectation of this data integrity operation
* even if there is now a new, dirty page at the same
* pagecache address.
*/
@@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
int ret;
- memcg = lock_page_memcg(page);
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- dec_lruvec_state(lruvec, NR_WRITEBACK);
+ dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- __unlock_page_memcg(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfc72873961d..ef2265f86b91 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,7 +72,6 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
-
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -108,6 +107,17 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/*
+ * Don't poison memory with KASAN (only for the tag-based modes).
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
+ * Poisoning all that memory lengthens boot time, especially on systems with
+ * large amount of RAM. This flag is used to skip that poisoning.
+ * This is only done for the tag-based KASAN modes, as those are able to
+ * detect memory corruptions with the memory tags assigned by default.
+ * All memory allocated normally after boot gets poisoned as usual.
+ */
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -167,10 +177,10 @@ unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
-DEFINE_STATIC_KEY_FALSE(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
static bool _init_on_alloc_enabled_early __read_mostly
@@ -384,10 +394,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ bool init, fpi_t fpi_flags)
{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
+ if (static_branch_unlikely(&deferred_pages))
+ return;
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON))
+ return;
+ kasan_free_pages(page, order, init);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -438,7 +453,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ bool init, fpi_t fpi_flags)
+{
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON))
+ return;
+ kasan_free_pages(page, order, init);
+}
static inline bool early_page_uninitialised(unsigned long pfn)
{
@@ -764,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
*/
void init_mem_debugging_and_hardening(void)
{
+ bool page_poisoning_requested = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ }
+#endif
+
if (_init_on_alloc_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
-#ifdef CONFIG_PAGE_POISONING
- /*
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- if (page_poisoning_enabled() ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()))
- static_branch_enable(&_page_poisoning_enabled);
-#endif
-
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -867,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
return false;
/*
- * Do not let lower order allocations polluate a movable pageblock.
+ * Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
@@ -1103,7 +1129,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page_memcg(page) |
+ page->memcg_data |
#endif
(page->flags & check_flags)))
return false;
@@ -1128,7 +1154,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page_memcg(page)))
+ if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@ -1216,9 +1242,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool init;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1276,16 +1303,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- if (want_init_on_free())
- kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order);
/*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_free_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ *
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- kasan_free_nondeferred_pages(page, order);
+ init = want_init_on_free();
+ if (init && !kasan_has_integrated_init())
+ kernel_init_free_pages(page, 1 << order);
+ kasan_free_nondeferred_pages(page, order, init, fpi_flags);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1307,7 +1339,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
static bool free_pcp_prepare(struct page *page)
{
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1327,9 +1359,9 @@ static bool bulkfree_pcp_prepare(struct page *page)
static bool free_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false);
+ return free_pages_prepare(page, 0, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1537,7 +1569,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1574,7 +1606,7 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL);
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2292,17 +2324,32 @@ static bool check_new_pages(struct page *page, unsigned int order)
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init;
+
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
- kasan_alloc_pages(page, order);
+
+ /*
+ * Page unpoisoning must happen before memory initialization.
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+ * allocations and the page unpoisoning code will complain.
+ */
kernel_unpoison_pages(page, 1 << order);
- set_page_owner(page, order, gfp_flags);
- if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_alloc_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ kasan_alloc_pages(page, order, init);
+ if (init && !kasan_has_integrated_init())
kernel_init_free_pages(page, 1 << order);
+
+ set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2386,19 +2433,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
+ unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
+ unsigned long pfn;
unsigned int order;
int pages_moved = 0;
- for (page = start_page; page <= end_page;) {
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
+ for (pfn = start_pfn; pfn <= end_pfn;) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
continue;
}
+ page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2408,8 +2457,7 @@ static int move_freepages(struct zone *zone,
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
-
- page++;
+ pfn++;
continue;
}
@@ -2419,7 +2467,7 @@ static int move_freepages(struct zone *zone,
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
- page += 1 << order;
+ pfn += 1 << order;
pages_moved += 1 << order;
}
@@ -2429,25 +2477,22 @@ static int move_freepages(struct zone *zone,
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
+ unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
+ pfn = page_to_pfn(page);
+ start_pfn = pfn & ~(pageblock_nr_pages - 1);
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
- start_page = page;
+ start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype,
+ return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
@@ -2731,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
- * in this loop althoug we changed the pageblock type
+ * in this loop although we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
@@ -2908,7 +2953,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
- int i, alloced = 0;
+ int i, allocated = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -2931,7 +2976,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
- alloced++;
+ allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
@@ -2940,12 +2985,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'alloced' which is the number of
+ * on i. Do not confuse with 'allocated' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return alloced;
+ return allocated;
}
#ifdef CONFIG_NUMA
@@ -3035,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work)
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
* a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is allright but we also have to make sure to not move to
+ * cpu which is alright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
@@ -3415,7 +3460,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
}
/* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -3813,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
- unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+ unsigned int alloc_flags)
{
#ifdef CONFIG_CMA
- unsigned int pflags = current->flags;
-
- if (!(pflags & PF_MEMALLOC_NOCMA) &&
- gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
-
#endif
return alloc_flags;
}
@@ -3922,7 +3965,7 @@ retry:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (node_reclaim_mode == 0 ||
+ if (!node_reclaim_enabled() ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
@@ -4130,7 +4173,7 @@ out:
}
/*
- * Maximum number of compaction retries wit a progress before OOM
+ * Maximum number of compaction retries with a progress before OOM
* killer is consider as the only way to move forward.
*/
#define MAX_COMPACT_RETRIES 16
@@ -4158,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
+ if (*compact_result == COMPACT_SKIPPED)
+ return NULL;
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
@@ -4478,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
return alloc_flags;
}
@@ -4780,7 +4825,7 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4921,7 +4966,7 @@ got_pg:
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
- struct alloc_context *ac, gfp_t *alloc_mask,
+ struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
@@ -4930,7 +4975,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
- *alloc_mask |= __GFP_HARDWALL;
+ *alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
@@ -4949,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+ *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4966,15 +5011,164 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
+ *
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
+ */
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+ nodemask_t *nodemask, int nr_pages,
+ struct list_head *page_list,
+ struct page **page_array)
+{
+ struct page *page;
+ unsigned long flags;
+ struct zone *zone;
+ struct zoneref *z;
+ struct per_cpu_pages *pcp;
+ struct list_head *pcp_list;
+ struct alloc_context ac;
+ gfp_t alloc_gfp;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ int nr_populated = 0;
+
+ if (unlikely(nr_pages <= 0))
+ return 0;
+
+ /*
+ * Skip populated array elements to determine if any pages need
+ * to be allocated before disabling IRQs.
+ */
+ while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ nr_populated++;
+
+ /* Already populated array? */
+ if (unlikely(page_array && nr_pages - nr_populated == 0))
+ return 0;
+
+ /* Use the single page allocator for one page. */
+ if (nr_pages - nr_populated == 1)
+ goto failed;
+
+ /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+ gfp &= gfp_allowed_mask;
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ return 0;
+ gfp = alloc_gfp;
+
+ /* Find an allowed local zone that meets the low watermark. */
+ for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ unsigned long mark;
+
+ if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp)) {
+ continue;
+ }
+
+ if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+ zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ goto failed;
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+ if (zone_watermark_fast(zone, 0, mark,
+ zonelist_zone_idx(ac.preferred_zoneref),
+ alloc_flags, gfp)) {
+ break;
+ }
+ }
+
+ /*
+ * If there are no allowed local zones that meets the watermarks then
+ * try to allocate a single page and reclaim if necessary.
+ */
+ if (unlikely(!zone))
+ goto failed;
+
+ /* Attempt the batch allocation */
+ local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ pcp_list = &pcp->lists[ac.migratetype];
+
+ while (nr_populated < nr_pages) {
+
+ /* Skip existing pages */
+ if (page_array && page_array[nr_populated]) {
+ nr_populated++;
+ continue;
+ }
+
+ page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+ pcp, pcp_list);
+ if (unlikely(!page)) {
+ /* Try and get at least one page */
+ if (!nr_populated)
+ goto failed_irq;
+ break;
+ }
+
+ /*
+ * Ideally this would be batched but the best way to do
+ * that cheaply is to first convert zone_statistics to
+ * be inaccurate per-cpu counter like vm_events to avoid
+ * a RMW cycle then do the accounting with IRQs enabled.
+ */
+ __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
+ zone_statistics(ac.preferred_zoneref->zone, zone);
+
+ prep_new_page(page, 0, gfp, 0);
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ local_irq_restore(flags);
+
+ return nr_populated;
+
+failed_irq:
+ local_irq_restore(flags);
+
+failed:
+ page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+ if (page) {
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ return nr_populated;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
@@ -4982,33 +5176,36 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
return NULL;
}
- gfp_mask &= gfp_allowed_mask;
- alloc_mask = gfp_mask;
- if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+ gfp &= gfp_allowed_mask;
+ /*
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+ * movable zones are not used during allocation.
+ */
+ gfp = current_gfp_context(gfp);
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
+ &alloc_gfp, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
/* First allocation attempt */
- page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
- /*
- * Apply scoped allocation constraints. This is mainly about GFP_NOFS
- * resp. GFP_NOIO which has to be inherited for all allocation requests
- * from a particular context which has been marked by
- * memalloc_no{fs,io}_{save,restore}.
- */
- alloc_mask = current_gfp_context(gfp_mask);
+ alloc_gfp = gfp;
ac.spread_dirty_pages = false;
/*
@@ -5017,20 +5214,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
*/
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
- if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
+ if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
+ unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
return page;
}
-EXPORT_SYMBOL(__alloc_pages_nodemask);
+EXPORT_SYMBOL(__alloc_pages);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
@@ -5736,7 +5933,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
static int __parse_numa_zonelist_order(char *s)
{
/*
- * We used to support different zonlists modes but they turned
+ * We used to support different zonelists modes but they turned
* out to be just not useful. Let's keep the warning in place
* if somebody still use the cmd line parameter so that we do
* not fail it silently
@@ -7477,7 +7674,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
/*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
*/
bool __weak arch_has_descending_max_zone_pfns(void)
@@ -7689,7 +7886,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-void __init mem_init_print_info(const char *str)
+void __init mem_init_print_info(void)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
@@ -7728,17 +7925,17 @@ void __init mem_init_print_info(const char *str)
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
- "%s%s)\n",
+ ")\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10)
#ifdef CONFIG_HIGHMEM
- totalhigh_pages() << (PAGE_SHIFT - 10),
+ , totalhigh_pages() << (PAGE_SHIFT - 10)
#endif
- str ? ", " : "", str ? str : "");
+ );
}
/**
@@ -8222,6 +8419,7 @@ void *__init alloc_large_system_hash(const char *tablename,
void *table = NULL;
gfp_t gfp_flags;
bool virt;
+ bool huge;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -8289,6 +8487,7 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
+ huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -8305,7 +8504,7 @@ void *__init alloc_large_system_hash(const char *tablename,
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? "vmalloc" : "linear");
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
@@ -8450,6 +8649,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
pageblock_nr_pages));
}
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
+static void alloc_contig_dump_pages(struct list_head *page_list)
+{
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
+
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
+ struct page *page;
+
+ dump_stack();
+ list_for_each_entry(page, page_list, lru)
+ dump_page(page, "migration failure");
+ }
+}
+#else
+static inline void alloc_contig_dump_pages(struct list_head *page_list)
+{
+}
+#endif
+
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
@@ -8464,7 +8684,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
- migrate_prep();
+ lru_cache_disable();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
@@ -8474,14 +8694,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc, pfn, end);
- if (!pfn) {
- ret = -EINTR;
+ ret = isolate_migratepages_range(cc, pfn, end);
+ if (ret && ret != -EAGAIN)
break;
- }
+ pfn = cc->migrate_pfn;
tries = 0;
} else if (++tries == 5) {
- ret = ret < 0 ? ret : -EBUSY;
+ ret = -EBUSY;
break;
}
@@ -8491,8 +8710,18 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+
+ /*
+ * On -ENOMEM, migrate_pages() bails out right away. It is pointless
+ * to retry again over this error, so do the same here.
+ */
+ if (ret == -ENOMEM)
+ break;
}
+
+ lru_cache_enable();
if (ret < 0) {
+ alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
}
@@ -8503,7 +8732,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlaying pageblocks (either
+ * @migratetype: migratetype of the underlying pageblocks (either
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
@@ -8583,7 +8812,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
- ret =0;
+ ret = 0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -8602,8 +8831,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
- lru_add_drain_all();
-
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
@@ -8629,8 +8856,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
- pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
- __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
@@ -8680,12 +8905,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
if (PageReserved(page))
return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
}
return true;
}
@@ -8757,9 +8976,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
}
#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{
- unsigned int count = 0;
+ unsigned long count = 0;
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -8767,13 +8986,13 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
count += page_count(page) != 1;
__free_page(page);
}
- WARN(count != 0, "%d pages are still in use!\n", count);
+ WARN(count != 0, "%lu pages are still in use!\n", count);
}
EXPORT_SYMBOL(free_contig_range);
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * page high values need to be recalculated.
*/
void __meminit zone_pcp_update(struct zone *zone)
{
@@ -8805,12 +9024,9 @@ void zone_pcp_enable(struct zone *zone)
void zone_pcp_reset(struct zone *zone)
{
- unsigned long flags;
int cpu;
struct per_cpu_pageset *pset;
- /* avoid races with drain_pages() */
- local_irq_save(flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8819,7 +9035,6 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
- local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -8947,6 +9162,8 @@ bool take_page_off_buddy(struct page *page)
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
break;
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c6860f51b6c6..7d83641eb86b 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_protected_usage(counter, new);
/* More uncharges than charges? */
- WARN_ON_ONCE(new < 0);
+ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
+ new, nr_pages)) {
+ new = 0;
+ atomic_long_set(&counter->usage, new);
+ }
+ propagate_protected_usage(counter, new);
}
/**
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d15c7c4994f5..adfabb560eb9 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -27,6 +27,7 @@ struct page_owner {
depot_stack_handle_t handle;
depot_stack_handle_t free_handle;
u64 ts_nsec;
+ u64 free_ts_nsec;
pid_t pid;
};
@@ -41,13 +42,7 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
- if (!buf)
- return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- page_owner_enabled = true;
-
- return 0;
+ return kstrtobool(buf, &page_owner_enabled);
}
early_param("page_owner", early_page_owner_param);
@@ -103,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-static inline bool check_recursive_alloc(unsigned long *entries,
- unsigned int nr_entries,
- unsigned long ip)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (entries[i] == ip)
- return true;
- }
- return false;
-}
-
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
- nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
-
/*
- * We need to check recursion here because our request to
- * stackdepot could trigger memory allocation to save new
- * entry. New memory allocation would reach here and call
- * stack_depot_save_entries() again if we don't catch it. There is
- * still not enough memory in stackdepot so it would try to
- * allocate memory again and loop forever.
+ * Avoid recursion.
+ *
+ * Sometimes page metadata allocation tracking requires more
+ * memory to be allocated:
+ * - when new stack trace is saved to stack depot
+ * - when backtrace itself is calculated (ia64)
*/
- if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+ if (current->in_page_owner)
return dummy_handle;
+ current->in_page_owner = 1;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
+ current->in_page_owner = 0;
return handle;
}
@@ -146,25 +129,27 @@ void __reset_page_owner(struct page *page, unsigned int order)
{
int i;
struct page_ext *page_ext;
- depot_stack_handle_t handle = 0;
+ depot_stack_handle_t handle;
struct page_owner *page_owner;
-
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ u64 free_ts_nsec = local_clock();
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_owner = get_page_owner(page_ext);
page_owner->free_handle = handle;
+ page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
}
-static inline void __set_page_owner_handle(struct page *page,
- struct page_ext *page_ext, depot_stack_handle_t handle,
- unsigned int order, gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
@@ -194,7 +179,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -243,11 +228,12 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->handle = old_page_owner->handle;
new_page_owner->pid = old_page_owner->pid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+ new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
/*
* We don't clear the bit on the oldpage as it's going to be freed
* after migration. Until then, the info can be useful in case of
- * a bug, and the overal stats will be off a bit only temporarily.
+ * a bug, and the overall stats will be off a bit only temporarily.
* Also, migrate_misplaced_transhuge_page() can still fail the
* migration and then we want the oldpage to retain the info. But
* in that case we also don't need to explicitly clear the info from
@@ -356,10 +342,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
- page_owner->ts_nsec);
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
if (ret >= count)
goto err;
@@ -435,9 +421,9 @@ void __dump_page_owner(struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec);
+ page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle) {
@@ -612,7 +598,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/* Found early allocated page */
- __set_page_owner_handle(page, page_ext, early_handle,
+ __set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 655dc5895604..98438985e1ed 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/mmdebug.h>
#include <linux/highmem.h>
#include <linux/page_ext.h>
#include <linux/poison.h>
@@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
return error && !(error & (error - 1));
}
-static void check_poison_mem(unsigned char *mem, size_t bytes)
+static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes)
{
static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
@@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
dump_stack();
+ dump_page(page, "pagealloc: corrupted page details");
}
static void unpoison_page(struct page *page)
@@ -83,7 +85,7 @@ static void unpoison_page(struct page *page)
* that is freed to buddy. Thus no extra check is done to
* see if a page was poisoned.
*/
- check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE);
+ check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 86e3a3688d59..a4435311754b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return pfn_is_match(pvmw->page, pfn);
}
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
+}
+
/**
* page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
* @pvmw->address
@@ -134,7 +141,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
* regardless of which page table level the page is mapped at. @pvmw->pmd is
* NULL.
*
- * Retruns false if there are no more page table entries for the page in
+ * Returns false if there are no more page table entries for the page in
* the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
*
* If you need to stop the walk before page_vma_mapped_walk() returned false,
@@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
struct mm_struct *mm = pvmw->vma->vm_mm;
struct page *page = pvmw->page;
+ unsigned long end;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
+ if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
- if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -168,78 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return not_found(pvmw);
return true;
}
-restart:
- pgd = pgd_offset(mm, pvmw->address);
- if (!pgd_present(*pgd))
- return false;
- p4d = p4d_offset(pgd, pvmw->address);
- if (!p4d_present(*p4d))
- return false;
- pud = pud_offset(p4d, pvmw->address);
- if (!pud_present(*pud))
- return false;
- pvmw->pmd = pmd_offset(pud, pvmw->address);
+
/*
- * Make sure the pmd value isn't cached in a register by the
- * compiler and used as a stale value after we've observed a
- * subsequent update.
+ * Seek to next pte only makes sense for THP.
+ * But more important than that optimization, is to filter out
+ * any PageKsm page: whose page->index misleads vma_address()
+ * and vma_address_end() to disaster.
*/
- pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (likely(pmd_trans_huge(*pvmw->pmd))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (pmd_page(*pvmw->pmd) != page)
- return not_found(pvmw);
- return true;
- } else if (!pmd_present(*pvmw->pmd)) {
- if (thp_migration_supported()) {
- if (!(pvmw->flags & PVMW_MIGRATION))
+ end = PageTransCompound(page) ?
+ vma_address_end(page, pvmw->vma) :
+ pvmw->address + PAGE_SIZE;
+ if (pvmw->pte)
+ goto next_pte;
+restart:
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
+
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (likely(pmd_trans_huge(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+ if (pmd_page(pmde) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
- if (migration_entry_to_page(entry) != page)
- return not_found(pvmw);
- return true;
- }
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
}
- return not_found(pvmw);
- } else {
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
- if (!map_pte(pvmw))
- goto next_pte;
- while (1) {
+ if (!map_pte(pvmw))
+ goto next_pte;
+this_pte:
if (check_pte(pvmw))
return true;
next_pte:
- /* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- thp_size(pvmw->page))
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
- if (pvmw->address % PMD_SIZE == 0) {
- pte_unmap(pvmw->pte);
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
}
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
goto restart;
- } else {
- pvmw->pte++;
+ }
+ pvmw->pte++;
+ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
}
} while (pte_none(*pvmw->pte));
@@ -247,7 +286,10 @@ next_pte:
pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
spin_lock(pvmw->ptl);
}
- }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
}
/**
@@ -266,14 +308,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 095d7eaa0db4..ae26b118e246 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -170,7 +170,7 @@ struct percpu_stats {
u64 nr_max_alloc; /* max # of live allocations */
u32 nr_chunks; /* current # of live chunks */
u32 nr_max_chunks; /* max # of live chunks */
- size_t min_alloc_size; /* min allocaiton size */
+ size_t min_alloc_size; /* min allocation size */
size_t max_alloc_size; /* max allocation size */
};
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index e46f7a6917f9..8d3844bc0c7c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,6 +8,7 @@
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
+#include "internal.h"
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
@@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}
/**
@@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
+ return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+ PAGE_KERNEL, pages, PAGE_SHIFT);
}
/**
diff --git a/mm/percpu.c b/mm/percpu.c
index 23308113a5ff..f99e9306b939 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1862,7 +1862,7 @@ fail:
pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
- /* see the flag handling in pcpu_blance_workfn() */
+ /* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index 1dcc865029a2..e9e879de8649 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
(__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
NULL: pte_offset_kernel(pmd, address))
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c2210e1cdb51..4e640baf9794 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_present(*pmdp));
- /* Below assumes pmd_present() is true */
- VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index f5fee9cf90f8..4bcc11958089 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/sched.h>
-#include <linux/compat.h>
#include <linux/sched/mm.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..d589f147f4c2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -198,8 +198,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
-
if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch
@@ -210,6 +208,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* not worth getting one just for that.
*/
read_pages(ractl, &page_pool, true);
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
@@ -223,6 +222,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
gfp_mask) < 0) {
put_page(page);
read_pages(ractl, &page_pool, true);
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
@@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl,
* memory at once.
*/
void force_page_cache_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned long nr_to_read)
+ unsigned long nr_to_read)
{
struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages, index;
@@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping,
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static void ondemand_readahead(struct readahead_control *ractl,
- struct file_ra_state *ra, bool hit_readahead_marker,
- unsigned long req_size)
+ bool hit_readahead_marker, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+ struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
unsigned long index = readahead_index(ractl);
@@ -550,7 +551,7 @@ readit:
}
void page_cache_sync_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned long req_count)
+ unsigned long req_count)
{
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
@@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
* read-ahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
- if (!ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
@@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl,
/* be dumb */
if (do_forced_ra) {
- force_page_cache_ra(ractl, ra, req_count);
+ force_page_cache_ra(ractl, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(ractl, ra, false, req_count);
+ ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, struct page *page,
- unsigned long req_count)
+ struct page *page, unsigned long req_count)
{
/* no read-ahead */
- if (!ra->ra_pages)
+ if (!ractl->ra->ra_pages)
return;
/*
@@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
return;
/* do read-ahead */
- ondemand_readahead(ractl, ra, true, req_count);
+ ondemand_readahead(ractl, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
@@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
return ksys_readahead(fd, offset, count);
}
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window. This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len)
+{
+ struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
+ pgoff_t new_index, new_nr_pages;
+ gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+ new_index = new_start / PAGE_SIZE;
+
+ /* Expand the leading edge downwards */
+ while (ractl->_index > new_index) {
+ unsigned long index = ractl->_index - 1;
+ struct page *page = xa_load(&mapping->i_pages, index);
+
+ if (page && !xa_is_value(page))
+ return; /* Page apparently present */
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return;
+ if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+ put_page(page);
+ return;
+ }
+
+ ractl->_nr_pages++;
+ ractl->_index = page->index;
+ }
+
+ new_len += new_start - readahead_pos(ractl);
+ new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+ /* Expand the trailing edge upwards */
+ while (ractl->_nr_pages < new_nr_pages) {
+ unsigned long index = ractl->_index + ractl->_nr_pages;
+ struct page *page = xa_load(&mapping->i_pages, index);
+
+ if (page && !xa_is_value(page))
+ return; /* Page apparently present */
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return;
+ if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+ put_page(page);
+ return;
+ }
+ ractl->_nr_pages++;
+ if (ra) {
+ ra->size++;
+ ra->async_size++;
+ }
+ }
+}
+EXPORT_SYMBOL(readahead_expand);
diff --git a/mm/rmap.c b/mm/rmap.c
index b0fc27e77d6d..e05c300048e6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
* anon_vma_fork(). The first three want an exact copy of src, while the last
* one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
* endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
- return address;
+ }
+
+ return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
+ address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
else
rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
}
/**
@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
diff --git a/mm/shmem.c b/mm/shmem.c
index b2db4ed0fbc7..5d46611cba8d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2258,25 +2258,11 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
/* arm64 - allow memory tagging on RAM-based files */
vma->vm_flags |= VM_MTE_ALLOWED;
@@ -2375,8 +2361,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pgoff_t offset, max_off;
ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
goto out;
+ }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
@@ -2846,6 +2842,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
+
+ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+
return 0;
}
@@ -3505,7 +3504,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
}
}
if (*this_char) {
- char *value = strchr(this_char,'=');
+ char *value = strchr(this_char, '=');
size_t len = 0;
int err;
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f0b7c3..cec62984f7d3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -10,7 +10,7 @@
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
-static inline void shuffle_free_memory(pg_data_t *pgdat)
+static inline void __meminit shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
@@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
}
extern void __shuffle_zone(struct zone *z);
-static inline void shuffle_zone(struct zone *z)
+static inline void __meminit shuffle_zone(struct zone *z)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
diff --git a/mm/slab.c b/mm/slab.c
index ae651bf540b7..d0f725637663 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define BATCHREFILL_LIMIT 16
/*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
* cpucache drain/refill cycles.
*
* OTOH the cpuarrays can contain lots of objects,
@@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep)
* Because if it is the case, that means we defer the creation of
* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
* And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
* This is a "chicken-and-egg" problem.
*
* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
@@ -2381,8 +2381,8 @@ union freelist_init_state {
};
/*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
*/
static bool freelist_state_initialize(union freelist_init_state *state,
struct kmem_cache *cachep,
@@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
void *ptr;
int slab_node = numa_mem_id();
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
- memset(ptr, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
return ptr;
}
@@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
unsigned long save_flags;
void *objp;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
- memset(objp, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
return objp;
}
@@ -3427,17 +3425,24 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
+ bool init;
+
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
__kfence_free(objp);
return;
}
- if (unlikely(slab_want_init_on_free(cachep)))
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = slab_want_init_on_free(cachep);
+ if (init && !kasan_has_integrated_init())
memset(objp, 0, cachep->object_size);
-
- /* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp))
+ /* KASAN might put objp into memory quarantine, delaying its reuse. */
+ if (kasan_slab_free(cachep, objp, init))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3542,18 +3547,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
- /* Clear memory outside IRQ disabled section */
- if (unlikely(slab_want_init_on_alloc(flags, s)))
- for (i = 0; i < size; i++)
- memset(p[i], 0, s->object_size);
-
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled section.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3651,6 +3656,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
struct kmem_cache *cachep;
@@ -3670,6 +3676,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
if (DEBUG && cachep->flags & SLAB_STORE_USER)
kpp->kp_ret = *dbg_userword(cachep, objp);
}
+#endif
/**
* __do_kmalloc - allocate memory
diff --git a/mm/slab.h b/mm/slab.h
index 076582f58f68..18c1927cd196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size, void **p)
+ struct obj_cgroup *objcg, gfp_t flags,
+ size_t size, void **p, bool init)
{
size_t i;
flags &= gfp_allowed_mask;
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_alloc and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
+ */
for (i = 0; i < size; i++) {
- p[i] = kasan_slab_alloc(s, p[i], flags);
- /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ p[i] = kasan_slab_alloc(s, p[i], flags, init);
+ if (p[i] && init && !kasan_has_integrated_init())
+ memset(p[i], 0, s->object_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
}
@@ -601,7 +610,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_alloc)) {
+ if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc)) {
if (c->ctor)
return false;
if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
@@ -613,12 +623,14 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_free))
+ if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
+ &init_on_free))
return !(c->ctor ||
(c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
return false;
}
+#ifdef CONFIG_PRINTK
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
void *kp_ptr;
@@ -630,5 +642,6 @@ struct kmem_obj_info {
void *kp_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+#endif
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 88e833986332..7cab77655f11 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str)
return 1;
}
+static int __init setup_slab_merge(char *str)
+{
+ slab_nomerge = false;
+ return 1;
+}
+
#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+__setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
#endif
__setup("slab_nomerge", setup_slab_nomerge);
+__setup("slab_merge", setup_slab_merge);
/*
* Determine the size of a slab object
@@ -89,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
@@ -310,6 +317,16 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * If no slub_debug was enabled globally, the static key is not yet
+ * enabled by setup_slub_debug(). Enable it if the cache is being
+ * created with any of the debugging flags passed explicitly.
+ */
+ if (flags & SLAB_DEBUG_FLAGS)
+ static_branch_enable(&slub_debug_enabled);
+#endif
+
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
@@ -526,6 +543,7 @@ bool slab_is_available(void)
return slab_state >= UP;
}
+#ifdef CONFIG_PRINTK
/**
* kmem_valid_obj - does the pointer reference a valid slab object?
* @object: pointer to query.
@@ -544,6 +562,7 @@ bool kmem_valid_obj(void *object)
page = virt_to_head_page(object);
return PageSlab(page);
}
+EXPORT_SYMBOL_GPL(kmem_valid_obj);
/**
* kmem_dump_obj - Print available slab provenance information
@@ -600,6 +619,8 @@ void kmem_dump_obj(void *object)
pr_info(" %pS\n", kp.kp_stack[i]);
}
}
+EXPORT_SYMBOL_GPL(kmem_dump_obj);
+#endif
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
diff --git a/mm/slob.c b/mm/slob.c
index 0578429b991b..74d3f6e60666 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,11 +461,13 @@ out:
spin_unlock_irqrestore(&slob_lock, flags);
}
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
kpp->kp_ptr = object;
kpp->kp_page = page;
}
+#endif
/*
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9bf1b3..61bd40e3eb9a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3,7 +3,7 @@
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
- * The allocator synchronizes using per slab locks or atomic operatios
+ * The allocator synchronizes using per slab locks or atomic operations
* and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
+#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
@@ -160,7 +161,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#undef SLUB_DEBUG_CMPXCHG
/*
- * Mininum number of partial slabs. These will be left on the partial
+ * Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
#define MIN_PARTIAL 5
@@ -301,6 +302,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
+ object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
@@ -624,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
if (!t->addr)
return;
- pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
@@ -650,8 +652,9 @@ void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
- page, page->objects, page->inuse, page->freelist, page->flags);
+ pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+ page, page->objects, page->inuse, page->freelist,
+ page->flags, &page->flags);
}
@@ -706,19 +709,19 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_page_info(page);
- pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad);
else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section(KERN_ERR, "Object ", p,
+ print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
off = get_info_end(s);
@@ -730,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section(KERN_ERR, "Padding ", p + off,
+ print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
dump_stack();
@@ -799,7 +802,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
end--;
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -832,7 +835,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at mininum
+ * C. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -907,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page,
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -926,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "Poison",
+ !check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
@@ -1532,7 +1535,8 @@ static __always_inline void kfree_hook(void *x)
kasan_kfree_large(x);
}
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+ void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
@@ -1558,8 +1562,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
__kcsan_check_access(x, s->object_size,
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
- /* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset's must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * The initialization memset's clear the object and the metadata,
+ * but don't touch the SLAB redzone.
+ */
+ if (init) {
+ int rsize;
+
+ if (!kasan_has_integrated_init())
+ memset(kasan_reset_tag(x), 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+ s->size - s->inuse - rsize);
+ }
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
+ return kasan_slab_free(s, x, init);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1569,10 +1590,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
- int rsize;
if (is_kfence_address(next)) {
- slab_free_hook(s, next);
+ slab_free_hook(s, next, false);
return true;
}
@@ -1584,20 +1604,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
object = next;
next = get_freepointer(s, object);
- if (slab_want_init_on_free(s)) {
- /*
- * Clear the object and the metadata, but don't touch
- * the redzone.
- */
- memset(kasan_reset_tag(object), 0, s->object_size);
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
- : 0;
- memset((char *)kasan_reset_tag(object) + s->inuse, 0,
- s->size - s->inuse - rsize);
-
- }
/* If object's reuse doesn't have to be delayed */
- if (!slab_free_hook(s, object)) {
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
@@ -2822,6 +2830,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
@@ -2899,12 +2908,10 @@ redo:
}
maybe_wipe_obj_freeptr(s, object);
-
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(kasan_reset_tag(object), 0, s->object_size);
+ init = slab_want_init_on_alloc(gfpflags, s);
out:
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
return object;
}
@@ -3236,7 +3243,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
}
if (is_kfence_address(object)) {
- slab_free_hook(df->s, object);
+ slab_free_hook(df->s, object, false);
__kfence_free(object);
p[size] = NULL; /* mark object processed */
return size;
@@ -3356,20 +3363,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c->tid = next_tid(c->tid);
local_irq_enable();
- /* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
- int j;
-
- for (j = 0; j < i; j++)
- memset(kasan_reset_tag(p[j]), 0, s->object_size);
- }
-
- /* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3390,7 +3393,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
*/
/*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
* and slab fragmentation. A higher order reduces the number of partial slabs
* and increases the number of allocations possible without having to
* take the list_lock.
@@ -3421,7 +3424,7 @@ static unsigned int slub_min_objects;
*
* Higher order allocations also allow the placement of more objects in a
* slab and thereby reduce object handling overhead. If the user has
- * requested a higher mininum order then we start with that one instead of
+ * requested a higher minimum order then we start with that one instead of
* the smallest order which will fit the object.
*/
static inline unsigned int slab_order(unsigned int size,
@@ -3579,7 +3582,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL);
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse = 1;
page->frozen = 0;
@@ -3687,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
- unsigned int freepointer_area;
unsigned int order;
/*
@@ -3696,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
- /*
- * This is the area of the object where a freepointer can be
- * safely written. If redzoning adds more to the inuse size, we
- * can't use that portion for writing the freepointer, so
- * s->offset must be limited within this for the general case.
- */
- freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -3728,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* With that we have determined the number of bytes in actual use
- * by the object. This is the potential offset to the free pointer.
+ * by the object and redzoning.
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
@@ -3749,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
- } else if (freepointer_area > sizeof(void *)) {
+ } else {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
- s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3898,7 +3895,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
for_each_object(p, s, addr, page->objects) {
if (!test_bit(__obj_to_index(s, addr, p), map)) {
- pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
@@ -3963,6 +3960,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
return 0;
}
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
void *base;
@@ -4002,6 +4000,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
#endif
#endif
}
+#endif
/********************************************************************
* Kmalloc subsystem
diff --git a/mm/sparse.c b/mm/sparse.c
index 7bd23f9d6cef..55c18aff3e42 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (unlikely(!mem_section)) {
unsigned long size, align;
- size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
if (!mem_section)
@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
+static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
+{
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ return __pa_symbol(pgdat);
+#else
+ return __pa(pgdat);
+#endif
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
- pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -547,6 +556,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
+ sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
@@ -623,7 +633,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* Mark all memory sections within the pfn range as offline */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -644,7 +653,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
ms->section_mem_map &= ~SECTION_IS_ONLINE;
}
}
-#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static struct page * __meminit populate_section_memmap(unsigned long pfn,
diff --git a/mm/swap.c b/mm/swap.c
index 31b844d4ed94..dfb48cf9c2c9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,6 +36,7 @@
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
+#include <linux/buffer_head.h>
#include "internal.h"
@@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
}
}
+/* return true if pagevec needs to drain */
+static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
+{
+ bool ret = false;
+
+ if (!pagevec_add(pvec, page) || PageCompound(page) ||
+ lru_cache_disabled())
+ ret = true;
+
+ return ret;
+}
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
@@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page)
get_page(page);
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
@@ -343,7 +356,7 @@ static void activate_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page);
local_unlock(&lru_pvecs.lock);
}
@@ -458,7 +471,7 @@ void lru_cache_add(struct page *page)
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
__pagevec_lru_add(pvec);
local_unlock(&lru_pvecs.lock);
}
@@ -483,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
int nr_pages = thp_nr_pages(page);
/*
- * We use the irq-unsafe __mod_zone_page_stat because this
+ * We use the irq-unsafe __mod_zone_page_state because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
@@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
+ invalidate_bh_lrus_cpu(cpu);
}
/**
@@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -676,7 +690,7 @@ void deactivate_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
* Calling this function with cpu hotplug locks held can actually lead
* to obscure indirect dependencies via WQ context.
*/
-void lru_add_drain_all(void)
+inline void __lru_add_drain_all(bool force_all_cpus)
{
/*
* lru_drain_gen - Global pages generation number
@@ -780,7 +794,7 @@ void lru_add_drain_all(void)
* (C) Exit the draining operation if a newer generation, from another
* lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
- if (unlikely(this_gen != lru_drain_gen))
+ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
goto done;
/*
@@ -794,7 +808,7 @@ void lru_add_drain_all(void)
* below which drains the page vectors.
*
* Let x, y, and z represent some system CPU numbers, where x < y < z.
- * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * Assume CPU #z is in the middle of the for_each_online_cpu loop
* below and has already reached CPU #y's per-cpu data. CPU #x comes
* along, adds some pages to its per-cpu vectors, then calls
* lru_add_drain_all().
@@ -810,12 +824,14 @@ void lru_add_drain_all(void)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ if (force_all_cpus ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
- need_activate_page_drain(cpu)) {
+ need_activate_page_drain(cpu) ||
+ has_bh_in_lru(cpu, NULL)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
@@ -828,6 +844,11 @@ void lru_add_drain_all(void)
done:
mutex_unlock(&lock);
}
+
+void lru_add_drain_all(void)
+{
+ __lru_add_drain_all(false);
+}
#else
void lru_add_drain_all(void)
{
@@ -835,6 +856,34 @@ void lru_add_drain_all(void)
}
#endif /* CONFIG_SMP */
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+ atomic_inc(&lru_disable_count);
+#ifdef CONFIG_SMP
+ /*
+ * lru_add_drain_all in the force mode will schedule draining on
+ * all online CPUs so any calls of lru_cache_disabled wrapped by
+ * local_lock or preemption disabled would be ordered by that.
+ * The atomic operation doesn't need to have stronger ordering
+ * requirements because that is enforeced by the scheduling
+ * guarantees.
+ */
+ __lru_add_drain_all(true);
+#else
+ lru_add_drain();
+#endif
+}
+
/**
* release_pages - batched put_page()
* @pages: array of pages to release
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index be9de6d5b516..6248d1030a9b 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -16,7 +16,7 @@
* to local caches without needing to acquire swap_info
* lock. We do not reuse the returned slots directly but
* move them back to the global pool in a batch. This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
*
* The swap entry allocated is marked with SWAP_HAS_CACHE
* flag in map_count that prevents it from being allocated
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3cdee7b11da9..272ea2108c9d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
xas_store(&xas, page);
xas_next(&xas);
}
- address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
@@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page,
xas_next(&xas);
}
ClearPageSwapCache(page);
- if (shadow)
- address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
@@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
xas_store(&xas, NULL);
nr_shadows++;
}
- address_space->nrexceptional -= nr_shadows;
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
@@ -497,16 +493,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageLocked(page);
__SetPageSwapBacked(page);
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
goto fail_unlock;
- }
- if (mem_cgroup_charge(page, NULL, gfp_mask)) {
- delete_from_swap_cache(page);
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- }
+
+ mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
workingset_refault(page, shadow);
@@ -517,6 +511,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return page;
fail_unlock:
+ put_swap_page(page, entry);
unlock_page(page);
put_page(page);
return NULL;
@@ -797,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf,
*
* Returns the struct page for entry and addr, after queueing swapin.
*
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
* virtual addresses are around the fault address in the same vma.
*
* Caller must hold read mmap_lock if vmf->vma is not NULL.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 084a5b9a18e5..996afa8131c8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
}
/*
@@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v)
unsigned int bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+ seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
@@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
diff --git a/mm/truncate.c b/mm/truncate.c
index 455944264663..234ddd879caa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
if (xas_load(&xas) != entry)
return;
xas_store(&xas, NULL);
- mapping->nrexceptional--;
}
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
@@ -168,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
{
- if (page_mapped(page)) {
- unsigned int nr = thp_nr_pages(page);
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (page_mapped(page))
+ unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
@@ -219,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return -EIO;
- truncate_cleanup_page(mapping, page);
+ truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
@@ -295,7 +291,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pgoff_t index;
int i;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
/* Offsets within partial pages */
@@ -326,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(mapping, pvec.pages[i]);
+ truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
@@ -440,9 +436,6 @@ EXPORT_SYMBOL(truncate_inode_pages);
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
- unsigned long nrexceptional;
- unsigned long nrpages;
-
/*
* Page reclaim can not participate in regular inode lifetime
* management (can't call iput()) and thus can race with the
@@ -452,16 +445,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
*/
mapping_set_exiting(mapping);
- /*
- * When reclaim installs eviction entries, it increases
- * nrexceptional first, then decreases nrpages. Make sure we see
- * this in the right order or we might miss an entry.
- */
- nrpages = mapping->nrpages;
- smp_rmb();
- nrexceptional = mapping->nrexceptional;
-
- if (nrpages || nrexceptional) {
+ if (!mapping_empty(mapping)) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
@@ -633,7 +617,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret2 = 0;
int did_range_unmap = 0;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
pagevec_init(&pvec);
@@ -652,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
@@ -659,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
- }
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
BUG_ON(page_mapped(page));
+
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..63a73e164d55 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ enum mcopy_atomic_mode mode)
{
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (zeropage) {
+ if (mode == MCOPY_ATOMIC_ZEROPAGE) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -273,8 +273,6 @@ retry:
}
while (src_addr < src_start + len) {
- pte_t dst_pteval;
-
BUG_ON(dst_addr >= dst_start + len);
/*
@@ -290,23 +288,23 @@ retry:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+ dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
goto out_unlock;
}
- err = -EEXIST;
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
+ if (mode != MCOPY_ATOMIC_CONTINUE &&
+ !huge_pte_none(huge_ptep_get(dst_pte))) {
+ err = -EEXIST;
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
goto out_unlock;
}
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, &page);
+ dst_addr, src_addr, mode, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
@@ -362,38 +360,38 @@ out:
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
* the reservation was consumed when the page was allocated.
- * We clear the PagePrivate flag now so that the global
+ * We clear the HPageRestoreReserve flag now so that the global
* reserve count will not be incremented in free_huge_page.
* The reservation map will still indicate the reservation
* was consumed and possibly prevent later page allocation.
* This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear PagePrivate
- * as no adjustments to reservation counts were made during
- * allocation.
+ * reservation existed, it is still safe to clear
+ * HPageRestoreReserve as no adjustments to reservation counts
+ * were made during allocation.
*
* The reservation map for shared mappings indicates which
* pages have reservations. When a huge page is allocated
* for an address with a reservation, no change is made to
- * the reserve map. In this case PagePrivate will be set
- * to indicate that the global reservation count should be
+ * the reserve map. In this case HPageRestoreReserve will be
+ * set to indicate that the global reservation count should be
* incremented when the page is freed. This is the desired
* behavior. However, when a huge page is allocated for an
* address without a reservation a reservation entry is added
- * to the reservation map, and PagePrivate will not be set.
- * When the page is freed, the global reserve count will NOT
- * be incremented and it will appear as though we have leaked
- * reserved page. In this case, set PagePrivate so that the
- * global reserve count will be incremented to match the
- * reservation map entry which was created.
+ * to the reservation map, and HPageRestoreReserve will not be
+ * set. When the page is freed, the global reserve count will
+ * NOT be incremented and it will appear as though we have
+ * leaked reserved page. In this case, set HPageRestoreReserve
+ * so that the global reserve count will be incremented to
+ * match the reservation map entry which was created.
*
* Note that vm_alloc_shared is based on the flags of the vma
* for which the page was originally allocated. dst_vma could
* be different or NULL on error.
*/
if (vm_alloc_shared)
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
else
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
put_page(page);
}
BUG_ON(copied < 0);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage);
+ enum mcopy_atomic_mode mode);
#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage,
+ enum mcopy_atomic_mode mcopy_mode,
bool *mmap_changing,
__u64 mode)
{
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
+ bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/*
* Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ src_start, len, mcopy_mode);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
+ if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ goto out_unlock;
/*
* Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
bool *mmap_changing, __u64 mode)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing, mode);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+ MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+ mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool *mmap_changing)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+ mmap_changing, 0);
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
diff --git a/mm/util.c b/mm/util.c
index 54870226cea6..a8bf17f18a81 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page)
}
EXPORT_SYMBOL(page_mapping);
-/*
- * For file cache pages, return the address_space, otherwise return NULL
- */
-struct address_space *page_mapping_file(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return NULL;
- return page_mapping(page);
-}
-
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
@@ -775,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
* The deviation of sync_overcommit_as could be big with loose policy
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
- * with the strict "NEVER", and to avoid possible race condtion (even
+ * with the strict "NEVER", and to avoid possible race condition (even
* though user usually won't too frequently do the switching to policy
* OVERCOMMIT_NEVER), the switch is done in the following order:
* 1. changing the batch
@@ -983,6 +973,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
return ret;
}
+#ifdef CONFIG_PRINTK
/**
* mem_dump_obj - Print available provenance information
* @object: object for which to find provenance information.
@@ -996,20 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
*/
void mem_dump_obj(void *object)
{
+ const char *type;
+
if (kmem_valid_obj(object)) {
kmem_dump_obj(object);
return;
}
+
if (vmalloc_dump_obj(object))
return;
- if (!virt_addr_valid(object)) {
- if (object == NULL)
- pr_cont(" NULL pointer.\n");
- else if (object == ZERO_SIZE_PTR)
- pr_cont(" zero-size pointer.\n");
- else
- pr_cont(" non-paged memory.\n");
- return;
- }
- pr_cont(" non-slab/vmalloc memory.\n");
+
+ if (virt_addr_valid(object))
+ type = "non-slab/vmalloc memory";
+ else if (object == NULL)
+ type = "NULL pointer";
+ else if (object == ZERO_SIZE_PTR)
+ type = "zero-size pointer";
+ else
+ type = "non-paged memory";
+
+ pr_cont(" %s\n", type);
}
+EXPORT_SYMBOL_GPL(mem_dump_obj);
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4f5f8c907897..d0a7d89be091 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -42,6 +42,19 @@
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@@ -68,6 +81,218 @@ static void free_work(struct work_struct *w)
}
/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
@@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
} while (p4d++, addr = next, addr != end);
}
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
*
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
*
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
*/
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void vunmap_range_noflush(unsigned long start, unsigned long end)
{
- unsigned long end = start + size;
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
@@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
- ret = map_kernel_range_noflush(start, size, prot, pages);
- flush_cache_vmap(start, start + size);
- return ret;
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
}
int is_vmalloc_or_module_addr(const void *x)
@@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
+ return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va)
spin_unlock(&free_vmap_area_lock);
}
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ kmem_cache_free(vmap_area_cachep, va);
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
- struct vmap_area *va, *pva;
+ struct vmap_area *va;
unsigned long addr;
int purged = 0;
int ret;
@@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
- /*
- * Preload this CPU with one extra vmap_area object. It is used
- * when fit type of free area is NE_FIT_TYPE. Please note, it
- * does not guarantee that an allocation occurs on a CPU that
- * is preloaded, instead we minimize the case when it is not.
- * It can happen because of cpu migration, because there is a
- * race until the below spinlock is taken.
- *
- * The preload is done in non-atomic context, thus it allows us
- * to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure. In rare case,
- * if not preloaded, GFP_NOWAIT is used.
- *
- * Set "pva" to NULL here, because of "retry" path.
- */
- pva = NULL;
-
- if (!this_cpu_read(ne_fit_preload_node))
- /*
- * Even if it fails we do not really care about that.
- * Just proceed as it is. If needed "overflow" path
- * will refill the cache we allocate from.
- */
- pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-
- spin_lock(&free_vmap_area_lock);
-
- if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
- kmem_cache_free(vmap_area_cachep, pva);
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
- addr = __alloc_vmap_area(size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
-
if (unlikely(addr == vend))
goto overflow;
@@ -1231,7 +1503,6 @@ retry:
va->va_end = addr + size;
va->vm = NULL;
-
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1312,7 +1583,7 @@ static unsigned long lazy_max_pages(void)
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
- * Serialize vmap purging. There is no actual criticial section protected
+ * Serialize vmap purging. There is no actual critical section protected
* by this look, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
@@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size)
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- unmap_kernel_range_noflush(addr, size);
+ vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
@@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
spin_lock(&vb->lock);
- if (vb->dirty) {
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
kasan_unpoison_vmalloc(mem, size);
- if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -2023,23 +2314,6 @@ void __init vmalloc_init(void)
vmap_initialized = true;
}
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- unmap_kernel_range_noflush(addr, size);
- flush_tlb_kernel_range(addr, end);
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2070,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+ unsigned long align, unsigned long shift, unsigned long flags,
+ unsigned long start, unsigned long end, int node,
+ gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt());
- size = PAGE_ALIGN(size);
+ size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
@@ -2110,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
@@ -2127,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
@@ -2135,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
@@ -2199,6 +2476,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@@ -2208,6 +2486,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
+ unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
@@ -2232,11 +2511,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2277,13 +2559,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
+ unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2348,7 +2631,7 @@ static void __vfree(const void *addr)
* May sleep if called *not* from interrupt context.
* Must not be called in NMI context (strictly speaking, it could be
* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea).
+ * conventions for vfree() arch-dependent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2402,6 +2685,7 @@ void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
@@ -2414,8 +2698,9 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
- pages) < 0) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2474,15 +2759,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
unsigned long array_size;
- unsigned int i;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
struct page **pages;
+ unsigned int i;
- array_size = (unsigned long)nr_pages * sizeof(struct page *);
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -2497,42 +2786,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (!pages) {
free_vm_area(area);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page array size %lu allocation failed",
+ nr_small_pages * PAGE_SIZE, array_size);
return NULL;
}
area->pages = pages;
- area->nr_pages = nr_pages;
+ area->nr_pages = nr_small_pages;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
+ page_order = vm_area_page_order(area);
- if (node == NUMA_NO_NODE)
- page = alloc_page(gfp_mask);
- else
- page = alloc_pages_node(node, gfp_mask, 0);
+ /*
+ * Careful, we allocate and map page_order pages, but tracking is done
+ * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
+ * the physical/mapped size.
+ */
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ struct page *page;
+ int p;
+ /* Compound pages required for remap_vmalloc_page */
+ page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page order %u allocation failed",
+ area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
- area->pages[i] = page;
+
+ for (p = 0; p < (1U << page_order); p++)
+ area->pages[i + p] = page + p;
+
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
__vfree(area->addr);
return NULL;
}
@@ -2563,19 +2870,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages())
- goto fail;
+ if (WARN_ON_ONCE(!size))
+ return NULL;
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
- vm_flags, start, end, node, gfp_mask, caller);
- if (!area)
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "exceeds total pages", real_size);
+ return NULL;
+ }
+
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+ arch_vmap_pmd_supported(prot)) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE) {
+ shift = PMD_SHIFT;
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+ }
+
+again:
+ area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ VM_UNINITIALIZED | vm_flags, start, end, node,
+ gfp_mask, caller);
+ if (!area) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "vm_struct allocation failed", real_size);
goto fail;
+ }
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2584,13 +2926,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
clear_vm_uninitialized_flag(area);
+ size = PAGE_ALIGN(size);
kmemleak_vmalloc(area, size, gfp_mask);
return addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure: %lu bytes", real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
return NULL;
}
@@ -2655,6 +3003,23 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size: allocation size
+ *
+ * Allocate enough non-huge pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_no_huge(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_no_huge);
+
+/**
* vzalloc - allocate virtually contiguous memory with zero fill
* @size: allocation size
*
@@ -2739,7 +3104,7 @@ EXPORT_SYMBOL(vzalloc_node);
* 64b systems should always have either DMA or DMA32 zones. For others
* GFP_DMA32 should do the right thing and use the normal zone.
*/
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
/**
@@ -2797,15 +3162,12 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
/*
* To do safe access to this _mapped_ area, we need
* lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
+ * overhead of vmalloc()/vfree() calls for this _debug_
* interface, rarely used. Instead of that, we'll use
* kmap() and get small overhead in this access function.
*/
if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
+ /* We can expect USER0 is not used -- see vread() */
void *map = kmap_atomic(p);
memcpy(buf, map + offset, length);
kunmap_atomic(map);
@@ -2820,43 +3182,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
return copied;
}
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
- struct page *p;
- int copied = 0;
-
- while (count) {
- unsigned long offset, length;
-
- offset = offset_in_page(addr);
- length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
- /*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
- */
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(map + offset, buf, length);
- kunmap_atomic(map);
- }
- addr += length;
- buf += length;
- copied += length;
- count -= length;
- }
- return copied;
-}
-
/**
* vread() - read vmalloc area in a safe way.
* @buf: buffer for reading data
@@ -2875,7 +3200,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
*
* Return: number of bytes for which addr and buf should be increased
* (same number as @count) or %0 if [addr...addr+count) doesn't
@@ -2894,7 +3219,10 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
+ va = __find_vmap_area((unsigned long)addr);
+ if (!va)
+ goto finished;
+ list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
@@ -2937,80 +3265,6 @@ finished:
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- struct vmap_area *va;
- struct vm_struct *vm;
- char *vaddr;
- unsigned long n, buflen;
- int copied = 0;
-
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
- buflen = count;
-
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
-
- if (!va->vm)
- continue;
-
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
- continue;
- while (addr < vaddr) {
- if (count == 0)
- goto finished;
- buf++;
- addr++;
- count--;
- }
- n = vaddr + get_vm_area_size(vm) - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP)) {
- aligned_vwrite(buf, addr, n);
- copied++;
- }
- buf += n;
- addr += n;
- count -= n;
- }
-finished:
- spin_unlock(&vmap_area_lock);
- if (!copied)
- return 0;
- return buflen;
-}
-
-/**
* remap_vmalloc_range_partial - map vmalloc pages to userspace
* @vma: vma to cover
* @uaddr: target user address to start at
@@ -3072,7 +3326,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
* remap_vmalloc_range - map vmalloc pages to userspace
@@ -3450,6 +3703,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
struct vm_struct *vm;
@@ -3462,6 +3716,7 @@ bool vmalloc_dump_obj(void *object)
vm->nr_pages, (unsigned long)vm->addr, vm->caller);
return true;
}
+#endif
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..5199b9696bab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
+{
+ return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+static inline int shrinker_defer_size(int nr_items)
+{
+ return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+ int nid)
+{
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+ int map_size, int defer_size,
+ int old_map_size, int old_defer_size)
+{
+ struct shrinker_info *new, *old;
+ struct mem_cgroup_per_node *pn;
+ int nid;
+ int size = map_size + defer_size;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ old = shrinker_info_protected(memcg, nid);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_deferred = (atomic_long_t *)(new + 1);
+ new->map = (void *)new->nr_deferred + defer_size;
+
+ /* map: set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_map_size);
+ memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+ /* nr_deferred: copy old values, clear all new values */
+ memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+ memset((void *)new->nr_deferred + old_defer_size, 0,
+ defer_size - old_defer_size);
+
+ rcu_assign_pointer(pn->shrinker_info, new);
+ kvfree_rcu(old, rcu);
+ }
+
+ return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct shrinker_info *info;
+ int nid;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ info = rcu_dereference_protected(pn->shrinker_info, true);
+ kvfree(info);
+ rcu_assign_pointer(pn->shrinker_info, NULL);
+ }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ int nid, size, ret = 0;
+ int map_size, defer_size = 0;
+
+ down_write(&shrinker_rwsem);
+ map_size = shrinker_map_size(shrinker_nr_max);
+ defer_size = shrinker_defer_size(shrinker_nr_max);
+ size = map_size + defer_size;
+ for_each_node(nid) {
+ info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+ if (!info) {
+ free_shrinker_info(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ info->nr_deferred = (atomic_long_t *)(info + 1);
+ info->map = (void *)info->nr_deferred + defer_size;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+
+static inline bool need_expand(int nr_max)
+{
+ return round_up(nr_max, BITS_PER_LONG) >
+ round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
+{
+ int ret = 0;
+ int new_nr_max = new_id + 1;
+ int map_size, defer_size = 0;
+ int old_map_size, old_defer_size = 0;
+ struct mem_cgroup *memcg;
+
+ if (!need_expand(new_nr_max))
+ goto out;
+
+ if (!root_mem_cgroup)
+ goto out;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ map_size = shrinker_map_size(new_nr_max);
+ defer_size = shrinker_defer_size(new_nr_max);
+ old_map_size = shrinker_map_size(shrinker_nr_max);
+ old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+ old_map_size, old_defer_size);
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
+ goto out;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+ if (!ret)
+ shrinker_nr_max = new_nr_max;
+
+ return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct shrinker_info *info;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, info->map);
+ rcu_read_unlock();
+ }
+}
static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
int id, ret = -ENOMEM;
+ if (mem_cgroup_disabled())
+ return -ENOSYS;
+
down_write(&shrinker_rwsem);
/* This may call shrinker, so it must use down_read_trylock() */
- id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
if (id >= shrinker_nr_max) {
- if (memcg_expand_shrinker_maps(id)) {
+ if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
goto unlock;
}
-
- shrinker_nr_max = id + 1;
}
shrinker->id = id;
ret = 0;
@@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- down_write(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_rwsem);
+
idr_remove(&shrinker_idr, id);
- up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+ int i, nid;
+ long nr;
+ struct mem_cgroup *parent;
+ struct shrinker_info *child_info, *parent_info;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /* Prevent from concurrent shrinker_info expand */
+ down_read(&shrinker_rwsem);
+ for_each_node(nid) {
+ child_info = shrinker_info_protected(memcg, nid);
+ parent_info = shrinker_info_protected(parent, nid);
+ for (i = 0; i < shrinker_nr_max; i++) {
+ nr = atomic_long_read(&child_info->nr_deferred[i]);
+ atomic_long_add(nr, &parent_info->nr_deferred[i]);
+ }
+ }
+ up_read(&shrinker_rwsem);
}
static bool cgroup_reclaim(struct scan_control *sc)
@@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
- return 0;
+ return -ENOSYS;
}
static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
@@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
}
#endif
+static long xchg_nr_deferred(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return xchg_nr_deferred_memcg(nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return add_nr_deferred_memcg(nr, nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- unsigned int size = sizeof(*shrinker->nr_deferred);
+ unsigned int size;
+ int err;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ err = prealloc_memcg_shrinker(shrinker);
+ if (err != -ENOSYS)
+ return err;
+
+ shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+ }
+ size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
if (!shrinker->nr_deferred)
return -ENOMEM;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- if (prealloc_memcg_shrinker(shrinker))
- goto free_deferred;
- }
-
return 0;
-
-free_deferred:
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
- return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
- return;
-
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
+ up_write(&shrinker_rwsem);
+ return;
+ }
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
@@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+ shrinker->flags |= SHRINKER_REGISTERED;
up_write(&shrinker_rwsem);
}
@@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
+ if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- unregister_memcg_shrinker(shrinker);
+
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
+ shrinker->flags &= ~SHRINKER_REGISTERED;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
long freeable;
long nr;
long new_nr;
- int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0 || freeable == SHRINK_EMPTY)
return freeable;
@@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+ nr = xchg_nr_deferred(shrinker, shrinkctl);
- total_scan = nr;
if (shrinker->seeks) {
delta = freeable >> priority;
delta *= 4;
@@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta = freeable / 2;
}
+ total_scan = nr >> priority;
total_scan += delta;
- if (total_scan < 0) {
- pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
- shrinker->scan_objects, total_scan);
- total_scan = freeable;
- next_deferred = nr;
- } else
- next_deferred = total_scan;
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * freeable. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < freeable / 4)
- total_scan = min(total_scan, freeable / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > freeable * 2)
- total_scan = freeable * 2;
+ total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
freeable, delta, total_scan, priority);
@@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
cond_resched();
}
- if (next_deferred >= scanned)
- next_deferred -= scanned;
- else
- next_deferred = 0;
+ /*
+ * The deferred work is increased by any new work (delta) that wasn't
+ * done, decreased by old deferred work that was done now.
+ *
+ * And it is capped to two times of the freeable items.
+ */
+ next_deferred = max_t(long, (nr + delta - scanned), 0);
+ next_deferred = min(next_deferred, (2 * freeable));
+
/*
* move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
+ * manner that handles concurrent updates.
*/
- if (next_deferred > 0)
- new_nr = atomic_long_add_return(next_deferred,
- &shrinker->nr_deferred[nid]);
- else
- new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
return freed;
}
@@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
- struct memcg_shrinker_map *map;
+ struct shrinker_info *info;
unsigned long ret, freed = 0;
int i;
@@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (!down_read_trylock(&shrinker_rwsem))
return 0;
- map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
- true);
- if (unlikely(!map))
+ info = shrinker_info_protected(memcg, nid);
+ if (unlikely(!info))
goto unlock;
- for_each_set_bit(i, map->map, shrinker_nr_max) {
+ for_each_set_bit(i, info->map, shrinker_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
if (!shrinker)
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
continue;
}
@@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
/*
* After the shrinker reported that it had no objects to
* free, but before we cleared the corresponding bit in
@@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
* case, we invoke the shrinker one more time and reset
* the bit if it reports that it is not empty anymore.
* The memory barrier here pairs with the barrier in
- * memcg_set_shrinker_bit():
+ * set_shrinker_bit():
*
* list_lru_add() shrink_slab_memcg()
* list_add_tail() clear_bit()
@@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
else
- memcg_set_shrinker_bit(memcg, nid, i);
+ set_shrinker_bit(memcg, nid, i);
}
freed += ret;
@@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_lru(page) && !PageDirty(page) &&
- !__PageMovable(page) && !PageUnevictable(page)) {
+ if (!PageHuge(page) && page_is_file_lru(page) &&
+ !PageDirty(page) && !__PageMovable(page) &&
+ !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -3862,7 +4059,7 @@ static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -4086,14 +4283,6 @@ module_init(kswapd_init)
int node_reclaim_mode __read_mostly;
/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI. New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
-
-/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74b2c374b86c..cccee36b289c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu)
/*
* this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
*/
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
{
@@ -1313,6 +1313,10 @@ const char * const vmstat_text[] = {
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
#endif
+#ifdef CONFIG_CMA
+ "cma_alloc_success",
+ "cma_alloc_fail",
+#endif
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
"unevictable_pgs_rescued",
@@ -1365,6 +1369,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
#endif
+#ifdef CONFIG_X86
+ "direct_map_level2_splits",
+ "direct_map_level3_splits",
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1854,25 +1862,34 @@ int vmstat_refresh(struct ctl_table *table, int write,
if (err)
return err;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_ZONE_WRITE_PENDING:
+ case NR_FREE_CMA_PAGES:
+ continue;
+ }
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
__func__, zone_stat_name(i), val);
- err = -EINVAL;
}
}
-#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- val = atomic_long_read(&vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_WRITEBACK:
+ continue;
+ }
+ val = atomic_long_read(&vm_node_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
- __func__, numa_stat_name(i), val);
- err = -EINVAL;
+ __func__, node_stat_name(i), val);
}
}
-#endif
- if (err)
- return err;
if (write)
*ppos += *lenp;
else
diff --git a/mm/workingset.c b/mm/workingset.c
index cd39902c1062..b7cdeca5a76d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- mapping->nrexceptional -= node->nr_values;
xa_delete_node(node, workingset_update_node);
__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 9d889ad2bb86..7fe7adaaad01 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
{
if (pool->inode)
iput(pool->inode);
- }
+}
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
diff --git a/mm/zpool.c b/mm/zpool.c
index 5ed71207ced7..6d9ed48141e5 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
* This may hold locks, disable interrupts, and/or preemption,
* and the zpool_unmap_handle() must be called to undo those
* actions. The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
* as soon as possible. As the implementation may use per-cpu
* data, multiple handles should not be mapped concurrently on
* any cpu.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 30c358b72025..19b563bc6c48 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -61,7 +61,7 @@
#define ZSPAGE_MAGIC 0x58
/*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
* These two conditions ensure that any 'struct link_free' itself doesn't
* span more than 1 page which avoids complex case of mapping 2 pages simply
* to restore link_free pointer values.
@@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage,
* class maintains a list of zspages where each zspage is divided
* into equal sized chunks. Each allocation falls into one of these
* classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
*/
static int get_size_class_index(int size)
{
@@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
* zs_map_object - get address of allocated object from handle.
* @pool: pool from which the object was allocated
* @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
*
* Before using an object allocated from zs_malloc, it must be mapped using
* this function. When done with the object, it must be unmapped using
@@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
head = obj_to_head(page, addr);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
+ BUG_ON(!testpin_tag(handle));
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
@@ -2035,8 +2034,7 @@ unpin_objects:
head = obj_to_head(page, addr);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
+ BUG_ON(!testpin_tag(handle));
unpin_tag(handle);
}
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 578d9f256920..20763267a219 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
}
pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
- strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+ strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
if (!pool->acomp_ctx) {