summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c117
-rw-r--r--mm/compaction.c25
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/filemap.c29
-rw-r--r--mm/filemap_xip.c477
-rw-r--r--mm/gup.c12
-rw-r--r--mm/huge_memory.c50
-rw-r--r--mm/internal.h6
-rw-r--r--mm/iov_iter.c17
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/kasan.c516
-rw-r--r--mm/kasan/kasan.h75
-rw-r--r--mm/kasan/report.c269
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/list_lru.c467
-rw-r--r--mm/madvise.c19
-rw-r--r--mm/memcontrol.c188
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c62
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mprotect.c48
-rw-r--r--mm/nommu.c69
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c22
-rw-r--r--mm/page_io.c9
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h67
-rw-r--r--mm/slab_common.c214
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c195
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c38
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c89
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c239
-rw-r--r--mm/zswap.c5
49 files changed, 2395 insertions, 1130 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12869c8..de5239c152f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
config GENERIC_EARLY_IOREMAP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 3548460ab7b6..3c1caa2693bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,6 +2,9 @@
# Makefile for the linux memory manager.
#
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -49,9 +52,9 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..6dc4580df2af 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,19 +14,10 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-struct backing_dev_info default_backing_dev_info = {
- .name = "default",
- .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
- .state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
-{
- if (wb1 < wb2) {
- spin_lock(&wb1->list_lock);
- spin_lock_nested(&wb2->list_lock, 1);
- } else {
- spin_lock(&wb2->list_lock);
- spin_lock_nested(&wb1->list_lock, 1);
- }
-}
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,10 +49,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
@@ -80,6 +60,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+ if (inode->i_state & I_DIRTY_TIME)
+ nr_dirty_time++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +81,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +95,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_dirty,
nr_io,
nr_more_io,
+ nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -264,9 +249,6 @@ static int __init default_bdi_init(void)
if (!bdi_wq)
return -ENOMEM;
- err = bdi_init(&default_backing_dev_info);
- if (!err)
- bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
@@ -355,19 +337,19 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- if (!bdi_cap_writeback_dirty(bdi))
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
+ spin_unlock_bh(&bdi->wb_lock);
return;
+ }
+ spin_unlock_bh(&bdi->wb_lock);
/*
* Make sure nobody finds us on the bdi_list anymore
*/
bdi_remove_from_list(bdi);
- /* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- clear_bit(BDI_registered, &bdi->state);
- spin_unlock_bh(&bdi->wb_lock);
-
/*
* Drain work list and shutdown the delayed_work. At this point,
* @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
@@ -375,37 +357,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
*/
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
flush_delayed_work(&bdi->wb.dwork);
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
}
/*
- * This bdi is going away now, make sure that no super_blocks point to it
+ * Called when the device behind @bdi has been removed or ejected.
+ *
+ * We can't really do much here except for reducing the dirty ratio at
+ * the moment. In the future we should be able to set a flag so that
+ * the filesystem can handle errors at mark_inode_dirty time instead
+ * of only at writeback time.
*/
-static void bdi_prune_sb(struct backing_dev_info *bdi)
-{
- struct super_block *sb;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdi == bdi)
- sb->s_bdi = &default_backing_dev_info;
- }
- spin_unlock(&sb_lock);
-}
-
void bdi_unregister(struct backing_dev_info *bdi)
{
- if (bdi->dev) {
- bdi_set_min_ratio(bdi, 0);
- trace_writeback_bdi_unregister(bdi);
- bdi_prune_sb(bdi);
+ if (WARN_ON_ONCE(!bdi->dev))
+ return;
- bdi_wb_shutdown(bdi);
- bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
- bdi->dev = NULL;
- }
+ bdi_set_min_ratio(bdi, 0);
}
EXPORT_SYMBOL(bdi_unregister);
@@ -418,6 +385,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
@@ -474,37 +442,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
- /*
- * Splice our entries to the default_backing_dev_info. This
- * condition shouldn't happen. @wb must be empty at this point and
- * dirty inodes on it might cause other issues. This workaround is
- * added by ce5f8e779519 ("writeback: splice dirty inode entries to
- * default bdi on bdi_destroy()") without root-causing the issue.
- *
- * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
- * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
- *
- * We should probably add WARN_ON() to find out whether it still
- * happens and track it down if so.
- */
- if (bdi_has_dirty_io(bdi)) {
- struct bdi_writeback *dst = &default_backing_dev_info.wb;
-
- bdi_lock_two(&bdi->wb, dst);
- list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
- list_splice(&bdi->wb.b_io, &dst->b_io);
- list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&bdi->wb.list_lock);
- spin_unlock(&dst->list_lock);
- }
-
- bdi_unregister(bdi);
+ bdi_wb_shutdown(bdi);
+ WARN_ON(!list_empty(&bdi->work_list));
WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
-
fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -513,13 +463,12 @@ EXPORT_SYMBOL(bdi_destroy);
* For use from filesystems to quickly init and register a bdi associated
* with dirty writeback
*/
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
- unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
{
int err;
bdi->name = name;
- bdi->capabilities = cap;
+ bdi->capabilities = 0;
err = bdi_init(bdi);
if (err)
return err;
diff --git a/mm/compaction.c b/mm/compaction.c
index b68736c8a1ce..8c0d9459b54a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
#include <linux/sysfs.h>
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
+#include <linux/kasan.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -72,6 +73,7 @@ static void map_pages(struct list_head *list)
list_for_each_entry(page, list, lru) {
arch_alloc_page(page, 0);
kernel_map_pages(page, 1, 1);
+ kasan_alloc_pages(page, 0);
}
}
@@ -490,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* If a page was split, advance to the end of it */
if (isolated) {
+ cc->nr_freepages += isolated;
+ if (!strict &&
+ cc->nr_migratepages <= cc->nr_freepages) {
+ blockpfn += isolated;
+ break;
+ }
+
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;
@@ -899,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
- int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
@@ -924,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc)
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; block_start_pfn >= low_pfn &&
+ cc->nr_migratepages > cc->nr_freepages;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
- unsigned long isolated;
/*
* This can iterate a massively long zone without finding any
@@ -953,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
- nr_freepages += isolated;
/*
* Remember where the free scanner should restart next time,
@@ -987,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc)
*/
if (block_start_pfn < low_pfn)
cc->free_pfn = cc->migrate_pfn;
-
- cc->nr_freepages = nr_freepages;
}
/*
@@ -1100,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn || cc->contended) {
+ acct_isolated(zone, cc);
return ISOLATE_ABORT;
+ }
/*
* Either we isolated something and proceed with migration. Or
@@ -1173,7 +1180,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
- if (cc->order >= pageblock_order && area->nr_free)
+ if (order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..4a3907cf79f8 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
struct fd f = fdget(fd);
+ struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
if (!f.file)
return -EBADF;
- if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ inode = file_inode(f.file);
+ if (S_ISFIFO(inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
goto out;
}
- if (mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(inode)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
@@ -73,7 +75,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
else
endbyte--; /* inclusive */
- bdi = mapping->backing_dev_info;
+ bdi = inode_to_bdi(mapping->host);
switch (advice) {
case POSIX_FADV_NORMAL:
@@ -113,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!bdi_write_congested(mapping->backing_dev_info))
+ if (!bdi_write_congested(bdi))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
diff --git a/mm/filemap.c b/mm/filemap.c
index bf7a27142704..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
}
}
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (file->f_flags & O_DIRECT) {
+ if (io_is_direct(file)) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
@@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
+ * the rest of the read. Buffered reads will not work for
+ * DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+ IS_DAX(inode)) {
file_accessed(file);
goto out;
}
@@ -2564,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (io_is_direct(file)) {
loff_t endbyte;
written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
- goto out;
-
/*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
+ * If the write stopped short of completing, fall back to
+ * buffered writes. Some filesystems do this for writes to
+ * holes, for example. For DAX files, a buffered write will
+ * not succeed (even if it did, DAX does not handle dirty
+ * page-cache pages correctly).
*/
+ if (written < 0 || written == count || IS_DAX(inode))
+ goto out;
+
pos += written;
count -= written;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index 70c09da1a419..000000000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
- if (!__xip_sparse_page) {
- struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
- if (page)
- __xip_sparse_page = page;
- }
- return __xip_sparse_page;
-}
-
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all. It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
- struct file_ra_state *_ra,
- struct file *filp,
- char __user *buf,
- size_t len,
- loff_t *ppos)
-{
- struct inode *inode = mapping->host;
- pgoff_t index, end_index;
- unsigned long offset;
- loff_t isize, pos;
- size_t copied = 0, error = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- pos = *ppos;
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
-
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- do {
- unsigned long nr, left;
- void *xip_mem;
- unsigned long xip_pfn;
- int zero = 0;
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- goto out;
- }
- }
- nr = nr - offset;
- if (nr > len - copied)
- nr = len - copied;
-
- error = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(error)) {
- if (error == -ENODATA) {
- /* sparse */
- zero = 1;
- } else
- goto out;
- }
-
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- /* address based flush */ ;
-
- /*
- * Ok, we have the mem, so now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
- */
- if (!zero)
- left = __copy_to_user(buf+copied, xip_mem+offset, nr);
- else
- left = __clear_user(buf + copied, nr);
-
- if (left) {
- error = -EFAULT;
- goto out;
- }
-
- copied += (nr - left);
- offset += (nr - left);
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
- } while (copied < len);
-
-out:
- *ppos = pos + copied;
- if (filp)
- file_accessed(filp);
-
- return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
- if (!access_ok(VERIFY_WRITE, buf, len))
- return -EFAULT;
-
- return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
- buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
- struct vm_area_struct *vma;
- struct page *page;
- unsigned count;
- int locked = 0;
-
- count = read_seqcount_begin(&xip_sparse_seq);
-
- page = __xip_sparse_page;
- if (!page)
- return;
-
-retry:
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- pte_t *pte, pteval;
- spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long address = vma->vm_start +
- ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- pte = page_check_address(page, mm, address, &ptl, 1);
- if (pte) {
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- BUG_ON(pte_dirty(pteval));
- pte_unmap_unlock(pte, ptl);
- /* must invalidate_page _before_ freeing the page */
- mmu_notifier_invalidate_page(mm, address);
- page_cache_release(page);
- }
- }
- i_mmap_unlock_read(mapping);
-
- if (locked) {
- mutex_unlock(&xip_sparse_mutex);
- } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
- mutex_lock(&xip_sparse_mutex);
- locked = 1;
- goto retry;
- }
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- pgoff_t size;
- void *xip_mem;
- unsigned long xip_pfn;
- struct page *page;
- int error;
-
- /* XXX: are VM_FAULT_ codes OK? */
-again:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (vmf->pgoff >= size)
- return VM_FAULT_SIGBUS;
-
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (likely(!error))
- goto found;
- if (error != -ENODATA)
- return VM_FAULT_OOM;
-
- /* sparse block */
- if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
- (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
- (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
- int err;
-
- /* maybe shared writable, allocate new block */
- mutex_lock(&xip_sparse_mutex);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (error)
- return VM_FAULT_SIGBUS;
- /* unmap sparse mappings at pgoff from all other vmas */
- __xip_unmap(mapping, vmf->pgoff);
-
-found:
- err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- xip_pfn);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- /*
- * err == -EBUSY is fine, we've raced against another thread
- * that faulted-in the same page
- */
- if (err != -EBUSY)
- BUG_ON(err);
- return VM_FAULT_NOPAGE;
- } else {
- int err, ret = VM_FAULT_OOM;
-
- mutex_lock(&xip_sparse_mutex);
- write_seqcount_begin(&xip_sparse_seq);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(!error)) {
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
- goto again;
- }
- if (error != -ENODATA)
- goto out;
- /* not shared and writable, use xip_sparse_page() */
- page = xip_sparse_page();
- if (!page)
- goto out;
- err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
- page);
- if (err == -ENOMEM)
- goto out;
-
- ret = VM_FAULT_NOPAGE;
-out:
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
-
- return ret;
- }
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
- .fault = xip_file_fault,
- .page_mkwrite = filemap_page_mkwrite,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
- BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
- file_accessed(file);
- vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
- size_t count, loff_t pos, loff_t *ppos)
-{
- struct address_space * mapping = filp->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- size_t bytes;
- ssize_t written = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
- void *xip_mem;
- unsigned long xip_pfn;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (status == -ENODATA) {
- /* we allocate a new page unmap it */
- mutex_lock(&xip_sparse_mutex);
- status = a_ops->get_xip_mem(mapping, index, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (!status)
- /* unmap page at pgoff from all other vmas */
- __xip_unmap(mapping, index);
- }
-
- if (status)
- break;
-
- copied = bytes -
- __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
- if (likely(copied > 0)) {
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- if (status < 0)
- break;
- } while (count);
- *ppos = pos;
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- */
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
-
- return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
- loff_t *ppos)
-{
- struct address_space *mapping = filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count;
- loff_t pos;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
-
- if (!access_ok(VERIFY_READ, buf, len)) {
- ret=-EFAULT;
- goto out_up;
- }
-
- pos = *ppos;
- count = len;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
-
- ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
- if (ret)
- goto out_backing;
- if (count == 0)
- goto out_backing;
-
- ret = file_remove_suid(filp);
- if (ret)
- goto out_backing;
-
- ret = file_update_time(filp);
- if (ret)
- goto out_backing;
-
- ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
- current->backing_dev_info = NULL;
- out_up:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize;
- unsigned length;
- void *xip_mem;
- unsigned long xip_pfn;
- int err;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- blocksize = 1 << mapping->host->i_blkbits;
- length = offset & (blocksize - 1);
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
-
- length = blocksize - length;
-
- err = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(err)) {
- if (err == -ENODATA)
- /* Hole? No need to truncate */
- return 0;
- else
- return err;
- }
- memset(xip_mem + offset, 0, length);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/gup.c b/mm/gup.c
index c2da1163986a..a6e24e246f86 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,7 +64,7 @@ retry:
migration_entry_wait(mm, pmd, address);
goto retry;
}
- if ((flags & FOLL_NUMA) && pte_numa(pte))
+ if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte)) {
pte_unmap_unlock(ptep, ptl);
@@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
@@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
/*
* Similar to the PMD case below, NUMA hinting must take slow
- * path
+ * path using the pte_protnone check.
*/
if (!pte_present(pte) || pte_special(pte) ||
- pte_numa(pte) || (write && !pte_write(pte)))
+ pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1092,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmdp = pmd_offset(&pud, addr);
do {
- pmd_t pmd = ACCESS_ONCE(*pmdp);
+ pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
@@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
page = pmd_page(*pmd);
@@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
+ page = pmd_page(*pmdp);
spin_unlock(ptl);
- wait_migrate_huge_page(vma->anon_vma, pmdp);
+ wait_on_page_locked(page);
goto out;
}
@@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
+ * and access rights restored.
*/
spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
- pmd = pmd_mknonnuma(pmd);
+ pmd = pmd_modify(pmd, vma->vm_page_prot);
set_pmd_at(mm, haddr, pmdp, pmd);
- VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock:
@@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
- ret = 1;
- if (!prot_numa) {
+
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!prot_numa || !pmd_protnone(*pmd)) {
+ ret = 1;
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
- } else {
- struct page *page = pmd_page(*pmd);
-
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
- !pmd_numa(*pmd)) {
- pmdp_set_numa(mm, addr, pmd);
- ret = HPAGE_PMD_NR;
- }
}
spin_unlock(ptl);
}
@@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
/*
- * Note that pmd_numa is not transferred deliberately
- * to avoid any possibility that pte_numa leaks to
- * a PROT_NONE VMA by accident.
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
*/
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/internal.h b/mm/internal.h
index c4d6c9b43491..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -351,8 +351,10 @@ extern int mminit_loglevel;
#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
if (level < mminit_loglevel) { \
- printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
- printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ if (level <= MMINIT_WARNING) \
+ printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ else \
+ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
} while (0)
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a1599ca4ab0e..827732047da1 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
EXPORT_SYMBOL(iov_iter_single_seg_count);
void iov_iter_kvec(struct iov_iter *i, int direction,
- const struct kvec *iov, unsigned long nr_segs,
+ const struct kvec *kvec, unsigned long nr_segs,
size_t count)
{
BUG_ON(!(direction & ITER_KVEC));
i->type = direction;
- i->kvec = (struct kvec *)iov;
+ i->kvec = kvec;
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count;
}
EXPORT_SYMBOL(iov_iter_kvec);
+void iov_iter_bvec(struct iov_iter *i, int direction,
+ const struct bio_vec *bvec, unsigned long nr_segs,
+ size_t count)
+{
+ BUG_ON(!(direction & ITER_BVEC));
+ i->type = direction;
+ i->bvec = bvec;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_bvec);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
unsigned long res = 0;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000000..bd837b8c2f41
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
+KASAN_SANITIZE := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 000000000000..78fee632a7ee
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,516 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ * Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+ kasan_poison_shadow(address, size, 0);
+
+ if (size & KASAN_SHADOW_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+ *shadow = size & KASAN_SHADOW_MASK;
+ }
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(shadow_value)) {
+ s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ return unlikely(last_accessible_byte >= shadow_value);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 1))
+ return true;
+
+ if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 3))
+ return true;
+
+ if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 7))
+ return true;
+
+ if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+ u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ u16 shadow_first_bytes = *(u16 *)shadow_addr;
+ s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+ if (unlikely(shadow_first_bytes))
+ return true;
+
+ if (likely(!last_byte))
+ return false;
+
+ return memory_is_poisoned_1(addr + 15);
+ }
+
+ return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+ size_t size)
+{
+ while (size) {
+ if (unlikely(*start))
+ return (unsigned long)start;
+ start++;
+ size--;
+ }
+
+ return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+ const void *end)
+{
+ unsigned int words;
+ unsigned long ret;
+ unsigned int prefix = (unsigned long)start % 8;
+
+ if (end - start <= 16)
+ return bytes_is_zero(start, end - start);
+
+ if (prefix) {
+ prefix = 8 - prefix;
+ ret = bytes_is_zero(start, prefix);
+ if (unlikely(ret))
+ return ret;
+ start += prefix;
+ }
+
+ words = (end - start) / 8;
+ while (words) {
+ if (unlikely(*(u64 *)start))
+ return bytes_is_zero(start, 8);
+ start += 8;
+ words--;
+ }
+
+ return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+ size_t size)
+{
+ unsigned long ret;
+
+ ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+ kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+ if (unlikely(ret)) {
+ unsigned long last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+ if (unlikely(ret != (unsigned long)last_shadow ||
+ ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+ if (__builtin_constant_p(size)) {
+ switch (size) {
+ case 1:
+ return memory_is_poisoned_1(addr);
+ case 2:
+ return memory_is_poisoned_2(addr);
+ case 4:
+ return memory_is_poisoned_4(addr);
+ case 8:
+ return memory_is_poisoned_8(addr);
+ case 16:
+ return memory_is_poisoned_16(addr);
+ default:
+ BUILD_BUG();
+ }
+ }
+
+ return memory_is_poisoned_n(addr, size);
+}
+
+
+static __always_inline void check_memory_region(unsigned long addr,
+ size_t size, bool write)
+{
+ struct kasan_access_info info;
+
+ if (unlikely(size == 0))
+ return;
+
+ if (unlikely((void *)addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ info.access_addr = (void *)addr;
+ info.access_size = size;
+ info.is_write = write;
+ info.ip = _RET_IP_;
+ kasan_report_user_access(&info);
+ return;
+ }
+
+ if (likely(!memory_is_poisoned(addr, size)))
+ return;
+
+ kasan_report(addr, size, write, _RET_IP_);
+}
+
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ __asan_storeN((unsigned long)addr, len);
+
+ return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ __asan_loadN((unsigned long)src, len);
+ __asan_storeN((unsigned long)dest, len);
+
+ return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ __asan_loadN((unsigned long)src, len);
+ __asan_storeN((unsigned long)dest, len);
+
+ return __memcpy(dest, src, len);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
+}
+
+void kasan_poison_slab(struct page *page)
+{
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << compound_order(page),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_poison_shadow(object,
+ round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+ kasan_kmalloc(cache, object, cache->object_size);
+}
+
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+ unsigned long size = cache->object_size;
+ unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+ return;
+
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (unlikely(object == NULL))
+ return;
+
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = round_up((unsigned long)object + cache->object_size,
+ KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(object, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+ struct page *page;
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (unlikely(ptr == NULL))
+ return;
+
+ page = virt_to_page(ptr);
+ redzone_start = round_up((unsigned long)(ptr + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+ kasan_unpoison_shadow(ptr, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page)))
+ kasan_kmalloc_large(object, size);
+ else
+ kasan_kmalloc(page->slab_cache, object, size);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+ struct page *page = virt_to_page(ptr);
+
+ kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+ KASAN_FREE_PAGE);
+}
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+ PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ return ret ? 0 : -ENOMEM;
+}
+
+void kasan_module_free(void *addr)
+{
+ vfree(kasan_mem_to_shadow(addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+ size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(global->beg, global->size);
+
+ kasan_poison_shadow(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, false); \
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, true); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, false);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, true);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ pr_err("WARNING: KASan doesn't support memory hot-add\n");
+ pr_err("Memory hot-add will be disabled\n");
+
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+module_init(kasan_memhotplug_init);
+#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000000..4986b0acab21
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,75 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+
+#define KASAN_FREE_PAGE 0xFF /* page was freed */
+#define KASAN_FREE_PAGE 0xFF /* page was freed */
+#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
+
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT 0xF1
+#define KASAN_STACK_MID 0xF2
+#define KASAN_STACK_RIGHT 0xF3
+#define KASAN_STACK_PARTIAL 0xF4
+
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+
+struct kasan_access_info {
+ const void *access_addr;
+ const void *first_bad_addr;
+ size_t access_size;
+ bool is_write;
+ unsigned long ip;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+ const char *filename;
+ int line_no;
+ int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+ const void *beg; /* Address of the beginning of the global variable. */
+ size_t size; /* Size of the global variable. */
+ size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
+ const void *name;
+ const void *module_name; /* Name of the module where the global variable is declared. */
+ unsigned long has_dynamic_init; /* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+ struct kasan_source_location *location;
+#endif
+};
+
+void kasan_report_error(struct kasan_access_info *info);
+void kasan_report_user_access(struct kasan_access_info *info);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+ return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+ << KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+ return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+ bool is_write, unsigned long ip);
+
+#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000000..680ceedf810a
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,269 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ * Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+ u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+ const void *first_bad_addr = addr;
+
+ while (!shadow_val && first_bad_addr < addr + size) {
+ first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+ shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+ }
+ return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown crash";
+ u8 shadow_val;
+
+ info->first_bad_addr = find_first_bad_addr(info->access_addr,
+ info->access_size);
+
+ shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ switch (shadow_val) {
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use after free";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ case KASAN_GLOBAL_REDZONE:
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ bug_type = "out of bounds access";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "out of bounds on stack";
+ break;
+ }
+
+ pr_err("BUG: KASan: %s in %pS at addr %p\n",
+ bug_type, (void *)info->ip,
+ info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm, task_pid_nr(current));
+}
+
+static inline bool kernel_or_module_addr(const void *addr)
+{
+ return (addr >= (void *)_stext && addr < (void *)_end)
+ || (addr >= (void *)MODULES_VADDR
+ && addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+ return addr >= (void *)&init_thread_union.stack &&
+ (addr <= (void *)&init_thread_union.stack +
+ sizeof(init_thread_union.stack));
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+ const void *addr = info->access_addr;
+
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory)) {
+ struct page *page = virt_to_head_page(addr);
+
+ if (PageSlab(page)) {
+ void *object;
+ struct kmem_cache *cache = page->slab_cache;
+ void *last_object;
+
+ object = virt_to_obj(cache, page_address(page), addr);
+ last_object = page_address(page) +
+ page->objects * cache->size;
+
+ if (unlikely(object > last_object))
+ object = last_object; /* we hit into padding */
+
+ object_err(cache, page, object,
+ "kasan: bad access detected");
+ return;
+ }
+ dump_page(page, "kasan: bad access detected");
+ }
+
+ if (kernel_or_module_addr(addr)) {
+ if (!init_task_stack_addr(addr))
+ pr_err("Address belongs to variable %pS\n", addr);
+ }
+
+ dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+ return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+ /* The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG/8)*2 chars.
+ */
+ return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+ (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+ int i;
+ const void *shadow = kasan_mem_to_shadow(addr);
+ const void *shadow_row;
+
+ shadow_row = (void *)round_down((unsigned long)shadow,
+ SHADOW_BYTES_PER_ROW)
+ - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+ pr_err("Memory state around the buggy address:\n");
+
+ for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+ const void *kaddr = kasan_shadow_to_mem(shadow_row);
+ char buffer[4 + (BITS_PER_LONG/8)*2];
+
+ snprintf(buffer, sizeof(buffer),
+ (i == 0) ? ">%p: " : " %p: ", kaddr);
+
+ kasan_disable_current();
+ print_hex_dump(KERN_ERR, buffer,
+ DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+ shadow_row, SHADOW_BYTES_PER_ROW, 0);
+ kasan_enable_current();
+
+ if (row_is_guilty(shadow_row, shadow))
+ pr_err("%*c\n",
+ shadow_pointer_offset(shadow_row, shadow),
+ '^');
+
+ shadow_row += SHADOW_BYTES_PER_ROW;
+ }
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+void kasan_report_error(struct kasan_access_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_lock, flags);
+ pr_err("================================="
+ "=================================\n");
+ print_error_description(info);
+ print_address_description(info);
+ print_shadow_for_address(info->first_bad_addr);
+ pr_err("================================="
+ "=================================\n");
+ spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report_user_access(struct kasan_access_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_lock, flags);
+ pr_err("================================="
+ "=================================\n");
+ pr_err("BUG: KASan: user-memory-access on address %p\n",
+ info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm, task_pid_nr(current));
+ dump_stack();
+ pr_err("================================="
+ "=================================\n");
+ spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report(unsigned long addr, size_t size,
+ bool is_write, unsigned long ip)
+{
+ struct kasan_access_info info;
+
+ if (likely(!kasan_enabled()))
+ return;
+
+ info.access_addr = (void *)addr;
+ info.access_size = size;
+ info.is_write = is_write;
+ info.ip = ip;
+ kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3cda50c1e394..5405aff5a590 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
#include <asm/processor.h>
#include <linux/atomic.h>
+#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
@@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object)
if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
return false;
+ kasan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
+ kasan_enable_current();
+
return object->checksum != old_csum;
}
@@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end,
BYTES_PER_POINTER))
continue;
+ kasan_disable_current();
pointer = *ptr;
+ kasan_enable_current();
object = find_and_get_object(pointer, 1);
if (!object)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_add(&lru->list, &list_lrus);
+ mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_del(&lru->list);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ /*
+ * The lock protects the array of per cgroup lists from relocation
+ * (see memcg_update_list_lru_node).
+ */
+ lockdep_assert_held(&nlru->lock);
+ if (nlru->memcg_lrus && idx >= 0)
+ return nlru->memcg_lrus->lru[idx];
+
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ struct mem_cgroup *memcg;
+
+ if (!nlru->memcg_lrus)
+ return &nlru->lru;
+
+ memcg = mem_cgroup_from_kmem(ptr);
+ if (!memcg)
+ return &nlru->lru;
+
+ return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
- list_add_tail(item, &nlru->list);
- if (nlru->nr_items++ == 0)
- node_set(nid, lru->active_nodes);
+ list_add_tail(item, &l->list);
+ l->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
+ l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
list_del_init(item);
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+ list_del_init(item);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+ struct list_head *head)
+{
+ list_move(item, head);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+ int nid, int memcg_idx)
{
- unsigned long count = 0;
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ unsigned long count;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- count += nlru->nr_items;
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ count = l->nr_items;
spin_unlock(&nlru->lock);
return count;
}
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
+{
+ return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+ long count = 0;
+ int memcg_idx;
+
+ count += __list_lru_count_one(lru, nid, -1);
+ if (list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx)
+ count += __list_lru_count_one(lru, nid, memcg_idx);
+ }
+ return count;
+}
EXPORT_SYMBOL_GPL(list_lru_count_node);
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
spin_lock(&nlru->lock);
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
- list_for_each_safe(item, n, &nlru->list) {
+ list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
/*
@@ -85,14 +206,11 @@ restart:
break;
--*nr_to_walk;
- ret = isolate(item, &nlru->lock, cb_arg);
+ ret = isolate(item, l, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
/*
* If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
goto restart;
break;
case LRU_ROTATE:
- list_move_tail(item, &nlru->list);
+ list_move_tail(item, &l->list);
break;
case LRU_SKIP:
break;
@@ -122,31 +240,322 @@ restart:
spin_unlock(&nlru->lock);
return isolated;
}
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+ isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ long isolated = 0;
+ int memcg_idx;
+
+ isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+ nr_to_walk);
+ if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx) {
+ isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+ isolate, cb_arg, nr_to_walk);
+ if (*nr_to_walk <= 0)
+ break;
+ }
+ }
+ return isolated;
+}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+ INIT_LIST_HEAD(&l->list);
+ l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++)
+ kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++) {
+ struct list_lru_one *l;
+
+ l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+ if (!l)
+ goto fail;
+
+ init_one_lru(l);
+ memcg_lrus->lru[i] = l;
+ }
+ return 0;
+fail:
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+ int size = memcg_nr_cache_ids;
+
+ nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+ if (!nlru->memcg_lrus)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+ kfree(nlru->memcg_lrus);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+ kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ struct list_lru_memcg *old, *new;
+
+ BUG_ON(old_size > new_size);
+
+ old = nlru->memcg_lrus;
+ new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+ kfree(new);
+ return -ENOMEM;
+ }
+
+ memcpy(new, old, old_size * sizeof(void *));
+
+ /*
+ * The lock guarantees that we won't race with a reader
+ * (see list_lru_from_memcg_idx).
+ *
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+ nlru->memcg_lrus = new;
+ spin_unlock_irq(&nlru->lock);
+
+ kfree(old);
+ return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ /* do not bother shrinking the array back to the old size, because we
+ * cannot handle allocation failures here */
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (!memcg_aware)
+ lru->node[i].memcg_lrus = NULL;
+ else if (memcg_init_list_lru_node(&lru->node[i]))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+ return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return 0;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (memcg_update_list_lru_node(&lru->node[i],
+ old_size, new_size))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+ return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+ int ret = 0;
+ struct list_lru *lru;
+ int old_size = memcg_nr_cache_ids;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list) {
+ ret = memcg_update_list_lru(lru, old_size, new_size);
+ if (ret)
+ goto fail;
+ }
+out:
+ mutex_unlock(&list_lrus_mutex);
+ return ret;
+fail:
+ list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+ memcg_cancel_update_list_lru(lru, old_size, new_size);
+ goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+ int src_idx, int dst_idx)
+{
+ struct list_lru_one *src, *dst;
+
+ /*
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+
+ src = list_lru_from_memcg_idx(nlru, src_idx);
+ dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+ list_splice_init(&src->list, &dst->list);
+ dst->nr_items += src->nr_items;
+ src->nr_items = 0;
+
+ spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+ int src_idx, int dst_idx)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+ struct list_lru *lru;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list)
+ memcg_drain_list_lru(lru, src_idx, dst_idx);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
+ int err = -ENOMEM;
+
+ memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
- return -ENOMEM;
+ goto out;
- nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
- INIT_LIST_HEAD(&lru->node[i].list);
- lru->node[i].nr_items = 0;
+ init_one_lru(&lru->node[i].lru);
}
- return 0;
+
+ err = memcg_init_list_lru(lru, memcg_aware);
+ if (err) {
+ kfree(lru->node);
+ goto out;
+ }
+
+ list_lru_register(lru);
+out:
+ memcg_put_cache_ids();
+ return err;
}
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
void list_lru_destroy(struct list_lru *lru)
{
+ /* Already destroyed or not yet initialized? */
+ if (!lru->node)
+ return;
+
+ memcg_get_cache_ids();
+
+ list_lru_unregister(lru);
+
+ memcg_destroy_list_lru(lru);
kfree(lru->node);
+ lru->node = NULL;
+
+ memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index d79fb5e8f80a..d551475517bf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -222,21 +222,24 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
- if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+ if (!file) {
*prev = vma;
- if (!file)
- force_swapin_readahead(vma, start, end);
- else
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ force_swapin_readahead(vma, start, end);
return 0;
}
-#endif
+ if (shmem_mapping(file->f_mapping)) {
+ *prev = vma;
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+#else
if (!file)
return -EBADF;
+#endif
- if (file->f_mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,8 +332,10 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
#endif
int last_scanned_node;
@@ -352,9 +354,9 @@ struct mem_cgroup {
};
#ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return memcg->kmemcg_id >= 0;
+ return memcg->kmem_acct_active;
}
#endif
@@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
- if (!memcg_proto_activated(&memcg->tcp_mem))
- return;
- static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size. It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
*/
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+ down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+ up_read(&memcg_cache_ids_sem);
+}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
@@ -569,32 +573,8 @@ int memcg_limited_groups_array_size;
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
- if (memcg_kmem_is_active(memcg)) {
- static_key_slow_dec(&memcg_kmem_enabled_key);
- memcg_free_cache_id(memcg->kmemcg_id);
- }
- /*
- * This check can't live in kmem destruction function,
- * since the charges will outlive the cgroup
- */
- WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
- disarm_sock_keys(memcg);
- disarm_kmem_keys(memcg);
-}
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
int id, size;
int err;
- id = ida_simple_get(&kmem_limited_groups,
+ id = ida_simple_get(&memcg_cache_ida,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (id < 0)
return id;
- if (id < memcg_limited_groups_array_size)
+ if (id < memcg_nr_cache_ids)
return id;
/*
* There's no space for the new id in memcg_caches arrays,
* so we have to grow them.
*/
+ down_write(&memcg_cache_ids_sem);
size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void)
size = MEMCG_CACHES_MAX_SIZE;
err = memcg_update_all_caches(size);
+ if (!err)
+ err = memcg_update_all_list_lrus(size);
+ if (!err)
+ memcg_nr_cache_ids = size;
+
+ up_write(&memcg_cache_ids_sem);
+
if (err) {
- ida_simple_remove(&kmem_limited_groups, id);
+ ida_simple_remove(&memcg_cache_ida, id);
return err;
}
return id;
@@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void)
static void memcg_free_cache_id(int id)
{
- ida_simple_remove(&kmem_limited_groups, id);
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
- memcg_limited_groups_array_size = num;
+ ida_simple_remove(&memcg_cache_ida, id);
}
struct memcg_kmem_cache_create_work {
@@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ int kmemcg_id;
- VM_BUG_ON(!cachep->memcg_params);
- VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(cachep));
if (current->memcg_kmem_skip_account)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!memcg_kmem_is_active(memcg))
+ kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ if (kmemcg_id < 0)
goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
if (likely(memcg_cachep))
return memcg_cachep;
@@ -2692,7 +2671,7 @@ out:
void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params->memcg->css);
+ css_put(&cachep->memcg_params.memcg->css);
}
/*
@@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+ if (PageSlab(page)) {
+ cachep = page->slab_cache;
+ if (!is_root_cache(cachep))
+ memcg = cachep->memcg_params.memcg;
+ } else
+ /* page allocated by alloc_kmem_pages */
+ memcg = page->mem_cgroup;
+
+ return memcg;
+}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
int err = 0;
int memcg_id;
- if (memcg_kmem_is_active(memcg))
- return 0;
+ BUG_ON(memcg->kmemcg_id >= 0);
+ BUG_ON(memcg->kmem_acct_activated);
+ BUG_ON(memcg->kmem_acct_active);
/*
* For simplicity, we won't allow this to be disabled. It also can't
@@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
* patched.
*/
memcg->kmemcg_id = memcg_id;
+ memcg->kmem_acct_activated = true;
+ memcg->kmem_acct_active = true;
out:
return err;
}
@@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return mem_cgroup_sockets_init(memcg, ss);
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (!memcg->kmem_acct_active)
+ return;
+
+ /*
+ * Clear the 'active' flag before clearing memcg_caches arrays entries.
+ * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+ * guarantees no cache will be created for this cgroup after we are
+ * done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_acct_active = false;
+
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
+ */
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
- memcg_destroy_kmem_caches(memcg);
+ if (memcg->kmem_acct_activated) {
+ memcg_destroy_kmem_caches(memcg);
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
@@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
-
- disarm_static_keys(memcg);
kfree(memcg);
}
@@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
spin_unlock(&memcg->event_list_lock);
vmpressure_cleanup(&memcg->vmpressure);
+
+ memcg_deactivate_kmem(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
* Only call shrink_node_slabs here (which would also shrink
* other caches) if access is not potentially fatal.
*/
- if (access) {
- int nr;
- int nid = page_to_nid(p);
- do {
- nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
- if (page_count(p) == 1)
- break;
- } while (nr > 10);
- }
+ if (access)
+ drop_slab_node(page_to_nid(p));
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
* setting PG_hwpoison.
*/
if (!is_free_buddy_page(page))
- lru_add_drain_all();
- if (!is_free_buddy_page(page))
drain_all_pages(page_zone(page));
SetPageHWPoison(page);
if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index bbe6a73a899d..8068893697bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = page;
+ vmf.cow_page = NULL;
ret = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
+ /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2638,7 +2640,8 @@ oom:
* See filemap_fault() and __lock_page_retry().
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
- pgoff_t pgoff, unsigned int flags, struct page **page)
+ pgoff_t pgoff, unsigned int flags,
+ struct page *cow_page, struct page **page)
{
struct vm_fault vmf;
int ret;
@@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
+ vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ if (!vmf.page)
+ goto out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
@@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ out:
*page = vmf.page;
return ret;
}
@@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
- copy_user_highpage(new_page, fault_page, address, vma);
+ if (fault_page)
+ copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg);
@@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3013,14 +3037,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
- * ptep_modify_prot_start is not called as this is clearing
- * the _PAGE_NUMA bit and it is not really expected that there
- * would be concurrent hardware modifications to the PTE.
+ * We can safely just do a "set_pte_at()", because the old
+ * page table entry is not accessible, so there would be no
+ * concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
@@ -3029,7 +3056,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
- pte = pte_mknonnuma(pte);
+ /* Make it present again */
+ pte = pte_modify(pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
@@ -3038,7 +3067,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
- BUG_ON(is_zero_pfn(page_to_pfn(page)));
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3124,7 +3152,7 @@ static int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
- if (pte_numa(entry))
+ if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
@@ -3202,7 +3230,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pmd_trans_splitting(orig_pmd))
return 0;
- if (pmd_numa(orig_pmd))
+ if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3458,7 +3486,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
- maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (write)
memcpy_toio(maddr + offset, buf, len);
else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..4721046a134a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -2817,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += snprintf(p, buffer + maxlen - p, "relative");
}
- if (!nodes_empty(nodes)) {
- p += snprintf(p, buffer + maxlen - p, ":");
- p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
- }
+ if (!nodes_empty(nodes))
+ p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
+ nodemask_pr_args(&nodes));
}
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
- struct page *page = pmd_page(*pmd);
- wait_on_page_locked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -1853,7 +1847,7 @@ out_fail:
out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
- entry = pmd_mknonnuma(entry);
+ entry = pmd_modify(entry, vma->vm_page_prot);
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
-int mminit_loglevel;
+int __meminitdata mminit_loglevel;
#ifndef SECTIONS_SHIFT
#define SECTIONS_SHIFT 0
#endif
/* The zonelists are simply reported, validation is manual. */
-void mminit_verify_zonelist(void)
+void __init mminit_verify_zonelist(void)
{
int nid;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33121662f08b..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
- bool updated = false;
- if (!prot_numa) {
- ptent = ptep_modify_prot_start(mm, addr, pte);
- if (pte_numa(ptent))
- ptent = pte_mknonnuma(ptent);
- ptent = pte_modify(ptent, newprot);
- /*
- * Avoid taking write faults for pages we
- * know to be dirty.
- */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY)))
- ptent = pte_mkwrite(ptent);
- ptep_modify_prot_commit(mm, addr, pte, ptent);
- updated = true;
- } else {
+ /*
+ * Avoid trapping faults against the zero or KSM
+ * pages. See similar comment in change_huge_pmd.
+ */
+ if (prot_numa) {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
- if (page && !PageKsm(page)) {
- if (!pte_numa(oldpte)) {
- ptep_set_numa(mm, addr, pte);
- updated = true;
- }
- }
+ if (!page || PageKsm(page))
+ continue;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ continue;
}
- if (updated)
- pages++;
+
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ ptent = pte_modify(ptent, newprot);
+
+ /* Avoid taking write faults for known dirty pages */
+ if (dirty_accountable && pte_dirty(ptent) &&
+ (pte_soft_dirty(ptent) ||
+ !(vma->vm_flags & VM_SOFTDIRTY))) {
+ ptent = pte_mkwrite(ptent);
+ }
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
+ pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1a19fb3b0463..7296360fc057 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -980,9 +980,6 @@ static int validate_mmap_request(struct file *file,
return -EOVERFLOW;
if (file) {
- /* validate file mapping requests */
- struct address_space *mapping;
-
/* files must support mmap */
if (!file->f_op->mmap)
return -ENODEV;
@@ -991,28 +988,22 @@ static int validate_mmap_request(struct file *file,
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
- mapping = file->f_mapping;
- if (!mapping)
- mapping = file_inode(file)->i_mapping;
-
- capabilities = 0;
- if (mapping && mapping->backing_dev_info)
- capabilities = mapping->backing_dev_info->capabilities;
-
- if (!capabilities) {
+ if (file->f_op->mmap_capabilities) {
+ capabilities = file->f_op->mmap_capabilities(file);
+ } else {
/* no explicit capabilities set, so assume some
* defaults */
switch (file_inode(file)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
break;
case S_IFCHR:
capabilities =
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP;
+ NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE;
break;
default:
@@ -1023,9 +1014,9 @@ static int validate_mmap_request(struct file *file,
/* eliminate any capabilities that we can't support on this
* device */
if (!file->f_op->get_unmapped_area)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (!file->f_op->read)
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
if (!(file->f_mode & FMODE_READ))
@@ -1044,29 +1035,29 @@ static int validate_mmap_request(struct file *file,
if (locks_verify_locked(file))
return -EAGAIN;
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
/* we mustn't privatise shared mappings */
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
} else {
/* we're going to read the file into private memory we
* allocate */
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
return -ENODEV;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if (prot & PROT_WRITE)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
- if (capabilities & BDI_CAP_MAP_DIRECT) {
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
+ ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+ ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
) {
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
printk(KERN_WARNING
"MAP_SHARED not completely supported on !MMU\n");
@@ -1083,21 +1074,21 @@ static int validate_mmap_request(struct file *file,
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
- if (capabilities & BDI_CAP_EXEC_MAP)
+ if (capabilities & NOMMU_MAP_EXEC)
prot |= PROT_EXEC;
}
} else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
- !(capabilities & BDI_CAP_EXEC_MAP)
+ !(capabilities & NOMMU_MAP_EXEC)
) {
/* backing file is not executable, try to copy */
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
} else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
@@ -1129,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
- if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ if (!(capabilities & NOMMU_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
@@ -1138,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file,
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+ vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
if (flags & MAP_SHARED)
vm_flags |= VM_SHARED;
}
@@ -1191,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
@@ -1380,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file,
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}
@@ -1419,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file,
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR_VALUE(addr)) {
@@ -1431,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file,
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
@@ -1445,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_region = region;
/* set up the mapping
- * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ * - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6a73e47e81c6..45e187b2d971 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
int ratelimit;
int *p;
@@ -1929,7 +1929,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+ __inc_bdi_stat(bdi, BDI_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2298,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
return 1;
}
@@ -2316,7 +2318,7 @@ int test_clear_page_writeback(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2351,7 +2353,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2405,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged);
*/
void wait_for_stable_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
-
- if (!bdi_cap_stable_pages_required(bdi))
- return;
-
- wait_on_page_writeback(page);
+ if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d52ab18fe0d..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
@@ -787,6 +788,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
+ kasan_free_pages(page, order);
if (PageAnon(page))
page->mapping = NULL;
@@ -970,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
if (gfp_flags & __GFP_ZERO)
prep_zero_page(page, order, gfp_flags);
@@ -3871,18 +3874,29 @@ static int __build_all_zonelists(void *data)
return 0;
}
+static noinline void __init
+build_all_zonelists_init(void)
+{
+ __build_all_zonelists(NULL);
+ mminit_verify_zonelist();
+ cpuset_init_current_mems_allowed();
+}
+
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(NULL);
- mminit_verify_zonelist();
- cpuset_init_current_mems_allowed();
+ build_all_zonelists_init();
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b0d497..e6045804c8d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
.bv_len = PAGE_SIZE,
.bv_offset = 0
};
- struct iov_iter from = {
- .type = ITER_BVEC | WRITE,
- .count = PAGE_SIZE,
- .iov_offset = 0,
- .nr_segs = 1,
- };
- from.bvec = &bv; /* older gcc versions are broken */
+ struct iov_iter from;
+ iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
kiocb.ki_nbytes = PAGE_SIZE;
diff --git a/mm/percpu.c b/mm/percpu.c
index d39e2f4e335c..73c97a5f4495 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
- static char cpus_buf[4096] __initdata;
static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size;
@@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int *unit_map;
int group, unit, i;
- cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
-
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
} \
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_t entry = *pmdp;
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
- ra->ra_pages = mapping->backing_dev_info->ra_pages;
+ ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(mapping->backing_dev_info))
+ if (bdi_read_congested(inode_to_bdi(mapping->host)))
return;
/* do read-ahead */
diff --git a/mm/shmem.c b/mm/shmem.c
index 864c878401e6..a63031fa3e0c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
-static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
goto redirty;
/*
- * shmem_backing_dev_info's capabilities prevent regular writeback or
- * sync from ever calling shmem_writepage; but a stacking filesystem
- * might use ->writepage of its underlying filesystem, in which case
- * tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync.
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
*/
if (!wbc->for_reclaim) {
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
- inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = get_seconds();
info = SHMEM_I(inode);
@@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- return mapping->backing_dev_info == &shmem_backing_dev_info;
+ return mapping->host->i_sb->s_op == &shmem_ops;
}
#ifdef CONFIG_TMPFS
@@ -3225,10 +3219,6 @@ int __init shmem_init(void)
if (shmem_inode_cachep)
return 0;
- error = bdi_init(&shmem_backing_dev_info);
- if (error)
- goto out4;
-
error = shmem_init_inodecache();
if (error)
goto out3;
@@ -3252,8 +3242,6 @@ out1:
out2:
shmem_destroy_inodecache();
out3:
- bdi_destroy(&shmem_backing_dev_info);
-out4:
shm_mnt = ERR_PTR(error);
return error;
}
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
{
int ret = 0;
int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep, false);
if (rc)
return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
- struct kmem_cache *c = NULL;
- int i = 0;
+ struct kmem_cache *c;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
if ((ret < 0) || !is_root_cache(cachep))
return ret;
- VM_BUG_ON(!mutex_is_locked(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(cachep, i);
- if (c)
- /* return value determined by the parent cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ lockdep_assert_held(&slab_mutex);
+ for_each_memcg_cache(c, cachep) {
+ /* return value determined by the root cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
extern void create_boot_cache(struct kmem_cache *, const char *name,
size_t size, unsigned long flags);
-struct mem_cgroup;
-
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(size_t size, size_t align,
unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+ list_for_each_entry(iter, &(root)->memcg_params.list, \
+ memcg_params.list)
+
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+ memcg_params.list)
+
static inline bool is_root_cache(struct kmem_cache *s)
{
- return !s->memcg_params || s->memcg_params->is_root_cache;
+ return s->memcg_params.is_root_cache;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+ struct kmem_cache *p)
{
- return (p == s) ||
- (s->memcg_params && (p == s->memcg_params->root_cache));
+ return p == s || p == s->memcg_params.root_cache;
}
/*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
static inline const char *cache_name(struct kmem_cache *s)
{
if (!is_root_cache(s))
- return s->memcg_params->root_cache->name;
+ s = s->memcg_params.root_cache;
return s->name;
}
/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
*/
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
struct kmem_cache *cachep;
- struct memcg_cache_params *params;
-
- if (!s->memcg_params)
- return NULL;
+ struct memcg_cache_array *arr;
rcu_read_lock();
- params = rcu_dereference(s->memcg_params);
+ arr = rcu_dereference(s->memcg_params.memcg_caches);
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
- * memcg_register_cache()).
+ * memcg_create_kmem_cache()).
*/
- cachep = lockless_dereference(params->memcg_caches[idx]);
+ cachep = lockless_dereference(arr->entries[idx]);
rcu_read_unlock();
return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
return s;
- return s->memcg_params->root_cache;
+ return s->memcg_params.root_cache;
}
static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
return 0;
if (is_root_cache(s))
return 0;
- return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
+ return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
}
static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
return;
if (is_root_cache(s))
return;
- memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
+ memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
}
-#else
+
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
+#define for_each_memcg_cache(iter, root) \
+ for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
static inline bool is_root_cache(struct kmem_cache *s)
{
return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
{
}
-#endif
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..999bb3424d44 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
#endif
#ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+void slab_init_memcg_params(struct kmem_cache *s)
{
- size_t size;
+ s->memcg_params.is_root_cache = true;
+ INIT_LIST_HEAD(&s->memcg_params.list);
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct memcg_cache_array *arr;
- if (!memcg_kmem_enabled())
+ if (memcg) {
+ s->memcg_params.is_root_cache = false;
+ s->memcg_params.memcg = memcg;
+ s->memcg_params.root_cache = root_cache;
return 0;
+ }
- if (!memcg) {
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += memcg_limited_groups_array_size * sizeof(void *);
- } else
- size = sizeof(struct memcg_cache_params);
+ slab_init_memcg_params(s);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params)
- return -ENOMEM;
+ if (!memcg_nr_cache_ids)
+ return 0;
- if (memcg) {
- s->memcg_params->memcg = memcg;
- s->memcg_params->root_cache = root_cache;
- } else
- s->memcg_params->is_root_cache = true;
+ arr = kzalloc(sizeof(struct memcg_cache_array) +
+ memcg_nr_cache_ids * sizeof(void *),
+ GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
return 0;
}
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
{
- kfree(s->memcg_params);
+ if (is_root_cache(s))
+ kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
- int size;
- struct memcg_cache_params *new_params, *cur_params;
-
- BUG_ON(!is_root_cache(s));
+ struct memcg_cache_array *old, *new;
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += num_memcgs * sizeof(void *);
+ if (!is_root_cache(s))
+ return 0;
- new_params = kzalloc(size, GFP_KERNEL);
- if (!new_params)
+ new = kzalloc(sizeof(struct memcg_cache_array) +
+ new_array_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
return -ENOMEM;
- cur_params = s->memcg_params;
- memcpy(new_params->memcg_caches, cur_params->memcg_caches,
- memcg_limited_groups_array_size * sizeof(void *));
-
- new_params->is_root_cache = true;
-
- rcu_assign_pointer(s->memcg_params, new_params);
- if (cur_params)
- kfree_rcu(cur_params, rcu_head);
+ old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ if (old)
+ memcpy(new->entries, old->entries,
+ memcg_nr_cache_ids * sizeof(void *));
+ rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
{
struct kmem_cache *s;
int ret = 0;
- mutex_lock(&slab_mutex);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
- if (!is_root_cache(s))
- continue;
-
- ret = memcg_update_cache_params(s, num_memcgs);
+ ret = update_memcg_params(s, num_memcgs);
/*
* Instead of freeing the memory, we'll just leave the caches
* up to this point in an updated state.
*/
if (ret)
- goto out;
+ break;
}
-
- memcg_update_array_size(num_memcgs);
-out:
mutex_unlock(&slab_mutex);
return ret;
}
#else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+static inline int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
return 0;
}
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags,
}
static struct kmem_cache *
-do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *),
+do_kmem_cache_create(const char *name, size_t object_size, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
s->align = align;
s->ctor = ctor;
- err = memcg_alloc_cache_params(memcg, s, root_cache);
+ err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
@@ -330,7 +329,7 @@ out:
return s;
out_free_cache:
- memcg_free_cache_params(s);
+ destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- char *cache_name;
+ const char *cache_name;
int err;
get_online_cpus();
get_online_mems();
+ memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
if (s)
goto out_unlock;
- cache_name = kstrdup(name, GFP_KERNEL);
+ cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
@@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align,
flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
- kfree(cache_name);
+ kfree_const(cache_name);
}
out_unlock:
mutex_unlock(&slab_mutex);
+ memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
*need_rcu_barrier = true;
#ifdef CONFIG_MEMCG_KMEM
- if (!is_root_cache(s)) {
- struct kmem_cache *root_cache = s->memcg_params->root_cache;
- int memcg_id = memcg_cache_id(s->memcg_params->memcg);
-
- BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
- root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
- }
+ if (!is_root_cache(s))
+ list_del(&s->memcg_params.list);
#endif
list_move(&s->list, release);
return 0;
@@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- int memcg_id = memcg_cache_id(memcg);
+ struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
+ int idx;
get_online_cpus();
get_online_mems();
@@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
/*
+ * The memory cgroup could have been deactivated while the cache
+ * creation work was pending.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out_unlock;
+
+ idx = memcg_cache_id(memcg);
+ arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+
+ /*
* Since per-memcg caches are created asynchronously on first
* allocation (see memcg_kmem_get_cache()), several threads can try to
* create the same cache, but only one of them may succeed.
*/
- if (cache_from_memcg_idx(root_cache, memcg_id))
+ if (arr->entries[idx])
goto out_unlock;
- cgroup_name(mem_cgroup_css(memcg)->cgroup,
- memcg_name_buf, sizeof(memcg_name_buf));
+ cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name_buf);
+ css->id, memcg_name_buf);
if (!cache_name)
goto out_unlock;
@@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
goto out_unlock;
}
+ list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
* initialized.
*/
smp_wmb();
- root_cache->memcg_params->memcg_caches[memcg_id] = s;
+ arr->entries[idx] = s;
out_unlock:
mutex_unlock(&slab_mutex);
@@ -535,6 +545,37 @@ out_unlock:
put_online_cpus();
}
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+ int idx;
+ struct memcg_cache_array *arr;
+ struct kmem_cache *s, *c;
+
+ idx = memcg_cache_id(memcg);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ c = arr->entries[idx];
+ if (!c)
+ continue;
+
+ __kmem_cache_shrink(c, true);
+ arr->entries[idx] = NULL;
+ }
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+}
+
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
LIST_HEAD(release);
@@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
mutex_lock(&slab_mutex);
list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+ if (is_root_cache(s) || s->memcg_params.memcg != memcg)
continue;
/*
* The cgroup is about to be freed and therefore has no charges
@@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
void slab_kmem_cache_release(struct kmem_cache *s)
{
- memcg_free_cache_params(s);
- kfree(s->name);
+ destroy_memcg_params(s);
+ kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
}
void kmem_cache_destroy(struct kmem_cache *s)
{
- int i;
+ struct kmem_cache *c, *c2;
LIST_HEAD(release);
bool need_rcu_barrier = false;
bool busy = false;
+ BUG_ON(!is_root_cache(s));
+
get_online_cpus();
get_online_mems();
@@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
-
- if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+ for_each_memcg_cache_safe(c, c2, s) {
+ if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
busy = true;
}
@@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
- ret = __kmem_cache_shrink(cachep);
+ ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
return ret;
@@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ slab_init_memcg_params(s);
+
err = __kmem_cache_create(s, flags);
if (err)
@@ -854,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_kmem_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
+ kasan_kmalloc_large(ret, size);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -920,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
{
struct kmem_cache *c;
struct slabinfo sinfo;
- int i;
if (!is_root_cache(s))
return;
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
+ for_each_memcg_cache(c, s) {
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(c, &sinfo);
@@ -981,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
if (p == slab_caches.next)
print_slabinfo_header(m);
- if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+ if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
cache_show(s, m);
return 0;
}
@@ -1038,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
if (p)
ks = ksize(p);
- if (ks >= new_size)
+ if (ks >= new_size) {
+ kasan_krealloc((void *)p, new_size);
return (void *)p;
+ }
ret = kmalloc_track_caller(new_size, flags);
if (ret && p)
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..6832c4eab104 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
#include <linux/proc_fs.h>
#include <linux/notifier.h>
#include <linux/seq_file.h>
+#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -468,12 +469,30 @@ static char *slub_debug_slabs;
static int disable_higher_order_debug;
/*
+ * slub is about to manipulate internal object metadata. This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error. metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+ kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+ kasan_enable_current();
+}
+
+/*
* Object debugging
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
+ metadata_access_enable();
print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
+ metadata_access_disable();
}
static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
trace.max_entries = TRACK_ADDRS_COUNT;
trace.entries = p->addrs;
trace.skip = 3;
+ metadata_access_enable();
save_stack_trace(&trace);
+ metadata_access_disable();
/* See rant in lockdep.c */
if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
dump_stack();
}
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *fault;
u8 *end;
+ metadata_access_enable();
fault = memchr_inv(start, value, bytes);
+ metadata_access_disable();
if (!fault)
return 1;
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
+ metadata_access_enable();
fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ metadata_access_disable();
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
@@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
+ kasan_kmalloc_large(ptr, size);
}
static inline void kfree_hook(const void *x)
{
kmemleak_free(x);
+ kasan_kfree_large(x);
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
memcg_kmem_put_cache(s);
+ kasan_slab_alloc(s, object);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+
+ kasan_slab_free(s, x);
}
/*
@@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
setup_object_debug(s, page, object);
- if (unlikely(s->ctor))
+ if (unlikely(s->ctor)) {
+ kasan_unpoison_object_data(s, object);
s->ctor(object);
+ kasan_poison_object_data(s, object);
+ }
}
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order);
+ kasan_poison_slab(page);
+
for_each_object_idx(p, idx, s, start, page->objects) {
setup_object(s, page, p);
if (likely(idx < page->objects))
@@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
+ preempt_disable();
do {
pages = 0;
pobjects = 0;
@@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
+ if (unlikely(!s->cpu_partial)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
#endif
}
@@ -2488,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ kasan_kmalloc(s, ret, size);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2514,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
+
+ kasan_kmalloc(s, ret, size);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2899,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
+ kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3271,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+ kasan_kmalloc(s, ret, size);
+
return ret;
}
EXPORT_SYMBOL(__kmalloc);
@@ -3314,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+ kasan_kmalloc(s, ret, size);
+
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
{
struct page *page;
@@ -3335,6 +3387,15 @@ size_t ksize(const void *object)
return slab_ksize(page->slab_cache);
}
+
+size_t ksize(const void *object)
+{
+ size_t size = __ksize(object);
+ /* We assume that ksize callers could use whole allocated area,
+ so we need unpoison this area. */
+ kasan_krealloc(object, size);
+ return size;
+}
EXPORT_SYMBOL(ksize);
void kfree(const void *x)
@@ -3358,69 +3419,92 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
+
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
*
* The slabs with the least items are placed last. This results in them
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
struct kmem_cache_node *n;
struct page *page;
struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
+ int ret = 0;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ kick_all_cpus_sync();
+ }
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
- if (!n->nr_partial)
- continue;
-
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in each slab.
+ * Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- list_move(&page->lru, slabs_by_inuse + page->inuse);
- if (!page->inuse)
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->lru, &discard);
n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->lru, promote + free - 1);
}
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * Promote the slabs filled up most to the head of the
+ * partial list.
*/
- for (i = objects - 1; i > 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ list_for_each_entry_safe(page, t, &discard, lru)
discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
}
- kfree(slabs_by_inuse);
- return 0;
+ return ret;
}
static int slab_mem_going_offline_callback(void *arg)
@@ -3429,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);
return 0;
@@ -3577,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
+ slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
return s;
}
@@ -3635,13 +3720,10 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s, *c;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
/*
@@ -3651,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
c->inuse = max_t(int, c->inuse,
ALIGN(size, sizeof(void *)));
@@ -4081,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (num_online_cpus() > 1 &&
!cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " cpus=");
- len += cpulist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- to_cpumask(l->cpus));
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " nodes=");
- len += nodelist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- l->nodes);
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
len += sprintf(buf + len, "\n");
}
@@ -4691,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1') {
- int rc = kmem_cache_shrink(s);
-
- if (rc)
- return rc;
- } else
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
return -EINVAL;
return length;
}
@@ -4920,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- int i;
+ struct kmem_cache *c;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
@@ -4943,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (c)
- attribute->store(c, buf, len);
- }
+ for_each_memcg_cache(c, s)
+ attribute->store(c, buf, len);
mutex_unlock(&slab_mutex);
}
#endif
@@ -4964,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
+ root_cache = s->memcg_params.root_cache;
/*
* This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
+ return s->memcg_params.root_cache->memcg_kset;
#endif
return slab_kset;
}
diff --git a/mm/swap.c b/mm/swap.c
index 5b3087228b99..cd3a5e64cea9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,8 +1138,6 @@ void __init swap_setup(void)
#ifdef CONFIG_SWAP
int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
#endif
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = {
#endif
};
-static struct backing_dev_info swap_backing_dev_info = {
- .name = "swap",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
struct address_space swapper_spaces[MAX_SWAPFILES] = {
[0 ... MAX_SWAPFILES - 1] = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.i_mmap_writable = ATOMIC_INIT(0),
.a_ops = &swap_aops,
- .backing_dev_info = &swap_backing_dev_info,
}
};
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
diff --git a/mm/util.c b/mm/util.c
index f3ef639c4857..3981ae9d1b15 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
+#include <asm/sections.h>
#include <asm/uaccess.h>
#include "internal.h"
+static inline int is_kernel_rodata(unsigned long addr)
+{
+ return addr >= (unsigned long)__start_rodata &&
+ addr < (unsigned long)__end_rodata;
+}
+
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+ if (!is_kernel_rodata((unsigned long)x))
+ kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
+
/**
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
@@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp)
EXPORT_SYMBOL(kstrdup);
/**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Function returns source string if it is in .rodata section otherwise it
+ * fallbacks to kstrdup.
+ * Strings allocated by kstrdup_const should be freed by kfree_const.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+ if (is_kernel_rodata((unsigned long)s))
+ return s;
+
+ return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+
+/**
* kstrndup - allocate space for and copy an existing string
* @s: the string to duplicate
* @max: read at most @max chars from @s
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c338896416..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (unlikely(!area))
return NULL;
- /*
- * We always allocate a guard page.
- */
- size += PAGE_SIZE;
+ if (!(flags & VM_NO_GUARD))
+ size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
@@ -1621,6 +1619,7 @@ fail:
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
@@ -1630,7 +1629,8 @@ fail:
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
- start, end, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+ vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
@@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, node, caller);
+ gfp_mask, prot, 0, node, caller);
}
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e645ee52045..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
- struct shrinker *shrinker,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
}
/**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
* @nr_scanned: pressure numerator
* @nr_eligible: pressure denominator
*
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
* passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
*
* Returns the number of reclaimed slab objects.
*/
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ if (memcg && !memcg_kmem_is_active(memcg))
+ return 0;
+
if (nr_scanned == 0)
nr_scanned = SWAP_CLUSTER_MAX;
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
+ .memcg = memcg,
};
+ if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ continue;
+
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
sc.nid = 0;
- freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+ freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
return freed;
}
+void drop_slab_node(int nid)
+{
+ unsigned long freed;
+
+ do {
+ struct mem_cgroup *memcg = NULL;
+
+ freed = 0;
+ do {
+ freed += shrink_slab(GFP_KERNEL, nid, memcg,
+ 1000, 1000);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ } while (freed > 10);
+}
+
+void drop_slab(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ drop_slab_node(nid);
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -500,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info, sc))
+ if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -879,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(mapping->backing_dev_info)) ||
+ bdi_write_congested(inode_to_bdi(mapping->host))) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
zone_lru_pages += lru_pages;
+ if (memcg && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+ memcg, sc->nr_scanned - scanned,
+ lru_pages);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
- if (global_reclaim(sc) && is_classzone) {
- struct reclaim_state *reclaim_state;
-
- shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
- sc->nr_scanned - nr_scanned,
- zone_lru_pages);
-
- reclaim_state = current->reclaim_state;
- if (reclaim_state) {
- sc->nr_reclaimed +=
- reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ if (global_reclaim(sc) && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+ sc->nr_scanned - nr_scanned,
+ zone_lru_pages);
+
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
{
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
- list_del_init(item);
+ list_lru_isolate(lru, item);
spin_unlock(lru_lock);
/*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
- shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+ shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+ struct zpool_ops *zpool_ops)
{
return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name The name of the zpool (e.g. zram0, zswap)
* @gfp The GFP flags to use when allocating the pool.
* @ops The optional ops callback.
*
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+ struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops);
zpool->ops = ops;
if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
@@ -168,6 +169,22 @@ enum fullness_group {
ZS_FULL
};
+enum zs_stat_type {
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ NR_ZS_STAT_TYPE,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#endif
+
/*
* number of size_classes
*/
@@ -200,6 +217,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct zs_size_stat stats;
+#endif
+
spinlock_t lock;
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
};
struct zs_pool {
+ char *name;
+
struct size_class **size_class;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
};
/*
@@ -246,9 +273,9 @@ struct mapping_area {
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zs_create_pool(gfp);
+ return zs_create_pool(name, gfp);
}
static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+ "obj_allocated", "obj_used", "pages_used");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
+ class->size, obj_allocated, obj_used, pages_used);
+
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
+ total_objs, total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "obj_in_classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
+
spin_lock(&class->lock);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
}
obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
kunmap_atomic(vaddr);
first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(pool, first_page);
spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
+
+ zs_stat_dec(class, OBJ_USED, 1);
+ if (fullness == ZS_EMPTY)
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+
spin_unlock(&class->lock);
if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
{
int i;
struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
if (!pool)
return NULL;
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name) {
+ kfree(pool);
+ return NULL;
+ }
+
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
+ kfree(pool->name);
kfree(pool);
return NULL;
}
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
pool->flags = flags;
+ if (zs_pool_stat_create(name, pool))
+ goto err;
+
return pool;
err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_pool_stat_destroy(pool);
+
for (i = 0; i < zs_size_classes; i++) {
int fg;
struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
}
kfree(pool->size_class);
+ kfree(pool->name);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
{
int ret = zs_register_cpu_notifier();
- if (ret) {
- zs_unregister_cpu_notifier();
- return ret;
- }
+ if (ret)
+ goto notifier_fail;
init_zs_size_classes();
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
+
+ ret = zs_stat_init();
+ if (ret) {
+ pr_err("zs stat initialization failed\n");
+ goto stat_fail;
+ }
return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+ zs_unregister_cpu_notifier();
+
+ return ret;
}
static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
zpool_unregister_driver(&zs_zpool_driver);
#endif
zs_unregister_cpu_notifier();
+
+ zs_stat_exit();
}
module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
pr_info("loading zswap\n");
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+ &zswap_zpool_ops);
if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_info("%s zpool not available\n", zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
&zswap_zpool_ops);
}
if (!zswap_pool) {