diff options
Diffstat (limited to 'mm')
49 files changed, 2395 insertions, 1130 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4395b12869c8..de5239c152f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -602,6 +602,16 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench +config ZSMALLOC_STAT + bool "Export zsmalloc statistics" + depends on ZSMALLOC + select DEBUG_FS + help + This option enables code in the zsmalloc to collect various + statistics about whats happening in zsmalloc and exports that + information to userspace via debugfs. + If unsure, say N. + config GENERIC_EARLY_IOREMAP bool diff --git a/mm/Makefile b/mm/Makefile index 3548460ab7b6..3c1caa2693bd 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -2,6 +2,9 @@ # Makefile for the linux memory manager. # +KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slub.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -49,9 +52,9 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o +obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o -obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..6dc4580df2af 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,19 +14,10 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -struct backing_dev_info default_backing_dev_info = { - .name = "default", - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -40,17 +31,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) -{ - if (wb1 < wb2) { - spin_lock(&wb1->list_lock); - spin_lock_nested(&wb2->list_lock, 1); - } else { - spin_lock(&wb2->list_lock); - spin_lock_nested(&wb1->list_lock, 1); - } -} - #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -69,10 +49,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -80,6 +60,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -98,6 +81,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -111,6 +95,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty, nr_io, nr_more_io, + nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -264,9 +249,6 @@ static int __init default_bdi_init(void) if (!bdi_wq) return -ENOMEM; - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); err = bdi_init(&noop_backing_dev_info); return err; @@ -355,19 +337,19 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - if (!bdi_cap_writeback_dirty(bdi)) + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + spin_unlock_bh(&bdi->wb_lock); return; + } + spin_unlock_bh(&bdi->wb_lock); /* * Make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - clear_bit(BDI_registered, &bdi->state); - spin_unlock_bh(&bdi->wb_lock); - /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -375,37 +357,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* - * This bdi is going away now, make sure that no super_blocks point to it + * Called when the device behind @bdi has been removed or ejected. + * + * We can't really do much here except for reducing the dirty ratio at + * the moment. In the future we should be able to set a flag so that + * the filesystem can handle errors at mark_inode_dirty time instead + * of only at writeback time. */ -static void bdi_prune_sb(struct backing_dev_info *bdi) -{ - struct super_block *sb; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdi == bdi) - sb->s_bdi = &default_backing_dev_info; - } - spin_unlock(&sb_lock); -} - void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { - bdi_set_min_ratio(bdi, 0); - trace_writeback_bdi_unregister(bdi); - bdi_prune_sb(bdi); + if (WARN_ON_ONCE(!bdi->dev)) + return; - bdi_wb_shutdown(bdi); - bdi_debug_unregister(bdi); - device_unregister(bdi->dev); - bdi->dev = NULL; - } + bdi_set_min_ratio(bdi, 0); } EXPORT_SYMBOL(bdi_unregister); @@ -418,6 +385,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } @@ -474,37 +442,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - /* - * Splice our entries to the default_backing_dev_info. This - * condition shouldn't happen. @wb must be empty at this point and - * dirty inodes on it might cause other issues. This workaround is - * added by ce5f8e779519 ("writeback: splice dirty inode entries to - * default bdi on bdi_destroy()") without root-causing the issue. - * - * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com - * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 - * - * We should probably add WARN_ON() to find out whether it still - * happens and track it down if so. - */ - if (bdi_has_dirty_io(bdi)) { - struct bdi_writeback *dst = &default_backing_dev_info.wb; - - bdi_lock_two(&bdi->wb, dst); - list_splice(&bdi->wb.b_dirty, &dst->b_dirty); - list_splice(&bdi->wb.b_io, &dst->b_io); - list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&bdi->wb.list_lock); - spin_unlock(&dst->list_lock); - } - - bdi_unregister(bdi); + bdi_wb_shutdown(bdi); + WARN_ON(!list_empty(&bdi->work_list)); WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); @@ -513,13 +463,12 @@ EXPORT_SYMBOL(bdi_destroy); * For use from filesystems to quickly init and register a bdi associated * with dirty writeback */ -int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, - unsigned int cap) +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { int err; bdi->name = name; - bdi->capabilities = cap; + bdi->capabilities = 0; err = bdi_init(bdi); if (err) return err; diff --git a/mm/compaction.c b/mm/compaction.c index b68736c8a1ce..8c0d9459b54a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,7 @@ #include <linux/sysfs.h> #include <linux/balloon_compaction.h> #include <linux/page-isolation.h> +#include <linux/kasan.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -72,6 +73,7 @@ static void map_pages(struct list_head *list) list_for_each_entry(page, list, lru) { arch_alloc_page(page, 0); kernel_map_pages(page, 1, 1); + kasan_alloc_pages(page, 0); } } @@ -490,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* If a page was split, advance to the end of it */ if (isolated) { + cc->nr_freepages += isolated; + if (!strict && + cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; + } + blockpfn += isolated - 1; cursor += isolated - 1; continue; @@ -899,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* @@ -924,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc) * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + for (; block_start_pfn >= low_pfn && + cc->nr_migratepages > cc->nr_freepages; block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -953,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); - nr_freepages += isolated; /* * Remember where the free scanner should restart next time, @@ -987,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc) */ if (block_start_pfn < low_pfn) cc->free_pfn = cc->migrate_pfn; - - cc->nr_freepages = nr_freepages; } /* @@ -1100,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, isolate_mode); - if (!low_pfn || cc->contended) + if (!low_pfn || cc->contended) { + acct_isolated(zone, cc); return ISOLATE_ABORT; + } /* * Either we isolated something and proceed with migration. Or @@ -1173,7 +1180,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_PARTIAL; /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) + if (order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; } diff --git a/mm/fadvise.c b/mm/fadvise.c index 2ad7adf4f0a4..4a3907cf79f8 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -28,6 +28,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { struct fd f = fdget(fd); + struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) if (!f.file) return -EBADF; - if (S_ISFIFO(file_inode(f.file)->i_mode)) { + inode = file_inode(f.file); + if (S_ISFIFO(inode->i_mode)) { ret = -ESPIPE; goto out; } @@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (mapping->a_ops->get_xip_mem) { + if (IS_DAX(inode)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: @@ -73,7 +75,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = mapping->backing_dev_info; + bdi = inode_to_bdi(mapping->host); switch (advice) { case POSIX_FADV_NORMAL: @@ -113,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(mapping->backing_dev_info)) + if (!bdi_write_congested(bdi)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/mm/filemap.c b/mm/filemap.c index bf7a27142704..ad7242043bdb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); } } @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (file->f_flags & O_DIRECT) { + if (io_is_direct(file)) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { file_accessed(file); goto out; } @@ -2564,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { + if (io_is_direct(file)) { loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) - goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). */ + if (written < 0 || written == count || IS_DAX(inode)) + goto out; + pos += written; count -= written; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c deleted file mode 100644 index 70c09da1a419..000000000000 --- a/mm/filemap_xip.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * linux/mm/filemap_xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte <cotte@de.ibm.com> - * - * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds - * - */ - -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/export.h> -#include <linux/uio.h> -#include <linux/rmap.h> -#include <linux/mmu_notifier.h> -#include <linux/sched.h> -#include <linux/seqlock.h> -#include <linux/mutex.h> -#include <linux/gfp.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -/* - * We do use our own empty page to avoid interference with other users - * of ZERO_PAGE(), such as /dev/zero - */ -static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); -static struct page *__xip_sparse_page; - -/* called under xip_sparse_mutex */ -static struct page *xip_sparse_page(void) -{ - if (!__xip_sparse_page) { - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - - if (page) - __xip_sparse_page = page; - } - return __xip_sparse_page; -} - -/* - * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_mem() function for the actual low-level - * stuff. - * - * Note the struct file* is not used at all. It may be NULL. - */ -static ssize_t -do_xip_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, - struct file *filp, - char __user *buf, - size_t len, - loff_t *ppos) -{ - struct inode *inode = mapping->host; - pgoff_t index, end_index; - unsigned long offset; - loff_t isize, pos; - size_t copied = 0, error = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - pos = *ppos; - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; - - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - do { - unsigned long nr, left; - void *xip_mem; - unsigned long xip_pfn; - int zero = 0; - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - if (nr > len - copied) - nr = len - copied; - - error = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(error)) { - if (error == -ENODATA) { - /* sparse */ - zero = 1; - } else - goto out; - } - - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - /* address based flush */ ; - - /* - * Ok, we have the mem, so now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - if (!zero) - left = __copy_to_user(buf+copied, xip_mem+offset, nr); - else - left = __clear_user(buf + copied, nr); - - if (left) { - error = -EFAULT; - goto out; - } - - copied += (nr - left); - offset += (nr - left); - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - } while (copied < len); - -out: - *ppos = pos + copied; - if (filp) - file_accessed(filp); - - return (copied ? copied : error); -} - -ssize_t -xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - if (!access_ok(VERIFY_WRITE, buf, len)) - return -EFAULT; - - return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - buf, len, ppos); -} -EXPORT_SYMBOL_GPL(xip_file_read); - -/* - * __xip_unmap is invoked from xip_unmap and xip_write - * - * This function walks all vmas of the address_space and unmaps the - * __xip_sparse_page when found at pgoff. - */ -static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) -{ - struct vm_area_struct *vma; - struct page *page; - unsigned count; - int locked = 0; - - count = read_seqcount_begin(&xip_sparse_seq); - - page = __xip_sparse_page; - if (!page) - return; - -retry: - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - pte_t *pte, pteval; - spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long address = vma->vm_start + - ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl, 1); - if (pte) { - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); - dec_mm_counter(mm, MM_FILEPAGES); - BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - /* must invalidate_page _before_ freeing the page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(page); - } - } - i_mmap_unlock_read(mapping); - - if (locked) { - mutex_unlock(&xip_sparse_mutex); - } else if (read_seqcount_retry(&xip_sparse_seq, count)) { - mutex_lock(&xip_sparse_mutex); - locked = 1; - goto retry; - } -} - -/* - * xip_fault() is invoked via the vma operations vector for a - * mapped memory region to read in file data during a page fault. - * - * This function is derived from filemap_fault, but used for execute in place - */ -static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - pgoff_t size; - void *xip_mem; - unsigned long xip_pfn; - struct page *page; - int error; - - /* XXX: are VM_FAULT_ codes OK? */ -again: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (likely(!error)) - goto found; - if (error != -ENODATA) - return VM_FAULT_OOM; - - /* sparse block */ - if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && - (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { - int err; - - /* maybe shared writable, allocate new block */ - mutex_lock(&xip_sparse_mutex); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (error) - return VM_FAULT_SIGBUS; - /* unmap sparse mappings at pgoff from all other vmas */ - __xip_unmap(mapping, vmf->pgoff); - -found: - err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - xip_pfn); - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* - * err == -EBUSY is fine, we've raced against another thread - * that faulted-in the same page - */ - if (err != -EBUSY) - BUG_ON(err); - return VM_FAULT_NOPAGE; - } else { - int err, ret = VM_FAULT_OOM; - - mutex_lock(&xip_sparse_mutex); - write_seqcount_begin(&xip_sparse_seq); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (unlikely(!error)) { - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - goto again; - } - if (error != -ENODATA) - goto out; - /* not shared and writable, use xip_sparse_page() */ - page = xip_sparse_page(); - if (!page) - goto out; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, - page); - if (err == -ENOMEM) - goto out; - - ret = VM_FAULT_NOPAGE; -out: - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - - return ret; - } -} - -static const struct vm_operations_struct xip_file_vm_ops = { - .fault = xip_file_fault, - .page_mkwrite = filemap_page_mkwrite, -}; - -int xip_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - BUG_ON(!file->f_mapping->a_ops->get_xip_mem); - - file_accessed(file); - vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; - return 0; -} -EXPORT_SYMBOL_GPL(xip_file_mmap); - -static ssize_t -__xip_file_write(struct file *filp, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) -{ - struct address_space * mapping = filp->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - size_t bytes; - ssize_t written = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - do { - unsigned long index; - unsigned long offset; - size_t copied; - void *xip_mem; - unsigned long xip_pfn; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (status == -ENODATA) { - /* we allocate a new page unmap it */ - mutex_lock(&xip_sparse_mutex); - status = a_ops->get_xip_mem(mapping, index, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (!status) - /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, index); - } - - if (status) - break; - - copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes); - - if (likely(copied > 0)) { - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; - if (status < 0) - break; - } while (count); - *ppos = pos; - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - */ - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - - return written ? written : status; -} - -ssize_t -xip_file_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; - size_t count; - loff_t pos; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - - if (!access_ok(VERIFY_READ, buf, len)) { - ret=-EFAULT; - goto out_up; - } - - pos = *ppos; - count = len; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out_backing; - if (count == 0) - goto out_backing; - - ret = file_remove_suid(filp); - if (ret) - goto out_backing; - - ret = file_update_time(filp); - if (ret) - goto out_backing; - - ret = __xip_file_write (filp, buf, count, pos, ppos); - - out_backing: - current->backing_dev_info = NULL; - out_up: - mutex_unlock(&inode->i_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xip_file_write); - -/* - * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_mem - * to get the page instead of page cache - */ -int -xip_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - unsigned length; - void *xip_mem; - unsigned long xip_pfn; - int err; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - blocksize = 1 << mapping->host->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - - err = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(err)) { - if (err == -ENODATA) - /* Hole? No need to truncate */ - return 0; - else - return err; - } - memset(xip_mem + offset, 0, length); - return 0; -} -EXPORT_SYMBOL_GPL(xip_truncate_page); @@ -64,7 +64,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_numa(pte)) + if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); @@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { @@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, /* * Similar to the PMD case below, NUMA hinting must take slow - * path + * path using the pte_protnone check. */ if (!pte_present(pte) || pte_special(pte) || - pte_numa(pte) || (write && !pte_write(pte))) + pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1092,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmdp = pmd_offset(&pud, addr); do { - pmd_t pmd = ACCESS_ONCE(*pmdp); + pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); if (pmd_none(pmd) || pmd_trans_splitting(pmd)) @@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, write, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb7be110cad3..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; page = pmd_page(*pmd); @@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * check_same as the page may no longer be mapped. */ if (unlikely(pmd_trans_migrating(*pmdp))) { + page = pmd_page(*pmdp); spin_unlock(ptl); - wait_migrate_huge_page(vma->anon_vma, pmdp); + wait_on_page_locked(page); goto out; } @@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Migrate the THP to the requested node, returns with page unlocked - * and pmd_numa cleared. + * and access rights restored. */ spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - pmd = pmd_mknonnuma(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; - ret = 1; - if (!prot_numa) { + + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!prot_numa || !pmd_protnone(*pmd)) { + ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); - } else { - struct page *page = pmd_page(*pmd); - - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && - !pmd_numa(*pmd)) { - pmdp_set_numa(mm, addr, pmd); - ret = HPAGE_PMD_NR; - } } spin_unlock(ptl); } @@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, pte_t *pte, entry; BUG_ON(PageCompound(page+i)); /* - * Note that pmd_numa is not transferred deliberately - * to avoid any possibility that pte_numa leaks to - * a PROT_NONE VMA by accident. + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/internal.h b/mm/internal.h index c4d6c9b43491..a96da5b0029d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -351,8 +351,10 @@ extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ - printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ - printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ + if (level <= MMINIT_WARNING) \ + printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + else \ + printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) diff --git a/mm/iov_iter.c b/mm/iov_iter.c index a1599ca4ab0e..827732047da1 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, int direction, - const struct kvec *iov, unsigned long nr_segs, + const struct kvec *kvec, unsigned long nr_segs, size_t count) { BUG_ON(!(direction & ITER_KVEC)); i->type = direction; - i->kvec = (struct kvec *)iov; + i->kvec = kvec; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; } EXPORT_SYMBOL(iov_iter_kvec); +void iov_iter_bvec(struct iov_iter *i, int direction, + const struct bio_vec *bvec, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_BVEC)); + i->type = direction; + i->bvec = bvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_bvec); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile new file mode 100644 index 000000000000..bd837b8c2f41 --- /dev/null +++ b/mm/kasan/Makefile @@ -0,0 +1,8 @@ +KASAN_SANITIZE := n + +CFLAGS_REMOVE_kasan.o = -pg +# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 +# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) + +obj-y := kasan.o report.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c new file mode 100644 index 000000000000..78fee632a7ee --- /dev/null +++ b/mm/kasan/kasan.c @@ -0,0 +1,516 @@ +/* + * This file contains shadow memory manipulation code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * + * Some of code borrowed from https://github.com/xairy/linux by + * Andrey Konovalov <adech.fo@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include <linux/export.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> + +#include "kasan.h" +#include "../slab.h" + +/* + * Poisons the shadow memory for 'size' bytes starting from 'addr'. + * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. + */ +static void kasan_poison_shadow(const void *address, size_t size, u8 value) +{ + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(address); + shadow_end = kasan_mem_to_shadow(address + size); + + memset(shadow_start, value, shadow_end - shadow_start); +} + +void kasan_unpoison_shadow(const void *address, size_t size) +{ + kasan_poison_shadow(address, size, 0); + + if (size & KASAN_SHADOW_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + *shadow = size & KASAN_SHADOW_MASK; + } +} + + +/* + * All functions below always inlined so compiler could + * perform better optimizations in each of __asan_loadX/__assn_storeX + * depending on memory access size X. + */ + +static __always_inline bool memory_is_poisoned_1(unsigned long addr) +{ + s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(shadow_value)) { + s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; + return unlikely(last_accessible_byte >= shadow_value); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_2(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 1)) + return true; + + if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_4(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 3)) + return true; + + if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_8(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 7)) + return true; + + if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_16(unsigned long addr) +{ + u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + u16 shadow_first_bytes = *(u16 *)shadow_addr; + s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK; + + if (unlikely(shadow_first_bytes)) + return true; + + if (likely(!last_byte)) + return false; + + return memory_is_poisoned_1(addr + 15); + } + + return false; +} + +static __always_inline unsigned long bytes_is_zero(const u8 *start, + size_t size) +{ + while (size) { + if (unlikely(*start)) + return (unsigned long)start; + start++; + size--; + } + + return 0; +} + +static __always_inline unsigned long memory_is_zero(const void *start, + const void *end) +{ + unsigned int words; + unsigned long ret; + unsigned int prefix = (unsigned long)start % 8; + + if (end - start <= 16) + return bytes_is_zero(start, end - start); + + if (prefix) { + prefix = 8 - prefix; + ret = bytes_is_zero(start, prefix); + if (unlikely(ret)) + return ret; + start += prefix; + } + + words = (end - start) / 8; + while (words) { + if (unlikely(*(u64 *)start)) + return bytes_is_zero(start, 8); + start += 8; + words--; + } + + return bytes_is_zero(start, (end - start) % 8); +} + +static __always_inline bool memory_is_poisoned_n(unsigned long addr, + size_t size) +{ + unsigned long ret; + + ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), + kasan_mem_to_shadow((void *)addr + size - 1) + 1); + + if (unlikely(ret)) { + unsigned long last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + + if (unlikely(ret != (unsigned long)last_shadow || + ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + return true; + } + return false; +} + +static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +{ + if (__builtin_constant_p(size)) { + switch (size) { + case 1: + return memory_is_poisoned_1(addr); + case 2: + return memory_is_poisoned_2(addr); + case 4: + return memory_is_poisoned_4(addr); + case 8: + return memory_is_poisoned_8(addr); + case 16: + return memory_is_poisoned_16(addr); + default: + BUILD_BUG(); + } + } + + return memory_is_poisoned_n(addr, size); +} + + +static __always_inline void check_memory_region(unsigned long addr, + size_t size, bool write) +{ + struct kasan_access_info info; + + if (unlikely(size == 0)) + return; + + if (unlikely((void *)addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + info.access_addr = (void *)addr; + info.access_size = size; + info.is_write = write; + info.ip = _RET_IP_; + kasan_report_user_access(&info); + return; + } + + if (likely(!memory_is_poisoned(addr, size))) + return; + + kasan_report(addr, size, write, _RET_IP_); +} + +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + __asan_storeN((unsigned long)addr, len); + + return __memset(addr, c, len); +} + +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + __asan_loadN((unsigned long)src, len); + __asan_storeN((unsigned long)dest, len); + + return __memmove(dest, src, len); +} + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + __asan_loadN((unsigned long)src, len); + __asan_storeN((unsigned long)dest, len); + + return __memcpy(dest, src, len); +} + +void kasan_alloc_pages(struct page *page, unsigned int order) +{ + if (likely(!PageHighMem(page))) + kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); +} + +void kasan_free_pages(struct page *page, unsigned int order) +{ + if (likely(!PageHighMem(page))) + kasan_poison_shadow(page_address(page), + PAGE_SIZE << order, + KASAN_FREE_PAGE); +} + +void kasan_poison_slab(struct page *page) +{ + kasan_poison_shadow(page_address(page), + PAGE_SIZE << compound_order(page), + KASAN_KMALLOC_REDZONE); +} + +void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_unpoison_shadow(object, cache->object_size); +} + +void kasan_poison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_poison_shadow(object, + round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), + KASAN_KMALLOC_REDZONE); +} + +void kasan_slab_alloc(struct kmem_cache *cache, void *object) +{ + kasan_kmalloc(cache, object, cache->object_size); +} + +void kasan_slab_free(struct kmem_cache *cache, void *object) +{ + unsigned long size = cache->object_size; + unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return; + + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); +} + +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +{ + unsigned long redzone_start; + unsigned long redzone_end; + + if (unlikely(object == NULL)) + return; + + redzone_start = round_up((unsigned long)(object + size), + KASAN_SHADOW_SCALE_SIZE); + redzone_end = round_up((unsigned long)object + cache->object_size, + KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(object, size); + kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); +} +EXPORT_SYMBOL(kasan_kmalloc); + +void kasan_kmalloc_large(const void *ptr, size_t size) +{ + struct page *page; + unsigned long redzone_start; + unsigned long redzone_end; + + if (unlikely(ptr == NULL)) + return; + + page = virt_to_page(ptr); + redzone_start = round_up((unsigned long)(ptr + size), + KASAN_SHADOW_SCALE_SIZE); + redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); + + kasan_unpoison_shadow(ptr, size); + kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE); +} + +void kasan_krealloc(const void *object, size_t size) +{ + struct page *page; + + if (unlikely(object == ZERO_SIZE_PTR)) + return; + + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) + kasan_kmalloc_large(object, size); + else + kasan_kmalloc(page->slab_cache, object, size); +} + +void kasan_kfree_large(const void *ptr) +{ + struct page *page = virt_to_page(ptr); + + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); +} + +int kasan_module_alloc(void *addr, size_t size) +{ + void *ret; + size_t shadow_size; + unsigned long shadow_start; + + shadow_start = (unsigned long)kasan_mem_to_shadow(addr); + shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, + PAGE_SIZE); + + if (WARN_ON(!PAGE_ALIGNED(shadow_start))) + return -EINVAL; + + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, + shadow_start + shadow_size, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, + __builtin_return_address(0)); + return ret ? 0 : -ENOMEM; +} + +void kasan_module_free(void *addr) +{ + vfree(kasan_mem_to_shadow(addr)); +} + +static void register_global(struct kasan_global *global) +{ + size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(global->beg, global->size); + + kasan_poison_shadow(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); +} + +void __asan_register_globals(struct kasan_global *globals, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + register_global(&globals[i]); +} +EXPORT_SYMBOL(__asan_register_globals); + +void __asan_unregister_globals(struct kasan_global *globals, size_t size) +{ +} +EXPORT_SYMBOL(__asan_unregister_globals); + +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region(addr, size, false); \ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region(addr, size, true); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_store##size##_noabort) + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); +DEFINE_ASAN_LOAD_STORE(16); + +void __asan_loadN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, false); +} +EXPORT_SYMBOL(__asan_loadN); + +__alias(__asan_loadN) +void __asan_loadN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_loadN_noabort); + +void __asan_storeN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, true); +} +EXPORT_SYMBOL(__asan_storeN); + +__alias(__asan_storeN) +void __asan_storeN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_storeN_noabort); + +/* to shut up compiler complaints */ +void __asan_handle_no_return(void) {} +EXPORT_SYMBOL(__asan_handle_no_return); + +#ifdef CONFIG_MEMORY_HOTPLUG +static int kasan_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; +} + +static int __init kasan_memhotplug_init(void) +{ + pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("Memory hot-add will be disabled\n"); + + hotplug_memory_notifier(kasan_mem_notifier, 0); + + return 0; +} + +module_init(kasan_memhotplug_init); +#endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h new file mode 100644 index 000000000000..4986b0acab21 --- /dev/null +++ b/mm/kasan/kasan.h @@ -0,0 +1,75 @@ +#ifndef __MM_KASAN_KASAN_H +#define __MM_KASAN_KASAN_H + +#include <linux/kasan.h> + +#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) +#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) + +#define KASAN_FREE_PAGE 0xFF /* page was freed */ +#define KASAN_FREE_PAGE 0xFF /* page was freed */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ +#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ +#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ + +/* + * Stack redzone shadow values + * (Those are compiler's ABI, don't change them) + */ +#define KASAN_STACK_LEFT 0xF1 +#define KASAN_STACK_MID 0xF2 +#define KASAN_STACK_RIGHT 0xF3 +#define KASAN_STACK_PARTIAL 0xF4 + +/* Don't break randconfig/all*config builds */ +#ifndef KASAN_ABI_VERSION +#define KASAN_ABI_VERSION 1 +#endif + +struct kasan_access_info { + const void *access_addr; + const void *first_bad_addr; + size_t access_size; + bool is_write; + unsigned long ip; +}; + +/* The layout of struct dictated by compiler */ +struct kasan_source_location { + const char *filename; + int line_no; + int column_no; +}; + +/* The layout of struct dictated by compiler */ +struct kasan_global { + const void *beg; /* Address of the beginning of the global variable. */ + size_t size; /* Size of the global variable. */ + size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ + const void *name; + const void *module_name; /* Name of the module where the global variable is declared. */ + unsigned long has_dynamic_init; /* This needed for C++ */ +#if KASAN_ABI_VERSION >= 4 + struct kasan_source_location *location; +#endif +}; + +void kasan_report_error(struct kasan_access_info *info); +void kasan_report_user_access(struct kasan_access_info *info); + +static inline const void *kasan_shadow_to_mem(const void *shadow_addr) +{ + return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) + << KASAN_SHADOW_SCALE_SHIFT); +} + +static inline bool kasan_enabled(void) +{ + return !current->kasan_depth; +} + +void kasan_report(unsigned long addr, size_t size, + bool is_write, unsigned long ip); + +#endif diff --git a/mm/kasan/report.c b/mm/kasan/report.c new file mode 100644 index 000000000000..680ceedf810a --- /dev/null +++ b/mm/kasan/report.c @@ -0,0 +1,269 @@ +/* + * This file contains error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * + * Some of code borrowed from https://github.com/xairy/linux by + * Andrey Konovalov <adech.fo@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +/* Shadow layout customization. */ +#define SHADOW_BYTES_PER_BLOCK 1 +#define SHADOW_BLOCKS_PER_ROW 16 +#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) +#define SHADOW_ROWS_AROUND_ADDR 2 + +static const void *find_first_bad_addr(const void *addr, size_t size) +{ + u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); + const void *first_bad_addr = addr; + + while (!shadow_val && first_bad_addr < addr + size) { + first_bad_addr += KASAN_SHADOW_SCALE_SIZE; + shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); + } + return first_bad_addr; +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = "unknown crash"; + u8 shadow_val; + + info->first_bad_addr = find_first_bad_addr(info->access_addr, + info->access_size); + + shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + switch (shadow_val) { + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use after free"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_KMALLOC_REDZONE: + case KASAN_GLOBAL_REDZONE: + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + bug_type = "out of bounds access"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "out of bounds on stack"; + break; + } + + pr_err("BUG: KASan: %s in %pS at addr %p\n", + bug_type, (void *)info->ip, + info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, task_pid_nr(current)); +} + +static inline bool kernel_or_module_addr(const void *addr) +{ + return (addr >= (void *)_stext && addr < (void *)_end) + || (addr >= (void *)MODULES_VADDR + && addr < (void *)MODULES_END); +} + +static inline bool init_task_stack_addr(const void *addr) +{ + return addr >= (void *)&init_thread_union.stack && + (addr <= (void *)&init_thread_union.stack + + sizeof(init_thread_union.stack)); +} + +static void print_address_description(struct kasan_access_info *info) +{ + const void *addr = info->access_addr; + + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) { + struct page *page = virt_to_head_page(addr); + + if (PageSlab(page)) { + void *object; + struct kmem_cache *cache = page->slab_cache; + void *last_object; + + object = virt_to_obj(cache, page_address(page), addr); + last_object = page_address(page) + + page->objects * cache->size; + + if (unlikely(object > last_object)) + object = last_object; /* we hit into padding */ + + object_err(cache, page, object, + "kasan: bad access detected"); + return; + } + dump_page(page, "kasan: bad access detected"); + } + + if (kernel_or_module_addr(addr)) { + if (!init_task_stack_addr(addr)) + pr_err("Address belongs to variable %pS\n", addr); + } + + dump_stack(); +} + +static bool row_is_guilty(const void *row, const void *guilty) +{ + return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW); +} + +static int shadow_pointer_offset(const void *row, const void *shadow) +{ + /* The length of ">ff00ff00ff00ff00: " is + * 3 + (BITS_PER_LONG/8)*2 chars. + */ + return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 + + (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1; +} + +static void print_shadow_for_address(const void *addr) +{ + int i; + const void *shadow = kasan_mem_to_shadow(addr); + const void *shadow_row; + + shadow_row = (void *)round_down((unsigned long)shadow, + SHADOW_BYTES_PER_ROW) + - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW; + + pr_err("Memory state around the buggy address:\n"); + + for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { + const void *kaddr = kasan_shadow_to_mem(shadow_row); + char buffer[4 + (BITS_PER_LONG/8)*2]; + + snprintf(buffer, sizeof(buffer), + (i == 0) ? ">%p: " : " %p: ", kaddr); + + kasan_disable_current(); + print_hex_dump(KERN_ERR, buffer, + DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, + shadow_row, SHADOW_BYTES_PER_ROW, 0); + kasan_enable_current(); + + if (row_is_guilty(shadow_row, shadow)) + pr_err("%*c\n", + shadow_pointer_offset(shadow_row, shadow), + '^'); + + shadow_row += SHADOW_BYTES_PER_ROW; + } +} + +static DEFINE_SPINLOCK(report_lock); + +void kasan_report_error(struct kasan_access_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&report_lock, flags); + pr_err("=================================" + "=================================\n"); + print_error_description(info); + print_address_description(info); + print_shadow_for_address(info->first_bad_addr); + pr_err("=================================" + "=================================\n"); + spin_unlock_irqrestore(&report_lock, flags); +} + +void kasan_report_user_access(struct kasan_access_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&report_lock, flags); + pr_err("=================================" + "=================================\n"); + pr_err("BUG: KASan: user-memory-access on address %p\n", + info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, task_pid_nr(current)); + dump_stack(); + pr_err("=================================" + "=================================\n"); + spin_unlock_irqrestore(&report_lock, flags); +} + +void kasan_report(unsigned long addr, size_t size, + bool is_write, unsigned long ip) +{ + struct kasan_access_info info; + + if (likely(!kasan_enabled())) + return; + + info.access_addr = (void *)addr; + info.access_size = size; + info.is_write = is_write; + info.ip = ip; + kasan_report_error(&info); +} + + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3cda50c1e394..5405aff5a590 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -98,6 +98,7 @@ #include <asm/processor.h> #include <linux/atomic.h> +#include <linux/kasan.h> #include <linux/kmemcheck.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object) if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) return false; + kasan_disable_current(); object->checksum = crc32(0, (void *)object->pointer, object->size); + kasan_enable_current(); + return object->checksum != old_csum; } @@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end, BYTES_PER_POINTER)) continue; + kasan_disable_current(); pointer = *ptr; + kasan_enable_current(); object = find_and_get_object(pointer, 1); if (!object) diff --git a/mm/list_lru.c b/mm/list_lru.c index f1a0db194173..909eca2c820e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -9,18 +9,100 @@ #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/memcontrol.h> + +#ifdef CONFIG_MEMCG_KMEM +static LIST_HEAD(list_lrus); +static DEFINE_MUTEX(list_lrus_mutex); + +static void list_lru_register(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_add(&lru->list, &list_lrus); + mutex_unlock(&list_lrus_mutex); +} + +static void list_lru_unregister(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_del(&lru->list); + mutex_unlock(&list_lrus_mutex); +} +#else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return !!lru->node[0].memcg_lrus; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + /* + * The lock protects the array of per cgroup lists from relocation + * (see memcg_update_list_lru_node). + */ + lockdep_assert_held(&nlru->lock); + if (nlru->memcg_lrus && idx >= 0) + return nlru->memcg_lrus->lru[idx]; + + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + struct mem_cgroup *memcg; + + if (!nlru->memcg_lrus) + return &nlru->lru; + + memcg = mem_cgroup_from_kmem(ptr); + if (!memcg) + return &nlru->lru; + + return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +} +#else +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return false; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + return &nlru->lru; +} +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); + l = list_lru_from_kmem(nlru, item); if (list_empty(item)) { - list_add_tail(item, &nlru->list); - if (nlru->nr_items++ == 0) - node_set(nid, lru->active_nodes); + list_add_tail(item, &l->list); + l->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); + l = list_lru_from_kmem(nlru, item); if (!list_empty(item)) { list_del_init(item); - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); - WARN_ON_ONCE(nlru->nr_items < 0); + l->nr_items--; spin_unlock(&nlru->lock); return true; } @@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); -unsigned long -list_lru_count_node(struct list_lru *lru, int nid) +void list_lru_isolate(struct list_lru_one *list, struct list_head *item) +{ + list_del_init(item); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate); + +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head) +{ + list_move(item, head); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate_move); + +static unsigned long __list_lru_count_one(struct list_lru *lru, + int nid, int memcg_idx) { - unsigned long count = 0; struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + unsigned long count; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); - count += nlru->nr_items; + l = list_lru_from_memcg_idx(nlru, memcg_idx); + count = l->nr_items; spin_unlock(&nlru->lock); return count; } + +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) +{ + return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); +} +EXPORT_SYMBOL_GPL(list_lru_count_one); + +unsigned long list_lru_count_node(struct list_lru *lru, int nid) +{ + long count = 0; + int memcg_idx; + + count += __list_lru_count_one(lru, nid, -1); + if (list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) + count += __list_lru_count_one(lru, nid, memcg_idx); + } + return count; +} EXPORT_SYMBOL_GPL(list_lru_count_node); -unsigned long -list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, - void *cb_arg, unsigned long *nr_to_walk) +static unsigned long +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; spin_lock(&nlru->lock); + l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: - list_for_each_safe(item, n, &nlru->list) { + list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* @@ -85,14 +206,11 @@ restart: break; --*nr_to_walk; - ret = isolate(item, &nlru->lock, cb_arg); + ret = isolate(item, l, &nlru->lock, cb_arg); switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); case LRU_REMOVED: - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); - WARN_ON_ONCE(nlru->nr_items < 0); isolated++; /* * If the lru lock has been dropped, our list @@ -103,7 +221,7 @@ restart: goto restart; break; case LRU_ROTATE: - list_move_tail(item, &nlru->list); + list_move_tail(item, &l->list); break; case LRU_SKIP: break; @@ -122,31 +240,322 @@ restart: spin_unlock(&nlru->lock); return isolated; } + +unsigned long +list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), + isolate, cb_arg, nr_to_walk); +} +EXPORT_SYMBOL_GPL(list_lru_walk_one); + +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + long isolated = 0; + int memcg_idx; + + isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, + nr_to_walk); + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) { + isolated += __list_lru_walk_one(lru, nid, memcg_idx, + isolate, cb_arg, nr_to_walk); + if (*nr_to_walk <= 0) + break; + } + } + return isolated; +} EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) +static void init_one_lru(struct list_lru_one *l) +{ + INIT_LIST_HEAD(&l->list); + l->nr_items = 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) + kfree(memcg_lrus->lru[i]); +} + +static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) { + struct list_lru_one *l; + + l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); + if (!l) + goto fail; + + init_one_lru(l); + memcg_lrus->lru[i] = l; + } + return 0; +fail: + __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + return -ENOMEM; +} + +static int memcg_init_list_lru_node(struct list_lru_node *nlru) +{ + int size = memcg_nr_cache_ids; + + nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + if (!nlru->memcg_lrus) + return -ENOMEM; + + if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { + kfree(nlru->memcg_lrus); + return -ENOMEM; + } + + return 0; +} + +static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) +{ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); + kfree(nlru->memcg_lrus); +} + +static int memcg_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + struct list_lru_memcg *old, *new; + + BUG_ON(old_size > new_size); + + old = nlru->memcg_lrus; + new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (__memcg_init_list_lru_node(new, old_size, new_size)) { + kfree(new); + return -ENOMEM; + } + + memcpy(new, old, old_size * sizeof(void *)); + + /* + * The lock guarantees that we won't race with a reader + * (see list_lru_from_memcg_idx). + * + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + nlru->memcg_lrus = new; + spin_unlock_irq(&nlru->lock); + + kfree(old); + return 0; +} + +static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + /* do not bother shrinking the array back to the old size, because we + * cannot handle allocation failures here */ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); +} + +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) { + if (!memcg_aware) + lru->node[i].memcg_lrus = NULL; + else if (memcg_init_list_lru_node(&lru->node[i])) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_destroy_list_lru_node(&lru->node[i]); + return -ENOMEM; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_destroy_list_lru_node(&lru->node[i]); +} + +static int memcg_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return 0; + + for (i = 0; i < nr_node_ids; i++) { + if (memcg_update_list_lru_node(&lru->node[i], + old_size, new_size)) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); + return -ENOMEM; +} + +static void memcg_cancel_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); +} + +int memcg_update_all_list_lrus(int new_size) +{ + int ret = 0; + struct list_lru *lru; + int old_size = memcg_nr_cache_ids; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) { + ret = memcg_update_list_lru(lru, old_size, new_size); + if (ret) + goto fail; + } +out: + mutex_unlock(&list_lrus_mutex); + return ret; +fail: + list_for_each_entry_continue_reverse(lru, &list_lrus, list) + memcg_cancel_update_list_lru(lru, old_size, new_size); + goto out; +} + +static void memcg_drain_list_lru_node(struct list_lru_node *nlru, + int src_idx, int dst_idx) +{ + struct list_lru_one *src, *dst; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + + src = list_lru_from_memcg_idx(nlru, src_idx); + dst = list_lru_from_memcg_idx(nlru, dst_idx); + + list_splice_init(&src->list, &dst->list); + dst->nr_items += src->nr_items; + src->nr_items = 0; + + spin_unlock_irq(&nlru->lock); +} + +static void memcg_drain_list_lru(struct list_lru *lru, + int src_idx, int dst_idx) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); +} + +void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +{ + struct list_lru *lru; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) + memcg_drain_list_lru(lru, src_idx, dst_idx); + mutex_unlock(&list_lrus_mutex); +} +#else +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + return 0; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; + int err = -ENOMEM; + + memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); if (!lru->node) - return -ENOMEM; + goto out; - nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); - INIT_LIST_HEAD(&lru->node[i].list); - lru->node[i].nr_items = 0; + init_one_lru(&lru->node[i].lru); } - return 0; + + err = memcg_init_list_lru(lru, memcg_aware); + if (err) { + kfree(lru->node); + goto out; + } + + list_lru_register(lru); +out: + memcg_put_cache_ids(); + return err; } -EXPORT_SYMBOL_GPL(list_lru_init_key); +EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { + /* Already destroyed or not yet initialized? */ + if (!lru->node) + return; + + memcg_get_cache_ids(); + + list_lru_unregister(lru); + + memcg_destroy_list_lru(lru); kfree(lru->node); + lru->node = NULL; + + memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/madvise.c b/mm/madvise.c index d79fb5e8f80a..d551475517bf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -222,21 +222,24 @@ static long madvise_willneed(struct vm_area_struct *vma, struct file *file = vma->vm_file; #ifdef CONFIG_SWAP - if (!file || mapping_cap_swap_backed(file->f_mapping)) { + if (!file) { *prev = vma; - if (!file) - force_swapin_readahead(vma, start, end); - else - force_shm_swapin_readahead(vma, start, end, - file->f_mapping); + force_swapin_readahead(vma, start, end); return 0; } -#endif + if (shmem_mapping(file->f_mapping)) { + *prev = vma; + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#else if (!file) return -EBADF; +#endif - if (file->f_mapping->a_ops->get_xip_mem) { + if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 095c1f96fbec..d18d3a6e7337 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -332,8 +332,10 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* Index in the kmem_cache->memcg_params->memcg_caches array */ + /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; + bool kmem_acct_activated; + bool kmem_acct_active; #endif int last_scanned_node; @@ -352,9 +354,9 @@ struct mem_cgroup { }; #ifdef CONFIG_MEMCG_KMEM -static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +bool memcg_kmem_is_active(struct mem_cgroup *memcg) { - return memcg->kmemcg_id >= 0; + return memcg->kmem_acct_active; } #endif @@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_proto_cgroup); -static void disarm_sock_keys(struct mem_cgroup *memcg) -{ - if (!memcg_proto_activated(&memcg->tcp_mem)) - return; - static_key_slow_dec(&memcg_socket_limit_enabled); -} -#else -static void disarm_sock_keys(struct mem_cgroup *memcg) -{ -} #endif #ifdef CONFIG_MEMCG_KMEM /* - * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, * but only a few kmem-limited. Or also, if we have, for instance, 200 * memcgs, and none but the 200th is kmem-limited, we'd have to have a * 200 entry array for that. * - * The current size of the caches array is stored in - * memcg_limited_groups_array_size. It will double each time we have to - * increase it. + * The current size of the caches array is stored in memcg_nr_cache_ids. It + * will double each time we have to increase it. */ -static DEFINE_IDA(kmem_limited_groups); -int memcg_limited_groups_array_size; +static DEFINE_IDA(memcg_cache_ida); +int memcg_nr_cache_ids; + +/* Protects memcg_nr_cache_ids */ +static DECLARE_RWSEM(memcg_cache_ids_sem); + +void memcg_get_cache_ids(void) +{ + down_read(&memcg_cache_ids_sem); +} + +void memcg_put_cache_ids(void) +{ + up_read(&memcg_cache_ids_sem); +} /* * MIN_SIZE is different than 1, because we would like to avoid going through @@ -569,32 +573,8 @@ int memcg_limited_groups_array_size; struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); -static void memcg_free_cache_id(int id); - -static void disarm_kmem_keys(struct mem_cgroup *memcg) -{ - if (memcg_kmem_is_active(memcg)) { - static_key_slow_dec(&memcg_kmem_enabled_key); - memcg_free_cache_id(memcg->kmemcg_id); - } - /* - * This check can't live in kmem destruction function, - * since the charges will outlive the cgroup - */ - WARN_ON(page_counter_read(&memcg->kmem)); -} -#else -static void disarm_kmem_keys(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static void disarm_static_keys(struct mem_cgroup *memcg) -{ - disarm_sock_keys(memcg); - disarm_kmem_keys(memcg); -} - static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { @@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void) int id, size; int err; - id = ida_simple_get(&kmem_limited_groups, + id = ida_simple_get(&memcg_cache_ida, 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); if (id < 0) return id; - if (id < memcg_limited_groups_array_size) + if (id < memcg_nr_cache_ids) return id; /* * There's no space for the new id in memcg_caches arrays, * so we have to grow them. */ + down_write(&memcg_cache_ids_sem); size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) @@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void) size = MEMCG_CACHES_MAX_SIZE; err = memcg_update_all_caches(size); + if (!err) + err = memcg_update_all_list_lrus(size); + if (!err) + memcg_nr_cache_ids = size; + + up_write(&memcg_cache_ids_sem); + if (err) { - ida_simple_remove(&kmem_limited_groups, id); + ida_simple_remove(&memcg_cache_ida, id); return err; } return id; @@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void) static void memcg_free_cache_id(int id) { - ida_simple_remove(&kmem_limited_groups, id); -} - -/* - * We should update the current array size iff all caches updates succeed. This - * can only be done from the slab side. The slab mutex needs to be held when - * calling this. - */ -void memcg_update_array_size(int num) -{ - memcg_limited_groups_array_size = num; + ida_simple_remove(&memcg_cache_ida, id); } struct memcg_kmem_cache_create_work { @@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + int kmemcg_id; - VM_BUG_ON(!cachep->memcg_params); - VM_BUG_ON(!cachep->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(cachep)); if (current->memcg_kmem_skip_account) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_kmem_is_active(memcg)) + kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + if (kmemcg_id < 0) goto out; - memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); if (likely(memcg_cachep)) return memcg_cachep; @@ -2692,7 +2671,7 @@ out: void __memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params->memcg->css); + css_put(&cachep->memcg_params.memcg->css); } /* @@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, 1 << order); page->mem_cgroup = NULL; } + +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) +{ + struct mem_cgroup *memcg = NULL; + struct kmem_cache *cachep; + struct page *page; + + page = virt_to_head_page(ptr); + if (PageSlab(page)) { + cachep = page->slab_cache; + if (!is_root_cache(cachep)) + memcg = cachep->memcg_params.memcg; + } else + /* page allocated by alloc_kmem_pages */ + memcg = page->mem_cgroup; + + return memcg; +} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, int err = 0; int memcg_id; - if (memcg_kmem_is_active(memcg)) - return 0; + BUG_ON(memcg->kmemcg_id >= 0); + BUG_ON(memcg->kmem_acct_activated); + BUG_ON(memcg->kmem_acct_active); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * patched. */ memcg->kmemcg_id = memcg_id; + memcg->kmem_acct_activated = true; + memcg->kmem_acct_active = true; out: return err; } @@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (!memcg->kmem_acct_active) + return; + + /* + * Clear the 'active' flag before clearing memcg_caches arrays entries. + * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it + * guarantees no cache will be created for this cgroup after we are + * done (see memcg_create_kmem_cache()). + */ + memcg->kmem_acct_active = false; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_destroy_kmem_caches(memcg); + if (memcg->kmem_acct_activated) { + memcg_destroy_kmem_caches(memcg); + static_key_slow_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } mem_cgroup_sockets_destroy(memcg); } #else @@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } @@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); - - disarm_static_keys(memcg); kfree(memcg); } @@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) spin_unlock(&memcg->event_list_lock); vmpressure_cleanup(&memcg->vmpressure); + + memcg_deactivate_kmem(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index feb803bf3443..d487f8dc6d39 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -242,15 +242,8 @@ void shake_page(struct page *p, int access) * Only call shrink_node_slabs here (which would also shrink * other caches) if access is not potentially fatal. */ - if (access) { - int nr; - int nid = page_to_nid(p); - do { - nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); - if (page_count(p) == 1) - break; - } while (nr > 10); - } + if (access) + drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags) * setting PG_hwpoison. */ if (!is_free_buddy_page(page)) - lru_add_drain_all(); - if (!is_free_buddy_page(page)) drain_all_pages(page_zone(page)); SetPageHWPoison(page); if (!is_free_buddy_page(page)) diff --git a/mm/memory.c b/mm/memory.c index bbe6a73a899d..8068893697bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = page; + vmf.cow_page = NULL; ret = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; + /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); @@ -2638,7 +2640,8 @@ oom: * See filemap_fault() and __lock_page_retry(). */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, struct page **page) + pgoff_t pgoff, unsigned int flags, + struct page *cow_page, struct page **page) { struct vm_fault vmf; int ret; @@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + if (!vmf.page) + goto out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + out: *page = vmf.page; return ret; } @@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - copy_user_highpage(new_page, fault_page, address, vma); + if (fault_page) + copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg); @@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3013,14 +3037,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. * - * ptep_modify_prot_start is not called as this is clearing - * the _PAGE_NUMA bit and it is not really expected that there - * would be concurrent hardware modifications to the PTE. + * We can safely just do a "set_pte_at()", because the old + * page table entry is not accessible, so there would be no + * concurrent hardware modifications to the PTE. */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); @@ -3029,7 +3056,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } - pte = pte_mknonnuma(pte); + /* Make it present again */ + pte = pte_modify(pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3038,7 +3067,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } - BUG_ON(is_zero_pfn(page_to_pfn(page))); /* * Avoid grouping on DSO/COW pages in specific and RO pages @@ -3124,7 +3152,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } - if (pte_numa(entry)) + if (pte_protnone(entry)) return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); @@ -3202,7 +3230,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (pmd_trans_splitting(orig_pmd)) return 0; - if (pmd_numa(orig_pmd)) + if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3458,7 +3486,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; - maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (write) memcpy_toio(maddr + offset, buf, len); else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1bd23803576..4721046a134a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); @@ -2817,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += snprintf(p, buffer + maxlen - p, "relative"); } - if (!nodes_empty(nodes)) { - p += snprintf(p, buffer + maxlen - p, ":"); - p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); - } + if (!nodes_empty(nodes)) + p += scnprintf(p, buffer + maxlen - p, ":%*pbl", + nodemask_pr_args(&nodes)); } diff --git a/mm/migrate.c b/mm/migrate.c index f98067e5d353..85e042686031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) -{ - struct page *page = pmd_page(*pmd); - wait_on_page_locked(page); -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -1853,7 +1847,7 @@ out_fail: out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { - entry = pmd_mknonnuma(entry); + entry = pmd_modify(entry, vma->vm_page_prot); set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } diff --git a/mm/mm_init.c b/mm/mm_init.c index 4074caf9936b..5f420f7fafa1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -14,14 +14,14 @@ #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT -int mminit_loglevel; +int __meminitdata mminit_loglevel; #ifndef SECTIONS_SHIFT #define SECTIONS_SHIFT 0 #endif /* The zonelists are simply reported, validation is manual. */ -void mminit_verify_zonelist(void) +void __init mminit_verify_zonelist(void) { int nid; diff --git a/mm/mprotect.c b/mm/mprotect.c index 33121662f08b..44727811bf4c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool updated = false; - if (!prot_numa) { - ptent = ptep_modify_prot_start(mm, addr, pte); - if (pte_numa(ptent)) - ptent = pte_mknonnuma(ptent); - ptent = pte_modify(ptent, newprot); - /* - * Avoid taking write faults for pages we - * know to be dirty. - */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) - ptent = pte_mkwrite(ptent); - ptep_modify_prot_commit(mm, addr, pte, ptent); - updated = true; - } else { + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { struct page *page; page = vm_normal_page(vma, addr, oldpte); - if (page && !PageKsm(page)) { - if (!pte_numa(oldpte)) { - ptep_set_numa(mm, addr, pte); - updated = true; - } - } + if (!page || PageKsm(page)) + continue; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; } - if (updated) - pages++; + + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + + /* Avoid taking write faults for known dirty pages */ + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) { + ptent = pte_mkwrite(ptent); + } + ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/nommu.c b/mm/nommu.c index 1a19fb3b0463..7296360fc057 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -980,9 +980,6 @@ static int validate_mmap_request(struct file *file, return -EOVERFLOW; if (file) { - /* validate file mapping requests */ - struct address_space *mapping; - /* files must support mmap */ if (!file->f_op->mmap) return -ENODEV; @@ -991,28 +988,22 @@ static int validate_mmap_request(struct file *file, * - we support chardevs that provide their own "memory" * - we support files/blockdevs that are memory backed */ - mapping = file->f_mapping; - if (!mapping) - mapping = file_inode(file)->i_mapping; - - capabilities = 0; - if (mapping && mapping->backing_dev_info) - capabilities = mapping->backing_dev_info->capabilities; - - if (!capabilities) { + if (file->f_op->mmap_capabilities) { + capabilities = file->f_op->mmap_capabilities(file); + } else { /* no explicit capabilities set, so assume some * defaults */ switch (file_inode(file)->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: - capabilities = BDI_CAP_MAP_COPY; + capabilities = NOMMU_MAP_COPY; break; case S_IFCHR: capabilities = - BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | - BDI_CAP_WRITE_MAP; + NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | + NOMMU_MAP_WRITE; break; default: @@ -1023,9 +1014,9 @@ static int validate_mmap_request(struct file *file, /* eliminate any capabilities that we can't support on this * device */ if (!file->f_op->get_unmapped_area) - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; if (!file->f_op->read) - capabilities &= ~BDI_CAP_MAP_COPY; + capabilities &= ~NOMMU_MAP_COPY; /* The file shall have been opened with read permission. */ if (!(file->f_mode & FMODE_READ)) @@ -1044,29 +1035,29 @@ static int validate_mmap_request(struct file *file, if (locks_verify_locked(file)) return -EAGAIN; - if (!(capabilities & BDI_CAP_MAP_DIRECT)) + if (!(capabilities & NOMMU_MAP_DIRECT)) return -ENODEV; /* we mustn't privatise shared mappings */ - capabilities &= ~BDI_CAP_MAP_COPY; + capabilities &= ~NOMMU_MAP_COPY; } else { /* we're going to read the file into private memory we * allocate */ - if (!(capabilities & BDI_CAP_MAP_COPY)) + if (!(capabilities & NOMMU_MAP_COPY)) return -ENODEV; /* we don't permit a private writable mapping to be * shared with the backing device */ if (prot & PROT_WRITE) - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } - if (capabilities & BDI_CAP_MAP_DIRECT) { - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + if (capabilities & NOMMU_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || + ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || + ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) ) { - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; if (flags & MAP_SHARED) { printk(KERN_WARNING "MAP_SHARED not completely supported on !MMU\n"); @@ -1083,21 +1074,21 @@ static int validate_mmap_request(struct file *file, } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { - if (capabilities & BDI_CAP_EXEC_MAP) + if (capabilities & NOMMU_MAP_EXEC) prot |= PROT_EXEC; } } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && - !(capabilities & BDI_CAP_EXEC_MAP) + !(capabilities & NOMMU_MAP_EXEC) ) { /* backing file is not executable, try to copy */ - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } } else { /* anonymous mappings are always memory backed and can be * privately mapped */ - capabilities = BDI_CAP_MAP_COPY; + capabilities = NOMMU_MAP_COPY; /* handle PROT_EXEC implication by PROT_READ */ if ((prot & PROT_READ) && @@ -1129,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ - if (!(capabilities & BDI_CAP_MAP_DIRECT)) { + if (!(capabilities & NOMMU_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) @@ -1138,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file, /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ - vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); + vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); if (flags & MAP_SHARED) vm_flags |= VM_SHARED; } @@ -1191,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (capabilities & BDI_CAP_MAP_DIRECT) { + if (capabilities & NOMMU_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ @@ -1380,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file, if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { /* new mapping is not a subset of the region */ - if (!(capabilities & BDI_CAP_MAP_DIRECT)) + if (!(capabilities & NOMMU_MAP_DIRECT)) goto sharing_violation; continue; } @@ -1419,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (capabilities & BDI_CAP_MAP_DIRECT) { + if (capabilities & NOMMU_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) { @@ -1431,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file, * the mapping so we'll have to attempt to copy * it */ ret = -ENODEV; - if (!(capabilities & BDI_CAP_MAP_COPY)) + if (!(capabilities & NOMMU_MAP_COPY)) goto error_just_free; - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } else { vma->vm_start = region->vm_start = addr; vma->vm_end = region->vm_end = addr + len; @@ -1445,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = region; /* set up the mapping - * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + * - the region is filled in if NOMMU_MAP_DIRECT is still set */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6a73e47e81c6..45e187b2d971 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long task_ratelimit; unsigned long dirty_ratelimit; unsigned long pos_ratio; - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; @@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); int ratelimit; int *p; @@ -1929,7 +1929,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + __inc_bdi_stat(bdi, BDI_RECLAIMABLE); + __inc_bdi_stat(bdi, BDI_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); } } EXPORT_SYMBOL(account_page_redirty); @@ -2298,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page) */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); return 1; } @@ -2316,7 +2318,7 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2351,7 +2353,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2405,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged); */ void wait_for_stable_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - struct backing_dev_info *bdi = mapping->backing_dev_info; - - if (!bdi_cap_stable_pages_required(bdi)) - return; - - wait_on_page_writeback(page); + if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) + wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d52ab18fe0d..a47f0b229a1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -25,6 +25,7 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> +#include <linux/kasan.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); * 1G machine -> (16M dma, 784M normal, 224M high) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL - * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA * * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation @@ -787,6 +788,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); if (PageAnon(page)) page->mapping = NULL; @@ -970,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -3871,18 +3874,29 @@ static int __build_all_zonelists(void *data) return 0; } +static noinline void __init +build_all_zonelists_init(void) +{ + __build_all_zonelists(NULL); + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); +} + /* * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. + * + * __ref due to (1) call of __meminit annotated setup_zone_pageset + * [we're only called with non-NULL zone through __meminit paths] and + * (2) call of __init annotated helper build_all_zonelists_init + * [protected by SYSTEM_BOOTING]. */ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(NULL); - mminit_verify_zonelist(); - cpuset_init_current_mems_allowed(); + build_all_zonelists_init(); } else { #ifdef CONFIG_MEMORY_HOTPLUG if (zone) diff --git a/mm/page_io.c b/mm/page_io.c index 955db8b0d497..e6045804c8d8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, .bv_len = PAGE_SIZE, .bv_offset = 0 }; - struct iov_iter from = { - .type = ITER_BVEC | WRITE, - .count = PAGE_SIZE, - .iov_offset = 0, - .nr_segs = 1, - }; - from.bvec = &bv; /* older gcc versions are broken */ + struct iov_iter from; + iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); kiocb.ki_nbytes = PAGE_SIZE; diff --git a/mm/percpu.c b/mm/percpu.c index d39e2f4e335c..73c97a5f4495 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { - static char cpus_buf[4096] __initdata; static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t dyn_size = ai->dyn_size; @@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int *unit_map; int group, unit, i; - cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); - #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("PERCPU: failed to initialize, %s", #cond); \ - pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } diff --git a/mm/readahead.c b/mm/readahead.c index 17b9172ec37f..935675844b2e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -27,7 +27,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { - ra->ra_pages = mapping->backing_dev_info->ra_pages; + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_read_congested(inode_to_bdi(mapping->host))) return; /* do read-ahead */ diff --git a/mm/shmem.c b/mm/shmem.c index 864c878401e6..a63031fa3e0c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; -static struct backing_dev_info shmem_backing_dev_info __read_mostly = { - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, -}; - static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * shmem_backing_dev_info's capabilities prevent regular writeback or - * sync from ever calling shmem_writepage; but a stacking filesystem - * might use ->writepage of its underlying filesystem, in which case - * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. + * Our capabilities prevent regular writeback or sync from ever calling + * shmem_writepage; but a stacking filesystem might use ->writepage of + * its underlying filesystem, in which case tmpfs should write out to + * swap only in response to memory pressure, and not for the writeback + * threads or sync. */ if (!wbc->for_reclaim) { WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ @@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; - inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); info = SHMEM_I(inode); @@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { - return mapping->backing_dev_info == &shmem_backing_dev_info; + return mapping->host->i_sb->s_op == &shmem_ops; } #ifdef CONFIG_TMPFS @@ -3225,10 +3219,6 @@ int __init shmem_init(void) if (shmem_inode_cachep) return 0; - error = bdi_init(&shmem_backing_dev_info); - if (error) - goto out4; - error = shmem_init_inodecache(); if (error) goto out3; @@ -3252,8 +3242,6 @@ out1: out2: shmem_destroy_inodecache(); out3: - bdi_destroy(&shmem_backing_dev_info); -out4: shm_mnt = ERR_PTR(error); return error; } diff --git a/mm/slab.c b/mm/slab.c index 65b5dcb6f671..c4b89eaf4c96 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2382,7 +2382,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) { int ret = 0; int node; @@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __kmem_cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep, false); if (rc) return rc; @@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { int ret; - struct kmem_cache *c = NULL; - int i = 0; + struct kmem_cache *c; ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); @@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, if ((ret < 0) || !is_root_cache(cachep)) return ret; - VM_BUG_ON(!mutex_is_locked(&slab_mutex)); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(cachep, i); - if (c) - /* return value determined by the parent cache only */ - __do_tune_cpucache(c, limit, batchcount, shared, gfp); + lockdep_assert_held(&slab_mutex); + for_each_memcg_cache(c, cachep) { + /* return value determined by the root cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); } return ret; diff --git a/mm/slab.h b/mm/slab.h index 90430d6f665e..4c3ac12dd644 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); -struct mem_cgroup; - int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)); @@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *, bool); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); #ifdef CONFIG_MEMCG_KMEM +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.list, \ + memcg_params.list) + +#define for_each_memcg_cache_safe(iter, tmp, root) \ + list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ + memcg_params.list) + static inline bool is_root_cache(struct kmem_cache *s) { - return !s->memcg_params || s->memcg_params->is_root_cache; + return s->memcg_params.is_root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, - struct kmem_cache *p) + struct kmem_cache *p) { - return (p == s) || - (s->memcg_params && (p == s->memcg_params->root_cache)); + return p == s || p == s->memcg_params.root_cache; } /* @@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, static inline const char *cache_name(struct kmem_cache *s) { if (!is_root_cache(s)) - return s->memcg_params->root_cache->name; + s = s->memcg_params.root_cache; return s->name; } /* * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. - * That said the caller must assure the memcg's cache won't go away. Since once - * created a memcg's cache is destroyed only along with the root cache, it is - * true if we are going to allocate from the cache or hold a reference to the - * root cache by other means. Otherwise, we should hold either the slab_mutex - * or the memcg's slab_caches_mutex while calling this function and accessing - * the returned value. + * That said the caller must assure the memcg's cache won't go away by either + * taking a css reference to the owner cgroup, or holding the slab_mutex. */ static inline struct kmem_cache * cache_from_memcg_idx(struct kmem_cache *s, int idx) { struct kmem_cache *cachep; - struct memcg_cache_params *params; - - if (!s->memcg_params) - return NULL; + struct memcg_cache_array *arr; rcu_read_lock(); - params = rcu_dereference(s->memcg_params); + arr = rcu_dereference(s->memcg_params.memcg_caches); /* * Make sure we will access the up-to-date value. The code updating * memcg_caches issues a write barrier to match this (see - * memcg_register_cache()). + * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(params->memcg_caches[idx]); + cachep = lockless_dereference(arr->entries[idx]); rcu_read_unlock(); return cachep; @@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { if (is_root_cache(s)) return s; - return s->memcg_params->root_cache; + return s->memcg_params.root_cache; } static __always_inline int memcg_charge_slab(struct kmem_cache *s, @@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); + return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); + memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); } -#else + +extern void slab_init_memcg_params(struct kmem_cache *); + +#else /* !CONFIG_MEMCG_KMEM */ + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) +#define for_each_memcg_cache_safe(iter, tmp, root) \ + for ((void)(iter), (void)(tmp), (void)(root); 0; ) + static inline bool is_root_cache(struct kmem_cache *s) { return true; @@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) { } -#endif + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 6e1e4cf65836..999bb3424d44 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) #endif #ifdef CONFIG_MEMCG_KMEM -static int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +void slab_init_memcg_params(struct kmem_cache *s) { - size_t size; + s->memcg_params.is_root_cache = true; + INIT_LIST_HEAD(&s->memcg_params.list); + RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); +} + +static int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct memcg_cache_array *arr; - if (!memcg_kmem_enabled()) + if (memcg) { + s->memcg_params.is_root_cache = false; + s->memcg_params.memcg = memcg; + s->memcg_params.root_cache = root_cache; return 0; + } - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); + slab_init_memcg_params(s); - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; + if (!memcg_nr_cache_ids) + return 0; - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - } else - s->memcg_params->is_root_cache = true; + arr = kzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); + if (!arr) + return -ENOMEM; + RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); return 0; } -static void memcg_free_cache_params(struct kmem_cache *s) +static void destroy_memcg_params(struct kmem_cache *s) { - kfree(s->memcg_params); + if (is_root_cache(s)) + kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); } -static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) +static int update_memcg_params(struct kmem_cache *s, int new_array_size) { - int size; - struct memcg_cache_params *new_params, *cur_params; - - BUG_ON(!is_root_cache(s)); + struct memcg_cache_array *old, *new; - size = offsetof(struct memcg_cache_params, memcg_caches); - size += num_memcgs * sizeof(void *); + if (!is_root_cache(s)) + return 0; - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) + new = kzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); + if (!new) return -ENOMEM; - cur_params = s->memcg_params; - memcpy(new_params->memcg_caches, cur_params->memcg_caches, - memcg_limited_groups_array_size * sizeof(void *)); - - new_params->is_root_cache = true; - - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); + old = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + if (old) + memcpy(new->entries, old->entries, + memcg_nr_cache_ids * sizeof(void *)); + rcu_assign_pointer(s->memcg_params.memcg_caches, new); + if (old) + kfree_rcu(old, rcu); return 0; } @@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs) { struct kmem_cache *s; int ret = 0; - mutex_lock(&slab_mutex); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - - ret = memcg_update_cache_params(s, num_memcgs); + ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches * up to this point in an updated state. */ if (ret) - goto out; + break; } - - memcg_update_array_size(num_memcgs); -out: mutex_unlock(&slab_mutex); return ret; } #else -static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +static inline int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { return 0; } -static inline void memcg_free_cache_params(struct kmem_cache *s) +static inline void destroy_memcg_params(struct kmem_cache *s) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags, } static struct kmem_cache * -do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *), +do_kmem_cache_create(const char *name, size_t object_size, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->align = align; s->ctor = ctor; - err = memcg_alloc_cache_params(memcg, s, root_cache); + err = init_memcg_params(s, memcg, root_cache); if (err) goto out_free_cache; @@ -330,7 +329,7 @@ out: return s; out_free_cache: - memcg_free_cache_params(s); + destroy_memcg_params(s); kmem_cache_free(kmem_cache, s); goto out; } @@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *cache_name; + const char *cache_name; int err; get_online_cpus(); get_online_mems(); + memcg_get_cache_ids(); mutex_lock(&slab_mutex); @@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, if (s) goto out_unlock; - cache_name = kstrdup(name, GFP_KERNEL); + cache_name = kstrdup_const(name, GFP_KERNEL); if (!cache_name) { err = -ENOMEM; goto out_unlock; @@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align, flags, ctor, NULL, NULL); if (IS_ERR(s)) { err = PTR_ERR(s); - kfree(cache_name); + kfree_const(cache_name); } out_unlock: mutex_unlock(&slab_mutex); + memcg_put_cache_ids(); put_online_mems(); put_online_cpus(); @@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, *need_rcu_barrier = true; #ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) { - struct kmem_cache *root_cache = s->memcg_params->root_cache; - int memcg_id = memcg_cache_id(s->memcg_params->memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); - root_cache->memcg_params->memcg_caches[memcg_id] = NULL; - } + if (!is_root_cache(s)) + list_del(&s->memcg_params.list); #endif list_move(&s->list, release); return 0; @@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ - int memcg_id = memcg_cache_id(memcg); + struct cgroup_subsys_state *css = mem_cgroup_css(memcg); + struct memcg_cache_array *arr; struct kmem_cache *s = NULL; char *cache_name; + int idx; get_online_cpus(); get_online_mems(); @@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* + * The memory cgroup could have been deactivated while the cache + * creation work was pending. + */ + if (!memcg_kmem_is_active(memcg)) + goto out_unlock; + + idx = memcg_cache_id(memcg); + arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + + /* * Since per-memcg caches are created asynchronously on first * allocation (see memcg_kmem_get_cache()), several threads can try to * create the same cache, but only one of them may succeed. */ - if (cache_from_memcg_idx(root_cache, memcg_id)) + if (arr->entries[idx]) goto out_unlock; - cgroup_name(mem_cgroup_css(memcg)->cgroup, - memcg_name_buf, sizeof(memcg_name_buf)); + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name_buf); + css->id, memcg_name_buf); if (!cache_name) goto out_unlock; @@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } + list_add(&s->memcg_params.list, &root_cache->memcg_params.list); + /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially * initialized. */ smp_wmb(); - root_cache->memcg_params->memcg_caches[memcg_id] = s; + arr->entries[idx] = s; out_unlock: mutex_unlock(&slab_mutex); @@ -535,6 +545,37 @@ out_unlock: put_online_cpus(); } +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_cache_array *arr; + struct kmem_cache *s, *c; + + idx = memcg_cache_id(memcg); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + c = arr->entries[idx]; + if (!c) + continue; + + __kmem_cache_shrink(c, true); + arr->entries[idx] = NULL; + } + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); @@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) mutex_lock(&slab_mutex); list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params->memcg != memcg) + if (is_root_cache(s) || s->memcg_params.memcg != memcg) continue; /* * The cgroup is about to be freed and therefore has no charges @@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) void slab_kmem_cache_release(struct kmem_cache *s) { - memcg_free_cache_params(s); - kfree(s->name); + destroy_memcg_params(s); + kfree_const(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { - int i; + struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; bool busy = false; + BUG_ON(!is_root_cache(s)); + get_online_cpus(); get_online_mems(); @@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - - if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + for_each_memcg_cache_safe(c, c2, s) { + if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) busy = true; } @@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); - ret = __kmem_cache_shrink(cachep); + ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); return ret; @@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz s->name = name; s->size = s->object_size = size; s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + slab_init_memcg_params(s); + err = __kmem_cache_create(s, flags); if (err) @@ -854,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); + kasan_kmalloc_large(ret, size); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -920,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) { struct kmem_cache *c; struct slabinfo sinfo; - int i; if (!is_root_cache(s)) return; - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - + for_each_memcg_cache(c, s) { memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(c, &sinfo); @@ -981,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p) if (p == slab_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + if (!is_root_cache(s) && s->memcg_params.memcg == memcg) cache_show(s, m); return 0; } @@ -1038,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, if (p) ks = ksize(p); - if (ks >= new_size) + if (ks >= new_size) { + kasan_krealloc((void *)p, new_size); return (void *)p; + } ret = kmalloc_track_caller(new_size, flags); if (ret && p) diff --git a/mm/slob.c b/mm/slob.c index 96a86206a26b..94a7fede6d48 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int __kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 8b8508adf9c2..6832c4eab104 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -20,6 +20,7 @@ #include <linux/proc_fs.h> #include <linux/notifier.h> #include <linux/seq_file.h> +#include <linux/kasan.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -468,12 +469,30 @@ static char *slub_debug_slabs; static int disable_higher_order_debug; /* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kasan_enable_current(); +} + +/* * Object debugging */ static void print_section(char *text, u8 *addr, unsigned int length) { + metadata_access_enable(); print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); + metadata_access_disable(); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object, trace.max_entries = TRACK_ADDRS_COUNT; trace.entries = p->addrs; trace.skip = 3; + metadata_access_enable(); save_stack_trace(&trace); + metadata_access_disable(); /* See rant in lockdep.c */ if (trace.nr_entries != 0 && @@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) dump_stack(); } -static void object_err(struct kmem_cache *s, struct page *page, +void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { slab_bug(s, "%s", reason); @@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; + metadata_access_enable(); fault = memchr_inv(start, value, bytes); + metadata_access_disable(); if (!fault) return 1; @@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + metadata_access_enable(); fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + metadata_access_disable(); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) @@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); + kasan_kmalloc_large(ptr, size); } static inline void kfree_hook(const void *x) { kmemleak_free(x); + kasan_kfree_large(x); } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, @@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); memcg_kmem_put_cache(s); + kasan_slab_alloc(s, object); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) #endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + + kasan_slab_free(s, x); } /* @@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page, void *object) { setup_object_debug(s, page, object); - if (unlikely(s->ctor)) + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); s->ctor(object); + kasan_poison_object_data(s, object); + } } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); + kasan_poison_slab(page); + for_each_object_idx(p, idx, s, start, page->objects) { setup_object(s, page, p); if (likely(idx < page->objects)) @@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages; int pobjects; + preempt_disable(); do { pages = 0; pobjects = 0; @@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + if (unlikely(!s->cpu_partial)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } + preempt_enable(); #endif } @@ -2488,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2514,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); + + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2899,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3271,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -3314,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif -size_t ksize(const void *object) +static size_t __ksize(const void *object) { struct page *page; @@ -3335,6 +3387,15 @@ size_t ksize(const void *object) return slab_ksize(page->slab_cache); } + +size_t ksize(const void *object) +{ + size_t size = __ksize(object); + /* We assume that ksize callers could use whole allocated area, + so we need unpoison this area. */ + kasan_krealloc(object, size); + return size; +} EXPORT_SYMBOL(ksize); void kfree(const void *x) @@ -3358,69 +3419,92 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +#define SHRINK_PROMOTE_MAX 32 + /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. * * The slabs with the least items are placed last. This results in them * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) { int node; int i; struct kmem_cache_node *n; struct page *page; struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; + int ret = 0; - if (!slabs_by_inuse) - return -ENOMEM; + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + kick_all_cpus_sync(); + } flush_all(s); for_each_kmem_cache_node(s, node, n) { - if (!n->nr_partial) - continue; - - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); spin_lock_irqsave(&n->list_lock, flags); /* - * Build lists indexed by the items in use in each slab. + * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); - if (!page->inuse) + int free = page->objects - page->inuse; + + /* Do not reread page->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == page->objects) { + list_move(&page->lru, &discard); n->nr_partial--; + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&page->lru, promote + free - 1); } /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Promote the slabs filled up most to the head of the + * partial list. */ - for (i = objects - 1; i > 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + list_for_each_entry_safe(page, t, &discard, lru) discard_slab(s, page); + + if (slabs_node(s, node)) + ret = 1; } - kfree(slabs_by_inuse); - return 0; + return ret; } static int slab_mem_going_offline_callback(void *arg) @@ -3429,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + __kmem_cache_shrink(s, false); mutex_unlock(&slab_mutex); return 0; @@ -3577,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) p->slab_cache = s; #endif } + slab_init_memcg_params(s); list_add(&s->list, &slab_caches); return s; } @@ -3635,13 +3720,10 @@ struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s, *c; s = find_mergeable(size, align, flags, name, ctor); if (s) { - int i; - struct kmem_cache *c; - s->refcount++; /* @@ -3651,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; + for_each_memcg_cache(c, s) { c->object_size = s->object_size; c->inuse = max_t(int, c->inuse, ALIGN(size, sizeof(void *))); @@ -4081,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf, if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, - PAGE_SIZE - len - 50, - to_cpumask(l->cpus)); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, - PAGE_SIZE - len - 50, - l->nodes); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); len += sprintf(buf + len, "\n"); } @@ -4691,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') { - int rc = kmem_cache_shrink(s); - - if (rc) - return rc; - } else + if (buf[0] == '1') + kmem_cache_shrink(s); + else return -EINVAL; return length; } @@ -4920,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, err = attribute->store(s, buf, len); #ifdef CONFIG_MEMCG_KMEM if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; + struct kmem_cache *c; mutex_lock(&slab_mutex); if (s->max_attr_size < len) @@ -4943,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, * directly either failed or succeeded, in which case we loop * through the descendants with best-effort propagation. */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (c) - attribute->store(c, buf, len); - } + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); mutex_unlock(&slab_mutex); } #endif @@ -4964,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (is_root_cache(s)) return; - root_cache = s->memcg_params->root_cache; + root_cache = s->memcg_params.root_cache; /* * This mean this cache had no attribute written. Therefore, no point @@ -5044,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) { #ifdef CONFIG_MEMCG_KMEM if (!is_root_cache(s)) - return s->memcg_params->root_cache->memcg_kset; + return s->memcg_params.root_cache->memcg_kset; #endif return slab_kset; } diff --git a/mm/swap.c b/mm/swap.c index 5b3087228b99..cd3a5e64cea9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1138,8 +1138,6 @@ void __init swap_setup(void) #ifdef CONFIG_SWAP int i; - if (bdi_init(swapper_spaces[0].backing_dev_info)) - panic("Failed to init swap bdi"); for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); #endif diff --git a/mm/swap_state.c b/mm/swap_state.c index 9711342987a0..405923f77334 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = { #endif }; -static struct backing_dev_info swap_backing_dev_info = { - .name = "swap", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, -}; - struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, - .backing_dev_info = &swap_backing_dev_info, } }; diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d6052369..ddec5a5966d7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); diff --git a/mm/util.c b/mm/util.c index f3ef639c4857..3981ae9d1b15 100644 --- a/mm/util.c +++ b/mm/util.c @@ -12,10 +12,30 @@ #include <linux/hugetlb.h> #include <linux/vmalloc.h> +#include <asm/sections.h> #include <asm/uaccess.h> #include "internal.h" +static inline int is_kernel_rodata(unsigned long addr) +{ + return addr >= (unsigned long)__start_rodata && + addr < (unsigned long)__end_rodata; +} + +/** + * kfree_const - conditionally free memory + * @x: pointer to the memory + * + * Function calls kfree only if @x is not in .rodata section. + */ +void kfree_const(const void *x) +{ + if (!is_kernel_rodata((unsigned long)x)) + kfree(x); +} +EXPORT_SYMBOL(kfree_const); + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp) EXPORT_SYMBOL(kstrdup); /** + * kstrdup_const - conditionally duplicate an existing const string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Function returns source string if it is in .rodata section otherwise it + * fallbacks to kstrdup. + * Strings allocated by kstrdup_const should be freed by kfree_const. + */ +const char *kstrdup_const(const char *s, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)s)) + return s; + + return kstrdup(s, gfp); +} +EXPORT_SYMBOL(kstrdup_const); + +/** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 39c338896416..35b25e1340ca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!area)) return NULL; - /* - * We always allocate a guard page. - */ - size += PAGE_SIZE; + if (!(flags & VM_NO_GUARD)) + size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask); if (IS_ERR(va)) { @@ -1621,6 +1619,7 @@ fail: * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * @@ -1630,7 +1629,8 @@ fail: */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, int node, const void *caller) + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) { struct vm_struct *area; void *addr; @@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, - start, end, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | + vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; @@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, node, caller); + gfp_mask, prot, 0, node, caller); } void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e645ee52045..5e8eadd71bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long shrink_slabs(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, } /** - * shrink_node_slabs - shrink slab caches of a given node + * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target * @nr_scanned: pressure numerator * @nr_eligible: pressure denominator * @@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * + * @memcg specifies the memory cgroup to target. If it is not NULL, + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan + * objects from the memory cgroup specified. Otherwise all shrinkers + * are called, and memcg aware shrinkers are supposed to scan the + * global list then. + * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example * passes the number of pages scanned and the number of pages on the @@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * * Returns the number of reclaimed slab objects. */ -unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; + if (memcg && !memcg_kmem_is_active(memcg)) + return 0; + if (nr_scanned == 0) nr_scanned = SWAP_CLUSTER_MAX; @@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, + .memcg = memcg, }; + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + continue; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); } up_read(&shrinker_rwsem); @@ -404,6 +419,29 @@ out: return freed; } +void drop_slab_node(int nid) +{ + unsigned long freed; + + do { + struct mem_cgroup *memcg = NULL; + + freed = 0; + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, + 1000, 1000); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while (freed > 10); +} + +void drop_slab(void) +{ + int nid; + + for_each_online_node(nid) + drop_slab_node(nid); +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -500,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) + if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -879,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(mapping->backing_dev_info)) || + bdi_write_congested(inode_to_bdi(mapping->host))) || (writeback && PageReclaim(page))) nr_congested++; @@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone, static bool shrink_zone(struct zone *zone, struct scan_control *sc, bool is_classzone) { + struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long scanned; struct lruvec *lruvec; int swappiness; @@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); zone_lru_pages += lru_pages; + if (memcg && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->nr_scanned - scanned, + lru_pages); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) { - struct reclaim_state *reclaim_state; - - shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), - sc->nr_scanned - nr_scanned, - zone_lru_pages); - - reclaim_state = current->reclaim_state; - if (reclaim_state) { - sc->nr_reclaimed += - reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } + if (global_reclaim(sc) && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, diff --git a/mm/workingset.c b/mm/workingset.c index f7216fa7da27..aa017133744b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); pages = node_present_pages(sc->nid); @@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } static enum lru_status shadow_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - list_del_init(item); + list_lru_isolate(lru, item); spin_unlock(lru_lock); /* @@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, - shadow_lru_isolate, NULL, &sc->nr_to_scan); + ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, + shadow_lru_isolate, NULL); local_irq_enable(); return ret; } diff --git a/mm/zbud.c b/mm/zbud.c index 4e387bea702e..2ee4e4520493 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zbud_zpool_create(char *name, gfp_t gfp, + struct zpool_ops *zpool_ops) { return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); } diff --git a/mm/zpool.c b/mm/zpool.c index 739cdf0d183a..bacdab6e47de 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver) /** * zpool_create_pool() - Create a new zpool * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) * @gfp The GFP flags to use when allocating the pool. * @ops The optional ops callback. * @@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, + struct zpool_ops *ops) { struct zpool_driver *driver; struct zpool *zpool; @@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(gfp, ops); + zpool->pool = driver->create(name, gfp, ops); zpool->ops = ops; if (!zpool->pool) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b72403927aa4..0dec1fa5f656 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -91,6 +91,7 @@ #include <linux/hardirq.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> @@ -168,6 +169,22 @@ enum fullness_group { ZS_FULL }; +enum zs_stat_type { + OBJ_ALLOCATED, + OBJ_USED, + NR_ZS_STAT_TYPE, +}; + +#ifdef CONFIG_ZSMALLOC_STAT + +static struct dentry *zs_stat_root; + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#endif + /* * number of size_classes */ @@ -200,6 +217,10 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; +#ifdef CONFIG_ZSMALLOC_STAT + struct zs_size_stat stats; +#endif + spinlock_t lock; struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; @@ -217,10 +238,16 @@ struct link_free { }; struct zs_pool { + char *name; + struct size_class **size_class; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; + +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif }; /* @@ -246,9 +273,9 @@ struct mapping_area { #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) { - return zs_create_pool(gfp); + return zs_create_pool(name, gfp); } static void zs_zpool_destroy(void *pool) @@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } +#ifdef CONFIG_ZSMALLOC_STAT + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return class->stats.objs[type]; +} + +static int __init zs_stat_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + return -ENOMEM; + + return 0; +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long obj_allocated, obj_used, pages_used; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + + seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", + "obj_allocated", "obj_used", "pages_used"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, + class->size, obj_allocated, obj_used, pages_used); + + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", + total_objs, total_used_objs, total_pages); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + struct dentry *entry; + + if (!zs_stat_root) + return -ENODEV; + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return -ENOMEM; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "obj_in_classes"); + return -ENOMEM; + } + + return 0; +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return 0; +} + +static int __init zs_stat_init(void) +{ + return 0; +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + return 0; +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} + +#endif + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) set_zspage_mapping(first_page, class->index, ZS_EMPTY); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); + spin_lock(&class->lock); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); } obj = (unsigned long)first_page->freelist; @@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) kunmap_atomic(vaddr); first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(pool, first_page); spin_unlock(&class->lock); @@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page->inuse--; fullness = fix_fullness_group(pool, first_page); + + zs_stat_dec(class, OBJ_USED, 1); + if (fullness == ZS_EMPTY) + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + spin_unlock(&class->lock); if (fullness == ZS_EMPTY) { @@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free); * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(gfp_t flags) +struct zs_pool *zs_create_pool(char *name, gfp_t flags) { int i; struct zs_pool *pool; @@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags) if (!pool) return NULL; + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) { + kfree(pool); + return NULL; + } + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { + kfree(pool->name); kfree(pool); return NULL; } @@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags) pool->flags = flags; + if (zs_pool_stat_create(name, pool)) + goto err; + return pool; err: @@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool) { int i; + zs_pool_stat_destroy(pool); + for (i = 0; i < zs_size_classes; i++) { int fg; struct size_class *class = pool->size_class[i]; @@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool) } kfree(pool->size_class); + kfree(pool->name); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); @@ -1250,17 +1460,30 @@ static int __init zs_init(void) { int ret = zs_register_cpu_notifier(); - if (ret) { - zs_unregister_cpu_notifier(); - return ret; - } + if (ret) + goto notifier_fail; init_zs_size_classes(); #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif + + ret = zs_stat_init(); + if (ret) { + pr_err("zs stat initialization failed\n"); + goto stat_fail; + } return 0; + +stat_fail: +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif +notifier_fail: + zs_unregister_cpu_notifier(); + + return ret; } static void __exit zs_exit(void) @@ -1269,6 +1492,8 @@ static void __exit zs_exit(void) zpool_unregister_driver(&zs_zpool_driver); #endif zs_unregister_cpu_notifier(); + + zs_stat_exit(); } module_init(zs_init); diff --git a/mm/zswap.c b/mm/zswap.c index 0cfce9bc51e4..4249e82ff934 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -906,11 +906,12 @@ static int __init init_zswap(void) pr_info("loading zswap\n"); - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { pr_info("%s zpool not available\n", zswap_zpool_type); zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, &zswap_zpool_ops); } if (!zswap_pool) { |