diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 12:53:37 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 12:53:37 -0800 |
commit | ac73e3dc8acd0a3be292755db30388c3580f5674 (patch) | |
tree | 5abef6cb82b205b5dbbb69dca950b8a5aae716de /mm | |
parent | 148842c98a24e508aecb929718818fbf4c2a6ff3 (diff) | |
parent | dfefd226b0bf7c435a58d75a0ce2f9273b9825f6 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
- a few random little subsystems
- almost all of the MM patches which are staged ahead of linux-next
material. I'll trickle to post-linux-next work in as the dependents
get merged up.
Subsystems affected by this patch series: kthread, kbuild, ide, ntfs,
ocfs2, arch, and mm (slab-generic, slab, slub, dax, debug, pagecache,
gup, swap, shmem, memcg, pagemap, mremap, hmm, vmalloc, documentation,
kasan, pagealloc, memory-failure, hugetlb, vmscan, z3fold, compaction,
oom-kill, migration, cma, page-poison, userfaultfd, zswap, zsmalloc,
uaccess, zram, and cleanups).
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (200 commits)
mm: cleanup kstrto*() usage
mm: fix fall-through warnings for Clang
mm: slub: convert sysfs sprintf family to sysfs_emit/sysfs_emit_at
mm: shmem: convert shmem_enabled_show to use sysfs_emit_at
mm:backing-dev: use sysfs_emit in macro defining functions
mm: huge_memory: convert remaining use of sprintf to sysfs_emit and neatening
mm: use sysfs_emit for struct kobject * uses
mm: fix kernel-doc markups
zram: break the strict dependency from lzo
zram: add stat to gather incompressible pages since zram set up
zram: support page writeback
mm/process_vm_access: remove redundant initialization of iov_r
mm/zsmalloc.c: rework the list_add code in insert_zspage()
mm/zswap: move to use crypto_acomp API for hardware acceleration
mm/zswap: fix passing zero to 'PTR_ERR' warning
mm/zswap: make struct kernel_param_ops definitions const
userfaultfd/selftests: hint the test runner on required privilege
userfaultfd/selftests: fix retval check for userfaultfd_open()
userfaultfd/selftests: always dump something in modes
userfaultfd: selftests: make __{s,u}64 format specifiers portable
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 25 | ||||
-rw-r--r-- | mm/Kconfig.debug | 28 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/cma.c | 6 | ||||
-rw-r--r-- | mm/compaction.c | 29 | ||||
-rw-r--r-- | mm/filemap.c | 609 | ||||
-rw-r--r-- | mm/gup.c | 326 | ||||
-rw-r--r-- | mm/gup_test.c (renamed from mm/gup_benchmark.c) | 111 | ||||
-rw-r--r-- | mm/gup_test.h | 32 | ||||
-rw-r--r-- | mm/highmem.c | 52 | ||||
-rw-r--r-- | mm/huge_memory.c | 62 | ||||
-rw-r--r-- | mm/hugetlb.c | 28 | ||||
-rw-r--r-- | mm/init-mm.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/kasan/generic.c | 3 | ||||
-rw-r--r-- | mm/kasan/report.c | 4 | ||||
-rw-r--r-- | mm/khugepaged.c | 58 | ||||
-rw-r--r-- | mm/ksm.c | 50 | ||||
-rw-r--r-- | mm/madvise.c | 14 | ||||
-rw-r--r-- | mm/mapping_dirty_helpers.c | 6 | ||||
-rw-r--r-- | mm/memblock.c | 80 | ||||
-rw-r--r-- | mm/memcontrol.c | 168 | ||||
-rw-r--r-- | mm/memory-failure.c | 226 | ||||
-rw-r--r-- | mm/memory.c | 24 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 32 | ||||
-rw-r--r-- | mm/mempolicy.c | 8 | ||||
-rw-r--r-- | mm/migrate.c | 183 | ||||
-rw-r--r-- | mm/mm_init.c | 1 | ||||
-rw-r--r-- | mm/mmap.c | 22 | ||||
-rw-r--r-- | mm/mmap_lock.c | 230 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 7 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mremap.c | 280 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 469 | ||||
-rw-r--r-- | mm/page_counter.c | 4 | ||||
-rw-r--r-- | mm/page_ext.c | 10 | ||||
-rw-r--r-- | mm/page_isolation.c | 12 | ||||
-rw-r--r-- | mm/page_owner.c | 17 | ||||
-rw-r--r-- | mm/page_poison.c | 56 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 9 | ||||
-rw-r--r-- | mm/process_vm_access.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 39 | ||||
-rw-r--r-- | mm/slab.c | 10 | ||||
-rw-r--r-- | mm/slab.h | 9 | ||||
-rw-r--r-- | mm/slab_common.c | 10 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 160 | ||||
-rw-r--r-- | mm/swap.c | 12 | ||||
-rw-r--r-- | mm/swap_state.c | 7 | ||||
-rw-r--r-- | mm/swapfile.c | 14 | ||||
-rw-r--r-- | mm/truncate.c | 12 | ||||
-rw-r--r-- | mm/vmalloc.c | 105 | ||||
-rw-r--r-- | mm/vmscan.c | 21 | ||||
-rw-r--r-- | mm/vmstat.c | 6 | ||||
-rw-r--r-- | mm/workingset.c | 8 | ||||
-rw-r--r-- | mm/z3fold.c | 191 | ||||
-rw-r--r-- | mm/zsmalloc.c | 11 | ||||
-rw-r--r-- | mm/zswap.c | 189 |
62 files changed, 2434 insertions, 1722 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8c49d09da214..cf04bc3c866c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -821,13 +821,28 @@ config PERCPU_STATS information includes global and per chunk statistics, which can be used to help understand percpu memory usage. -config GUP_BENCHMARK - bool "Enable infrastructure for get_user_pages() and related calls benchmarking" +config GUP_TEST + bool "Enable infrastructure for get_user_pages()-related unit tests" + depends on DEBUG_FS help - Provides /sys/kernel/debug/gup_benchmark that helps with testing - performance of get_user_pages() and related calls. + Provides /sys/kernel/debug/gup_test, which in turn provides a way + to make ioctl calls that can launch kernel-based unit tests for + the get_user_pages*() and pin_user_pages*() family of API calls. - See tools/testing/selftests/vm/gup_benchmark.c + These tests include benchmark testing of the _fast variants of + get_user_pages*() and pin_user_pages*(), as well as smoke tests of + the non-_fast variants. + + There is also a sub-test that allows running dump_page() on any + of up to eight pages (selected by command line args) within the + range of user-space addresses. These pages are either pinned via + pin_user_pages*(), or pinned via get_user_pages*(), as specified + by other command line arguments. + + See tools/testing/selftests/vm/gup_test.c + +comment "GUP_TEST needs to have DEBUG_FS enabled" + depends on !GUP_TEST && !DEBUG_FS config GUP_GET_PTE_LOW_HIGH bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 864f129f1937..1e73717802f8 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,7 +64,6 @@ config PAGE_OWNER config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_POISONING_NO_SANITY if HIBERNATION help Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps @@ -75,30 +74,11 @@ config PAGE_POISONING Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. - If unsure, say N - -config PAGE_POISONING_NO_SANITY - depends on PAGE_POISONING - bool "Only poison, don't sanity check" - help - Skip the sanity checking on alloc, only fill the pages with - poison on free. This reduces some of the overhead of the - poisoning feature. - - If you are only interested in sanitization, say Y. Otherwise - say N. + If you are only interested in sanitization of freed pages without + checking the poison pattern on alloc, you can boot the kernel with + "init_on_free=1" instead of enabling this. -config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of debugging value" - depends on PAGE_POISONING - help - Instead of using the existing poison value, fill the pages with - zeros. This makes it harder to detect when errors are occurring - due to sanitization but the zeroing at free means that it is - no longer necessary to write zeros when GFP_ZERO is used on - allocation. - - If unsure, say N + If unsure, say N config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c..b6cd2fffa492 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o $(mmu-y) + debug.o gup.o mmap_lock.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o @@ -90,7 +90,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o -obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o +obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 408d5051d05b..e33797579338 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -150,11 +150,11 @@ static ssize_t read_ahead_kb_store(struct device *dev, #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ - struct device_attribute *attr, char *page) \ + struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ - return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ + return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); @@ -200,11 +200,11 @@ BDI_SHOW(max_ratio, bdi->max_ratio) static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, - char *page) + char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); - return snprintf(page, PAGE_SIZE-1, "%d\n", 0); + return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); @@ -38,7 +38,6 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; -static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { @@ -454,10 +453,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, mutex_unlock(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); - mutex_unlock(&cma_mutex); + if (ret == 0) { page = pfn_to_page(pfn); break; @@ -512,7 +510,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) if (!cma || !pages) return false; - pr_debug("%s(page %p)\n", __func__, (void *)pages); + pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); diff --git a/mm/compaction.c b/mm/compaction.c index 13cb7a961b31..dbcfdfce1b82 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(__ClearPageMovable); * allocation success. 1 << compact_defer_shift, compactions are skipped up * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT */ -void defer_compaction(struct zone *zone, int order) +static void defer_compaction(struct zone *zone, int order) { zone->compact_considered = 0; zone->compact_defer_shift++; @@ -172,7 +172,7 @@ void defer_compaction(struct zone *zone, int order) } /* Returns true if compaction should be skipped this time */ -bool compaction_deferred(struct zone *zone, int order) +static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; @@ -209,7 +209,7 @@ void compaction_defer_reset(struct zone *zone, int order, } /* Returns true if restarting compaction after many failures */ -bool compaction_restarting(struct zone *zone, int order) +static bool compaction_restarting(struct zone *zone, int order) { if (order < zone->compact_order_failed) return false; @@ -237,7 +237,7 @@ static void reset_cached_positions(struct zone *zone) } /* - * Compound pages of >= pageblock_order should consistenly be skipped until + * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are * migratable), and the pageblocks they occupy cannot contain any free pages. */ @@ -2070,13 +2070,6 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -/* - * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction - * COMPACT_CONTINUE - If compaction should run now - */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx, @@ -2120,6 +2113,13 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx) @@ -2275,7 +2275,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; - unsigned long start_pfn = cc->migrate_pfn; + unsigned long iteration_start_pfn = cc->migrate_pfn; /* * Avoid multiple rescans which can happen if a page cannot be @@ -2287,7 +2287,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) */ cc->rescan = false; if (pageblock_start_pfn(last_migrated_pfn) == - pageblock_start_pfn(start_pfn)) { + pageblock_start_pfn(iteration_start_pfn)) { cc->rescan = true; } @@ -2311,8 +2311,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto check_drain; case ISOLATE_SUCCESS: update_cached = false; - last_migrated_pfn = start_pfn; - ; + last_migrated_pfn = iteration_start_pfn; } err = migrate_pages(&cc->migratepages, compaction_alloc, diff --git a/mm/filemap.c b/mm/filemap.c index 0b2067b3c328..39bb88140680 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_node_page_state(page, NR_SHMEM_THPS); + __dec_lruvec_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { - __dec_node_page_state(page, NR_FILE_THPS); + __dec_lruvec_page_state(page, NR_FILE_THPS); filemap_nr_thps_dec(mapping); } @@ -1583,19 +1583,20 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, else wait_on_page_locked(page); return 0; - } else { - if (flags & FAULT_FLAG_KILLABLE) { - int ret; + } + if (flags & FAULT_FLAG_KILLABLE) { + int ret; - ret = __lock_page_killable(page); - if (ret) { - mmap_read_unlock(mm); - return 0; - } - } else - __lock_page(page); - return 1; + ret = __lock_page_killable(page); + if (ret) { + mmap_read_unlock(mm); + return 0; + } + } else { + __lock_page(page); } + return 1; + } /** @@ -2166,6 +2167,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } +static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) +{ + if (iocb->ki_flags & IOCB_WAITQ) + return lock_page_async(page, iocb->ki_waitq); + else if (iocb->ki_flags & IOCB_NOWAIT) + return trylock_page(page) ? 0 : -EAGAIN; + else + return lock_page_killable(page); +} + +static struct page * +generic_file_buffered_read_readpage(struct kiocb *iocb, + struct file *filp, + struct address_space *mapping, + struct page *page) +{ + struct file_ra_state *ra = &filp->f_ra; + int error; + + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EAGAIN); + } + + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) { + put_page(page); + return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; + } + + if (!PageUptodate(page)) { + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_mapping_pages got it + */ + unlock_page(page); + put_page(page); + return NULL; + } + unlock_page(page); + shrink_readahead_size_eio(ra); + put_page(page); + return ERR_PTR(-EIO); + } + unlock_page(page); + } + + return page; +} + +static struct page * +generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, + struct file *filp, + struct iov_iter *iter, + struct page *page, + loff_t pos, loff_t count) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + int error; + + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + if (iocb->ki_flags & IOCB_WAITQ) { + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + error = wait_on_page_locked_killable(page); + } + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (PageUptodate(page)) + return page; + + if (inode->i_blkbits == PAGE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iov_iter_is_pipe(iter))) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, + pos & ~PAGE_MASK, count)) + goto page_not_up_to_date_locked; + unlock_page(page); + return page; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + +page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + put_page(page); + return NULL; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static struct page * +generic_file_buffered_read_no_cached_page(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + struct page *page; + int error; + + if (iocb->ki_flags & IOCB_NOIO) + return ERR_PTR(-EAGAIN); + + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + page = page_cache_alloc(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error) { + put_page(page); + return error != -EEXIST ? ERR_PTR(error) : NULL; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static int generic_file_buffered_read_get_pages(struct kiocb *iocb, + struct iov_iter *iter, + struct page **pages, + unsigned int nr) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; + int i, j, nr_got, err = 0; + + nr = min_t(unsigned long, last_index - index, nr); +find_page: + if (fatal_signal_pending(current)) + return -EINTR; + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + + page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); + err = PTR_ERR_OR_ZERO(pages[0]); + if (!IS_ERR_OR_NULL(pages[0])) + nr_got = 1; +got_pages: + for (i = 0; i < nr_got; i++) { + struct page *page = pages[i]; + pgoff_t pg_index = index + i; + loff_t pg_pos = max(iocb->ki_pos, + (loff_t) pg_index << PAGE_SHIFT); + loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; + + if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + page_cache_async_readahead(mapping, ra, filp, page, + pg_index, last_index - pg_index); + } + + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_NOWAIT) || + ((iocb->ki_flags & IOCB_WAITQ) && i)) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + + page = generic_file_buffered_read_pagenotuptodate(iocb, + filp, iter, page, pg_pos, pg_count); + if (IS_ERR_OR_NULL(page)) { + for (j = i + 1; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = PTR_ERR_OR_ZERO(page); + break; + } + } + } + + if (likely(nr_got)) + return nr_got; + if (err) + return err; + /* + * No pages and no error means we raced and should retry: + */ + goto find_page; +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2186,294 +2440,117 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; + struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct file_ra_state *ra = &filp->f_ra; - loff_t *ppos = &iocb->ki_pos; - pgoff_t index; - pgoff_t last_index; - pgoff_t prev_index; - unsigned long offset; /* offset into pagecache page */ - unsigned int prev_offset; - int error = 0; - - if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; + unsigned int nr_pages = min_t(unsigned int, 512, + ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (iocb->ki_pos >> PAGE_SHIFT)); + int i, pg_nr, error = 0; + bool writably_mapped; + loff_t isize, end_offset; + + if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - index = *ppos >> PAGE_SHIFT; - prev_index = ra->prev_pos >> PAGE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; - - /* - * If we've already successfully copied some data, then we - * can no longer safely return -EIOCBQUEUED. Hence mark - * an async read NOWAIT at that point. - */ - if (written && (iocb->ki_flags & IOCB_WAITQ)) - iocb->ki_flags |= IOCB_NOWAIT; + if (nr_pages > ARRAY_SIZE(pages_onstack)) + pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - for (;;) { - struct page *page; - pgoff_t end_index; - loff_t isize; - unsigned long nr, ret; + if (!pages) { + pages = pages_onstack; + nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); + } + do { cond_resched(); -find_page: - if (fatal_signal_pending(current)) { - error = -EINTR; - goto out; - } - page = find_get_page(mapping, index); - if (!page) { - if (iocb->ki_flags & IOCB_NOIO) - goto would_block; - page_cache_sync_readahead(mapping, - ra, filp, - index, last_index - index); - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) - goto no_cached_page; - } - if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { - put_page(page); - goto out; - } - page_cache_async_readahead(mapping, - ra, filp, page, - index, last_index - index); - } - if (!PageUptodate(page)) { - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = wait_on_page_locked_async(page, - iocb->ki_waitq); - } else { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - error = wait_on_page_locked_killable(page); - } - if (unlikely(error)) - goto readpage_error; - if (PageUptodate(page)) - goto page_ok; - - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ - if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - offset, iter->count)) - goto page_not_up_to_date_locked; - unlock_page(page); + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if ((iocb->ki_flags & IOCB_WAITQ) && written) + iocb->ki_flags |= IOCB_NOWAIT; + + i = 0; + pg_nr = generic_file_buffered_read_get_pages(iocb, iter, + pages, nr_pages); + if (pg_nr < 0) { + error = pg_nr; + break; } -page_ok: + /* - * i_size must be checked after we know the page is Uptodate. + * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) { - put_page(page); - goto out; - } + if (unlikely(iocb->ki_pos >= isize)) + goto put_pages; - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_MASK) + 1; - if (nr <= offset) { - put_page(page); - goto out; - } - } - nr = nr - offset; + end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > + (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) + put_page(pages[--pg_nr]); /* - * When a sequential read accesses a page several times, - * only mark it as accessed the first time. + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: */ - if (prev_index != index || offset != prev_offset) - mark_page_accessed(page); - prev_index = index; + writably_mapped = mapping_writably_mapped(mapping); /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... + * When a sequential read accesses a page several times, only + * mark it as accessed the first time. */ + if (iocb->ki_pos >> PAGE_SHIFT != + ra->prev_pos >> PAGE_SHIFT) + mark_page_accessed(pages[0]); + for (i = 1; i < pg_nr; i++) + mark_page_accessed(pages[i]); + + for (i = 0; i < pg_nr; i++) { + unsigned int offset = iocb->ki_pos & ~PAGE_MASK; + unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, + PAGE_SIZE - offset); + unsigned int copied; - ret = copy_page_to_iter(page, offset, nr, iter); - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - prev_offset = offset; - - put_page(page); - written += ret; - if (!iov_iter_count(iter)) - goto out; - if (ret < nr) { - error = -EFAULT; - goto out; - } - continue; - -page_not_up_to_date: - /* Get exclusive access to the page ... */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } - if (unlikely(error)) - goto readpage_error; - -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - continue; - } - - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - goto page_ok; - } - -readpage: - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { - unlock_page(page); - put_page(page); - goto would_block; - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. - */ - ClearPageError(page); - /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (writably_mapped) + flush_dcache_page(pages[i]); - if (unlikely(error)) { - if (error == AOP_TRUNCATED_PAGE) { - put_page(page); - error = 0; - goto find_page; - } - goto readpage_error; - } + copied = copy_page_to_iter(pages[i], offset, bytes, iter); - if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } + written += copied; + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; - if (unlikely(error)) - goto readpage_error; - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - goto find_page; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - error = -EIO; - goto readpage_error; + if (copied < bytes) { + error = -EFAULT; + break; } - unlock_page(page); } +put_pages: + for (i = 0; i < pg_nr; i++) + put_page(pages[i]); + } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - goto page_ok; - -readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ - put_page(page); - goto out; - -no_cached_page: - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ - page = page_cache_alloc(mapping); - if (!page) { - error = -ENOMEM; - goto out; - } - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - if (error == -EEXIST) { - error = 0; - goto find_page; - } - goto out; - } - goto readpage; - } + file_accessed(filp); -would_block: - error = -EAGAIN; -out: - ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_SHIFT; - ra->prev_pos |= prev_offset; + if (pages != pages_onstack) + kfree(pages); - *ppos = ((loff_t)index << PAGE_SHIFT) + offset; - file_accessed(filp); return written ? written : error; } EXPORT_SYMBOL_GPL(generic_file_buffered_read); @@ -123,6 +123,28 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page, return NULL; } +static void put_compound_head(struct page *page, int refs, unsigned int flags) +{ + if (flags & FOLL_PIN) { + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, + refs); + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, refs); + else + refs *= GUP_PIN_COUNTING_BIAS; + } + + VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @@ -177,41 +199,6 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) return true; } -#ifdef CONFIG_DEV_PAGEMAP_OPS -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - int count, refs = 1; - - if (!page_is_devmap_managed(page)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - count = page_ref_sub_return(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); - - return true; -} -#else -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - return false; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released @@ -223,28 +210,7 @@ static bool __unpin_devmap_managed_user_page(struct page *page) */ void unpin_user_page(struct page *page) { - int refs = 1; - - page = compound_head(page); - - /* - * For devmap managed pages we need to catch refcount transition from - * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the - * page is free and we need to inform the device driver through - * callback. See include/linux/memremap.h and HMM for details. - */ - if (__unpin_devmap_managed_user_page(page)) - return; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - if (page_ref_sub_and_test(page, refs)) - __put_page(page); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); + put_compound_head(compound_head(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); @@ -923,6 +889,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) return -EFAULT; + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -1060,10 +1029,14 @@ static long __get_user_pages(struct mm_struct *mm, goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) { + if (!vma) { ret = -EFAULT; goto out; } + ret = check_vma_flags(vma, gup_flags); + if (ret) + goto out; + if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -1567,26 +1540,6 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) -static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - long i; - struct vm_area_struct *vma_prev = NULL; - - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - return true; - } - return false; -} - #ifdef CONFIG_CMA static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, @@ -1705,63 +1658,23 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - struct vm_area_struct **vmas_tmp = vmas; unsigned long flags = 0; - long rc, i; - - if (gup_flags & FOLL_LONGTERM) { - if (!pages) - return -EINVAL; + long rc; - if (!vmas_tmp) { - vmas_tmp = kcalloc(nr_pages, - sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas_tmp) - return -ENOMEM; - } + if (gup_flags & FOLL_LONGTERM) flags = memalloc_nocma_save(); - } - rc = __get_user_pages_locked(mm, start, nr_pages, pages, - vmas_tmp, NULL, gup_flags); + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, + gup_flags); if (gup_flags & FOLL_LONGTERM) { - if (rc < 0) - goto out; - - if (check_dax_vmas(vmas_tmp, rc)) { - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, rc); - else - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; - } - - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas_tmp, gup_flags); -out: + if (rc > 0) + rc = check_and_migrate_cma_pages(mm, start, rc, pages, + vmas, gup_flags); memalloc_nocma_restore(flags); } - - if (vmas_tmp != vmas) - kfree(vmas_tmp); return rc; } -#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ -static __always_inline long __gup_longterm_locked(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int flags) -{ - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, flags); -} -#endif /* CONFIG_FS_DAX || CONFIG_CMA */ static bool is_valid_gup_flags(unsigned int gup_flags) { @@ -1932,7 +1845,19 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages); /** - * get_user_pages_locked() is suitable to replace the form: + * get_user_pages_locked() - variant of get_user_pages() + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * It is suitable to replace the form: * * mmap_read_lock(mm); * do_something() @@ -1948,16 +1873,6 @@ EXPORT_SYMBOL(get_user_pages); * if (locked) * mmap_read_unlock(mm); * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * * We can leverage the VM_FAULT_RETRY functionality in the page fault * paths better by using either get_user_pages_locked() or * get_user_pages_unlocked(). @@ -2063,28 +1978,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ #ifdef CONFIG_HAVE_FAST_GUP -static void put_compound_head(struct page *page, int refs, unsigned int flags) -{ - if (flags & FOLL_PIN) { - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, - refs); - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, refs); - else - refs *= GUP_PIN_COUNTING_BIAS; - } - - VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); -} - static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2621,13 +2514,61 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, return ret; } -static int internal_get_user_pages_fast(unsigned long start, int nr_pages, +static unsigned long lockless_pages_from_mm(unsigned long start, + unsigned long end, + unsigned int gup_flags, + struct page **pages) +{ + unsigned long flags; + int nr_pinned = 0; + unsigned seq; + + if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + !gup_fast_permitted(start, end)) + return 0; + + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + + /* + * Disable interrupts. The nested form is used, in order to allow full, + * general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being freed + * from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock() here as we also want to block IPIs + * that come from THPs splitting. + */ + local_irq_save(flags); + gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } + return nr_pinned; +} + +static int internal_get_user_pages_fast(unsigned long start, + unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { - unsigned long addr, len, end; - unsigned long flags; - int nr_pinned = 0, ret = 0; + unsigned long len, end; + unsigned long nr_pinned; + int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | @@ -2641,54 +2582,33 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, might_lock_read(¤t->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) + len = nr_pages << PAGE_SHIFT; + if (check_add_overflow(start, len, &end)) return 0; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - /* - * Disable interrupts. The nested form is used, in order to allow - * full, general purpose use of this routine. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - unsigned long fast_flags = gup_flags; - - local_irq_save(flags); - gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); - local_irq_restore(flags); - ret = nr_pinned; - } - - if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { - /* Try to get the remaining pages with get_user_pages */ - start += nr_pinned << PAGE_SHIFT; - pages += nr_pinned; + nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) + return nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, - gup_flags, pages); - - /* Have to be a bit careful with return values */ - if (nr_pinned > 0) { - if (ret < 0) - ret = nr_pinned; - else - ret += nr_pinned; - } + /* Slow path: try to get the remaining pages with get_user_pages */ + start += nr_pinned << PAGE_SHIFT; + pages += nr_pinned; + ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, + pages); + if (ret < 0) { + /* + * The caller has to unpin the pages we already pinned so + * returning -errno is not an option + */ + if (nr_pinned) + return nr_pinned; + return ret; } - - return ret; + return ret + nr_pinned; } + /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address diff --git a/mm/gup_benchmark.c b/mm/gup_test.c index 8b3e5b5cd8fa..e3cf78e5873e 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_test.c @@ -4,40 +4,34 @@ #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/debugfs.h> - -#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) -#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) - -struct gup_benchmark { - __u64 get_delta_usec; - __u64 put_delta_usec; - __u64 addr; - __u64 size; - __u32 nr_pages_per_call; - __u32 flags; - __u64 expansion[10]; /* For future use */ -}; +#include "gup_test.h" static void put_back_pages(unsigned int cmd, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, unsigned int gup_test_flags) { unsigned long i; switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_BENCHMARK: + case GUP_BASIC_TEST: for (i = 0; i < nr_pages; i++) put_page(pages[i]); break; case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: + case PIN_BASIC_TEST: case PIN_LONGTERM_BENCHMARK: unpin_user_pages(pages, nr_pages); break; + case DUMP_USER_PAGES_TEST: + if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + } + break; } } @@ -49,14 +43,14 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, switch (cmd) { case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: + case PIN_BASIC_TEST: case PIN_LONGTERM_BENCHMARK: for (i = 0; i < nr_pages; i++) { page = pages[i]; if (WARN(!page_maybe_dma_pinned(page), "pages[%lu] is NOT dma-pinned\n", i)) { - dump_page(page, "gup_benchmark failure"); + dump_page(page, "gup_test failure"); break; } } @@ -64,8 +58,39 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, } } -static int __gup_benchmark_ioctl(unsigned int cmd, - struct gup_benchmark *gup) +static void dump_pages_test(struct gup_test *gup, struct page **pages, + unsigned long nr_pages) +{ + unsigned int index_to_dump; + unsigned int i; + + /* + * Zero out any user-supplied page index that is out of range. Remember: + * .which_pages[] contains a 1-based set of page indices. + */ + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + if (gup->which_pages[i] > nr_pages) { + pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n", + i, gup->which_pages[i]); + gup->which_pages[i] = 0; + } + } + + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + index_to_dump = gup->which_pages[i]; + + if (index_to_dump) { + index_to_dump--; // Decode from 1-based, to 0-based + pr_info("---- page #%u, starting from user virt addr: 0x%llx\n", + index_to_dump, gup->addr); + dump_page(pages[index_to_dump], + "gup_test: dump_pages() test"); + } + } +} + +static int __gup_test_ioctl(unsigned int cmd, + struct gup_test *gup) { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; @@ -109,7 +134,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = get_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case GUP_BENCHMARK: + case GUP_BASIC_TEST: nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); break; @@ -117,7 +142,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = pin_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case PIN_BENCHMARK: + case PIN_BASIC_TEST: nr = pin_user_pages(addr, nr, gup->flags, pages + i, NULL); break; @@ -126,6 +151,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd, gup->flags | FOLL_LONGTERM, pages + i, NULL); break; + case DUMP_USER_PAGES_TEST: + if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->flags, + pages + i, NULL); + else + nr = get_user_pages(addr, nr, gup->flags, + pages + i, NULL); + break; default: ret = -EINVAL; goto unlock; @@ -149,9 +182,12 @@ static int __gup_benchmark_ioctl(unsigned int cmd, */ verify_dma_pinned(cmd, pages, nr_pages); + if (cmd == DUMP_USER_PAGES_TEST) + dump_pages_test(gup, pages, nr_pages); + start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages); + put_back_pages(cmd, pages, nr_pages, gup->flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); @@ -164,18 +200,19 @@ free_pages: return ret; } -static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, +static long gup_test_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { - struct gup_benchmark gup; + struct gup_test gup; int ret; switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_BENCHMARK: case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: case PIN_LONGTERM_BENCHMARK: + case GUP_BASIC_TEST: + case PIN_BASIC_TEST: + case DUMP_USER_PAGES_TEST: break; default: return -EINVAL; @@ -184,7 +221,7 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; - ret = __gup_benchmark_ioctl(cmd, &gup); + ret = __gup_test_ioctl(cmd, &gup); if (ret) return ret; @@ -194,17 +231,17 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, return 0; } -static const struct file_operations gup_benchmark_fops = { +static const struct file_operations gup_test_fops = { .open = nonseekable_open, - .unlocked_ioctl = gup_benchmark_ioctl, + .unlocked_ioctl = gup_test_ioctl, }; -static int gup_benchmark_init(void) +static int __init gup_test_init(void) { - debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, - &gup_benchmark_fops); + debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL, + &gup_test_fops); return 0; } -late_initcall(gup_benchmark_init); +late_initcall(gup_test_init); diff --git a/mm/gup_test.h b/mm/gup_test.h new file mode 100644 index 000000000000..90a6713d50eb --- /dev/null +++ b/mm/gup_test.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __GUP_TEST_H +#define __GUP_TEST_H + +#include <linux/types.h> + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test) +#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test) +#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) +#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) +#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) + +#define GUP_TEST_MAX_PAGES_TO_DUMP 8 + +#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1 + +struct gup_test { + __u64 get_delta_usec; + __u64 put_delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 flags; + /* + * Each non-zero entry is the number of the page (1-based: first page is + * page 1, so that zero entries mean "do nothing") from the .addr base. + */ + __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; +}; + +#endif /* __GUP_TEST_H */ diff --git a/mm/highmem.c b/mm/highmem.c index 83f9660f168f..c3a9ea7875ef 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -359,6 +359,58 @@ void kunmap_high(struct page *page) wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void zero_user_segments(struct page *page, unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +{ + unsigned int i; + + BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + + for (i = 0; i < compound_nr(page); i++) { + void *kaddr = NULL; + + if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) + kaddr = kmap_atomic(page + i); + + if (start1 >= PAGE_SIZE) { + start1 -= PAGE_SIZE; + end1 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); + + if (end1 > start1) + memset(kaddr + start1, 0, this_end - start1); + end1 -= this_end; + start1 = 0; + } + + if (start2 >= PAGE_SIZE) { + start2 -= PAGE_SIZE; + end2 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); + + if (end2 > start2) + memset(kaddr + start2, 0, this_end - start2); + end2 -= this_end; + start2 = 0; + } + + if (kaddr) { + kunmap_atomic(kaddr); + flush_dcache_page(page + i); + } + + if (!end1 && !end2) + break; + } + + BUG_ON((start1 | start2 | end1 | end2) != 0); +} +EXPORT_SYMBOL(zero_user_segments); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HIGHMEM */ #ifdef CONFIG_KMAP_LOCAL diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ec2bb93f7431..57d08156acb1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,12 +163,17 @@ static struct shrinker huge_zero_page_shrinker = { static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + const char *output; + if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [madvise] never\n"); + output = "[always] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always [madvise] never"; else - return sprintf(buf, "always madvise [never]\n"); + output = "always madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); } static ssize_t enabled_store(struct kobject *kobj, @@ -200,11 +205,11 @@ static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); ssize_t single_hugepage_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag flag) + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) { - return sprintf(buf, "%d\n", - !!test_bit(flag, &transparent_hugepage_flags)); + return sysfs_emit(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); } ssize_t single_hugepage_flag_store(struct kobject *kobj, @@ -232,15 +237,24 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer defer+madvise madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] defer+madvise madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [defer+madvise] madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer defer+madvise [madvise] never\n"); - return sprintf(buf, "always defer defer+madvise madvise [never]\n"); + const char *output; + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + output = "[always] defer defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + output = "always [defer] defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer [defer+madvise] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer defer+madvise [madvise] never"; + else + output = "always defer defer+madvise madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); } static ssize_t defrag_store(struct kobject *kobj, @@ -281,10 +295,10 @@ static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); static ssize_t use_zero_page_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -296,9 +310,9 @@ static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); static ssize_t hpage_pmd_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); + return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); } static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); @@ -2321,7 +2335,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; bool unmap_success; @@ -2710,9 +2724,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); if (mapping) { if (PageSwapBacked(head)) - __dec_node_page_state(head, NR_SHMEM_THPS); + __dec_lruvec_page_state(head, NR_SHMEM_THPS); else - __dec_node_page_state(head, NR_FILE_THPS); + __dec_lruvec_page_state(head, NR_FILE_THPS); } __split_huge_page(page, list, end, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d029d938d26d..cbf32d2824fd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1944,13 +1944,14 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(struct hstate *h, int delta) +static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { struct list_head surplus_list; struct page *page, *tmp; - int ret, i; - int needed, allocated; + int ret; + long i; + long needed, allocated; bool alloc_ok = true; needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2014,8 +2015,7 @@ retry: * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ - put_page_testzero(page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!put_page_testzero(page), page); enqueue_huge_page(h, page); } free: @@ -2760,7 +2760,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, else nr_huge_pages = h->nr_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", nr_huge_pages); + return sysfs_emit(buf, "%lu\n", nr_huge_pages); } static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, @@ -2833,7 +2833,8 @@ HSTATE_ATTR(nr_hugepages); * huge page alloc/free. */ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, + char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } @@ -2851,7 +2852,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); - return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); + return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, @@ -2889,7 +2890,7 @@ static ssize_t free_hugepages_show(struct kobject *kobj, else free_huge_pages = h->free_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", free_huge_pages); + return sysfs_emit(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); @@ -2897,7 +2898,7 @@ static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); - return sprintf(buf, "%lu\n", h->resv_huge_pages); + return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); @@ -2914,7 +2915,7 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj, else surplus_huge_pages = h->surplus_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", surplus_huge_pages); + return sysfs_emit(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); @@ -3198,8 +3199,6 @@ void __init hugetlb_add_hstate(unsigned int order) h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - h->nr_huge_pages = 0; - h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); @@ -3673,7 +3672,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, - .split = hugetlb_vm_op_split, + .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; @@ -5115,6 +5114,7 @@ int hugetlb_reserve_pages(struct inode *inode, if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); + ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* diff --git a/mm/init-mm.c b/mm/init-mm.c index 3a613c85f9ed..153162669f80 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,6 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), diff --git a/mm/internal.h b/mm/internal.h index c43ccdddb0f6..25d2b2439f19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -199,8 +199,13 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void free_unref_page(struct page *page); +extern void free_unref_page_list(struct list_head *list); + extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void zone_pcp_disable(struct zone *zone); +extern void zone_pcp_enable(struct zone *zone); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 248264b9cb76..30c0a5038b5c 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -339,9 +339,6 @@ void kasan_record_aux_stack(void *addr) object = nearest_obj(cache, page, addr); alloc_info = get_alloc_info(cache, object); - /* - * record the last two call_rcu() call stacks. - */ alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 00a53f1355ae..5a0102f37171 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -185,12 +185,12 @@ static void describe_object(struct kmem_cache *cache, void *object, #ifdef CONFIG_KASAN_GENERIC if (alloc_info->aux_stack[0]) { - pr_err("Last call_rcu():\n"); + pr_err("Last potentially related work creation:\n"); print_stack(alloc_info->aux_stack[0]); pr_err("\n"); } if (alloc_info->aux_stack[1]) { - pr_err("Second to last call_rcu():\n"); + pr_err("Second to last potentially related work creation:\n"); print_stack(alloc_info->aux_stack[1]); pr_err("\n"); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e3dff13eb70..ad316d2e1fee 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -90,6 +90,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly; * @hash: hash collision list * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head * @mm: the mm that this information is valid for + * @nr_pte_mapped_thp: number of pte mapped THP + * @pte_mapped_thp: address array corresponding pte mapped THP */ struct mm_slot { struct hlist_node hash; @@ -124,18 +126,18 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; @@ -152,18 +154,18 @@ static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; @@ -180,17 +182,17 @@ static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_to_scan); + return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int pages; int err; - unsigned long pages; - err = kstrtoul(buf, 10, &pages); - if (err || !pages || pages > UINT_MAX) + err = kstrtouint(buf, 10, &pages); + if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; @@ -205,7 +207,7 @@ static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_collapsed); + return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); @@ -214,7 +216,7 @@ static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_full_scans); + return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); @@ -223,7 +225,7 @@ static ssize_t khugepaged_defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t khugepaged_defrag_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -248,7 +250,7 @@ static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_none); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -273,7 +275,7 @@ static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, @@ -297,10 +299,10 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = khugepaged_max_ptes_swap_store); static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) + struct kobj_attribute *attr, + char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, @@ -1414,7 +1416,11 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, } /** - * Try to collapse a pte-mapped THP for mm at address haddr. + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with @@ -1605,6 +1611,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * + * @mm: process address space where collapse happens + * @file: file that collapse on + * @start: collapse start address + * @hpage: new allocated huge page for collapse + * @node: appointed node the new huge page allocate from + * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one @@ -1845,9 +1857,9 @@ out_unlock: } if (is_shmem) - __inc_node_page_state(new_page, NR_SHMEM_THPS); + __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); else { - __inc_node_page_state(new_page, NR_FILE_THPS); + __inc_lruvec_page_state(new_page, NR_FILE_THPS); filemap_nr_thps_inc(mapping); } @@ -2833,18 +2833,18 @@ static void wait_while_offlining(void) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); + return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; @@ -2857,18 +2857,18 @@ KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); + return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int nr_pages; int err; - unsigned long nr_pages; - err = kstrtoul(buf, 10, &nr_pages); - if (err || nr_pages > UINT_MAX) + err = kstrtouint(buf, 10, &nr_pages); + if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; @@ -2880,17 +2880,17 @@ KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_run); + return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int flags; int err; - unsigned long flags; - err = kstrtoul(buf, 10, &flags); - if (err || flags > UINT_MAX) + err = kstrtouint(buf, 10, &flags); + if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; @@ -2927,9 +2927,9 @@ KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_merge_across_nodes); + return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, @@ -2984,9 +2984,9 @@ KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_use_zero_pages); + return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -3008,7 +3008,7 @@ KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_max_page_sharing); + return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, @@ -3049,21 +3049,21 @@ KSM_ATTR(max_page_sharing); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_shared); + return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_sharing); + return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_unshared); + return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); @@ -3080,21 +3080,21 @@ static ssize_t pages_volatile_show(struct kobject *kobj, */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; - return sprintf(buf, "%ld\n", ksm_pages_volatile); + return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_stable_node_dups); + return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_stable_node_chains); + return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); @@ -3103,7 +3103,7 @@ stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); + return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t @@ -3127,7 +3127,7 @@ KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_scan.seqnr); + return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); diff --git a/mm/madvise.c b/mm/madvise.c index 13f5677b9322..6a660858784b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -877,7 +877,6 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { - struct zone *zone; unsigned long size; if (!capable(CAP_SYS_ADMIN)) @@ -908,24 +907,13 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - /* - * Drop the page reference taken by get_user_pages_fast(). In - * the absence of MF_COUNT_INCREASED the memory_failure() - * routine is responsible for pinning the page to prevent it - * from being released back to the page allocator. - */ - put_page(page); - ret = memory_failure(pfn, 0); + ret = memory_failure(pfn, MF_COUNT_INCREASED); } if (ret) return ret; } - /* Ensure that all poisoned pages are removed from per-cpu lists */ - for_each_populated_zone(zone) - drain_all_pages(zone); - return 0; } #endif diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 2c7d03675903..b59054ef2e10 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -23,7 +23,8 @@ struct wp_walk { /** * wp_pte - Write-protect a pte * @pte: Pointer to the pte - * @addr: The virtual page address + * @addr: The start of protecting virtual address + * @end: The end of protecting virtual address * @walk: pagetable walk callback argument * * The function write-protects a pte and records the range in @@ -74,7 +75,8 @@ struct clean_walk { * clean_record_pte - Clean a pte and record its address space offset in a * bitmap * @pte: Pointer to the pte - * @addr: The virtual page address + * @addr: The start of virtual address to be clean + * @end: The end of virtual address to be clean * @walk: pagetable walk callback argument * * The function cleans a pte and records the range in diff --git a/mm/memblock.c b/mm/memblock.c index b68ee86788af..049df4163a97 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1926,6 +1926,85 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); +static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *start_pg, *end_pg; + phys_addr_t pg, pgend; + + /* + * Convert start_pfn/end_pfn to a struct page pointer. + */ + start_pg = pfn_to_page(start_pfn - 1) + 1; + end_pg = pfn_to_page(end_pfn - 1) + 1; + + /* + * Convert to physical addresses, and round start upwards and end + * downwards. + */ + pg = PAGE_ALIGN(__pa(start_pg)); + pgend = __pa(end_pg) & PAGE_MASK; + + /* + * If there are free pages between these, free the section of the + * memmap array. + */ + if (pg < pgend) + memblock_free(pg, pgend - pg); +} + +/* + * The mem_map array can get very big. Free the unused area of the memory map. + */ +static void __init free_unused_memmap(void) +{ + unsigned long start, end, prev_end = 0; + int i; + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) || + IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; + + /* + * This relies on each bank being in address order. + * The banks are sorted previously in bootmem_init(). + */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { +#ifdef CONFIG_SPARSEMEM + /* + * Take care not to free memmap entries that don't exist + * due to SPARSEMEM sections which aren't present. + */ + start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); +#else + /* + * Align down here since the VM subsystem insists that the + * memmap entries are valid from the bank start aligned to + * MAX_ORDER_NR_PAGES. + */ + start = round_down(start, MAX_ORDER_NR_PAGES); +#endif + + /* + * If we had a previous bank, and there is a space + * between the current bank and the previous, free it. + */ + if (prev_end && prev_end < start) + free_memmap(prev_end, start); + + /* + * Align up here since the VM subsystem insists that the + * memmap entries are valid from the bank end aligned to + * MAX_ORDER_NR_PAGES. + */ + prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); + } + +#ifdef CONFIG_SPARSEMEM + if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) + free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); +#endif +} + static void __init __free_pages_memory(unsigned long start, unsigned long end) { int order; @@ -2012,6 +2091,7 @@ unsigned long __init memblock_free_all(void) { unsigned long pages; + free_unused_memmap(); reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29459a6ce1c7..b9419a3605eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -623,14 +623,9 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; rightmost = false; - } - - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) + } else { p = &(*p)->rb_right; + } } if (rightmost) @@ -858,7 +853,25 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, + int val) +{ + struct page *head = compound_head(page); /* rmap on tail pages */ + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!head->mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + return; + } + + lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + __mod_lruvec_state(lruvec, idx, val); +} +EXPORT_SYMBOL(__mod_lruvec_page_state); + +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; @@ -882,17 +895,6 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -void mod_memcg_obj_state(void *p, int idx, int val) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_obj(p); - if (memcg) - mod_memcg_state(memcg, idx, val); - rcu_read_unlock(); -} - /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -1157,12 +1159,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) pos = prev; - if (!root->use_hierarchy && root != root_mem_cgroup) { - if (prev) - goto out; - return root; - } - rcu_read_lock(); if (reclaim) { @@ -1242,7 +1238,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, out_unlock: rcu_read_unlock(); -out: if (prev && prev != root) css_put(&prev->css); @@ -1340,7 +1335,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, * @page: the page * @pgdat: pgdat of the page * - * This function relies on page->mem_cgroup being stable - see the + * This function relies on page's memcg being stable - see the * access rules in commit_charge(). */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) @@ -1499,6 +1494,7 @@ static struct memory_stat memory_stats[] = { { "anon", PAGE_SIZE, NR_ANON_MAPPED }, { "file", PAGE_SIZE, NR_FILE_PAGES }, { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, + { "pagetables", PAGE_SIZE, NR_PAGETABLE }, { "percpu", 1, MEMCG_PERCPU_B }, { "sock", PAGE_SIZE, MEMCG_SOCK }, { "shmem", PAGE_SIZE, NR_SHMEM }, @@ -1512,6 +1508,8 @@ static struct memory_stat memory_stats[] = { * constant(e.g. powerpc). */ { "anon_thp", 0, NR_ANON_THPS }, + { "file_thp", 0, NR_FILE_THPS }, + { "shmem_thp", 0, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, @@ -1542,7 +1540,9 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_ANON_THPS) + if (memory_stats[i].idx == NR_ANON_THPS || + memory_stats[i].idx == NR_FILE_THPS || + memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif VM_BUG_ON(!memory_stats[i].ratio); @@ -2891,7 +2891,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) { VM_BUG_ON_PAGE(page->mem_cgroup, page); /* - * Any of the following ensures page->mem_cgroup stability: + * Any of the following ensures page's memcg stability: * * - the page lock * - LRU isolation @@ -2987,6 +2987,7 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) objcg = rcu_dereference(memcg->objcg); if (objcg && obj_cgroup_tryget(objcg)) break; + objcg = NULL; } rcu_read_unlock(); @@ -3246,8 +3247,10 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) * independently later. */ rcu_read_lock(); +retry: memcg = obj_cgroup_memcg(objcg); - css_get(&memcg->css); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; rcu_read_unlock(); nr_pages = size >> PAGE_SHIFT; @@ -3470,22 +3473,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, } /* - * Test whether @memcg has children, dead or alive. Note that this - * function doesn't care whether @memcg has use_hierarchy enabled and - * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsibility. - */ -static inline bool memcg_has_children(struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = css_next_child(NULL, &memcg->css); - rcu_read_unlock(); - return ret; -} - -/* * Reclaims as many pages from the given memcg as possible. * * Caller is responsible for holding css reference for memcg. @@ -3533,37 +3520,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->use_hierarchy; + return 1; } static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - - if (memcg->use_hierarchy == val) + if (val == 1) return 0; - /* - * If parent's use_hierarchy is set, we can't make any modifications - * in the child subtrees. If it is unset, then the change can - * occur, provided the current cgroup has no children. - * - * For the root cgroup, parent_mem is NULL, we allow value to be - * set if there are no children. - */ - if ((!parent_memcg || !parent_memcg->use_hierarchy) && - (val == 1 || val == 0)) { - if (!memcg_has_children(memcg)) - memcg->use_hierarchy = val; - else - retval = -EBUSY; - } else - retval = -EINVAL; + pr_warn_once("Non-hierarchical mode is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); - return retval; + return -EINVAL; } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3712,12 +3682,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); - /* - * A memory cgroup is considered kmem-online as soon as it gets - * kmemcg_id. Setting the id after enabling static branching will - * guarantee no one starts accounting before all call sites are - * patched. - */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; @@ -3757,8 +3721,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; } rcu_read_unlock(); @@ -5349,38 +5311,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; - } - if (!parent) { - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); - } else if (parent->use_hierarchy) { - memcg->use_hierarchy = true; + page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { - page_counter_init(&memcg->memory, &root_mem_cgroup->memory); - page_counter_init(&memcg->swap, &root_mem_cgroup->swap); - page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); - page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); - /* - * Deeper hierachy with use_hierarchy == false doesn't make - * much sense so let cgroup subsystem know about this - * unfortunate state in our controller. - */ - if (parent != root_mem_cgroup) - memory_cgrp_subsys.broken_hierarchy = true; - } + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); - /* The following stuff does not apply to the root */ - if (!parent) { root_mem_cgroup = memcg; return &memcg->css; } + /* The following stuff does not apply to the root */ error = memcg_online_kmem(memcg); if (error) goto fail; @@ -6217,24 +6163,6 @@ static void mem_cgroup_move_task(void) } #endif -/* - * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify whether we're attached to the default hierarchy on each mount - * attempt. - */ -static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) -{ - /* - * use_hierarchy is forced on the default hierarchy. cgroup core - * guarantees that @root doesn't have any children, so turning it - * on for the root memcg is enough. - */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - root_mem_cgroup->use_hierarchy = true; - else - root_mem_cgroup->use_hierarchy = false; -} - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6572,7 +6500,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, - .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, @@ -6995,7 +6922,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) if (newpage->mem_cgroup) return; - /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; @@ -7354,9 +7280,9 @@ bool mem_cgroup_swap_full(struct page *page) static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - cgroup_memory_noswap = 0; + cgroup_memory_noswap = false; else if (!strcmp(s, "0")) - cgroup_memory_noswap = 1; + cgroup_memory_noswap = true; return 1; } __setup("swapaccount=", setup_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5d880d4eb9a2..5a38e9eade94 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -263,8 +263,8 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) } /* - * When a unknown page type is encountered drain as many buffers as possible - * in the hope to turn the page into a LRU or free page, which we can handle. + * Unknown page type encountered. Try to check whether it can turn PageLRU by + * lru_add_drain_all, or a free page by reclaiming slabs when possible. */ void shake_page(struct page *p, int access) { @@ -273,9 +273,6 @@ void shake_page(struct page *p, int access) if (!PageSlab(p)) { lru_add_drain_all(); - if (PageLRU(p)) - return; - drain_all_pages(page_zone(p)); if (PageLRU(p) || is_free_buddy_page(p)) return; } @@ -809,7 +806,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) */ static int me_huge_page(struct page *p, unsigned long pfn) { - int res = 0; + int res; struct page *hpage = compound_head(p); struct address_space *mapping; @@ -820,6 +817,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (mapping) { res = truncate_error_page(hpage, pfn, mapping); } else { + res = MF_FAILED; unlock_page(hpage); /* * migration entry prevents later access on error anonymous @@ -828,8 +826,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) */ if (PageAnon(hpage)) put_page(hpage); - dissolve_free_huge_page(p); - res = MF_RECOVERED; + if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } lock_page(hpage); } @@ -946,13 +946,13 @@ static int page_action(struct page_state *ps, struct page *p, } /** - * get_hwpoison_page() - Get refcount for memory error handling: + * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) * * Return: return 0 if failed to grab the refcount, otherwise true (some * non-zero value.) */ -static int get_hwpoison_page(struct page *page) +static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -983,13 +983,80 @@ static int get_hwpoison_page(struct page *page) } /* + * Safely get reference count of an arbitrary page. + * + * Returns 0 for a free page, 1 for an in-use page, + * -EIO for a page-type we cannot handle and -EBUSY if we raced with an + * allocation. + * We only incremented refcount in case the page was already in-use and it + * is a known type we can handle. + */ +static int get_any_page(struct page *p, unsigned long flags) +{ + int ret = 0, pass = 0; + bool count_increased = false; + + if (flags & MF_COUNT_INCREASED) + count_increased = true; + +try_again: + if (!count_increased && !__get_hwpoison_page(p)) { + if (page_count(p)) { + /* We raced with an allocation, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EBUSY; + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + /* We raced with put_page, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EIO; + } + } else { + if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + ret = 1; + } else { + /* + * A page we cannot handle. Check whether we can turn + * it into something we can handle. + */ + if (pass++ < 3) { + put_page(p); + shake_page(p, 1); + count_increased = false; + goto try_again; + } + put_page(p); + ret = -EIO; + } + } + + return ret; +} + +static int get_hwpoison_page(struct page *p, unsigned long flags, + enum mf_flags ctxt) +{ + int ret; + + zone_pcp_disable(page_zone(p)); + if (ctxt == MF_SOFT_OFFLINE) + ret = get_any_page(p, flags); + else + ret = __get_hwpoison_page(p); + zone_pcp_enable(page_zone(p)); + + return ret; +} + +/* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success = true; @@ -1162,7 +1229,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { /* * Check "filter hit" and "race with other subpage." */ @@ -1176,9 +1243,13 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) } } unlock_page(head); - dissolve_free_huge_page(p); - action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); - return 0; + res = MF_FAILED; + if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } lock_page(head); @@ -1231,6 +1302,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, loff_t start; dax_entry_t cookie; + if (flags & MF_COUNT_INCREASED) + /* + * Drop the extra refcount in case we come from madvise(). + */ + put_page(page); + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by @@ -1319,6 +1396,7 @@ int memory_failure(unsigned long pfn, int flags) struct dev_pagemap *pgmap; int res; unsigned long page_flags; + bool retry = true; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1336,6 +1414,7 @@ int memory_failure(unsigned long pfn, int flags) return -ENXIO; } +try_again: if (PageHuge(p)) return memory_failure_hugetlb(pfn, flags); if (TestSetPageHWPoison(p)) { @@ -1358,10 +1437,23 @@ int memory_failure(unsigned long pfn, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { if (is_free_buddy_page(p)) { - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - return 0; + if (take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + /* We lost the race, try again */ + if (retry) { + ClearPageHWPoison(p); + num_poisoned_pages_dec(); + retry = false; + goto try_again; + } + res = MF_FAILED; + } + action_result(pfn, MF_MSG_BUDDY, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; @@ -1385,14 +1477,6 @@ int memory_failure(unsigned long pfn, int flags) * walked by the page reclaim code, however that's not a big loss. */ shake_page(p, 0); - /* shake_page could have turned it free. */ - if (!PageLRU(p) && is_free_buddy_page(p)) { - if (flags & MF_COUNT_INCREASED) - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - else - action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); - return 0; - } lock_page(p); @@ -1596,6 +1680,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + unsigned long flags = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1640,7 +1725,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - if (!get_hwpoison_page(p)) { + if (!get_hwpoison_page(p, flags, 0)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -1671,75 +1756,6 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -/* - * Safely get reference count of an arbitrary page. - * Returns 0 for a free page, -EIO for a zero refcount page - * that is not free, and 1 for any other page type. - * For 1 the page is returned with increased page count, otherwise not. - */ -static int __get_any_page(struct page *p, unsigned long pfn, int flags) -{ - int ret; - - if (flags & MF_COUNT_INCREASED) - return 1; - - /* - * When the target page is a free hugepage, just remove it - * from free hugepage list. - */ - if (!get_hwpoison_page(p)) { - if (PageHuge(p)) { - pr_info("%s: %#lx free huge page\n", __func__, pfn); - ret = 0; - } else if (is_free_buddy_page(p)) { - pr_info("%s: %#lx free buddy page\n", __func__, pfn); - ret = 0; - } else if (page_count(p)) { - /* raced with allocation */ - ret = -EBUSY; - } else { - pr_info("%s: %#lx: unknown zero refcount page type %lx\n", - __func__, pfn, p->flags); - ret = -EIO; - } - } else { - /* Not a free page */ - ret = 1; - } - return ret; -} - -static int get_any_page(struct page *page, unsigned long pfn, int flags) -{ - int ret = __get_any_page(page, pfn, flags); - - if (ret == -EBUSY) - ret = __get_any_page(page, pfn, flags); - - if (ret == 1 && !PageHuge(page) && - !PageLRU(page) && !__PageMovable(page)) { - /* - * Try to free it. - */ - put_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = __get_any_page(page, pfn, 0); - if (ret == 1 && !PageLRU(page)) { - /* Drop page reference which is from __get_any_page() */ - put_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", - pfn, page->flags, &page->flags); - return -EIO; - } - } - return ret; -} - static bool isolate_page(struct page *page, struct list_head *pagelist) { bool isolated = false; @@ -1839,11 +1855,11 @@ static int __soft_offline_page(struct page *page) pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", pfn, msg_page[huge], ret, page->flags, &page->flags); if (ret > 0) - ret = -EIO; + ret = -EBUSY; } } else { - pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n", - pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags); + pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n", + pfn, msg_page[huge], page_count(page), page->flags, &page->flags); ret = -EBUSY; } return ret; @@ -1905,7 +1921,7 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); + pr_info("%s: %#lx page already poisoned\n", __func__, pfn); if (flags & MF_COUNT_INCREASED) put_page(page); return 0; @@ -1913,16 +1929,20 @@ int soft_offline_page(unsigned long pfn, int flags) retry: get_online_mems(); - ret = get_any_page(page, pfn, flags); + ret = get_hwpoison_page(page, flags, MF_SOFT_OFFLINE); put_online_mems(); - if (ret > 0) + if (ret > 0) { ret = soft_offline_in_use_page(page); - else if (ret == 0) + } else if (ret == 0) { if (soft_offline_free_page(page) && try_again) { try_again = false; goto retry; } + } else if (ret == -EIO) { + pr_info("%s: %#lx: unknown page type: %lx (%pGP)\n", + __func__, pfn, page->flags, &page->flags); + } return ret; } diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..4a42a74a2240 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; @@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow) + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } return ret; } @@ -4874,11 +4885,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* - * Access another process' address space as given in mm. If non-NULL, use the - * given task for page fault accounting. + * Access another process' address space as given in mm. */ -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; @@ -4955,7 +4965,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -4973,7 +4983,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 63b2e46b6555..e0a561c550b3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -596,8 +596,7 @@ void generic_online_page(struct page *page, unsigned int order) * so we should map it first. This is better than introducing a special * case in page freeing fast path. */ - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); __free_pages_core(page, order); totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM @@ -1304,7 +1303,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (WARN_ON(PageLRU(page))) isolate_lru_page(page); if (page_mapped(page)) - try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_IGNORE_MLOCK); continue; } @@ -1492,13 +1491,19 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } node = zone_to_nid(zone); + /* + * Disable pcplists so that page isolation cannot race with freeing + * in a way that pages from isolated pageblock are left on pcplists. + */ + zone_pcp_disable(zone); + /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; - goto failed_removal; + goto failed_removal_pcplists_disabled; } arg.start_pfn = start_pfn; @@ -1550,21 +1555,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) goto failed_removal_isolated; } - /* - * per-cpu pages are drained in start_isolate_page_range, but if - * there are still pages that are not free, make sure that we - * drain again, because when we isolated range we might - * have raced with another thread that was adding pages to pcp - * list. - * - * Forward progress should be still guaranteed because - * pages on the pcp list can only belong to MOVABLE_ZONE - * because has_unmovable_pages explicitly checks for - * PageBuddy on freed pages on other zones. - */ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); - if (ret) - drain_all_pages(zone); + } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ @@ -1580,6 +1572,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + zone_pcp_enable(zone); + /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); zone->present_pages -= nr_pages; @@ -1612,6 +1606,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); +failed_removal_pcplists_disabled: + zone_pcp_enable(zone); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3ca4898f3f24..8cf96bd21341 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1114,9 +1114,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err; nodemask_t tmp; - err = migrate_prep(); - if (err) - return err; + migrate_prep(); mmap_read_lock(mm); @@ -1315,9 +1313,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - err = migrate_prep(); - if (err) - goto mpol_out; + migrate_prep(); } { NODEMASK_SCRATCH(scratch); diff --git a/mm/migrate.c b/mm/migrate.c index 5795cb82e27c..ee802cb509a3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -62,7 +62,7 @@ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is * undesirable, use migrate_prep_local() */ -int migrate_prep(void) +void migrate_prep(void) { /* * Clear the LRU lists so pages can be isolated. @@ -71,16 +71,12 @@ int migrate_prep(void) * pages that may be busy. */ lru_add_drain_all(); - - return 0; } /* Do the necessary work of migrate_prep but not if it involves other CPUs */ -int migrate_prep_local(void) +void migrate_prep_local(void) { lru_add_drain(); - - return 0; } int isolate_movable_page(struct page *page, isolate_mode_t mode) @@ -1106,7 +1102,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a page->mapping==NULL page will * trigger a BUG. So handle it here. - * 2. An orphaned page (see truncate_complete_page) might have + * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to @@ -1122,8 +1118,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); page_was_mapped = 1; } @@ -1169,13 +1164,14 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, - enum migrate_reason reason) + enum migrate_reason reason, + struct list_head *ret) { int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) - return -ENOMEM; + return -ENOSYS; if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ @@ -1206,7 +1202,14 @@ out: * migrated will have kept its references and be restored. */ list_del(&page->lru); + } + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { /* * Compaction can migrate also non-LRU pages which are * not accounted to NR_ISOLATED_*. They can be recognized @@ -1215,35 +1218,16 @@ out: if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), -thp_nr_pages(page)); - } - /* - * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the page to right list unless - * we want to retry. - */ - if (rc == MIGRATEPAGE_SUCCESS) { if (reason != MR_MEMORY_FAILURE) /* * We release the page in page_handle_poison. */ put_page(page); } else { - if (rc != -EAGAIN) { - if (likely(!__PageMovable(page))) { - putback_lru_page(page); - goto put_new; - } + if (rc != -EAGAIN) + list_add_tail(&page->lru, ret); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); - else - __ClearPageIsolated(page); - unlock_page(page); - put_page(page); - } -put_new: if (put_new_page) put_new_page(newpage, private); else @@ -1274,7 +1258,8 @@ put_new: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, + struct list_head *ret) { int rc = -EAGAIN; int page_was_mapped = 0; @@ -1290,7 +1275,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * kicking migration. */ if (!hugepage_migration_supported(page_hstate(hpage))) { - putback_active_hugepage(hpage); + list_move_tail(&hpage->lru, ret); return -ENOSYS; } @@ -1329,8 +1314,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_mapped(hpage)) { bool mapping_locked = false; - enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK| - TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; if (!PageAnon(hpage)) { /* @@ -1376,8 +1360,10 @@ put_anon: out_unlock: unlock_page(hpage); out: - if (rc != -EAGAIN) + if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); + else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + list_move_tail(&hpage->lru, ret); /* * If migration was not successful and there's a freeing callback, use @@ -1392,6 +1378,20 @@ out: return rc; } +static inline int try_split_thp(struct page *page, struct page **page2, + struct list_head *from) +{ + int rc = 0; + + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) + list_safe_reset_next(page, *page2, lru); + + return rc; +} + /* * migrate_pages - migrate the pages specified in a list, to the free pages * supplied as the target for the page migration @@ -1408,8 +1408,8 @@ out: * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. - * The caller should call putback_movable_pages() to return pages to the LRU - * or free list only if ret != 0. + * It is caller's responsibility to call putback_movable_pages() to return pages + * to the LRU or free list only if ret != 0. * * Returns the number of pages that were not migrated, or an error code. */ @@ -1430,6 +1430,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; + LIST_HEAD(ret_pages); if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1452,31 +1453,56 @@ retry: if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode, reason); + pass > 2, mode, reason, + &ret_pages); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, - reason); - + reason, &ret_pages); + /* + * The rules are: + * Success: non hugetlb page will be freed, hugetlb + * page will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * Other errno: put on ret_pages list then splice to + * from list + */ switch(rc) { + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + case -ENOSYS: + /* THP migration is unsupported */ + if (is_thp) { + if (!try_split_thp(page, &page2, from)) { + nr_thp_split++; + goto retry; + } + + nr_thp_failed++; + nr_failed += nr_subpages; + break; + } + + /* Hugetlb migration is unsupported */ + nr_failed++; + break; case -ENOMEM: /* - * THP migration might be unsupported or the - * allocation could've failed so we should - * retry on the same page with the THP split - * to base pages. - * - * Head page is retried immediately and tail - * pages are added to the tail of the list so - * we encounter them after the rest of the list - * is processed. + * When memory is low, don't bother to try to migrate + * other pages, just exit. */ if (is_thp) { - lock_page(page); - rc = split_huge_page_to_list(page, from); - unlock_page(page); - if (!rc) { - list_safe_reset_next(page, page2, lru); + if (!try_split_thp(page, &page2, from)) { nr_thp_split++; goto retry; } @@ -1504,7 +1530,7 @@ retry: break; default: /* - * Permanent failure (-EBUSY, -ENOSYS, etc.): + * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed page is * removed from migration page list and not * retried in the next outer loop. @@ -1523,6 +1549,12 @@ retry: nr_thp_failed += thp_retry; rc = nr_failed; out: + /* + * Put the permanent failure page back to migration list, they + * will be put back to the right list by the caller. + */ + list_splice(&ret_pages, from); + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); @@ -1698,7 +1730,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node, * Positive err means the number of failed * pages to migrate. Since we are going to * abort and return the number of non-migrated - * pages, so need to incude the rest of the + * pages, so need to include the rest of the * nr_pages that have not been attempted as * well. */ @@ -2065,6 +2097,17 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } +static inline bool is_shared_exec_page(struct vm_area_struct *vma, + struct page *page) +{ + if (page_mapcount(page) != 1 && + (page_is_file_lru(page) || vma_is_shmem(vma)) && + (vma->vm_flags & VM_EXEC)) + return true; + + return false; +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2082,8 +2125,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_lru(page) && - (vma->vm_flags & VM_EXEC)) + if (is_shared_exec_page(vma, page)) goto out; /* @@ -2138,6 +2180,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; + if (is_shared_exec_page(vma, page)) + goto out; + new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2249,6 +2294,7 @@ out_fail: out_unlock: unlock_page(page); +out: put_page(page); return 0; } @@ -2688,7 +2734,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) */ static void migrate_vma_unmap(struct migrate_vma *migrate) { - int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; unsigned long addr, i, restore = 0; @@ -2848,8 +2894,7 @@ EXPORT_SYMBOL(migrate_vma_setup); static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, - unsigned long *src, - unsigned long *dst) + unsigned long *src) { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; @@ -3003,16 +3048,14 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (!notified) { notified = true; - mmu_notifier_range_init(&range, - MMU_NOTIFY_CLEAR, 0, - NULL, - migrate->vma->vm_mm, - addr, migrate->end); + mmu_notifier_range_init_migrate(&range, 0, + migrate->vma, migrate->vma->vm_mm, + addr, migrate->end, + migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i], - &migrate->dst[i]); + &migrate->src[i]); continue; } diff --git a/mm/mm_init.c b/mm/mm_init.c index b06a30fbedff..8e02e865cc65 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -173,6 +173,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, case MEM_ONLINE: case MEM_OFFLINE: mm_compute_batch(sysctl_overcommit_memory); + break; default: break; } diff --git a/mm/mmap.c b/mm/mmap.c index 5c8b4485860d..10598e5d4757 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2731,8 +2731,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - if (vma->vm_ops && vma->vm_ops->split) { - err = vma->vm_ops->split(vma, addr); + if (vma->vm_ops && vma->vm_ops->may_split) { + err = vma->vm_ops->may_split(vma, addr); if (err) return err; } @@ -3405,10 +3405,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma) +static int special_mapping_mremap(struct vm_area_struct *new_vma, + unsigned long flags) { struct vm_special_mapping *sm = new_vma->vm_private_data; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; @@ -3418,6 +3422,17 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma) return 0; } +static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting special mappings - kernel has expectations over + * the number of pages in mapping. Together with VM_DONTEXPAND + * the size of vma should stay the same over the special mapping's + * lifetime. + */ + return -EINVAL; +} + static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, @@ -3425,6 +3440,7 @@ static const struct vm_operations_struct special_mapping_vmops = { .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, + .may_split = special_mapping_split, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c new file mode 100644 index 000000000000..dcdde4f722a4 --- /dev/null +++ b/mm/mmap_lock.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include <trace/events/mmap_lock.h> + +#include <linux/mm.h> +#include <linux/cgroup.h> +#include <linux/memcontrol.h> +#include <linux/mmap_lock.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/smp.h> +#include <linux/trace_events.h> + +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); + +#ifdef CONFIG_MEMCG + +/* + * Our various events all share the same buffer (because we don't want or need + * to allocate a set of buffers *per event type*), so we need to protect against + * concurrent _reg() and _unreg() calls, and count how many _reg() calls have + * been made. + */ +static DEFINE_MUTEX(reg_lock); +static int reg_refcount; /* Protected by reg_lock. */ + +/* + * Size of the buffer for memcg path names. Ignoring stack trace support, + * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. + */ +#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL + +/* + * How many contexts our trace events might be called in: normal, softirq, irq, + * and NMI. + */ +#define CONTEXT_COUNT 4 + +static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +static char **tmp_bufs; +static DEFINE_PER_CPU(int, memcg_path_buf_idx); + +/* Called with reg_lock held. */ +static void free_memcg_path_bufs(void) +{ + int cpu; + char **old = tmp_bufs; + + for_each_possible_cpu(cpu) { + *(old++) = rcu_dereference_protected( + per_cpu(memcg_path_buf, cpu), + lockdep_is_held(®_lock)); + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + } + + /* Wait for inflight memcg_path_buf users to finish. */ + synchronize_rcu(); + + old = tmp_bufs; + for_each_possible_cpu(cpu) { + kfree(*(old++)); + } + + kfree(tmp_bufs); + tmp_bufs = NULL; +} + +int trace_mmap_lock_reg(void) +{ + int cpu; + char *new; + + mutex_lock(®_lock); + + /* If the refcount is going 0->1, proceed with allocating buffers. */ + if (reg_refcount++) + goto out; + + tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), + GFP_KERNEL); + if (tmp_bufs == NULL) + goto out_fail; + + for_each_possible_cpu(cpu) { + new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); + if (new == NULL) + goto out_fail_free; + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + /* Don't need to wait for inflights, they'd have gotten NULL. */ + } + +out: + mutex_unlock(®_lock); + return 0; + +out_fail_free: + free_memcg_path_bufs(); +out_fail: + /* Since we failed, undo the earlier ref increment. */ + --reg_refcount; + + mutex_unlock(®_lock); + return -ENOMEM; +} + +void trace_mmap_lock_unreg(void) +{ + mutex_lock(®_lock); + + /* If the refcount is going 1->0, proceed with freeing buffers. */ + if (--reg_refcount) + goto out; + + free_memcg_path_bufs(); + +out: + mutex_unlock(®_lock); +} + +static inline char *get_memcg_path_buf(void) +{ + char *buf; + int idx; + + rcu_read_lock(); + buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + if (buf == NULL) { + rcu_read_unlock(); + return NULL; + } + idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + MEMCG_PATH_BUF_SIZE; + return &buf[idx]; +} + +static inline void put_memcg_path_buf(void) +{ + this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + rcu_read_unlock(); +} + +/* + * Write the given mm_struct's memcg path to a percpu buffer, and return a + * pointer to it. If the path cannot be determined, or no buffer was available + * (because the trace event is being unregistered), NULL is returned. + * + * Note: buffers are allocated per-cpu to avoid locking, so preemption must be + * disabled by the caller before calling us, and re-enabled only after the + * caller is done with the pointer. + * + * The caller must call put_memcg_path_buf() once the buffer is no longer + * needed. This must be done while preemption is still disabled. + */ +static const char *get_mm_memcg_path(struct mm_struct *mm) +{ + char *buf = NULL; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + + if (memcg == NULL) + goto out; + if (unlikely(memcg->css.cgroup == NULL)) + goto out_put; + + buf = get_memcg_path_buf(); + if (buf == NULL) + goto out_put; + + cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + +out_put: + css_put(&memcg->css); +out: + return buf; +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + preempt_disable(); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + preempt_enable(); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +/* + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + */ + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) +{ + TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); + +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(released, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5654dd19addc..61ee40ed804e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -612,13 +612,6 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); - if (IS_ENABLED(CONFIG_LOCKDEP)) { - fs_reclaim_acquire(GFP_KERNEL); - lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); - lock_map_release(&__mmu_notifier_invalidate_range_start_map); - fs_reclaim_release(GFP_KERNEL); - } - if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..f337831affc2 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,20 +72,6 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, return z; } -#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone) -{ - if (page_to_pfn(page) != pfn) - return false; - - if (page_zone(page) != zone) - return false; - - return true; -} -#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; diff --git a/mm/mremap.c b/mm/mremap.c index 138abbae4f75..c5590afe7165 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -30,12 +30,11 @@ #include "internal.h" -static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) @@ -49,6 +48,18 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pud_none_or_clear_bad(pud)) return NULL; + return pud; +} + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return NULL; + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; @@ -56,19 +67,27 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, +static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; - pud = pud_alloc(mm, p4d, addr); + + return pud_alloc(mm, p4d, addr); +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = alloc_new_pud(mm, vma, addr); if (!pud) return NULL; @@ -249,14 +268,148 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, return true; } +#else +static inline bool move_normal_pmd(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, + pmd_t *new_pmd) +{ + return false; +} #endif +#ifdef CONFIG_HAVE_MOVE_PUD +static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static inline bool move_normal_pud(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, + pud_t *new_pud) +{ + return false; +} +#endif + +enum pgt_entry { + NORMAL_PMD, + HPAGE_PMD, + NORMAL_PUD, +}; + +/* + * Returns an extent of the corresponding size for the pgt_entry specified if + * valid. Else returns a smaller extent bounded by the end of the source and + * destination pgt_entry. + */ +static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, + unsigned long old_end, unsigned long new_addr) +{ + unsigned long next, extent, mask, size; + + switch (entry) { + case HPAGE_PMD: + case NORMAL_PMD: + mask = PMD_MASK; + size = PMD_SIZE; + break; + case NORMAL_PUD: + mask = PUD_MASK; + size = PUD_SIZE; + break; + default: + BUILD_BUG(); + break; + } + + next = (old_addr + size) & mask; + /* even if next overflowed, extent below will be ok */ + extent = (next > old_end) ? old_end - old_addr : next - old_addr; + next = (new_addr + size) & mask; + if (extent > next - new_addr) + extent = next - new_addr; + return extent; +} + +/* + * Attempts to speedup the move by moving entry at the level corresponding to + * pgt_entry. Returns true if the move was successful, else false. + */ +static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, + void *old_entry, void *new_entry, bool need_rmap_locks) +{ + bool moved = false; + + /* See comment in move_ptes() */ + if (need_rmap_locks) + take_rmap_locks(vma); + + switch (entry) { + case NORMAL_PMD: + moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case NORMAL_PUD: + moved = move_normal_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case HPAGE_PMD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (need_rmap_locks) + drop_rmap_locks(vma); + + return moved; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks) { - unsigned long extent, next, old_end; + unsigned long extent, old_end; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; @@ -269,53 +422,50 @@ unsigned long move_page_tables(struct vm_area_struct *vma, for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); - next = (old_addr + PMD_SIZE) & PMD_MASK; - /* even if next overflowed, extent below will be ok */ - extent = next - old_addr; - if (extent > old_end - old_addr) - extent = old_end - old_addr; - next = (new_addr + PMD_SIZE) & PMD_MASK; - if (extent > next - new_addr) - extent = next - new_addr; + /* + * If extent is PUD-sized try to speed up the move by moving at the + * PUD level if possible. + */ + extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); + if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + pud_t *old_pud, *new_pud; + + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks)) + continue; + } + + extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { - if (extent == HPAGE_PMD_SIZE) { - bool moved; - /* See comment in move_ptes() */ - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_huge_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) - continue; - } + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || + pmd_devmap(*old_pmd)) { + if (extent == HPAGE_PMD_SIZE && + move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) + continue; split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; - } else if (extent == PMD_SIZE) { -#ifdef CONFIG_HAVE_MOVE_PMD + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && + extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ - bool moved; - - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_normal_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) + if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) continue; -#endif } if (pte_alloc(new_vma->vm_mm, new_pmd)) @@ -343,7 +493,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; - int err; + int err = 0; bool need_rmap_locks; /* @@ -353,6 +503,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (vma->vm_ops && vma->vm_ops->may_split) { + if (vma->vm_start != old_addr) + err = vma->vm_ops->may_split(vma, old_addr); + if (!err && vma->vm_end != old_addr + old_len) + err = vma->vm_ops->may_split(vma, old_addr + old_len); + if (err) + return err; + } + /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new @@ -365,18 +524,26 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { + if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + return -ENOMEM; + } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); - if (!new_vma) + if (!new_vma) { + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) + vm_unacct_memory(new_len >> PAGE_SHIFT); return -ENOMEM; + } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma); + err = vma->vm_ops->mremap(new_vma, flags); } if (unlikely(err)) { @@ -398,7 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT) { + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; excess = vma->vm_end - vma->vm_start - old_len; if (old_addr > vma->vm_start && @@ -423,34 +590,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, untrack_pfn_moved(vma); if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { - if (vm_flags & VM_ACCOUNT) { - /* Always put back VM_ACCOUNT since we won't unmap */ - vma->vm_flags |= VM_ACCOUNT; - - vm_acct_memory(new_len >> PAGE_SHIFT); - } - - /* - * VMAs can actually be merged back together in copy_vma - * calling merge_vma. This can happen with anonymous vmas - * which have not yet been faulted, so if we were to consider - * this VMA split we'll end up adding VM_ACCOUNT on the - * next VMA, which is completely unrelated if this VMA - * was re-merged. - */ - if (split && new_vma == vma) - split = 0; - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; /* Because we won't unmap we don't need to touch locked_vm */ - goto out; + return new_addr; } if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ - vm_unacct_memory(excess >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) + vm_acct_memory(new_len >> PAGE_SHIFT); excess = 0; } @@ -458,7 +608,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->locked_vm += new_len >> PAGE_SHIFT; *locked = true; } -out: + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ diff --git a/mm/nommu.c b/mm/nommu.c index 0faf39b32cdb..870fea12823e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; @@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + len = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8b84661a6410..04b19b7b5435 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -170,11 +170,13 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/* - * Print out unreclaimble slabs info when unreclaimable slabs amount is greater - * than all user memory (LRU pages) - */ -static bool is_dump_unreclaim_slabs(void) +/** + * Check whether unreclaimable slab amount is greater than + * all user memory(LRU pages). + * dump_unreclaimable_slab() could help in the case that + * oom due to too much unreclaimable slab used by kernel. +*/ +static bool should_dump_unreclaim_slab(void) { unsigned long nr_lru; @@ -463,7 +465,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) mem_cgroup_print_oom_meminfo(oc->memcg); else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); - if (is_dump_unreclaim_slabs()) + if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa227a479e4..b63294517e04 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> @@ -70,6 +71,7 @@ #include <linux/psi.h> #include <linux/padata.h> #include <linux/khugepaged.h> +#include <linux/buffer_head.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -165,53 +167,26 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_alloc); -#else DEFINE_STATIC_KEY_FALSE(init_on_alloc); -#endif EXPORT_SYMBOL(init_on_alloc); -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_free); -#else DEFINE_STATIC_KEY_FALSE(init_on_free); -#endif EXPORT_SYMBOL(init_on_free); +static bool _init_on_alloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) { - int ret; - bool bool_result; - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); - if (bool_result) - static_branch_enable(&init_on_alloc); - else - static_branch_disable(&init_on_alloc); - return 0; + return kstrtobool(buf, &_init_on_alloc_enabled_early); } early_param("init_on_alloc", early_init_on_alloc); +static bool _init_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); static int __init early_init_on_free(char *buf) { - int ret; - bool bool_result; - - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); - if (bool_result) - static_branch_enable(&init_on_free); - else - static_branch_disable(&init_on_free); - return 0; + return kstrtobool(buf, &_init_on_free_enabled_early); } early_param("init_on_free", early_init_on_free); @@ -495,14 +470,6 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, unsigned long pfn, @@ -521,6 +488,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, return (word >> bitidx) & mask; } +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long mask) { @@ -728,19 +703,6 @@ static int __init early_debug_pagealloc(char *buf) } early_param("debug_pagealloc", early_debug_pagealloc); -void init_debug_pagealloc(void) -{ - if (!debug_pagealloc_enabled()) - return; - - static_branch_enable(&_debug_pagealloc_enabled); - - if (!debug_guardpage_minorder()) - return; - - static_branch_enable(&_debug_guardpage_enabled); -} - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -792,6 +754,53 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif +/* + * Enable static keys related to various memory debugging and hardening options. + * Some override others, and depend on early params that are evaluated in the + * order of appearance. So we need to first gather the full picture of what was + * enabled, and then make decisions. + */ +void init_mem_debugging_and_hardening(void) +{ + if (_init_on_alloc_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc\n"); + else + static_branch_enable(&init_on_alloc); + } + if (_init_on_free_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_free\n"); + else + static_branch_enable(&init_on_free); + } + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) + static_branch_enable(&_page_poisoning_enabled); +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; + + static_branch_enable(&_debug_pagealloc_enabled); + + if (!debug_guardpage_minorder()) + return; + + static_branch_enable(&_debug_guardpage_enabled); +#endif +} + static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -994,7 +1003,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -1007,7 +1016,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: - while (order < max_order - 1) { + while (order < max_order) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1033,7 +1042,7 @@ continue_merging: pfn = combined_pfn; order++; } - if (max_order < MAX_ORDER) { + if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. * We want to prevent merge between freepages on isolate * pageblock and normal pageblock. Without this, pageblock @@ -1054,7 +1063,7 @@ continue_merging: is_migrate_isolate(buddy_mt))) goto done_merging; } - max_order++; + max_order = order + 1; goto continue_merging; } @@ -1264,7 +1273,8 @@ static __always_inline bool free_pages_prepare(struct page *page, if (want_init_on_free()) kernel_init_free_pages(page, 1 << order); - kernel_poison_pages(page, 1 << order, 0); + kernel_poison_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should @@ -1272,8 +1282,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 0); + debug_pagealloc_unmap_pages(page, 1 << order); kasan_free_nondeferred_pages(page, order); @@ -1344,7 +1353,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int prefetch_nr = 0; + int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; LIST_HEAD(head); @@ -1395,8 +1404,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, * avoid excessive prefetching due to large count, only * prefetch buddy for the first pcp->batch nr of pages. */ - if (prefetch_nr++ < pcp->batch) + if (prefetch_nr) { prefetch_buddy(page); + prefetch_nr--; + } } while (--count && --batch_free && !list_empty(list)); } @@ -1558,14 +1569,23 @@ void __free_pages_core(struct page *page, unsigned int order) #ifdef CONFIG_NEED_MULTIPLE_NODES -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * treats start/end as pfns. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn, +static int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; @@ -1583,7 +1603,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return nid; } -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { @@ -2103,6 +2122,8 @@ void __init page_alloc_init_late(void) files_maxfiles_init(); #endif + buffer_init(); + /* Discard memblock private memory */ memblock_discard(); @@ -2207,12 +2228,6 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(void) -{ - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled()) || want_init_on_free(); -} - #ifdef CONFIG_DEBUG_VM /* * With DEBUG_VM enabled, order-0 pages are checked for expected state when @@ -2270,11 +2285,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); kasan_alloc_pages(page, order); - kernel_poison_pages(page, 1 << order, 1); + kernel_unpoison_pages(page, 1 << order); set_page_owner(page, order, gfp_flags); + + if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2282,9 +2299,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags { post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); - if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2470,12 +2484,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline void boost_watermark(struct zone *zone) +static inline bool boost_watermark(struct zone *zone) { unsigned long max_boost; if (!watermark_boost_factor) - return; + return false; /* * Don't bother in zones that are unlikely to produce results. * On small machines, including kdump capture kernels running @@ -2483,7 +2497,7 @@ static inline void boost_watermark(struct zone *zone) * memory situation immediately. */ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return; + return false; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); @@ -2497,12 +2511,14 @@ static inline void boost_watermark(struct zone *zone) * boosted watermark resulting in a hang. */ if (!max_boost) - return; + return false; max_boost = max(pageblock_nr_pages, max_boost); zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, max_boost); + + return true; } /* @@ -2540,8 +2556,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * likelihood of future fallbacks. Wake kswapd now as the node * may be balanced overall and kswapd will not wake naturally. */ - boost_watermark(zone); - if (alloc_flags & ALLOC_KSWAPD) + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ @@ -3017,13 +3032,16 @@ static void drain_local_pages_wq(struct work_struct *work) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator. - * - * When zone parameter is non-NULL, spill just the single zone's pages. + * The implementation of drain_all_pages(), exposing an extra parameter to + * drain on all cpus. * - * Note that this can be extremely slow as the draining happens in a workqueue. + * drain_all_pages() is optimized to only execute on cpus where pcplists are + * not empty. The check for non-emptiness can however race with a free to + * pcplist that has not yet increased the pcp->count from 0 to 1. Callers + * that need the guarantee that every CPU has drained can disable the + * optimizing racy check. */ -void drain_all_pages(struct zone *zone) +static void __drain_all_pages(struct zone *zone, bool force_all_cpus) { int cpu; @@ -3062,7 +3080,13 @@ void drain_all_pages(struct zone *zone) struct zone *z; bool has_pcps = false; - if (zone) { + if (force_all_cpus) { + /* + * The pcp.count check is racy, some callers need a + * guarantee that no cpu is missed. + */ + has_pcps = true; + } else if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); if (pcp->pcp.count) has_pcps = true; @@ -3095,6 +3119,18 @@ void drain_all_pages(struct zone *zone) mutex_unlock(&pcpu_drain_mutex); } +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * When zone parameter is non-NULL, spill just the single zone's pages. + * + * Note that this can be extremely slow as the draining happens in a workqueue. + */ +void drain_all_pages(struct zone *zone) +{ + __drain_all_pages(zone, false); +} + #ifdef CONFIG_HIBERNATION /* @@ -3190,10 +3226,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) pcp = &this_cpu_ptr(zone->pageset)->pcp; list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= pcp->high) { - unsigned long batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, batch, pcp); - } + if (pcp->count >= READ_ONCE(pcp->high)) + free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); } /* @@ -3378,7 +3412,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, + READ_ONCE(pcp->batch), list, migratetype, alloc_flags); if (unlikely(list_empty(list))) return NULL; @@ -4264,10 +4298,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); -static bool __need_fs_reclaim(gfp_t gfp_mask) +static bool __need_reclaim(gfp_t gfp_mask) { - gfp_mask = current_gfp_context(gfp_mask); - /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return false; @@ -4276,10 +4308,6 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) if (current->flags & PF_MEMALLOC) return false; - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) - return false; - if (gfp_mask & __GFP_NOLOCKDEP) return false; @@ -4298,15 +4326,29 @@ void __fs_reclaim_release(void) void fs_reclaim_acquire(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_acquire(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_acquire(); + +#ifdef CONFIG_MMU_NOTIFIER + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); +#endif + + } } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_release(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_release(); + } } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -5007,6 +5049,26 @@ static inline void free_the_page(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_NONE); } +/** + * __free_pages - Free pages allocated with alloc_pages(). + * @page: The page pointer returned from alloc_pages(). + * @order: The order of the allocation. + * + * This function can free multi-page allocations that are not compound + * pages. It does not check that the @order passed in matches that of + * the allocation, so it is easy to leak memory. Freeing more memory + * than was allocated will probably emit a warning. + * + * If the last reference to this page is speculative, it will be released + * by put_page() which only frees the first page of a non-compound + * allocation. To prevent the remaining pages from being leaked, we free + * the subsequent pages here. If you want to use the page's reference + * count to decide when to free the allocation, you should allocate a + * compound page, and use put_page() instead of __free_pages(). + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) @@ -5465,7 +5527,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_zone_page_state(NR_PAGETABLE), + global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -5497,6 +5559,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif + " pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5522,6 +5585,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif + K(node_page_state(pgdat, NR_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5553,7 +5617,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" @@ -5574,7 +5637,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), @@ -5904,7 +5966,10 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static void pageset_init(struct per_cpu_pageset *p); +/* These effectively disable the pcplists in the boot pageset completely */ +#define BOOT_PAGESET_HIGH 0 +#define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); @@ -5972,7 +6037,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + pageset_init(&per_cpu(boot_pageset, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -6255,13 +6320,16 @@ static int zone_batchsize(struct zone *zone) } /* - * pcp->high and pcp->batch values are related and dependent on one another: - * ->batch must never be higher then ->high. - * The following function updates them in a safe manner without read side - * locking. + * pcp->high and pcp->batch values are related and generally batch is lower + * than high. They are also related to pcp->count such that count is lower + * than high, and as soon as it reaches high, the pcplist is flushed. * - * Any new users of pcp->batch and pcp->high should ensure they can cope with - * those fields changing asynchronously (acording to the above rule). + * However, guaranteeing these relations at all times would require e.g. write + * barriers here but also careful usage of read barriers at the read side, and + * thus be prone to error and bad for performance. Thus the update only prevents + * store tearing. Any new users of pcp->batch and pcp->high should ensure they + * can cope with those fields changing asynchronously, and fully trust only the + * pcp->count field on the local CPU with interrupts disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters @@ -6270,21 +6338,8 @@ static int zone_batchsize(struct zone *zone) static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, unsigned long batch) { - /* start with a fail safe value for batch */ - pcp->batch = 1; - smp_wmb(); - - /* Update high, then batch, in order */ - pcp->high = high; - smp_wmb(); - - pcp->batch = batch; -} - -/* a companion to pageset_set_high() */ -static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) -{ - pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); + WRITE_ONCE(pcp->batch, batch); + WRITE_ONCE(pcp->high, high); } static void pageset_init(struct per_cpu_pageset *p) @@ -6297,53 +6352,70 @@ static void pageset_init(struct per_cpu_pageset *p) pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); + + /* + * Set batch and high values safe for a boot pageset. A true percpu + * pageset's initialization will update them subsequently. Here we don't + * need to be as careful as pageset_update() as nobody can access the + * pageset yet. + */ + pcp->high = BOOT_PAGESET_HIGH; + pcp->batch = BOOT_PAGESET_BATCH; } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, + unsigned long batch) { - pageset_init(p); - pageset_set_batch(p, batch); + struct per_cpu_pageset *p; + int cpu; + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_update(&p->pcp, high, batch); + } } /* - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist - * to the value high for the pageset p. + * Calculate and set new high and batch values for all per-cpu pagesets of a + * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. */ -static void pageset_set_high(struct per_cpu_pageset *p, - unsigned long high) +static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long batch = max(1UL, high / 4); - if ((high / 4) > (PAGE_SHIFT * 8)) - batch = PAGE_SHIFT * 8; + unsigned long new_high, new_batch; - pageset_update(&p->pcp, high, batch); -} + if (percpu_pagelist_fraction) { + new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; + new_batch = max(1UL, new_high / 4); + if ((new_high / 4) > (PAGE_SHIFT * 8)) + new_batch = PAGE_SHIFT * 8; + } else { + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); + } -static void pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) -{ - if (percpu_pagelist_fraction) - pageset_set_high(pcp, - (zone_managed_pages(zone) / - percpu_pagelist_fraction)); - else - pageset_set_batch(pcp, zone_batchsize(zone)); -} + if (zone->pageset_high == new_high && + zone->pageset_batch == new_batch) + return; -static void __meminit zone_pageset_init(struct zone *zone, int cpu) -{ - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + zone->pageset_high = new_high; + zone->pageset_batch = new_batch; - pageset_init(pcp); - pageset_set_high_and_batch(zone, pcp); + __zone_set_pageset_high_and_batch(zone, new_high, new_batch); } void __meminit setup_zone_pageset(struct zone *zone) { + struct per_cpu_pageset *p; int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_init(p); + } + + zone_set_pageset_high_and_batch(zone); } /* @@ -6386,6 +6458,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * offset of a (static) per cpu variable into the per cpu area. */ zone->pageset = &boot_pageset; + zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", @@ -7791,31 +7865,24 @@ static void calculate_totalreserve_pages(void) static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; - enum zone_type j, idx; + enum zone_type i, j; for_each_online_pgdat(pgdat) { - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone_managed_pages(zone); - - zone->lowmem_reserve[j] = 0; - - idx = j; - while (idx) { - struct zone *lower_zone; - - idx--; - lower_zone = pgdat->node_zones + idx; - - if (!sysctl_lowmem_reserve_ratio[idx] || - !zone_managed_pages(lower_zone)) { - lower_zone->lowmem_reserve[j] = 0; - continue; + for (i = 0; i < MAX_NR_ZONES - 1; i++) { + struct zone *zone = &pgdat->node_zones[i]; + int ratio = sysctl_lowmem_reserve_ratio[i]; + bool clear = !ratio || !zone_managed_pages(zone); + unsigned long managed_pages = 0; + + for (j = i + 1; j < MAX_NR_ZONES; j++) { + if (clear) { + zone->lowmem_reserve[j] = 0; } else { - lower_zone->lowmem_reserve[j] = - managed_pages / sysctl_lowmem_reserve_ratio[idx]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); + zone->lowmem_reserve[j] = managed_pages / ratio; } - managed_pages += zone_managed_pages(lower_zone); } } } @@ -8077,15 +8144,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -static void __zone_pcp_update(struct zone *zone) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); -} - /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu @@ -8118,7 +8176,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, goto out; for_each_populated_zone(zone) - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; @@ -8517,6 +8575,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + drain_all_pages(cc.zone); + /* * In case of -EBUSY, we'd like to know which page causes problem. * So, just fall through. test_pages_isolated() has a tracepoint @@ -8725,7 +8785,28 @@ EXPORT_SYMBOL(free_contig_range); void __meminit zone_pcp_update(struct zone *zone) { mutex_lock(&pcp_batch_high_lock); - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); + mutex_unlock(&pcp_batch_high_lock); +} + +/* + * Effectively disable pcplists for the zone by setting the high limit to 0 + * and draining all cpus. A concurrent page freeing on another CPU that's about + * to put the page on pcplist will either finish before the drain and the page + * will be drained, or observe the new high limit and skip the pcplist. + * + * Must be paired with a call to zone_pcp_enable(). + */ +void zone_pcp_disable(struct zone *zone) +{ + mutex_lock(&pcp_batch_high_lock); + __zone_set_pageset_high_and_batch(zone, 0, 1); + __drain_all_pages(zone, true); +} + +void zone_pcp_enable(struct zone *zone) +{ + __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); mutex_unlock(&pcp_batch_high_lock); } diff --git a/mm/page_counter.c b/mm/page_counter.c index b24a60b28bb0..c6860f51b6c6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -183,14 +183,14 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - usage = atomic_long_read(&counter->usage); + usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->usage) <= usage) + if (page_counter_read(counter) <= usage) return 0; counter->max = old; diff --git a/mm/page_ext.c b/mm/page_ext.c index a3616f7a0e9e..16b161f28a31 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void) } } +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) +{ + invoke_init_callbacks(); +} +#endif + static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -#if !defined(CONFIG_SPARSEMEM) +#ifndef CONFIG_SPARSEMEM void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) @@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); - invoke_init_callbacks(); return; fail: diff --git a/mm/page_isolation.c b/mm/page_isolation.c index abbf42214485..bddf788f45bf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -49,7 +49,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); - drain_all_pages(zone); return 0; } @@ -89,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order) { + if (order >= pageblock_order && order < MAX_ORDER - 1) { pfn = page_to_pfn(page); buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -172,11 +171,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * * Please note that there is no strong synchronization with the page allocator * either. Pages might be freed while their page blocks are marked ISOLATED. - * In some cases pages might still end up on pcp lists and that would allow + * A call to drain_all_pages() after isolation can flush most of them. However + * in some cases pages might still end up on pcp lists and that would allow * for their allocation even when they are in fact isolated already. Depending - * on how strong of a guarantee the caller needs drain_all_pages might be needed - * (e.g. __offline_pages will need to call it after check for isolated range for - * a next retry). + * on how strong of a guarantee the caller needs, zone_pcp_disable/enable() + * might be used to flush and disable pcplist before isolation and enable after + * unisolation. * * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ diff --git a/mm/page_owner.c b/mm/page_owner.c index b735a8eafcdb..af464bb7fbe7 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> +#include <linux/sched/clock.h> #include "internal.h" @@ -25,6 +26,8 @@ struct page_owner { gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; + u64 ts_nsec; + pid_t pid; }; static bool page_owner_enabled = false; @@ -172,6 +175,8 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; + page_owner->pid = current->pid; + page_owner->ts_nsec = local_clock(); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -236,6 +241,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->last_migrate_reason = old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; + new_page_owner->pid = old_page_owner->pid; + new_page_owner->ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -349,9 +356,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg)\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", page_owner->order, page_owner->gfp_mask, - &page_owner->gfp_mask); + &page_owner->gfp_mask, page_owner->pid, + page_owner->ts_nsec); if (ret >= count) goto err; @@ -427,8 +435,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, + page_owner->pid, page_owner->ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { diff --git a/mm/page_poison.c b/mm/page_poison.c index ae0482cded87..06ec518b2089 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -8,45 +8,17 @@ #include <linux/ratelimit.h> #include <linux/kasan.h> -static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning); +bool _page_poisoning_enabled_early; +EXPORT_SYMBOL(_page_poisoning_enabled_early); +DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled); +EXPORT_SYMBOL(_page_poisoning_enabled); static int __init early_page_poison_param(char *buf) { - int ret; - bool tmp; - - ret = strtobool(buf, &tmp); - if (ret) - return ret; - - if (tmp) - static_branch_enable(&want_page_poisoning); - else - static_branch_disable(&want_page_poisoning); - - return 0; + return kstrtobool(buf, &_page_poisoning_enabled_early); } early_param("page_poison", early_page_poison_param); -/** - * page_poisoning_enabled - check if page poisoning is enabled - * - * Return true if page poisoning is enabled, or false if not. - */ -bool page_poisoning_enabled(void) -{ - /* - * Assumes that debug_pagealloc_enabled is set before - * memblock_free_all. - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - return (static_branch_unlikely(&want_page_poisoning) || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())); -} -EXPORT_SYMBOL_GPL(page_poisoning_enabled); - static void poison_page(struct page *page) { void *addr = kmap_atomic(page); @@ -58,7 +30,7 @@ static void poison_page(struct page *page) kunmap_atomic(addr); } -static void poison_pages(struct page *page, int n) +void __kernel_poison_pages(struct page *page, int n) { int i; @@ -79,9 +51,6 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) unsigned char *start; unsigned char *end; - if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) - return; - start = memchr_inv(mem, PAGE_POISON, bytes); if (!start) return; @@ -117,7 +86,7 @@ static void unpoison_page(struct page *page) kunmap_atomic(addr); } -static void unpoison_pages(struct page *page, int n) +void __kernel_unpoison_pages(struct page *page, int n) { int i; @@ -125,17 +94,6 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void kernel_poison_pages(struct page *page, int numpages, int enable) -{ - if (!page_poisoning_enabled()) - return; - - if (enable) - unpoison_pages(page, numpages); - else - poison_pages(page, numpages); -} - #ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 5e77b269c330..86e3a3688d59 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -66,18 +66,19 @@ static inline bool pfn_is_match(struct page *page, unsigned long pfn) /** * check_pte - check if @pvmw->page is mapped at the @pvmw->pte + * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking * * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* * mapped. check_pte() has to validate this. * - * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary - * page. + * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to + * arbitrary page. * * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration * entry that points to @pvmw->page or any subpage in case of THP. * - * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to - * @pvmw->page or any subpage in case of THP. + * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to + * pvmw->page or any subpage in case of THP. * * Otherwise, return false. * diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 702250f148e7..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -260,7 +260,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; - struct iovec *iov_r = iovstack_r; + struct iovec *iov_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? WRITE : READ; diff --git a/mm/rmap.c b/mm/rmap.c index 31b29321adfe..6657000b18d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1533,15 +1533,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, goto discard; } - if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } - } - /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); if (should_defer_flush(mm, flags)) { diff --git a/mm/shmem.c b/mm/shmem.c index 537c137698f8..7c6b6d8f6c39 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -246,7 +246,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -static const struct address_space_operations shmem_aops; +const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -713,7 +713,7 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); - __inc_node_page_state(page, NR_SHMEM_THPS); + __inc_lruvec_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); @@ -1152,7 +1152,7 @@ static void shmem_evict_inode(struct inode *inode) struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (inode->i_mapping->a_ops == &shmem_aops) { + if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); @@ -1858,7 +1858,7 @@ repeat: } /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) + if (!shmem_mapping(mapping)) goto alloc_nohuge; if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) goto alloc_nohuge; @@ -2352,11 +2352,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } -bool shmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &shmem_aops; -} - static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -3865,7 +3860,7 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } -static const struct address_space_operations shmem_aops = { +const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS @@ -3877,6 +3872,7 @@ static const struct address_space_operations shmem_aops = { #endif .error_remove_page = generic_error_remove_page, }; +EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, @@ -4024,7 +4020,7 @@ out2: #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, @@ -4034,16 +4030,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; - int i, count; - - for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { - const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; + int len = 0; + int i; - count += sprintf(buf + count, fmt, - shmem_format_huge(values[i])); + for (i = 0; i < ARRAY_SIZE(values); i++) { + len += sysfs_emit_at(buf, len, + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", + shmem_format_huge(values[i])); } - buf[count - 1] = '\n'; - return count; + + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, @@ -4312,7 +4311,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, struct page *page; int error; - BUG_ON(mapping->a_ops != &shmem_aops); + BUG_ON(!shmem_mapping(mapping)); error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) diff --git a/mm/slab.c b/mm/slab.c index b1113561b98b..d7c8da9319c7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1399,7 +1399,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page_mapcount_reset(page); - page->mapping = NULL; + /* In union with page->mapping where page allocator expects NULL */ + page->slab_cache = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; @@ -1434,7 +1435,7 @@ static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) if (!is_debug_pagealloc_cache(cachep)) return; - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); + __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else @@ -3416,6 +3417,9 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + if (unlikely(slab_want_init_on_free(cachep))) + memset(objp, 0, cachep->object_size); + /* Put the object into the quarantine, don't touch it for now. */ if (kasan_slab_free(cachep, objp, _RET_IP_)) return; @@ -3434,8 +3438,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); - if (unlikely(slab_want_init_on_free(cachep))) - memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); memcg_slab_free_hook(cachep, &objp, 1); diff --git a/mm/slab.h b/mm/slab.h index f9977d6613d6..54faf9a623f9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -204,7 +204,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -static inline int cache_vmstat_idx(struct kmem_cache *s) +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; @@ -304,7 +304,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, static inline void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - int idx, int nr) + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -510,10 +510,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; - fs_reclaim_acquire(flags); - fs_reclaim_release(flags); - - might_sleep_if(gfpflags_allow_blocking(flags)); + might_alloc(flags); if (should_failslab(s, flags)) return NULL; diff --git a/mm/slab_common.c b/mm/slab_common.c index f9ccd5dc13f3..2f2b55c2798e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -978,7 +978,7 @@ static int slab_show(struct seq_file *m, void *p) void dump_unreclaimable_slab(void) { - struct kmem_cache *s, *s2; + struct kmem_cache *s; struct slabinfo sinfo; /* @@ -996,7 +996,7 @@ void dump_unreclaimable_slab(void) pr_info("Unreclaimable slab info:\n"); pr_info("Name Used Total\n"); - list_for_each_entry_safe(s, s2, &slab_caches, list) { + list_for_each_entry(s, &slab_caches, list) { if (s->flags & SLAB_RECLAIM_ACCOUNT) continue; @@ -1091,9 +1091,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, * @flags: the type of memory to allocate. * * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored). + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * * Return: pointer to the allocated memory or %NULL in case of error */ diff --git a/mm/slob.c b/mm/slob.c index 7cc9805c8091..8d4bfa46247f 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -474,8 +474,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - fs_reclaim_acquire(gfp); - fs_reclaim_release(gfp); + might_alloc(gfp); if (size < PAGE_SIZE - minalign) { int align = minalign; @@ -597,8 +596,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - fs_reclaim_acquire(flags); - fs_reclaim_release(flags); + might_alloc(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); diff --git a/mm/slub.c b/mm/slub.c index 34dcc09e2ec9..4552319148f6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1836,8 +1836,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - - page->mapping = NULL; + /* In union with page->mapping where page allocator expects NULL */ + page->slab_cache = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab_page(page, order, s); @@ -2245,8 +2245,7 @@ redo: } } else { m = M_FULL; -#ifdef CONFIG_SLUB_DEBUG - if ((s->flags & SLAB_STORE_USER) && !lock) { + if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) { lock = 1; /* * This also ensures that the scanning of full @@ -2255,7 +2254,6 @@ redo: */ spin_lock(&n->list_lock); } -#endif } if (l != m) { @@ -3433,7 +3431,7 @@ static inline int calculate_order(unsigned int size) */ min_objects = slub_min_objects; if (!min_objects) - min_objects = 4 * (fls(nr_cpu_ids) + 1); + min_objects = 4 * (fls(num_online_cpus()) + 1); max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); @@ -4726,7 +4724,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, } static int list_locations(struct kmem_cache *s, char *buf, - enum track_item alloc) + enum track_item alloc) { int len = 0; unsigned long i; @@ -4736,7 +4734,7 @@ static int list_locations(struct kmem_cache *s, char *buf, if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - return sprintf(buf, "Out of memory\n"); + return sysfs_emit(buf, "Out of memory\n"); } /* Push back cpu slabs */ flush_all(s); @@ -4759,50 +4757,45 @@ static int list_locations(struct kmem_cache *s, char *buf, for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) - break; - len += sprintf(buf + len, "%7ld ", l->count); + len += sysfs_emit_at(buf, len, "%7ld ", l->count); if (l->addr) - len += sprintf(buf + len, "%pS", (void *)l->addr); + len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr); else - len += sprintf(buf + len, "<not-available>"); - - if (l->sum_time != l->min_time) { - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - (long)div_u64(l->sum_time, l->count), - l->max_time); - } else - len += sprintf(buf + len, " age=%ld", - l->min_time); + len += sysfs_emit_at(buf, len, "<not-available>"); + + if (l->sum_time != l->min_time) + len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld", + l->min_time, + (long)div_u64(l->sum_time, + l->count), + l->max_time); + else + len += sysfs_emit_at(buf, len, " age=%ld", l->min_time); if (l->min_pid != l->max_pid) - len += sprintf(buf + len, " pid=%ld-%ld", - l->min_pid, l->max_pid); + len += sysfs_emit_at(buf, len, " pid=%ld-%ld", + l->min_pid, l->max_pid); else - len += sprintf(buf + len, " pid=%ld", - l->min_pid); + len += sysfs_emit_at(buf, len, " pid=%ld", + l->min_pid); if (num_online_cpus() > 1 && - !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " cpus=%*pbl", - cpumask_pr_args(to_cpumask(l->cpus))); - - if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " nodes=%*pbl", - nodemask_pr_args(&l->nodes)); - - len += sprintf(buf + len, "\n"); + !cpumask_empty(to_cpumask(l->cpus))) + len += sysfs_emit_at(buf, len, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + len += sysfs_emit_at(buf, len, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + + len += sysfs_emit_at(buf, len, "\n"); } free_loc_track(&t); if (!t.count) - len += sprintf(buf, "No data\n"); + len += sysfs_emit_at(buf, len, "No data\n"); + return len; } #endif /* CONFIG_SLUB_DEBUG */ @@ -4899,12 +4892,13 @@ __setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); #endif static ssize_t show_slab_objects(struct kmem_cache *s, - char *buf, unsigned long flags) + char *buf, unsigned long flags) { unsigned long total = 0; int node; int x; unsigned long *nodes; + int len = 0; nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!nodes) @@ -4993,15 +4987,19 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } } - x = sprintf(buf, "%lu", total); + + len += sysfs_emit_at(buf, len, "%lu", total); #ifdef CONFIG_NUMA - for (node = 0; node < nr_node_ids; node++) + for (node = 0; node < nr_node_ids; node++) { if (nodes[node]) - x += sprintf(buf + x, " N%d=%lu", - node, nodes[node]); + len += sysfs_emit_at(buf, len, " N%d=%lu", + node, nodes[node]); + } #endif + len += sysfs_emit_at(buf, len, "\n"); kfree(nodes); - return x + sprintf(buf + x, "\n"); + + return len; } #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) @@ -5023,37 +5021,37 @@ struct slab_attribute { static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->size); + return sysfs_emit(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->align); + return sysfs_emit(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->object_size); + return sysfs_emit(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", oo_objects(s->oo)); + return sysfs_emit(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", oo_order(s->oo)); + return sysfs_emit(buf, "%u\n", oo_order(s->oo)); } SLAB_ATTR_RO(order); static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%lu\n", s->min_partial); + return sysfs_emit(buf, "%lu\n", s->min_partial); } static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, @@ -5073,7 +5071,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", slub_cpu_partial(s)); + return sysfs_emit(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -5098,13 +5096,13 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) return 0; - return sprintf(buf, "%pS\n", s->ctor); + return sysfs_emit(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); + return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -5137,7 +5135,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int objects = 0; int pages = 0; int cpu; - int len; + int len = 0; for_each_online_cpu(cpu) { struct page *page; @@ -5150,52 +5148,53 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) } } - len = sprintf(buf, "%d(%d)", objects, pages); + len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages); #ifdef CONFIG_SMP for_each_online_cpu(cpu) { struct page *page; page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (page && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%d(%d)", cpu, - page->pobjects, page->pages); + if (page) + len += sysfs_emit_at(buf, len, " C%d=%d(%d)", + cpu, page->pobjects, page->pages); } #endif - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } SLAB_ATTR_RO(slabs_cpu_partial); static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } SLAB_ATTR_RO(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } SLAB_ATTR_RO(hwcache_align); #ifdef CONFIG_ZONE_DMA static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } SLAB_ATTR_RO(cache_dma); #endif static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->usersize); + return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } SLAB_ATTR_RO(destroy_by_rcu); @@ -5214,33 +5213,33 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } SLAB_ATTR_RO(sanity_checks); static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } SLAB_ATTR_RO(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); } SLAB_ATTR_RO(red_zone); static ssize_t poison_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON)); } SLAB_ATTR_RO(poison); static ssize_t store_user_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); } SLAB_ATTR_RO(store_user); @@ -5284,7 +5283,7 @@ SLAB_ATTR_RO(free_calls); #ifdef CONFIG_FAILSLAB static ssize_t failslab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } SLAB_ATTR_RO(failslab); #endif @@ -5308,7 +5307,7 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10); + return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, @@ -5335,7 +5334,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) { unsigned long sum = 0; int cpu; - int len; + int len = 0; int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); if (!data) @@ -5348,16 +5347,19 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) sum += x; } - len = sprintf(buf, "%lu", sum); + len += sysfs_emit_at(buf, len, "%lu", sum); #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + if (data[cpu]) + len += sysfs_emit_at(buf, len, " C%d=%u", + cpu, data[cpu]); } #endif kfree(data); - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static void clear_stat(struct kmem_cache *s, enum stat_item si) diff --git a/mm/swap.c b/mm/swap.c index 47a47681c86b..16a525296960 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -909,6 +909,9 @@ void release_pages(struct page **pages, int nr) put_devmap_managed_page(page); continue; } + if (put_page_testzero(page)) + put_dev_pagemap(page->pgmap); + continue; } if (!put_page_testzero(page)) @@ -1164,15 +1167,6 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range_tag); -unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag, unsigned max_pages) -{ - pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ diff --git a/mm/swap_state.c b/mm/swap_state.c index ee465827420e..751c1ef2fe0e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -839,7 +839,9 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, swp_entry_t entry; unsigned int i; bool page_allocated; - struct vma_swap_readahead ra_info = {0,}; + struct vma_swap_readahead ra_info = { + .win = 1, + }; swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) @@ -900,7 +902,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", + enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, diff --git a/mm/swapfile.c b/mm/swapfile.c index d58361109066..1c0a829f7311 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -975,8 +975,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; struct swap_cluster_info *ci; - unsigned long offset, i; - unsigned char *map; + unsigned long offset; /* * Should not even be attempting cluster allocations when huge @@ -996,9 +995,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) alloc_cluster(si, idx); cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] = SWAP_HAS_CACHE; + memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); unlock_cluster(ci); swap_range_alloc(si, offset, SWAPFILE_CLUSTER); *slot = swp_entry(si->type, offset); @@ -3445,11 +3442,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned long offset; unsigned char count; unsigned char has_cache; - int err = -EINVAL; + int err; p = get_swap_device(entry); if (!p) - goto out; + return -EINVAL; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); @@ -3496,7 +3493,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); -out: if (p) put_swap_device(p); return err; @@ -3613,7 +3609,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = lock_cluster(si, offset); - count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* diff --git a/mm/truncate.c b/mm/truncate.c index 960edf5803ca..8aa4907e06e0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -637,9 +637,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, EXPORT_SYMBOL(invalidate_mapping_pages); /** - * This helper is similar with the above one, except that it accounts for pages - * that are likely on a pagevec and count them in @nr_pagevec, which will used by - * the caller. + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. */ void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6ae491a8b210..4d88fe5a277a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; +static struct rb_root purge_vmap_area_root = RB_ROOT; +static LIST_HEAD(purge_vmap_area_list); +static DEFINE_SPINLOCK(purge_vmap_area_lock); + /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to @@ -820,10 +823,17 @@ insert: if (!merged) link_va(va, root, parent, link, head); - /* - * Last step is to check and update the tree. - */ - augment_tree_propagate_from(va); + return va; +} + +static __always_inline struct vmap_area * +merge_or_add_vmap_area_augment(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + va = merge_or_add_vmap_area(va, root, head); + if (va) + augment_tree_propagate_from(va); + return va; } @@ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va) * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); - merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } @@ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void) static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; - struct llist_node *valist; - struct vmap_area *va; - struct vmap_area *n_va; + struct list_head local_pure_list; + struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); - valist = llist_del_all(&vmap_purge_list); - if (unlikely(valist == NULL)) + spin_lock(&purge_vmap_area_lock); + purge_vmap_area_root = RB_ROOT; + list_replace_init(&purge_vmap_area_list, &local_pure_list); + spin_unlock(&purge_vmap_area_lock); + + if (unlikely(list_empty(&local_pure_list))) return false; - /* - * TODO: to calculate a flush range without looping. - * The list can be up to lazy_max_pages() elements. - */ - llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < start) - start = va->va_start; - if (va->va_end > end) - end = va->va_end; - } + start = min(start, + list_first_entry(&local_pure_list, + struct vmap_area, list)->va_start); + + end = max(end, + list_last_entry(&local_pure_list, + struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) { + list_for_each_entry_safe(va, n_va, &local_pure_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; @@ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) * detached and there is no need to "unlink" it from * anything. */ - va = merge_or_add_vmap_area(va, &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, + &free_vmap_area_list); if (!va) continue; @@ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va) nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - /* After this point, we may free va at any time */ - llist_add(&va->purge_list, &vmap_purge_list); + /* + * Merge or place it to the purge tree/list. + */ + spin_lock(&purge_vmap_area_lock); + merge_or_add_vmap_area(va, + &purge_vmap_area_root, &purge_vmap_area_list); + spin_unlock(&purge_vmap_area_lock); + /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) try_purge_vmap_area_lazy(); } @@ -2256,7 +2272,7 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - kasan_poison_vmalloc(area->addr, area->size); + kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); vm_remove_mappings(area, deallocate_pages); @@ -2275,7 +2291,6 @@ static void __vunmap(const void *addr, int deallocate_pages) } kfree(area); - return; } static inline void __vfree_deferred(const void *addr) @@ -2461,9 +2476,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - unsigned int array_size = nr_pages * sizeof(struct page *), i; + unsigned long array_size; + unsigned int i; struct page **pages; + array_size = (unsigned long)nr_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2477,8 +2494,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!pages) { - remove_vm_area(area->addr); - kfree(area); + free_vm_area(area); return NULL; } @@ -3134,6 +3150,7 @@ pvm_find_va_enclose_addr(unsigned long addr) * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. + * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ @@ -3350,8 +3367,8 @@ recovery: while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -3400,8 +3417,8 @@ err_free_shadow: for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -3448,11 +3465,11 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_purge_lock) __releases(&vmap_area_lock) + __releases(&vmap_purge_lock) { - mutex_unlock(&vmap_purge_lock); spin_unlock(&vmap_area_lock); + mutex_unlock(&vmap_purge_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) @@ -3481,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static void show_purge_info(struct seq_file *m) { - struct llist_node *head; struct vmap_area *va; - head = READ_ONCE(vmap_purge_list.first); - if (head == NULL) - return; - - llist_for_each_entry(va, head, purge_list) { + spin_lock(&purge_vmap_area_lock); + list_for_each_entry(va, &purge_vmap_area_list, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } + spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) @@ -3550,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p) seq_putc(m, '\n'); /* - * As a final step, dump "unpurged" areas. Note, - * that entire "/proc/vmallocinfo" output will not - * be address sorted, because the purge list is not - * sorted. + * As a final step, dump "unpurged" areas. */ if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7b4e31eac2cf..242368592ea7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/vmscan.c - * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. @@ -1072,7 +1070,6 @@ static void page_check_dirty_writeback(struct page *page, static unsigned int shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, - enum ttu_flags ttu_flags, struct reclaim_stat *stat, bool ignore_references) { @@ -1297,7 +1294,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page)) { - enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) @@ -1372,6 +1369,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, if (PageDirty(page) || PageWriteback(page)) goto keep_locked; mapping = page_mapping(page); + fallthrough; case PAGE_CLEAN: ; /* try to free the page below */ } @@ -1393,7 +1391,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * * Rarely, pages can have buffers and no ->mapping. These are * the pages which were not successfully invalidated in - * truncate_complete_page(). We try to drop those buffers here + * truncate_cleanup_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. @@ -1514,7 +1512,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, } nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &stat, true); + &stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); @@ -1958,8 +1956,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, - &stat, false); + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -2131,8 +2128,7 @@ unsigned long reclaim_pages(struct list_head *page_list) nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -2145,8 +2141,7 @@ unsigned long reclaim_pages(struct list_head *page_list) if (!list_empty(&node_page_list)) { nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -3899,7 +3894,7 @@ kswapd_try_sleep: highest_zoneidx); /* Read the new order and highest_zoneidx */ - alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); + alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d1..f8942160fc95 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1157,7 +1157,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_page_table_pages", "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1215,6 +1214,7 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) "nr_shadow_call_stack", #endif + "nr_page_table_pages", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1503,10 +1503,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!page) continue; - /* Watch for unexpected holes punched in the memmap */ - if (!memmap_valid_within(pfn, page, zone)) - continue; - if (page_zone(page) != zone) continue; diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..25f75bbe80e0 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -445,12 +445,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_slab_state(node, WORKINGSET_NODES); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } @@ -544,7 +544,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -559,7 +559,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); - __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); diff --git a/mm/z3fold.c b/mm/z3fold.c index 18feaa0bc537..dacb0d70fa61 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -90,7 +90,7 @@ struct z3fold_buddy_slots { * be enough slots to hold all possible variants */ unsigned long slot[BUDDY_MASK + 1]; - unsigned long pool; /* back link + flags */ + unsigned long pool; /* back link */ rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03) @@ -185,7 +185,7 @@ enum z3fold_page_flags { * handle flags, go under HANDLE_FLAG_MASK */ enum z3fold_handle_flags { - HANDLES_ORPHANED = 0, + HANDLES_NOFREE = 0, }; /* @@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct z3fold_header *zhdr) z3fold_page_unlock(zhdr); } -static inline void free_handle(unsigned long handle) +static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) { struct z3fold_buddy_slots *slots; - struct z3fold_header *zhdr; int i; bool is_free; @@ -316,22 +315,19 @@ static inline void free_handle(unsigned long handle) if (WARN_ON(*(unsigned long *)handle == 0)) return; - zhdr = handle_to_z3fold_header(handle); slots = handle_to_slots(handle); write_lock(&slots->lock); *(unsigned long *)handle = 0; - if (zhdr->slots == slots) { + + if (test_bit(HANDLES_NOFREE, &slots->pool)) { write_unlock(&slots->lock); return; /* simple case, nothing else to do */ } - /* we are freeing a foreign handle if we are here */ - zhdr->foreign_handles--; + if (zhdr->slots != slots) + zhdr->foreign_handles--; + is_free = true; - if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { - write_unlock(&slots->lock); - return; - } for (i = 0; i <= BUDDY_MASK; i++) { if (slots->slot[i]) { is_free = false; @@ -343,6 +339,8 @@ static inline void free_handle(unsigned long handle) if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); + if (zhdr->slots == slots) + zhdr->slots = NULL; kmem_cache_free(pool->c_handle, slots); } } @@ -525,8 +523,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); struct z3fold_pool *pool = zhdr_to_pool(zhdr); - bool is_free = true; - int i; WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); @@ -536,21 +532,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) list_del_init(&page->lru); spin_unlock(&pool->lock); - /* If there are no foreign handles, free the handles array */ - read_lock(&zhdr->slots->lock); - for (i = 0; i <= BUDDY_MASK; i++) { - if (zhdr->slots->slot[i]) { - is_free = false; - break; - } - } - if (!is_free) - set_bit(HANDLES_ORPHANED, &zhdr->slots->pool); - read_unlock(&zhdr->slots->lock); - - if (is_free) - kmem_cache_free(pool->c_handle, zhdr->slots); - if (locked) z3fold_page_unlock(zhdr); @@ -642,15 +623,39 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool, { if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); - + struct list_head *unbuddied; int freechunks = num_free_chunks(zhdr); + + migrate_disable(); + unbuddied = this_cpu_ptr(pool->unbuddied); spin_lock(&pool->lock); list_add(&zhdr->buddy, &unbuddied[freechunks]); spin_unlock(&pool->lock); zhdr->cpu = smp_processor_id(); - put_cpu_ptr(pool->unbuddied); + migrate_enable(); + } +} + +static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) +{ + enum buddy bud = HEADLESS; + + if (zhdr->middle_chunks) { + if (!zhdr->first_chunks && + chunks <= zhdr->start_middle - ZHDR_CHUNKS) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + } else { + if (!zhdr->first_chunks) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + else + bud = MIDDLE; } + + return bud; } static inline void *mchunk_memmove(struct z3fold_header *zhdr, @@ -714,18 +719,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) if (WARN_ON(new_zhdr == zhdr)) goto out_fail; - if (new_zhdr->first_chunks == 0) { - if (new_zhdr->middle_chunks != 0 && - chunks >= new_zhdr->start_middle) { - new_bud = LAST; - } else { - new_bud = FIRST; - } - } else if (new_zhdr->last_chunks == 0) { - new_bud = LAST; - } else if (new_zhdr->middle_chunks == 0) { - new_bud = MIDDLE; - } + new_bud = get_free_buddy(new_zhdr, chunks); q = new_zhdr; switch (new_bud) { case FIRST: @@ -847,9 +841,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } - if (unlikely(PageIsolated(page) || - test_bit(PAGE_CLAIMED, &page->private) || - test_bit(PAGE_STALE, &page->private))) { + if (test_bit(PAGE_STALE, &page->private) || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -858,13 +851,16 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); - else + else { + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); + } return; } z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } @@ -886,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, int chunks = size_to_chunks(size), i; lookup: + migrate_disable(); /* First, try to find an unbuddied z3fold page. */ - unbuddied = get_cpu_ptr(pool->unbuddied); + unbuddied = this_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { struct list_head *l = &unbuddied[i]; @@ -905,7 +902,7 @@ lookup: !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); zhdr = NULL; - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (can_sleep) cond_resched(); goto lookup; @@ -919,7 +916,7 @@ lookup: test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (can_sleep) cond_resched(); goto lookup; @@ -934,7 +931,7 @@ lookup: kref_get(&zhdr->refcount); break; } - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (!zhdr) { int cpu; @@ -973,6 +970,9 @@ lookup: } } + if (zhdr && !zhdr->slots) + zhdr->slots = alloc_slots(pool, + can_sleep ? GFP_NOIO : GFP_ATOMIC); return zhdr; } @@ -1109,17 +1109,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, retry: zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { - if (zhdr->first_chunks == 0) { - if (zhdr->middle_chunks != 0 && - chunks >= zhdr->start_middle) - bud = LAST; - else - bud = FIRST; - } else if (zhdr->last_chunks == 0) - bud = LAST; - else if (zhdr->middle_chunks == 0) - bud = MIDDLE; - else { + bud = get_free_buddy(zhdr, chunks); + if (bud == HEADLESS) { if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); @@ -1265,12 +1256,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); put_z3fold_header(zhdr); - clear_bit(PAGE_CLAIMED, &page->private); return; } if (!page_claimed) - free_handle(handle); + free_handle(handle, zhdr); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -1280,8 +1270,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) z3fold_page_unlock(zhdr); return; } - if (unlikely(PageIsolated(page)) || - test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); return; @@ -1345,6 +1334,10 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) struct page *page = NULL; struct list_head *pos; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN))); + + rwlock_init(&slots.lock); + slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); spin_lock(&pool->lock); if (!pool->ops || !pool->ops->evict || retries == 0) { @@ -1359,35 +1352,36 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) list_for_each_prev(pos, &pool->lru) { page = list_entry(pos, struct page, lru); - /* this bit could have been set by free, in which case - * we pass over to the next page in the pool. - */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { - page = NULL; - continue; - } - - if (unlikely(PageIsolated(page))) { - clear_bit(PAGE_CLAIMED, &page->private); - page = NULL; - continue; - } zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) break; + if (kref_get_unless_zero(&zhdr->refcount) == 0) { + zhdr = NULL; + break; + } if (!z3fold_page_trylock(zhdr)) { - clear_bit(PAGE_CLAIMED, &page->private); + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); zhdr = NULL; continue; /* can't evict at this point */ } - if (zhdr->foreign_handles) { - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); + + /* test_and_set_bit is of course atomic, but we still + * need to do it under page lock, otherwise checking + * that bit in __z3fold_alloc wouldn't make sense + */ + if (zhdr->foreign_handles || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + else + z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ } - kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; break; @@ -1409,12 +1403,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) first_handle = 0; last_handle = 0; middle_handle = 0; + memset(slots.slot, 0, sizeof(slots.slot)); if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); + first_handle = __encode_handle(zhdr, &slots, + FIRST); if (zhdr->middle_chunks) - middle_handle = encode_handle(zhdr, MIDDLE); + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); + last_handle = __encode_handle(zhdr, &slots, + LAST); /* * it's safe to unlock here because we hold a * reference to this page @@ -1429,19 +1427,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) ret = pool->ops->evict(pool, middle_handle); if (ret) goto next; - free_handle(middle_handle); } if (first_handle) { ret = pool->ops->evict(pool, first_handle); if (ret) goto next; - free_handle(first_handle); } if (last_handle) { ret = pool->ops->evict(pool, last_handle); if (ret) goto next; - free_handle(last_handle); } next: if (test_bit(PAGE_HEADLESS, &page->private)) { @@ -1455,9 +1450,11 @@ next: spin_unlock(&pool->lock); clear_bit(PAGE_CLAIMED, &page->private); } else { + struct z3fold_buddy_slots *slots = zhdr->slots; z3fold_page_lock(zhdr); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + kmem_cache_free(pool->c_handle, slots); atomic64_dec(&pool->pages_nr); return 0; } @@ -1573,8 +1570,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); - if (test_bit(PAGE_HEADLESS, &page->private) || - test_bit(PAGE_CLAIMED, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private)) return false; zhdr = page_address(page); @@ -1586,6 +1582,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) goto out; + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + goto out; pool = zhdr_to_pool(zhdr); spin_lock(&pool->lock); if (!list_empty(&zhdr->buddy)) @@ -1612,16 +1610,17 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); zhdr = page_address(page); pool = zhdr_to_pool(zhdr); - if (!z3fold_page_trylock(zhdr)) { + if (!z3fold_page_trylock(zhdr)) return -EAGAIN; - } if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return -EBUSY; } if (work_pending(&zhdr->work)) { @@ -1663,6 +1662,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); page_mapcount_reset(page); + clear_bit(PAGE_CLAIMED, &page->private); put_page(page); return 0; } @@ -1686,6 +1686,7 @@ static void z3fold_page_putback(struct page *page) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cdfaaadea8ff..7289f502ffac 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -726,13 +726,10 @@ static void insert_zspage(struct size_class *class, * We want to see more ZS_FULL pages and less almost empty/full. * Put pages with higher ->inuse first. */ - if (head) { - if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { - list_add(&zspage->list, &head->list); - return; - } - } - list_add(&zspage->list, &class->fullness_list[fullness]); + if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head)) + list_add(&zspage->list, &head->list); + else + list_add(&zspage->list, &class->fullness_list[fullness]); } /* diff --git a/mm/zswap.c b/mm/zswap.c index fbb782924ccc..182f6ad5aa69 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,8 +24,10 @@ #include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> +#include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/zpool.h> +#include <crypto/acompress.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -81,7 +83,7 @@ static bool zswap_pool_reached_full; static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_enabled_param_ops = { +static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; @@ -91,7 +93,7 @@ module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_compressor_param_ops = { +static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, @@ -102,7 +104,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops, /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_zpool_param_ops = { +static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, @@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, * data structures **********************************/ +struct crypto_acomp_ctx { + struct crypto_acomp *acomp; + struct acomp_req *req; + struct crypto_wait wait; + u8 *dstmem; + struct mutex *mutex; +}; + struct zswap_pool { struct zpool *zpool; - struct crypto_comp * __percpu *tfm; + struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; @@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, * per-cpu code **********************************/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); +/* + * If users dynamically change the zpool type and compressor at runtime, i.e. + * zswap is running, zswap can have more than one zpool on one cpu, but they + * are sharing dtsmem. So we need this mutex to be per-cpu. + */ +static DEFINE_PER_CPU(struct mutex *, zswap_mutex); static int zswap_dstmem_prepare(unsigned int cpu) { + struct mutex *mutex; u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) return -ENOMEM; + mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); + if (!mutex) { + kfree(dst); + return -ENOMEM; + } + + mutex_init(mutex); per_cpu(zswap_dstmem, cpu) = dst; + per_cpu(zswap_mutex, cpu) = mutex; return 0; } static int zswap_dstmem_dead(unsigned int cpu) { + struct mutex *mutex; u8 *dst; + mutex = per_cpu(zswap_mutex, cpu); + kfree(mutex); + per_cpu(zswap_mutex, cpu) = NULL; + dst = per_cpu(zswap_dstmem, cpu); kfree(dst); per_cpu(zswap_dstmem, cpu) = NULL; @@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu) static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_comp *tfm; - - if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) - return 0; + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + return PTR_ERR(acomp); + } + acomp_ctx->acomp = acomp; - tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - pr_err("could not alloc crypto comp %s : %ld\n", - pool->tfm_name, PTR_ERR(tfm)); + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + crypto_free_acomp(acomp_ctx->acomp); return -ENOMEM; } - *per_cpu_ptr(pool->tfm, cpu) = tfm; + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); + acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); + return 0; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_comp *tfm; + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + } - tfm = *per_cpu_ptr(pool->tfm, cpu); - if (!IS_ERR_OR_NULL(tfm)) - crypto_free_comp(tfm); - *per_cpu_ptr(pool->tfm, cpu) = NULL; return 0; } @@ -561,8 +615,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - pool->tfm = alloc_percpu(struct crypto_comp *); - if (!pool->tfm) { + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } @@ -585,7 +640,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return pool; error: - free_percpu(pool->tfm); + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); if (pool->zpool) zpool_destroy_pool(pool->zpool); kfree(pool); @@ -596,14 +652,14 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; - has_comp = crypto_has_comp(zswap_compressor, 0, 0); + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_comp(zswap_compressor, 0, 0); + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", @@ -639,7 +695,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->tfm); + free_percpu(pool->acomp_ctx); zpool_destroy_pool(pool->zpool); kfree(pool); } @@ -723,7 +779,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } type = s; } else if (!compressor) { - if (!crypto_has_comp(s, 0, 0)) { + if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } @@ -774,7 +830,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_comp() + * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); @@ -876,8 +932,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) pgoff_t offset; struct zswap_entry *entry; struct page *page; - struct crypto_comp *tfm; - u8 *src, *dst; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + + u8 *src; unsigned int dlen; int ret; struct writeback_control wbc = { @@ -916,14 +974,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); - dst = kmap_atomic(page); - tfm = *get_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, - dst, &dlen); - put_cpu_ptr(entry->pool->tfm); - kunmap_atomic(dst); + + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + mutex_unlock(acomp_ctx->mutex); + BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1004,7 +1068,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; - struct crypto_comp *tfm; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; int ret; unsigned int hlen, dlen = PAGE_SIZE; unsigned long handle, value; @@ -1074,12 +1139,32 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* compress */ - dst = get_cpu_var(zswap_dstmem); - tfm = *get_cpu_ptr(entry->pool->tfm); - src = kmap_atomic(page); - ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); - kunmap_atomic(src); - put_cpu_ptr(entry->pool->tfm); + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(acomp_ctx->mutex); + + dst = acomp_ctx->dstmem; + sg_init_table(&input, 1); + sg_set_page(&input, page, PAGE_SIZE, 0); + + /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, frontswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing frontswap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { ret = -EINVAL; goto put_dstmem; @@ -1103,7 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, memcpy(buf, &zhdr, hlen); memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); - put_cpu_var(zswap_dstmem); + mutex_unlock(acomp_ctx->mutex); /* populate entry */ entry->offset = offset; @@ -1131,7 +1216,7 @@ insert_entry: return 0; put_dstmem: - put_cpu_var(zswap_dstmem); + mutex_unlock(acomp_ctx->mutex); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1148,7 +1233,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - struct crypto_comp *tfm; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; u8 *src, *dst; unsigned int dlen; int ret; @@ -1175,11 +1261,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); - dst = kmap_atomic(page); - tfm = *get_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); - put_cpu_ptr(entry->pool->tfm); - kunmap_atomic(dst); + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + mutex_unlock(acomp_ctx->mutex); + zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); |