summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile2
-rw-r--r--mm/balloon_compaction.c94
-rw-r--r--mm/compaction.c123
-rw-r--r--mm/filemap.c217
-rw-r--r--mm/frontswap.c35
-rw-r--r--mm/gup.c9
-rw-r--r--mm/huge_memory.c1909
-rw-r--r--mm/hugetlb.c54
-rw-r--r--mm/internal.h4
-rw-r--r--mm/khugepaged.c1922
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c134
-rw-r--r--mm/memory.c885
-rw-r--r--mm/memory_hotplug.c70
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c262
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c65
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c164
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/page_owner.c157
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/rmap.c78
-rw-r--r--mm/shmem.c918
-rw-r--r--mm/slab.c90
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c49
-rw-r--r--mm/slub.c145
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c28
-rw-r--r--mm/util.c12
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c8
-rw-r--r--mm/vmstat.c8
-rw-r--r--mm/zsmalloc.c1350
42 files changed, 5817 insertions, 3118 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e2daef3c946..3c81803b00a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -440,6 +440,14 @@ choice
endchoice
#
+# We don't deposit page tables on file THP mapping,
+# but Power makes use of them to address MMU quirk.
+#
+config TRANSPARENT_HUGE_PAGECACHE
+ def_bool y
+ depends on TRANSPARENT_HUGEPAGE && !PPC
+
+#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
diff --git a/mm/Makefile b/mm/Makefile
index 78c6f7dedb83..fc059666c760 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -74,7 +74,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 57b3e9bd6bc5..da91df50ba31 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
*/
if (trylock_page(page)) {
#ifdef CONFIG_BALLOON_COMPACTION
- if (!PagePrivate(page)) {
+ if (PageIsolated(page)) {
/* raced with isolation */
unlock_page(page);
continue;
@@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-static inline void __isolate_balloon_page(struct page *page)
+bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- ClearPagePrivate(page);
list_del(&page->lru);
b_dev_info->isolated_pages++;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+ return true;
}
-static inline void __putback_balloon_page(struct page *page)
+void balloon_page_putback(struct page *page)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- SetPagePrivate(page);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-/* __isolate_lru_page() counterpart for a ballooned page */
-bool balloon_page_isolate(struct page *page)
-{
- /*
- * Avoid burning cycles with pages that are yet under __free_pages(),
- * or just got freed under us.
- *
- * In case we 'win' a race for a balloon page being freed under us and
- * raise its refcount preventing __free_pages() from doing its job
- * the put_page() at the end of this block will take care of
- * release this page, thus avoiding a nasty leakage.
- */
- if (likely(get_page_unless_zero(page))) {
- /*
- * As balloon pages are not isolated from LRU lists, concurrent
- * compaction threads can race against page migration functions
- * as well as race against the balloon driver releasing a page.
- *
- * In order to avoid having an already isolated balloon page
- * being (wrongly) re-isolated while it is under migration,
- * or to avoid attempting to isolate pages being released by
- * the balloon driver, lets be sure we have the page lock
- * before proceeding with the balloon page isolation steps.
- */
- if (likely(trylock_page(page))) {
- /*
- * A ballooned page, by default, has PagePrivate set.
- * Prevent concurrent compaction threads from isolating
- * an already isolated balloon page by clearing it.
- */
- if (balloon_page_movable(page)) {
- __isolate_balloon_page(page);
- unlock_page(page);
- return true;
- }
- unlock_page(page);
- }
- put_page(page);
- }
- return false;
-}
-
-/* putback_lru_page() counterpart for a ballooned page */
-void balloon_page_putback(struct page *page)
-{
- /*
- * 'lock_page()' stabilizes the page and prevents races against
- * concurrent isolation threads attempting to re-isolate it.
- */
- lock_page(page);
-
- if (__is_movable_balloon_page(page)) {
- __putback_balloon_page(page);
- /* drop the extra ref count taken for page isolation */
- put_page(page);
- } else {
- WARN_ON(1);
- dump_page(page, "not movable balloon page");
- }
- unlock_page(page);
-}
/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode)
+int balloon_page_migrate(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct balloon_dev_info *balloon = balloon_page_device(page);
- int rc = -EAGAIN;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page, "not movable balloon page");
- return rc;
- }
+ return balloon->migratepage(balloon, newpage, page, mode);
+}
- if (balloon && balloon->migratepage)
- rc = balloon->migratepage(balloon, newpage, page, mode);
+const struct address_space_operations balloon_aops = {
+ .migratepage = balloon_page_migrate,
+ .isolate_page = balloon_page_isolate,
+ .putback_page = balloon_page_putback,
+};
+EXPORT_SYMBOL_GPL(balloon_aops);
- return rc;
-}
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index 7bc04778f84d..64df5fe052db 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,11 +15,11 @@
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
-#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/page_owner.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -65,13 +65,27 @@ static unsigned long release_freepages(struct list_head *freelist)
static void map_pages(struct list_head *list)
{
- struct page *page;
+ unsigned int i, order, nr_pages;
+ struct page *page, *next;
+ LIST_HEAD(tmp_list);
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
- list_for_each_entry(page, list, lru) {
- arch_alloc_page(page, 0);
- kernel_map_pages(page, 1, 1);
- kasan_alloc_pages(page, 0);
+ order = page_private(page);
+ nr_pages = 1 << order;
+
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ if (order)
+ split_page(page, order);
+
+ for (i = 0; i < nr_pages; i++) {
+ list_add(&page->lru, &tmp_list);
+ page++;
+ }
}
+
+ list_splice(&tmp_list, list);
}
static inline bool migrate_async_suitable(int migratetype)
@@ -81,6 +95,44 @@ static inline bool migrate_async_suitable(int migratetype)
#ifdef CONFIG_COMPACTION
+int PageMovable(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (!__PageMovable(page))
+ return 0;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(PageMovable);
+
+void __SetPageMovable(struct page *page, struct address_space *mapping)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__SetPageMovable);
+
+void __ClearPageMovable(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ /*
+ * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
+ * flag so that VM can catch up released page by driver after isolation.
+ * With it, VM migration doesn't try to put it back.
+ */
+ page->mapping = (void *)((unsigned long)page->mapping &
+ PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__ClearPageMovable);
+
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -368,12 +420,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
+ unsigned int order;
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
- int isolated, i;
+ int isolated;
struct page *page = cursor;
/*
@@ -439,17 +492,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
goto isolate_fail;
}
- /* Found a free page, break it into order-0 pages */
- isolated = split_free_page(page);
+ /* Found a free page, will break it into order-0 pages */
+ order = page_order(page);
+ isolated = __isolate_free_page(page, order);
if (!isolated)
break;
+ set_page_private(page, order);
total_isolated += isolated;
cc->nr_freepages += isolated;
- for (i = 0; i < isolated; i++) {
- list_add(&page->lru, freelist);
- page++;
- }
+ list_add_tail(&page->lru, freelist);
+
if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
blockpfn += isolated;
break;
@@ -568,7 +621,7 @@ isolate_freepages_range(struct compact_control *cc,
*/
}
- /* split_free_page does not map the pages */
+ /* __isolate_free_page() does not map the pages */
map_pages(&freelist);
if (pfn < end_pfn) {
@@ -670,7 +723,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
- bool is_lru;
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
@@ -733,21 +785,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
- * Check may be lockless but that's ok as we recheck later.
- * It's possible to migrate LRU pages and balloon pages
- * Skip any other type of page
- */
- is_lru = PageLRU(page);
- if (!is_lru) {
- if (unlikely(balloon_page_movable(page))) {
- if (balloon_page_isolate(page)) {
- /* Successfully isolated */
- goto isolate_success;
- }
- }
- }
-
- /*
* Regardless of being on LRU, compound pages such as THP and
* hugetlbfs are not to be compacted. We can potentially save
* a lot of iterations if we skip them at once. The check is
@@ -763,8 +800,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail;
}
- if (!is_lru)
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU and non-lru movable pages.
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ /*
+ * __PageMovable can return false positive so we need
+ * to verify it under page_lock.
+ */
+ if (unlikely(__PageMovable(page)) &&
+ !PageIsolated(page)) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock,
+ flags);
+ locked = false;
+ }
+
+ if (isolate_movable_page(page, isolate_mode))
+ goto isolate_success;
+ }
+
goto isolate_fail;
+ }
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -1059,7 +1118,7 @@ static void isolate_freepages(struct compact_control *cc)
}
}
- /* split_free_page does not map the pages */
+ /* __isolate_free_page() does not map the pages */
map_pages(freelist);
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 20f3b1f33f0e..e90c1543ec2d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,14 @@ static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
struct radix_tree_node *node;
+ int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
- VM_BUG_ON(!PageLocked(page));
-
- node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
- shadow);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(nr != 1 && shadow, page);
if (shadow) {
- mapping->nrexceptional++;
+ mapping->nrexceptional += nr;
/*
* Make sure the nrexceptional update is committed before
* the nrpages update so that final truncate racing
@@ -130,31 +130,38 @@ static void page_cache_tree_delete(struct address_space *mapping,
*/
smp_wmb();
}
- mapping->nrpages--;
+ mapping->nrpages -= nr;
- if (!node)
- return;
-
- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
+ for (i = 0; i < nr; i++) {
+ node = radix_tree_replace_clear_tags(&mapping->page_tree,
+ page->index + i, shadow);
+ if (!node) {
+ VM_BUG_ON_PAGE(nr != 1, page);
return;
+ }
- /*
- * Track node that only contains shadow entries. DAX mappings contain
- * no shadow entries and may contain other exceptional entries so skip
- * those.
- *
- * Avoid acquiring the list_lru lock if already tracked. The
- * list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes, &node->private_list);
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ continue;
+
+ /*
+ * Track node that only contains shadow entries. DAX mappings
+ * contain no shadow entries and may contain other exceptional
+ * entries so skip those.
+ *
+ * Avoid acquiring the list_lru lock if already tracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes,
+ &node->private_list);
+ }
}
}
@@ -166,6 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
+ int nr = hpage_nr_pages(page);
trace_mm_filemap_delete_from_page_cache(page);
/*
@@ -178,6 +186,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
else
cleancache_invalidate_page(mapping, page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
int mapcount;
@@ -209,9 +218,14 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(page))
- __dec_zone_page_state(page, NR_FILE_PAGES);
- if (PageSwapBacked(page))
- __dec_zone_page_state(page, NR_SHMEM);
+ __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
+ if (PageSwapBacked(page)) {
+ __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
+ if (PageTransHuge(page))
+ __dec_zone_page_state(page, NR_SHMEM_THPS);
+ } else {
+ VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
+ }
/*
* At this point page must be either written or cleaned by truncate.
@@ -235,9 +249,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
*/
void delete_from_page_cache(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
unsigned long flags;
-
void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
@@ -250,7 +263,13 @@ void delete_from_page_cache(struct page *page)
if (freepage)
freepage(page);
- put_page(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
}
EXPORT_SYMBOL(delete_from_page_cache);
@@ -1053,7 +1072,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
- struct page *page;
+ struct page *head, *page;
rcu_read_lock();
repeat:
@@ -1073,16 +1092,24 @@ repeat:
*/
goto out;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/*
* Has the page moved?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
}
@@ -1118,12 +1145,12 @@ repeat:
if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(page_mapping(page) != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
}
return page;
}
@@ -1255,7 +1282,7 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1272,12 +1299,20 @@ repeat:
*/
goto export;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
export:
@@ -1318,7 +1353,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1337,12 +1372,19 @@ repeat:
continue;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
@@ -1379,7 +1421,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
/* The hole, there no reason to continue */
@@ -1399,12 +1441,19 @@ repeat:
break;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
@@ -1413,7 +1462,7 @@ repeat:
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page->index != iter.index) {
+ if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
put_page(page);
break;
}
@@ -1451,7 +1500,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1476,12 +1525,19 @@ repeat:
continue;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
@@ -1525,7 +1581,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1543,12 +1599,20 @@ repeat:
*/
goto export;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
export:
@@ -2128,21 +2192,21 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+void filemap_map_pages(struct fault_env *fe,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = vma->vm_file;
+ struct file *file = fe->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ pgoff_t last_pgoff = start_pgoff;
loff_t size;
- struct page *page;
- unsigned long address = (unsigned long) vmf->virtual_address;
- unsigned long addr;
- pte_t *pte;
+ struct page *head, *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
- if (iter.index > vmf->max_pgoff)
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+ start_pgoff) {
+ if (iter.index > end_pgoff)
break;
repeat:
page = radix_tree_deref_slot(slot);
@@ -2156,12 +2220,19 @@ repeat:
goto next;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(page);
+ put_page(head);
goto repeat;
}
@@ -2179,14 +2250,15 @@ repeat:
if (page->index >= size >> PAGE_SHIFT)
goto unlock;
- pte = vmf->pte + page->index - vmf->pgoff;
- if (!pte_none(*pte))
- goto unlock;
-
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
- do_set_pte(vma, addr, page, pte, false, false);
+
+ fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (fe->pte)
+ fe->pte += iter.index - last_pgoff;
+ last_pgoff = iter.index;
+ if (alloc_set_pte(fe, NULL, page))
+ goto unlock;
unlock_page(page);
goto next;
unlock:
@@ -2194,7 +2266,10 @@ unlock:
skip:
put_page(page);
next:
- if (iter.index == vmf->max_pgoff)
+ /* Huge page is mapped? No need to proceed. */
+ if (pmd_trans_huge(*fe->pmd))
+ break;
+ if (iter.index == end_pgoff)
break;
}
rcu_read_unlock();
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 27a9924caf61..fec8b5044040 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -20,6 +20,8 @@
#include <linux/frontswap.h>
#include <linux/swapfile.h>
+DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
+
/*
* frontswap_ops are added by frontswap_register_ops, and provide the
* frontswap "backend" implementation functions. Multiple implementations
@@ -139,6 +141,8 @@ void frontswap_register_ops(struct frontswap_ops *ops)
ops->next = frontswap_ops;
} while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+ static_branch_inc(&frontswap_enabled_key);
+
spin_lock(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list) {
if (si->frontswap_map)
@@ -189,7 +193,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
- BUG_ON(sis == NULL);
+ VM_BUG_ON(sis == NULL);
/*
* p->frontswap is a bitmap that we MUST have to figure out which page
@@ -248,15 +252,9 @@ int __frontswap_store(struct page *page)
pgoff_t offset = swp_offset(entry);
struct frontswap_ops *ops;
- /*
- * Return if no backend registed.
- * Don't need to inc frontswap_failed_stores here.
- */
- if (!frontswap_ops)
- return -1;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(sis == NULL);
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(sis == NULL);
/*
* If a dup, we must remove the old page first; we can't leave the
@@ -303,11 +301,10 @@ int __frontswap_load(struct page *page)
pgoff_t offset = swp_offset(entry);
struct frontswap_ops *ops;
- if (!frontswap_ops)
- return -1;
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(sis == NULL);
- BUG_ON(!PageLocked(page));
- BUG_ON(sis == NULL);
if (!__frontswap_test(sis, offset))
return -1;
@@ -337,10 +334,9 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
- if (!frontswap_ops)
- return;
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(sis == NULL);
- BUG_ON(sis == NULL);
if (!__frontswap_test(sis, offset))
return;
@@ -360,10 +356,9 @@ void __frontswap_invalidate_area(unsigned type)
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
- if (!frontswap_ops)
- return;
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(sis == NULL);
- BUG_ON(sis == NULL);
if (sis->frontswap_map == NULL)
return;
diff --git a/mm/gup.c b/mm/gup.c
index c057784c8444..547741f5f7a7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -279,6 +279,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
ret = 0;
split_huge_pmd(vma, pmd, address);
+ if (pmd_trans_unstable(pmd))
+ ret = -EBUSY;
} else {
get_page(page);
spin_unlock(ptl);
@@ -286,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
+ if (pmd_none(*pmd))
+ return no_page_table(vma, flags);
}
return ret ? ERR_PTR(ret) :
@@ -350,7 +354,6 @@ unmap:
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *nonblocking)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned int fault_flags = 0;
int ret;
@@ -375,7 +378,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(mm, vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return -ENOMEM;
@@ -690,7 +693,7 @@ retry:
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
- ret = handle_mm_fault(mm, vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 343a2b7e57aa..3647334c2ef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,7 +18,6 @@
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/dax.h>
-#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
@@ -30,39 +29,12 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
-enum scan_result {
- SCAN_FAIL,
- SCAN_SUCCEED,
- SCAN_PMD_NULL,
- SCAN_EXCEED_NONE_PTE,
- SCAN_PTE_NON_PRESENT,
- SCAN_PAGE_RO,
- SCAN_NO_REFERENCED_PAGE,
- SCAN_PAGE_NULL,
- SCAN_SCAN_ABORT,
- SCAN_PAGE_COUNT,
- SCAN_PAGE_LRU,
- SCAN_PAGE_LOCK,
- SCAN_PAGE_ANON,
- SCAN_PAGE_COMPOUND,
- SCAN_ANY_PROCESS,
- SCAN_VMA_NULL,
- SCAN_VMA_CHECK,
- SCAN_ADDRESS_RANGE,
- SCAN_SWAP_CACHE_PAGE,
- SCAN_DEL_PAGE_LRU,
- SCAN_ALLOC_HUGE_PAGE_FAIL,
- SCAN_CGROUP_CHARGE_FAIL
-};
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/huge_memory.h>
-
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -82,127 +54,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
-/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly;
-static unsigned int khugepaged_pages_collapsed;
-static unsigned int khugepaged_full_scans;
-static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
-/* during fragmentation poll the hugepage allocator once every minute */
-static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
-static unsigned long khugepaged_sleep_expire;
-static struct task_struct *khugepaged_thread __read_mostly;
-static DEFINE_MUTEX(khugepaged_mutex);
-static DEFINE_SPINLOCK(khugepaged_mm_lock);
-static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
-/*
- * default collapse hugepages if there is at least one pte mapped like
- * it would have happened if the vma was large enough during page
- * fault.
- */
-static unsigned int khugepaged_max_ptes_none __read_mostly;
-
-static int khugepaged(void *none);
-static int khugepaged_slab_init(void);
-static void khugepaged_slab_exit(void);
-
-#define MM_SLOTS_HASH_BITS 10
-static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-
-static struct kmem_cache *mm_slot_cache __read_mostly;
-
-/**
- * struct mm_slot - hash lookup from mm to mm_slot
- * @hash: hash collision list
- * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
- * @mm: the mm that this information is valid for
- */
-struct mm_slot {
- struct hlist_node hash;
- struct list_head mm_node;
- struct mm_struct *mm;
-};
-
-/**
- * struct khugepaged_scan - cursor for scanning
- * @mm_head: the head of the mm list to scan
- * @mm_slot: the current mm_slot we are scanning
- * @address: the next address inside that to be scanned
- *
- * There is only the one khugepaged_scan instance of this cursor structure.
- */
-struct khugepaged_scan {
- struct list_head mm_head;
- struct mm_slot *mm_slot;
- unsigned long address;
-};
-static struct khugepaged_scan khugepaged_scan = {
- .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
-};
-
static struct shrinker deferred_split_shrinker;
-static void set_recommended_min_free_kbytes(void)
-{
- struct zone *zone;
- int nr_zones = 0;
- unsigned long recommended_min;
-
- for_each_populated_zone(zone)
- nr_zones++;
-
- /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
- recommended_min = pageblock_nr_pages * nr_zones * 2;
-
- /*
- * Make sure that on average at least two pageblocks are almost free
- * of another type, one for a migratetype to fall back to and a
- * second to avoid subsequent fallbacks of other types There are 3
- * MIGRATE_TYPES we care about.
- */
- recommended_min += pageblock_nr_pages * nr_zones *
- MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
-
- /* don't ever allow to reserve more than 5% of the lowmem */
- recommended_min = min(recommended_min,
- (unsigned long) nr_free_buffer_pages() / 20);
- recommended_min <<= (PAGE_SHIFT-10);
-
- if (recommended_min > min_free_kbytes) {
- if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
- min_free_kbytes, recommended_min);
-
- min_free_kbytes = recommended_min;
- }
- setup_per_zone_wmarks();
-}
-
-static int start_stop_khugepaged(void)
-{
- int err = 0;
- if (khugepaged_enabled()) {
- if (!khugepaged_thread)
- khugepaged_thread = kthread_run(khugepaged, NULL,
- "khugepaged");
- if (IS_ERR(khugepaged_thread)) {
- pr_err("khugepaged: kthread_run(khugepaged) failed\n");
- err = PTR_ERR(khugepaged_thread);
- khugepaged_thread = NULL;
- goto fail;
- }
-
- if (!list_empty(&khugepaged_scan.mm_head))
- wake_up_interruptible(&khugepaged_wait);
-
- set_recommended_min_free_kbytes();
- } else if (khugepaged_thread) {
- kthread_stop(khugepaged_thread);
- khugepaged_thread = NULL;
- }
-fail:
- return err;
-}
-
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
@@ -328,12 +181,7 @@ static ssize_t enabled_store(struct kobject *kobj,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
if (ret > 0) {
- int err;
-
- mutex_lock(&khugepaged_mutex);
- err = start_stop_khugepaged();
- mutex_unlock(&khugepaged_mutex);
-
+ int err = start_stop_khugepaged();
if (err)
ret = err;
}
@@ -343,7 +191,7 @@ static ssize_t enabled_store(struct kobject *kobj,
static struct kobj_attribute enabled_attr =
__ATTR(enabled, 0644, enabled_show, enabled_store);
-static ssize_t single_flag_show(struct kobject *kobj,
+ssize_t single_hugepage_flag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
@@ -351,7 +199,7 @@ static ssize_t single_flag_show(struct kobject *kobj,
!!test_bit(flag, &transparent_hugepage_flags));
}
-static ssize_t single_flag_store(struct kobject *kobj,
+ssize_t single_hugepage_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
@@ -406,13 +254,13 @@ static struct kobj_attribute defrag_attr =
static ssize_t use_zero_page_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return single_flag_show(kobj, attr, buf,
+ return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- return single_flag_store(kobj, attr, buf, count,
+ return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr =
@@ -421,14 +269,14 @@ static struct kobj_attribute use_zero_page_attr =
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return single_flag_show(kobj, attr, buf,
+ return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
}
static ssize_t debug_cow_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return single_flag_store(kobj, attr, buf, count,
+ return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
}
static struct kobj_attribute debug_cow_attr =
@@ -439,6 +287,9 @@ static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
+#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+ &shmem_enabled_attr.attr,
+#endif
#ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr,
#endif
@@ -449,171 +300,6 @@ static struct attribute_group hugepage_attr_group = {
.attrs = hugepage_attr,
};
-static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
-}
-
-static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long msecs;
- int err;
-
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
- return -EINVAL;
-
- khugepaged_scan_sleep_millisecs = msecs;
- khugepaged_sleep_expire = 0;
- wake_up_interruptible(&khugepaged_wait);
-
- return count;
-}
-static struct kobj_attribute scan_sleep_millisecs_attr =
- __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
- scan_sleep_millisecs_store);
-
-static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
-}
-
-static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long msecs;
- int err;
-
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
- return -EINVAL;
-
- khugepaged_alloc_sleep_millisecs = msecs;
- khugepaged_sleep_expire = 0;
- wake_up_interruptible(&khugepaged_wait);
-
- return count;
-}
-static struct kobj_attribute alloc_sleep_millisecs_attr =
- __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
- alloc_sleep_millisecs_store);
-
-static ssize_t pages_to_scan_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
-}
-static ssize_t pages_to_scan_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- int err;
- unsigned long pages;
-
- err = kstrtoul(buf, 10, &pages);
- if (err || !pages || pages > UINT_MAX)
- return -EINVAL;
-
- khugepaged_pages_to_scan = pages;
-
- return count;
-}
-static struct kobj_attribute pages_to_scan_attr =
- __ATTR(pages_to_scan, 0644, pages_to_scan_show,
- pages_to_scan_store);
-
-static ssize_t pages_collapsed_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
-}
-static struct kobj_attribute pages_collapsed_attr =
- __ATTR_RO(pages_collapsed);
-
-static ssize_t full_scans_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_full_scans);
-}
-static struct kobj_attribute full_scans_attr =
- __ATTR_RO(full_scans);
-
-static ssize_t khugepaged_defrag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
-}
-static ssize_t khugepaged_defrag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
-}
-static struct kobj_attribute khugepaged_defrag_attr =
- __ATTR(defrag, 0644, khugepaged_defrag_show,
- khugepaged_defrag_store);
-
-/*
- * max_ptes_none controls if khugepaged should collapse hugepages over
- * any unmapped ptes in turn potentially increasing the memory
- * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
- * reduce the available free memory in the system as it
- * runs. Increasing max_ptes_none will instead potentially reduce the
- * free memory in the system during the khugepaged scan.
- */
-static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
-}
-static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- int err;
- unsigned long max_ptes_none;
-
- err = kstrtoul(buf, 10, &max_ptes_none);
- if (err || max_ptes_none > HPAGE_PMD_NR-1)
- return -EINVAL;
-
- khugepaged_max_ptes_none = max_ptes_none;
-
- return count;
-}
-static struct kobj_attribute khugepaged_max_ptes_none_attr =
- __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
- khugepaged_max_ptes_none_store);
-
-static struct attribute *khugepaged_attr[] = {
- &khugepaged_defrag_attr.attr,
- &khugepaged_max_ptes_none_attr.attr,
- &pages_to_scan_attr.attr,
- &pages_collapsed_attr.attr,
- &full_scans_attr.attr,
- &scan_sleep_millisecs_attr.attr,
- &alloc_sleep_millisecs_attr.attr,
- NULL,
-};
-
-static struct attribute_group khugepaged_attr_group = {
- .attrs = khugepaged_attr,
- .name = "khugepaged",
-};
-
static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
int err;
@@ -672,8 +358,6 @@ static int __init hugepage_init(void)
return -EINVAL;
}
- khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
- khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
/*
* hugepages can't be allocated by the buddy allocator
*/
@@ -688,7 +372,7 @@ static int __init hugepage_init(void)
if (err)
goto err_sysfs;
- err = khugepaged_slab_init();
+ err = khugepaged_init();
if (err)
goto err_slab;
@@ -719,7 +403,7 @@ err_khugepaged:
err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
- khugepaged_slab_exit();
+ khugepaged_destroy();
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
@@ -765,11 +449,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
-{
- return pmd_mkhuge(mk_pmd(page, prot));
-}
-
static inline struct list_head *page_deferred_list(struct page *page)
{
/*
@@ -790,26 +469,23 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- struct page *page, gfp_t gfp,
- unsigned int flags)
+static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+ gfp_t gfp)
{
+ struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- spinlock_t *ptl;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- pgtable = pte_alloc_one(mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
@@ -824,12 +500,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(ptl);
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_none(*fe->pmd))) {
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
} else {
pmd_t entry;
@@ -837,12 +513,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
- pte_free(mm, pgtable);
- ret = handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ pte_free(vma->vm_mm, pgtable);
+ ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -852,11 +527,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, haddr, pmd, entry);
- add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&mm->nr_ptes);
- spin_unlock(ptl);
+ pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
+ spin_unlock(fe->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -883,12 +558,6 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
return GFP_TRANSHUGE | reclaim_flags;
}
-/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
-static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
-{
- return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
-}
-
/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -906,13 +575,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- unsigned int flags)
+int do_huge_pmd_anonymous_page(struct fault_env *fe)
{
+ struct vm_area_struct *vma = fe->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -920,42 +588,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
+ if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
- spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
int ret;
- pgtable = pte_alloc_one(mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- ptl = pmd_lock(mm, pmd);
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
ret = 0;
set = false;
- if (pmd_none(*pmd)) {
+ if (pmd_none(*fe->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(ptl);
- ret = handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ spin_unlock(fe->ptl);
+ ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
- set_huge_zero_page(pgtable, mm, vma,
- haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ set_huge_zero_page(pgtable, vma->vm_mm, vma,
+ haddr, fe->pmd, zero_page);
+ spin_unlock(fe->ptl);
set = true;
}
} else
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
if (!set) {
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
put_huge_zero_page();
}
return ret;
@@ -967,8 +633,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
- flags);
+ return __do_huge_pmd_anonymous_page(fe, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1080,14 +745,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable = NULL;
- int ret;
+ int ret = -ENOMEM;
- if (!vma_is_dax(vma)) {
- ret = -ENOMEM;
- pgtable = pte_alloc_one(dst_mm, addr);
- if (unlikely(!pgtable))
- goto out;
- }
+ /* Skip if can be re-fill on fault */
+ if (!vma_is_anonymous(vma))
+ return 0;
+
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1095,7 +761,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
+ if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
}
@@ -1118,16 +784,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (!vma_is_dax(vma)) {
- /* thp accounting separate from pmd_devmap accounting */
- src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- get_page(src_page);
- page_dup_rmap(src_page, true);
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&dst_mm->nr_ptes);
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
- }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&dst_mm->nr_ptes);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
@@ -1141,38 +804,31 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmd, pmd_t orig_pmd,
- int dirty)
+void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
{
- spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
- update_mmu_cache_pmd(vma, address, pmd);
+ haddr = fe->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
+ fe->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmd, pmd_t orig_pmd,
- struct page *page,
- unsigned long haddr)
+static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+ struct page *page)
{
+ struct vm_area_struct *vma = fe->vma;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
- spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -1189,11 +845,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
- __GFP_OTHER_NODE,
- vma, address, page_to_nid(page));
+ __GFP_OTHER_NODE, vma,
+ fe->address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg, false))) {
+ mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
@@ -1219,41 +875,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
+ pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
+ fe->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*fe->pte));
+ set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
+ pte_unmap(fe->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(mm, pmd, pgtable);
+ pmd_populate(vma->vm_mm, fe->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1262,8 +918,8 @@ out:
return ret;
out_free_pages:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ spin_unlock(fe->ptl);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
@@ -1274,25 +930,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
{
- spinlock_t *ptl;
- int ret = 0;
+ struct vm_area_struct *vma = fe->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
+ int ret = 0;
- ptl = pmd_lockptr(mm, pmd);
+ fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ spin_lock(fe->ptl);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1305,13 +959,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
- update_mmu_cache_pmd(vma, address, pmd);
+ if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1324,13 +978,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
+ ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1339,14 +992,12 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
- true))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ huge_gfp, &memcg, true))) {
put_page(new_page);
- if (page) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
+ if (page)
put_page(page);
- } else
- split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1362,13 +1013,13 @@ alloc:
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(ptl);
+ spin_lock(fe->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(ptl);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1376,14 +1027,14 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache_pmd(vma, address, pmd);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) {
- add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1392,13 +1043,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
return ret;
}
@@ -1432,6 +1083,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
* We don't mlock() pte-mapped THPs. This way we can avoid
* leaking mlocked pages into non-VM_LOCKED VMAs.
*
+ * For anon THP:
+ *
* In most cases the pmd is the only mapping of the page as we
* break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
* writable private mappings in populate_vma_page_range().
@@ -1439,15 +1092,26 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
* The only scenario when we have the page shared here is if we
* mlocking read-only mapping shared over fork(). We skip
* mlocking such pages.
+ *
+ * For file THP:
+ *
+ * We can expect PageDoubleMap() to be stable under page lock:
+ * for file pages we set it in page_add_file_rmap(), which
+ * requires page to be locked.
*/
- if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
- page->mapping && trylock_page(page)) {
- lru_add_drain();
- if (page->mapping)
- mlock_vma_page(page);
- unlock_page(page);
- }
+
+ if (PageAnon(page) && compound_mapcount(page) != 1)
+ goto skip_mlock;
+ if (PageDoubleMap(page) || !page->mapping)
+ goto skip_mlock;
+ if (!trylock_page(page))
+ goto skip_mlock;
+ lru_add_drain();
+ if (page->mapping && !PageDoubleMap(page))
+ mlock_vma_page(page);
+ unlock_page(page);
}
+skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
@@ -1458,13 +1122,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
{
- spinlock_t *ptl;
+ struct vm_area_struct *vma = fe->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1475,8 +1138,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* A PROT_NONE fault should not end up here */
BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_same(pmd, *pmdp)))
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(pmd, *fe->pmd)))
goto out_unlock;
/*
@@ -1484,9 +1147,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*pmdp))) {
- page = pmd_page(*pmdp);
- spin_unlock(ptl);
+ if (unlikely(pmd_trans_migrating(*fe->pmd))) {
+ page = pmd_page(*fe->pmd);
+ spin_unlock(fe->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1519,7 +1182,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1530,12 +1193,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* to serialises splits
*/
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(ptl);
- if (unlikely(!pmd_same(pmd, *pmdp))) {
+ spin_lock(fe->ptl);
+ if (unlikely(!pmd_same(pmd, *fe->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1553,9 +1216,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(ptl);
- migrated = migrate_misplaced_transhuge_page(mm, vma,
- pmdp, pmd, addr, page, target_nid);
+ spin_unlock(fe->ptl);
+ migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
+ fe->pmd, pmd, fe->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1570,18 +1233,18 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(mm, haddr, pmdp, pmd);
- update_mmu_cache_pmd(vma, addr, pmdp);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
return 0;
}
@@ -1684,12 +1347,18 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
- pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
- atomic_long_dec(&tlb->mm->nr_ptes);
+ if (PageAnon(page)) {
+ pgtable_t pgtable;
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ pte_free(tlb->mm, pgtable);
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ } else {
+ add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ }
spin_unlock(ptl);
- tlb_remove_page(tlb, page);
+ tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -1779,7 +1448,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_mkwrite(entry);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(!preserve_write && pmd_write(entry));
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
+ pmd_write(entry));
}
spin_unlock(ptl);
}
@@ -1788,10 +1458,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Returns true if a given pmd maps a thp, false otherwise.
+ * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
*
- * Note that if it returns true, this routine returns without unlocking page
- * table lock. So callers must unlock it.
+ * Note that if it returns page table lock pointer, this routine returns without
+ * unlocking page table lock. So callers must unlock it.
*/
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
@@ -1803,1040 +1473,6 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return NULL;
}
-#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
-
-int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
-{
- switch (advice) {
- case MADV_HUGEPAGE:
-#ifdef CONFIG_S390
- /*
- * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
- * can't handle this properly after s390_enable_sie, so we simply
- * ignore the madvise to prevent qemu from causing a SIGSEGV.
- */
- if (mm_has_pgste(vma->vm_mm))
- return 0;
-#endif
- /*
- * Be somewhat over-protective like KSM for now!
- */
- if (*vm_flags & VM_NO_THP)
- return -EINVAL;
- *vm_flags &= ~VM_NOHUGEPAGE;
- *vm_flags |= VM_HUGEPAGE;
- /*
- * If the vma become good for khugepaged to scan,
- * register it here without waiting a page fault that
- * may not happen any time soon.
- */
- if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
- return -ENOMEM;
- break;
- case MADV_NOHUGEPAGE:
- /*
- * Be somewhat over-protective like KSM for now!
- */
- if (*vm_flags & VM_NO_THP)
- return -EINVAL;
- *vm_flags &= ~VM_HUGEPAGE;
- *vm_flags |= VM_NOHUGEPAGE;
- /*
- * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
- * this vma even if we leave the mm registered in khugepaged if
- * it got registered before VM_NOHUGEPAGE was set.
- */
- break;
- }
-
- return 0;
-}
-
-static int __init khugepaged_slab_init(void)
-{
- mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
- sizeof(struct mm_slot),
- __alignof__(struct mm_slot), 0, NULL);
- if (!mm_slot_cache)
- return -ENOMEM;
-
- return 0;
-}
-
-static void __init khugepaged_slab_exit(void)
-{
- kmem_cache_destroy(mm_slot_cache);
-}
-
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
-
- hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
- if (mm == mm_slot->mm)
- return mm_slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
-}
-
-static inline int khugepaged_test_exit(struct mm_struct *mm)
-{
- return atomic_read(&mm->mm_users) == 0;
-}
-
-int __khugepaged_enter(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
- int wakeup;
-
- mm_slot = alloc_mm_slot();
- if (!mm_slot)
- return -ENOMEM;
-
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- free_mm_slot(mm_slot);
- return 0;
- }
-
- spin_lock(&khugepaged_mm_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
- /*
- * Insert just behind the scanning cursor, to let the area settle
- * down a little.
- */
- wakeup = list_empty(&khugepaged_scan.mm_head);
- list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
- spin_unlock(&khugepaged_mm_lock);
-
- atomic_inc(&mm->mm_count);
- if (wakeup)
- wake_up_interruptible(&khugepaged_wait);
-
- return 0;
-}
-
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- unsigned long hstart, hend;
- if (!vma->anon_vma)
- /*
- * Not yet faulted in so we will register later in the
- * page fault if needed.
- */
- return 0;
- if (vma->vm_ops || (vm_flags & VM_NO_THP))
- /* khugepaged not yet working on file or special mappings */
- return 0;
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart < hend)
- return khugepaged_enter(vma, vm_flags);
- return 0;
-}
-
-void __khugepaged_exit(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
- int free = 0;
-
- spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
- free = 1;
- }
- spin_unlock(&khugepaged_mm_lock);
-
- if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
- mmdrop(mm);
- } else if (mm_slot) {
- /*
- * This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_sem.
- */
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
-}
-
-static void release_pte_page(struct page *page)
-{
- /* 0 stands for page_is_file_cache(page) == false */
- dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
- unlock_page(page);
- putback_lru_page(page);
-}
-
-static void release_pte_pages(pte_t *pte, pte_t *_pte)
-{
- while (--_pte >= pte) {
- pte_t pteval = *_pte;
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
- release_pte_page(pte_page(pteval));
- }
-}
-
-static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long address,
- pte_t *pte)
-{
- struct page *page = NULL;
- pte_t *_pte;
- int none_or_zero = 0, result = 0;
- bool referenced = false, writable = false;
-
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
- if (pte_none(pteval) || (pte_present(pteval) &&
- is_zero_pfn(pte_pfn(pteval)))) {
- if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
- continue;
- } else {
- result = SCAN_EXCEED_NONE_PTE;
- goto out;
- }
- }
- if (!pte_present(pteval)) {
- result = SCAN_PTE_NON_PRESENT;
- goto out;
- }
- page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
- result = SCAN_PAGE_NULL;
- goto out;
- }
-
- VM_BUG_ON_PAGE(PageCompound(page), page);
- VM_BUG_ON_PAGE(!PageAnon(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-
- /*
- * We can do it before isolate_lru_page because the
- * page can't be freed from under us. NOTE: PG_lock
- * is needed to serialize against split_huge_page
- * when invoked from the VM.
- */
- if (!trylock_page(page)) {
- result = SCAN_PAGE_LOCK;
- goto out;
- }
-
- /*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
- */
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
- unlock_page(page);
- result = SCAN_PAGE_COUNT;
- goto out;
- }
- if (pte_write(pteval)) {
- writable = true;
- } else {
- if (PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
- /*
- * Page is not in the swap cache. It can be collapsed
- * into a THP.
- */
- }
-
- /*
- * Isolate the page to avoid collapsing an hugepage
- * currently in use by the VM.
- */
- if (isolate_lru_page(page)) {
- unlock_page(page);
- result = SCAN_DEL_PAGE_LRU;
- goto out;
- }
- /* 0 stands for page_is_file_cache(page) == false */
- inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- /* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
- referenced = true;
- }
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
- result = SCAN_PAGE_RO;
- }
-
-out:
- release_pte_pages(pte, _pte);
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 0;
-}
-
-static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl)
-{
- pte_t *_pte;
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
- pte_t pteval = *_pte;
- struct page *src_page;
-
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- clear_user_highpage(page, address);
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
- if (is_zero_pfn(pte_pfn(pteval))) {
- /*
- * ptl mostly unnecessary.
- */
- spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
- spin_unlock(ptl);
- }
- } else {
- src_page = pte_page(pteval);
- copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
- release_pte_page(src_page);
- /*
- * ptl mostly unnecessary, but preempt has to
- * be disabled to update the per-cpu stats
- * inside page_remove_rmap().
- */
- spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page, false);
- spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
- }
-
- address += PAGE_SIZE;
- page++;
- }
-}
-
-static void khugepaged_alloc_sleep(void)
-{
- DEFINE_WAIT(wait);
-
- add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
-}
-
-static int khugepaged_node_load[MAX_NUMNODES];
-
-static bool khugepaged_scan_abort(int nid)
-{
- int i;
-
- /*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
- * allocate memory locally.
- */
- if (!zone_reclaim_mode)
- return false;
-
- /* If there is a count for this node already, it must be acceptable */
- if (khugepaged_node_load[nid])
- return false;
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- if (!khugepaged_node_load[i])
- continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
- return true;
- }
- return false;
-}
-
-#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
-{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
- int nid, target_node = 0, max_value = 0;
-
- /* find first node with max normal pages hit */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
- target_node = nid;
- }
-
- /* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
- target_node = nid;
- break;
- }
-
- last_khugepaged_target_node = target_node;
- return target_node;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- if (IS_ERR(*hpage)) {
- if (!*wait)
- return false;
-
- *wait = false;
- *hpage = NULL;
- khugepaged_alloc_sleep();
- } else if (*hpage) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- return true;
-}
-
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
- unsigned long address, int node)
-{
- VM_BUG_ON_PAGE(*hpage, *hpage);
-
- /*
- * Before allocating the hugepage, release the mmap_sem read lock.
- * The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_sem during
- * that. We will recheck the vma after taking it again in write mode.
- */
- up_read(&mm->mmap_sem);
-
- *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- *hpage = ERR_PTR(-ENOMEM);
- return NULL;
- }
-
- prep_transhuge_page(*hpage);
- count_vm_event(THP_COLLAPSE_ALLOC);
- return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
- struct page *page;
-
- page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
- HPAGE_PMD_ORDER);
- if (page)
- prep_transhuge_page(page);
- return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_khugepaged_hugepage();
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- if (!*wait)
- return NULL;
-
- *wait = false;
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
- return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- if (!*hpage)
- *hpage = khugepaged_alloc_hugepage(wait);
-
- if (unlikely(!*hpage))
- return false;
-
- return true;
-}
-
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
- unsigned long address, int node)
-{
- up_read(&mm->mmap_sem);
- VM_BUG_ON(!*hpage);
-
- return *hpage;
-}
-#endif
-
-static bool hugepage_vma_check(struct vm_area_struct *vma)
-{
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE))
- return false;
- if (!vma->anon_vma || vma->vm_ops)
- return false;
- if (is_vma_temporary_stack(vma))
- return false;
- return !(vma->vm_flags & VM_NO_THP);
-}
-
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- struct vm_area_struct *vma,
- int node)
-{
- pmd_t *pmd, _pmd;
- pte_t *pte;
- pgtable_t pgtable;
- struct page *new_page;
- spinlock_t *pmd_ptl, *pte_ptl;
- int isolated = 0, result = 0;
- unsigned long hstart, hend;
- struct mem_cgroup *memcg;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- gfp_t gfp;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
-
- /* release the mmap_sem read lock. */
- new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out_nolock;
- }
-
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
- goto out_nolock;
- }
-
- /*
- * Prevent all access to pagetables with the exception of
- * gup_fast later hanlded by the ptep_clear_flush and the VM
- * handled by the anon_vma lock + PG_lock.
- */
- down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm))) {
- result = SCAN_ANY_PROCESS;
- goto out;
- }
-
- vma = find_vma(mm, address);
- if (!vma) {
- result = SCAN_VMA_NULL;
- goto out;
- }
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
- result = SCAN_ADDRESS_RANGE;
- goto out;
- }
- if (!hugepage_vma_check(vma)) {
- result = SCAN_VMA_CHECK;
- goto out;
- }
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
- goto out;
- }
-
- anon_vma_lock_write(vma->anon_vma);
-
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
- mmun_start = address;
- mmun_end = address + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
- /*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
- */
- _pmd = pmdp_collapse_flush(vma, address, pmd);
- spin_unlock(pmd_ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte);
- spin_unlock(pte_ptl);
-
- if (unlikely(!isolated)) {
- pte_unmap(pte);
- spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- /*
- * We can only use set_pmd_at when establishing
- * hugepmds and never for establishing regular pmds that
- * points to regular pagetables. Use pmd_populate for that
- */
- pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
- result = SCAN_FAIL;
- goto out;
- }
-
- /*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
- */
- anon_vma_unlock_write(vma->anon_vma);
-
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
- pte_unmap(pte);
- __SetPageUptodate(new_page);
- pgtable = pmd_pgtable(_pmd);
-
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
- /*
- * spin_lock() below is not the equivalent of smp_wmb(), so
- * this is needed to avoid the copy_huge_page writes to become
- * visible after the set_pmd_at() write.
- */
- smp_wmb();
-
- spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(pmd_ptl);
-
- *hpage = NULL;
-
- khugepaged_pages_collapsed++;
- result = SCAN_SUCCEED;
-out_up_write:
- up_write(&mm->mmap_sem);
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
-
-out_nolock:
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
-out:
- mem_cgroup_cancel_charge(new_page, memcg, true);
- goto out_up_write;
-}
-
-static int khugepaged_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- struct page **hpage)
-{
- pmd_t *pmd;
- pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0, result = 0;
- struct page *page = NULL;
- unsigned long _address;
- spinlock_t *ptl;
- int node = NUMA_NO_NODE;
- bool writable = false, referenced = false;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
- goto out;
- }
-
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
- _pte++, _address += PAGE_SIZE) {
- pte_t pteval = *_pte;
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
- continue;
- } else {
- result = SCAN_EXCEED_NONE_PTE;
- goto out_unmap;
- }
- }
- if (!pte_present(pteval)) {
- result = SCAN_PTE_NON_PRESENT;
- goto out_unmap;
- }
- if (pte_write(pteval))
- writable = true;
-
- page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page)) {
- result = SCAN_PAGE_NULL;
- goto out_unmap;
- }
-
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
- goto out_unmap;
- }
-
- /*
- * Record which node the original page is from and save this
- * information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
- * hit record.
- */
- node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
- result = SCAN_SCAN_ABORT;
- goto out_unmap;
- }
- khugepaged_node_load[node]++;
- if (!PageLRU(page)) {
- result = SCAN_PAGE_LRU;
- goto out_unmap;
- }
- if (PageLocked(page)) {
- result = SCAN_PAGE_LOCK;
- goto out_unmap;
- }
- if (!PageAnon(page)) {
- result = SCAN_PAGE_ANON;
- goto out_unmap;
- }
-
- /*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
- */
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
- result = SCAN_PAGE_COUNT;
- goto out_unmap;
- }
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
- referenced = true;
- }
- if (writable) {
- if (referenced) {
- result = SCAN_SUCCEED;
- ret = 1;
- } else {
- result = SCAN_NO_REFERENCED_PAGE;
- }
- } else {
- result = SCAN_PAGE_RO;
- }
-out_unmap:
- pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
- /* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, vma, node);
- }
-out:
- trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
- none_or_zero, result);
- return ret;
-}
-
-static void collect_mm_slot(struct mm_slot *mm_slot)
-{
- struct mm_struct *mm = mm_slot->mm;
-
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
- if (khugepaged_test_exit(mm)) {
- /* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
-
- /*
- * Not strictly needed because the mm exited already.
- *
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- */
-
- /* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
- mmdrop(mm);
- }
-}
-
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
- struct page **hpage)
- __releases(&khugepaged_mm_lock)
- __acquires(&khugepaged_mm_lock)
-{
- struct mm_slot *mm_slot;
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- int progress = 0;
-
- VM_BUG_ON(!pages);
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
- if (khugepaged_scan.mm_slot)
- mm_slot = khugepaged_scan.mm_slot;
- else {
- mm_slot = list_entry(khugepaged_scan.mm_head.next,
- struct mm_slot, mm_node);
- khugepaged_scan.address = 0;
- khugepaged_scan.mm_slot = mm_slot;
- }
- spin_unlock(&khugepaged_mm_lock);
-
- mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
- vma = NULL;
- else
- vma = find_vma(mm, khugepaged_scan.address);
-
- progress++;
- for (; vma; vma = vma->vm_next) {
- unsigned long hstart, hend;
-
- cond_resched();
- if (unlikely(khugepaged_test_exit(mm))) {
- progress++;
- break;
- }
- if (!hugepage_vma_check(vma)) {
-skip:
- progress++;
- continue;
- }
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend)
- goto skip;
- if (khugepaged_scan.address > hend)
- goto skip;
- if (khugepaged_scan.address < hstart)
- khugepaged_scan.address = hstart;
- VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
-
- while (khugepaged_scan.address < hend) {
- int ret;
- cond_resched();
- if (unlikely(khugepaged_test_exit(mm)))
- goto breakouterloop;
-
- VM_BUG_ON(khugepaged_scan.address < hstart ||
- khugepaged_scan.address + HPAGE_PMD_SIZE >
- hend);
- ret = khugepaged_scan_pmd(mm, vma,
- khugepaged_scan.address,
- hpage);
- /* move to next address */
- khugepaged_scan.address += HPAGE_PMD_SIZE;
- progress += HPAGE_PMD_NR;
- if (ret)
- /* we released mmap_sem so break loop */
- goto breakouterloop_mmap_sem;
- if (progress >= pages)
- goto breakouterloop;
- }
- }
-breakouterloop:
- up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
-breakouterloop_mmap_sem:
-
- spin_lock(&khugepaged_mm_lock);
- VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
- /*
- * Release the current mm_slot if this mm is about to die, or
- * if we scanned all vmas of this mm.
- */
- if (khugepaged_test_exit(mm) || !vma) {
- /*
- * Make sure that if mm_users is reaching zero while
- * khugepaged runs here, khugepaged_exit will find
- * mm_slot not pointing to the exiting mm.
- */
- if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
- khugepaged_scan.mm_slot = list_entry(
- mm_slot->mm_node.next,
- struct mm_slot, mm_node);
- khugepaged_scan.address = 0;
- } else {
- khugepaged_scan.mm_slot = NULL;
- khugepaged_full_scans++;
- }
-
- collect_mm_slot(mm_slot);
- }
-
- return progress;
-}
-
-static int khugepaged_has_work(void)
-{
- return !list_empty(&khugepaged_scan.mm_head) &&
- khugepaged_enabled();
-}
-
-static int khugepaged_wait_event(void)
-{
- return !list_empty(&khugepaged_scan.mm_head) ||
- kthread_should_stop();
-}
-
-static void khugepaged_do_scan(void)
-{
- struct page *hpage = NULL;
- unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
- bool wait = true;
-
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
- while (progress < pages) {
- if (!khugepaged_prealloc_page(&hpage, &wait))
- break;
-
- cond_resched();
-
- if (unlikely(kthread_should_stop() || try_to_freeze()))
- break;
-
- spin_lock(&khugepaged_mm_lock);
- if (!khugepaged_scan.mm_slot)
- pass_through_head++;
- if (khugepaged_has_work() &&
- pass_through_head < 2)
- progress += khugepaged_scan_mm_slot(pages - progress,
- &hpage);
- else
- progress = pages;
- spin_unlock(&khugepaged_mm_lock);
- }
-
- if (!IS_ERR_OR_NULL(hpage))
- put_page(hpage);
-}
-
-static bool khugepaged_should_wakeup(void)
-{
- return kthread_should_stop() ||
- time_after_eq(jiffies, khugepaged_sleep_expire);
-}
-
-static void khugepaged_wait_work(void)
-{
- if (khugepaged_has_work()) {
- const unsigned long scan_sleep_jiffies =
- msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
-
- if (!scan_sleep_jiffies)
- return;
-
- khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
- wait_event_freezable_timeout(khugepaged_wait,
- khugepaged_should_wakeup(),
- scan_sleep_jiffies);
- return;
- }
-
- if (khugepaged_enabled())
- wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
-}
-
-static int khugepaged(void *none)
-{
- struct mm_slot *mm_slot;
-
- set_freezable();
- set_user_nice(current, MAX_NICE);
-
- while (!kthread_should_stop()) {
- khugepaged_do_scan();
- khugepaged_wait_work();
- }
-
- spin_lock(&khugepaged_mm_lock);
- mm_slot = khugepaged_scan.mm_slot;
- khugepaged_scan.mm_slot = NULL;
- if (mm_slot)
- collect_mm_slot(mm_slot);
- spin_unlock(&khugepaged_mm_lock);
- return 0;
-}
-
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd)
{
@@ -2883,10 +1519,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
- if (vma_is_dax(vma)) {
- pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ if (!vma_is_anonymous(vma)) {
+ _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ if (vma_is_dax(vma))
+ return;
+ page = pmd_page(_pmd);
+ if (!PageReferenced(page) && pmd_young(_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
return __split_huge_zero_page_pmd(vma, haddr, pmd);
@@ -2942,7 +1586,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __dec_zone_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -3076,12 +1720,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void freeze_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
- TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+ TTU_RMAP_LOCKED;
int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
+ if (PageAnon(page))
+ ttu_flags |= TTU_MIGRATION;
+
/* We only need TTU_SPLIT_HUGE_PMD once */
ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
@@ -3091,7 +1738,7 @@ static void freeze_page(struct page *page)
ret = try_to_unmap(page + i, ttu_flags);
}
- VM_BUG_ON(ret);
+ VM_BUG_ON_PAGE(ret, page + i - 1);
}
static void unfreeze_page(struct page *page)
@@ -3113,15 +1760,20 @@ static void __split_huge_page_tail(struct page *head, int tail,
/*
* tail_page->_refcount is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_inc(), we
- * would then run atomic_set() concurrently with
+ * tail_page. If we used atomic_set() below instead of atomic_inc() or
+ * atomic_add(), we would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_inc().
+ * it's safer to use atomic_inc()/atomic_add().
*/
- page_ref_inc(page_tail);
+ if (PageAnon(head)) {
+ page_ref_inc(page_tail);
+ } else {
+ /* Additional pin to radix tree */
+ page_ref_add(page_tail, 2);
+ }
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3157,25 +1809,46 @@ static void __split_huge_page_tail(struct page *head, int tail,
lru_add_page_tail(head, page_tail, lruvec, list);
}
-static void __split_huge_page(struct page *page, struct list_head *list)
+static void __split_huge_page(struct page *page, struct list_head *list,
+ unsigned long flags)
{
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
+ pgoff_t end = -1;
int i;
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
lruvec = mem_cgroup_page_lruvec(head, zone);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ if (!PageAnon(page))
+ end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
+
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
+ /* Some pages can be beyond i_size: drop them from page cache */
+ if (head[i].index >= end) {
+ __ClearPageDirty(head + i);
+ __delete_from_page_cache(head + i, NULL);
+ if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
+ shmem_uncharge(head->mapping->host, 1);
+ put_page(head + i);
+ }
+ }
ClearPageCompound(head);
- spin_unlock_irq(&zone->lru_lock);
+ /* See comment in __split_huge_page_tail() */
+ if (PageAnon(head)) {
+ page_ref_inc(head);
+ } else {
+ /* Additional pin to radix tree */
+ page_ref_add(head, 2);
+ spin_unlock(&head->mapping->tree_lock);
+ }
+
+ spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
unfreeze_page(head);
@@ -3198,18 +1871,22 @@ static void __split_huge_page(struct page *page, struct list_head *list)
int total_mapcount(struct page *page)
{
- int i, ret;
+ int i, compound, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
- ret = compound_mapcount(page);
+ compound = compound_mapcount(page);
if (PageHuge(page))
- return ret;
+ return compound;
+ ret = compound;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
+ /* File pages has compound_mapcount included in _mapcount */
+ if (!PageAnon(page))
+ return ret - compound * HPAGE_PMD_NR;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
@@ -3296,36 +1973,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
- struct anon_vma *anon_vma;
- int count, mapcount, ret;
+ struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
+ int count, mapcount, extra_pins, ret;
bool mlocked;
unsigned long flags;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
- VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(head);
- if (!anon_vma) {
- ret = -EBUSY;
- goto out;
+ if (PageAnon(head)) {
+ /*
+ * The caller does not necessarily hold an mmap_sem that would
+ * prevent the anon_vma disappearing so we first we take a
+ * reference to it and then lock the anon_vma for write. This
+ * is similar to page_lock_anon_vma_read except the write lock
+ * is taken to serialise against parallel split or collapse
+ * operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ extra_pins = 0;
+ mapping = NULL;
+ anon_vma_lock_write(anon_vma);
+ } else {
+ mapping = head->mapping;
+
+ /* Truncated ? */
+ if (!mapping) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* Addidional pins from radix tree */
+ extra_pins = HPAGE_PMD_NR;
+ anon_vma = NULL;
+ i_mmap_lock_read(mapping);
}
- anon_vma_lock_write(anon_vma);
/*
* Racy check if we can split the page, before freeze_page() will
* split PMDs
*/
- if (total_mapcount(head) != page_count(head) - 1) {
+ if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
ret = -EBUSY;
goto out_unlock;
}
@@ -3338,35 +2033,62 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mlocked)
lru_add_drain();
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
+
+ if (mapping) {
+ void **pslot;
+
+ spin_lock(&mapping->tree_lock);
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(head));
+ /*
+ * Check if the head page is present in radix tree.
+ * We assume all tail are present too, if head is there.
+ */
+ if (radix_tree_deref_slot_protected(pslot,
+ &mapping->tree_lock) != head)
+ goto fail;
+ }
+
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock(&pgdata->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
- if (!mapcount && count == 1) {
+ if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- __split_huge_page(page, list);
+ if (mapping)
+ __dec_zone_page_state(page, NR_SHMEM_THPS);
+ spin_unlock(&pgdata->split_queue_lock);
+ __split_huge_page(page, list, flags);
ret = 0;
- } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
} else {
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > 0");
+ BUG();
+ }
+ spin_unlock(&pgdata->split_queue_lock);
+fail: if (mapping)
+ spin_unlock(&mapping->tree_lock);
+ spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
unfreeze_page(head);
ret = -EBUSY;
}
out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
+ if (anon_vma) {
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+ }
+ if (mapping)
+ i_mmap_unlock_read(mapping);
out:
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
@@ -3489,8 +2211,7 @@ static int split_huge_pages_set(void *data, u64 val)
if (zone != page_zone(page))
goto next;
- if (!PageHead(page) || !PageAnon(page) ||
- PageHuge(page))
+ if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
goto next;
total++;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc2a99e9cbc8..abc1c5fb7222 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3179,7 +3179,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
{
- int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
@@ -3198,19 +3197,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
-again:
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep))
- goto unlock;
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ spin_unlock(ptl);
+ continue;
+ }
pte = huge_ptep_get(ptep);
- if (huge_pte_none(pte))
- goto unlock;
+ if (huge_pte_none(pte)) {
+ spin_unlock(ptl);
+ continue;
+ }
/*
* Migrating hugepage or HWPoisoned hugepage is already
@@ -3218,7 +3220,8 @@ again:
*/
if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep);
- goto unlock;
+ spin_unlock(ptl);
+ continue;
}
page = pte_page(pte);
@@ -3228,9 +3231,10 @@ again:
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
- if (page != ref_page)
- goto unlock;
-
+ if (page != ref_page) {
+ spin_unlock(ptl);
+ continue;
+ }
/*
* Mark the VMA as having unmapped its page so that
* future faults in this VMA will fail rather than
@@ -3246,30 +3250,14 @@ again:
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, true);
- force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush) {
- address += sz;
- spin_unlock(ptl);
- break;
- }
- /* Bail out after unmapping reference page if supplied */
- if (ref_page) {
- spin_unlock(ptl);
- break;
- }
-unlock:
+
spin_unlock(ptl);
- }
- /*
- * mmu_gather ran out of room to batch pages, we break out of
- * the PTE lock to avoid doing the potential expensive TLB invalidate
- * and page-free while holding it.
- */
- if (force_flush) {
- force_flush = 0;
- tlb_flush_mmu(tlb);
- if (address < end && !ref_page)
- goto again;
+ tlb_remove_page_size(tlb, page, huge_page_size(h));
+ /*
+ * Bail out after unmapping reference page if supplied
+ */
+ if (ref_page)
+ break;
}
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
tlb_end_vma(tlb, vma);
diff --git a/mm/internal.h b/mm/internal.h
index 2524ec880e24..9b6a6c43ac39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,6 +36,8 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -150,6 +152,8 @@ extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
+extern void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags);
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
new file mode 100644
index 000000000000..7dbee698d6aa
--- /dev/null
+++ b/mm/khugepaged.c
@@ -0,0 +1,1922 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_LACK_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE,
+ SCAN_TRUNCATED,
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static unsigned long khugepaged_sleep_expire;
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly;
+static unsigned int khugepaged_max_ptes_swap __read_mostly;
+
+#define MM_SLOTS_HASH_BITS 10
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+};
+
+static struct khugepaged_scan khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = kstrtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_hugepage_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_hugepage_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = kstrtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
+ NULL,
+};
+
+struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+
+#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+ /*
+ * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+ * can't handle this properly after s390_enable_sie, so we simply
+ * ignore the madvise to prevent qemu from causing a SIGSEGV.
+ */
+ if (mm_has_pgste(vma->vm_mm))
+ return 0;
+#endif
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
+ khugepaged_enter_vma_merge(vma, *vm_flags))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+int __init khugepaged_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+
+ return 0;
+}
+
+void __init khugepaged_destroy(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+ if (mm == mm_slot->mm)
+ return mm_slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED))
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma, vm_flags);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ if (free) {
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page = NULL;
+ pte_t *_pte;
+ int none_or_zero = 0, result = 0, referenced = 0;
+ bool writable = false;
+
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval) || (pte_present(pteval) &&
+ is_zero_pfn(pte_pfn(pteval)))) {
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto out;
+ }
+ }
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out;
+ }
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ unlock_page(page);
+ result = SCAN_PAGE_COUNT;
+ goto out;
+ }
+ if (pte_write(pteval)) {
+ writable = true;
+ } else {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
+ unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
+ goto out;
+ }
+ /*
+ * Page is not in the swap cache. It can be collapsed
+ * into a THP.
+ */
+ }
+
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ /* There should be enough young pte to collapse the page */
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced++;
+ }
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
+out:
+ release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 0;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page, false);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+
+ add_wait_queue(&khugepaged_wait, &wait);
+ freezable_schedule_timeout_interruptible(
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+static int khugepaged_node_load[MAX_NUMNODES];
+
+static bool khugepaged_scan_abort(int nid)
+{
+ int i;
+
+ /*
+ * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * allocate memory locally.
+ */
+ if (!zone_reclaim_mode)
+ return false;
+
+ /* If there is a count for this node already, it must be acceptable */
+ if (khugepaged_node_load[nid])
+ return false;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (!khugepaged_node_load[i])
+ continue;
+ if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ return true;
+ }
+ return false;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
+{
+ return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
+}
+
+#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+{
+ VM_BUG_ON_PAGE(*hpage, *hpage);
+
+ *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+
+ prep_transhuge_page(*hpage);
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ return *hpage;
+}
+#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_khugepaged_hugepage(void)
+{
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
+}
+
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_khugepaged_hugepage();
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+{
+ VM_BUG_ON(!*hpage);
+
+ return *hpage;
+}
+#endif
+
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ return false;
+ if (shmem_file(vma->vm_file)) {
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ return !(vma->vm_flags & VM_NO_KHUGEPAGED);
+}
+
+/*
+ * If mmap_sem temporarily dropped, revalidate vma
+ * before taking mmap_sem.
+ * Return 0 if succeeds, otherwise return none-zero
+ * value (scan code).
+ */
+
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
+{
+ struct vm_area_struct *vma;
+ unsigned long hstart, hend;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ return SCAN_ANY_PROCESS;
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ return SCAN_VMA_NULL;
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ return SCAN_ADDRESS_RANGE;
+ if (!hugepage_vma_check(vma))
+ return SCAN_VMA_CHECK;
+ return 0;
+}
+
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ int referenced)
+{
+ pte_t pteval;
+ int swapped_in = 0, ret = 0;
+ struct fault_env fe = {
+ .vma = vma,
+ .address = address,
+ .flags = FAULT_FLAG_ALLOW_RETRY,
+ .pmd = pmd,
+ };
+
+ fe.pte = pte_offset_map(pmd, address);
+ for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ fe.pte++, fe.address += PAGE_SIZE) {
+ pteval = *fe.pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ /* we only decide to swapin, if there is enough young ptes */
+ if (referenced < HPAGE_PMD_NR/2) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ ret = do_swap_page(&fe, pteval);
+
+ /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
+ if (ret & VM_FAULT_RETRY) {
+ down_read(&mm->mmap_sem);
+ if (hugepage_vma_revalidate(mm, address)) {
+ /* vma is no longer available, don't continue to swapin */
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ /* check if the pmd is still valid */
+ if (mm_find_pmd(mm, address) != pmd)
+ return false;
+ }
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ /* pte is unmapped now, we need to map it */
+ fe.pte = pte_offset_map(pmd, fe.address);
+ }
+ fe.pte--;
+ pte_unmap(fe.pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
+ return true;
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node, int referenced)
+{
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *pmd_ptl, *pte_ptl;
+ int isolated = 0, result = 0;
+ struct mem_cgroup *memcg;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t gfp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
+
+ /*
+ * Before allocating the hugepage, release the mmap_sem read lock.
+ * The allocation can take potentially a long time if it involves
+ * sync compaction, and we do not need to hold the mmap_sem during
+ * that. We will recheck the vma after taking it again in write mode.
+ */
+ up_read(&mm->mmap_sem);
+ new_page = khugepaged_alloc_page(hpage, gfp, node);
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
+
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
+
+ down_read(&mm->mmap_sem);
+ result = hugepage_vma_revalidate(mm, address);
+ if (result) {
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ up_read(&mm->mmap_sem);
+ goto out_nolock;
+ }
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ up_read(&mm->mmap_sem);
+ goto out_nolock;
+ }
+
+ /*
+ * __collapse_huge_page_swapin always returns with mmap_sem locked.
+ * If it fails, we release mmap_sem and jump out_nolock.
+ * Continuing to collapse causes inconsistency.
+ */
+ if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ up_read(&mm->mmap_sem);
+ goto out_nolock;
+ }
+
+ up_read(&mm->mmap_sem);
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later handled by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ result = hugepage_vma_revalidate(mm, address);
+ if (result)
+ goto out;
+ /* check if the pmd is still valid */
+ if (mm_find_pmd(mm, address) != pmd)
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
+ mmun_start = address;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ spin_lock(pte_ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(pte_ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ /*
+ * We can only use set_pmd_at when establishing
+ * hugepmds and never for establishing regular pmds that
+ * points to regular pagetables. Use pmd_populate for that
+ */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ spin_unlock(pmd_ptl);
+ anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock_write(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache_pmd(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+
+ *hpage = NULL;
+
+ khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
+out_up_write:
+ up_write(&mm->mmap_sem);
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
+ return;
+out:
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+ struct page *page = NULL;
+ unsigned long _address;
+ spinlock_t *ptl;
+ int node = NUMA_NO_NODE, unmapped = 0;
+ bool writable = false;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
+
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto out_unmap;
+ }
+ }
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto out_unmap;
+ }
+ if (pte_write(pteval))
+ writable = true;
+
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unmap;
+ }
+
+ /*
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
+ */
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
+ goto out_unmap;
+ }
+ khugepaged_node_load[node]++;
+ if (!PageLRU(page)) {
+ result = SCAN_PAGE_LRU;
+ goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
+ goto out_unmap;
+ }
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced++;
+ }
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret) {
+ node = khugepaged_find_target_node();
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node, referenced);
+ }
+out:
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ none_or_zero, result, unmapped);
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr;
+ pmd_t *pmd, _pmd;
+
+ i_mmap_lock_write(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ /* probably overkill */
+ if (vma->anon_vma)
+ continue;
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (addr & ~HPAGE_PMD_MASK)
+ continue;
+ if (vma->vm_end < addr + HPAGE_PMD_SIZE)
+ continue;
+ pmd = mm_find_pmd(vma->vm_mm, addr);
+ if (!pmd)
+ continue;
+ /*
+ * We need exclusive mmap_sem to retract page table.
+ * If trylock fails we would end up with pte-mapped THP after
+ * re-fault. Not ideal, but it's more important to not disturb
+ * the system too much.
+ */
+ if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
+ spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ up_write(&vma->vm_mm->mmap_sem);
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ }
+ }
+ i_mmap_unlock_write(mapping);
+}
+
+/**
+ * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ *
+ * Basic scheme is simple, details are more complex:
+ * - allocate and freeze a new huge page;
+ * - scan over radix tree replacing old pages the new one
+ * + swap in pages if necessary;
+ * + fill in gaps;
+ * + keep old pages around in case if rollback is required;
+ * - if replacing succeed:
+ * + copy data over;
+ * + free old pages;
+ * + unfreeze huge page;
+ * - if replacing failed;
+ * + put all pages back and unfreeze them;
+ * + restore gaps in the radix-tree;
+ * + free huge page;
+ */
+static void collapse_shmem(struct mm_struct *mm,
+ struct address_space *mapping, pgoff_t start,
+ struct page **hpage, int node)
+{
+ gfp_t gfp;
+ struct page *page, *new_page, *tmp;
+ struct mem_cgroup *memcg;
+ pgoff_t index, end = start + HPAGE_PMD_NR;
+ LIST_HEAD(pagelist);
+ struct radix_tree_iter iter;
+ void **slot;
+ int nr_none = 0, result = SCAN_SUCCEED;
+
+ VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_khugepaged_gfpmask() |
+ __GFP_OTHER_NODE | __GFP_THISNODE;
+
+ new_page = khugepaged_alloc_page(hpage, gfp, node);
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out;
+ }
+
+ new_page->index = start;
+ new_page->mapping = mapping;
+ __SetPageSwapBacked(new_page);
+ __SetPageLocked(new_page);
+ BUG_ON(!page_ref_freeze(new_page, 1));
+
+
+ /*
+ * At this point the new_page is 'frozen' (page_count() is zero), locked
+ * and not up-to-date. It's safe to insert it into radix tree, because
+ * nobody would be able to map it or use it in other way until we
+ * unfreeze it.
+ */
+
+ index = start;
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ int n = min(iter.index, end) - index;
+
+ /*
+ * Handle holes in the radix tree: charge it from shmem and
+ * insert relevant subpage of new_page into the radix-tree.
+ */
+ if (n && !shmem_charge(mapping->host, n)) {
+ result = SCAN_FAIL;
+ break;
+ }
+ nr_none += n;
+ for (; index < min(iter.index, end); index++) {
+ radix_tree_insert(&mapping->page_tree, index,
+ new_page + (index % HPAGE_PMD_NR));
+ }
+
+ /* We are done. */
+ if (index >= end)
+ break;
+
+ page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto tree_unlocked;
+ }
+ spin_lock_irq(&mapping->tree_lock);
+ } else if (trylock_page(page)) {
+ get_page(page);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ break;
+ }
+
+ /*
+ * The page must be locked, so we can drop the tree_lock
+ * without racing with truncate.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (page_mapping(page) != mapping) {
+ result = SCAN_TRUNCATED;
+ goto out_unlock;
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (isolate_lru_page(page)) {
+ result = SCAN_DEL_PAGE_LRU;
+ goto out_isolate_failed;
+ }
+
+ if (page_mapped(page))
+ unmap_mapping_range(mapping, index << PAGE_SHIFT,
+ PAGE_SIZE, 0);
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+
+ /*
+ * The page is expected to have page_count() == 3:
+ * - we hold a pin on it;
+ * - one reference from radix tree;
+ * - one from isolate_lru_page;
+ */
+ if (!page_ref_freeze(page, 3)) {
+ result = SCAN_PAGE_COUNT;
+ goto out_lru;
+ }
+
+ /*
+ * Add the page to the list to be able to undo the collapse if
+ * something go wrong.
+ */
+ list_add_tail(&page->lru, &pagelist);
+
+ /* Finally, replace with the new page. */
+ radix_tree_replace_slot(slot,
+ new_page + (index % HPAGE_PMD_NR));
+
+ index++;
+ continue;
+out_lru:
+ spin_unlock_irq(&mapping->tree_lock);
+ putback_lru_page(page);
+out_isolate_failed:
+ unlock_page(page);
+ put_page(page);
+ goto tree_unlocked;
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
+ /*
+ * Handle hole in radix tree at the end of the range.
+ * This code only triggers if there's nothing in radix tree
+ * beyond 'end'.
+ */
+ if (result == SCAN_SUCCEED && index < end) {
+ int n = end - index;
+
+ if (!shmem_charge(mapping->host, n)) {
+ result = SCAN_FAIL;
+ goto tree_locked;
+ }
+
+ for (; index < end; index++) {
+ radix_tree_insert(&mapping->page_tree, index,
+ new_page + (index % HPAGE_PMD_NR));
+ }
+ nr_none += n;
+ }
+
+tree_locked:
+ spin_unlock_irq(&mapping->tree_lock);
+tree_unlocked:
+
+ if (result == SCAN_SUCCEED) {
+ unsigned long flags;
+ struct zone *zone = page_zone(new_page);
+
+ /*
+ * Replacing old pages with new one has succeed, now we need to
+ * copy the content and free old pages.
+ */
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+ page);
+ list_del(&page->lru);
+ unlock_page(page);
+ page_ref_unfreeze(page, 1);
+ page->mapping = NULL;
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ put_page(page);
+ }
+
+ local_irq_save(flags);
+ __inc_zone_page_state(new_page, NR_SHMEM_THPS);
+ if (nr_none) {
+ __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none);
+ __mod_zone_page_state(zone, NR_SHMEM, nr_none);
+ }
+ local_irq_restore(flags);
+
+ /*
+ * Remove pte page tables, so we can re-faulti
+ * the page as huge.
+ */
+ retract_page_tables(mapping, start);
+
+ /* Everything is ready, let's unfreeze the new_page */
+ set_page_dirty(new_page);
+ SetPageUptodate(new_page);
+ page_ref_unfreeze(new_page, HPAGE_PMD_NR);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
+ lru_cache_add_anon(new_page);
+ unlock_page(new_page);
+
+ *hpage = NULL;
+ } else {
+ /* Something went wrong: rollback changes to the radix-tree */
+ shmem_uncharge(mapping->host, nr_none);
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+ start) {
+ if (iter.index >= end)
+ break;
+ page = list_first_entry_or_null(&pagelist,
+ struct page, lru);
+ if (!page || iter.index < page->index) {
+ if (!nr_none)
+ break;
+ /* Put holes back where they were */
+ radix_tree_replace_slot(slot, NULL);
+ nr_none--;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(page->index != iter.index, page);
+
+ /* Unfreeze the page. */
+ list_del(&page->lru);
+ page_ref_unfreeze(page, 2);
+ radix_tree_replace_slot(slot, page);
+ spin_unlock_irq(&mapping->tree_lock);
+ putback_lru_page(page);
+ unlock_page(page);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+ VM_BUG_ON(nr_none);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ /* Unfreeze new_page, caller would take care about freeing it */
+ page_ref_unfreeze(new_page, 1);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ unlock_page(new_page);
+ new_page->mapping = NULL;
+ }
+out:
+ VM_BUG_ON(!list_empty(&pagelist));
+ /* TODO: tracepoints */
+}
+
+static void khugepaged_scan_shmem(struct mm_struct *mm,
+ struct address_space *mapping,
+ pgoff_t start, struct page **hpage)
+{
+ struct page *page = NULL;
+ struct radix_tree_iter iter;
+ void **slot;
+ int present, swap;
+ int node = NUMA_NO_NODE;
+ int result = SCAN_SUCCEED;
+
+ present = 0;
+ swap = 0;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (iter.index >= start + HPAGE_PMD_NR)
+ break;
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (radix_tree_exception(page)) {
+ if (++swap > khugepaged_max_ptes_swap) {
+ result = SCAN_EXCEED_SWAP_PTE;
+ break;
+ }
+ continue;
+ }
+
+ if (PageTransCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ break;
+ }
+
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
+ break;
+ }
+ khugepaged_node_load[node]++;
+
+ if (!PageLRU(page)) {
+ result = SCAN_PAGE_LRU;
+ break;
+ }
+
+ if (page_count(page) != 1 + page_mapcount(page)) {
+ result = SCAN_PAGE_COUNT;
+ break;
+ }
+
+ /*
+ * We probably should check if the page is referenced here, but
+ * nobody would transfer pte_young() to PageReferenced() for us.
+ * And rmap walk here is just too costly...
+ */
+
+ present++;
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ slot = radix_tree_iter_next(&iter);
+ }
+ }
+ rcu_read_unlock();
+
+ if (result == SCAN_SUCCEED) {
+ if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ result = SCAN_EXCEED_NONE_PTE;
+ } else {
+ node = khugepaged_find_target_node();
+ collapse_shmem(mm, mapping, start, hpage, node);
+ }
+ }
+
+ /* TODO: tracepoints */
+}
+#else
+static void khugepaged_scan_shmem(struct mm_struct *mm,
+ struct address_space *mapping,
+ pgoff_t start, struct page **hpage)
+{
+ BUILD_BUG();
+}
+#endif
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+ if (!hugepage_vma_check(vma)) {
+skip:
+ progress++;
+ continue;
+ }
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ if (shmem_file(vma->vm_file)) {
+ struct file *file;
+ pgoff_t pgoff = linear_page_index(vma,
+ khugepaged_scan.address);
+ if (!shmem_huge_enabled(vma))
+ goto skip;
+ file = get_file(vma->vm_file);
+ up_read(&mm->mmap_sem);
+ ret = 1;
+ khugepaged_scan_shmem(mm, file->f_mapping,
+ pgoff, hpage);
+ fput(file);
+ } else {
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ }
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ kthread_should_stop();
+}
+
+static void khugepaged_do_scan(void)
+{
+ struct page *hpage = NULL;
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ if (!khugepaged_prealloc_page(&hpage, &wait))
+ break;
+
+ cond_resched();
+
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ &hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
+}
+
+static bool khugepaged_should_wakeup(void)
+{
+ return kthread_should_stop() ||
+ time_after_eq(jiffies, khugepaged_sleep_expire);
+}
+
+static void khugepaged_wait_work(void)
+{
+ if (khugepaged_has_work()) {
+ const unsigned long scan_sleep_jiffies =
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
+
+ if (!scan_sleep_jiffies)
+ return;
+
+ khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
+ wait_event_freezable_timeout(khugepaged_wait,
+ khugepaged_should_wakeup(),
+ scan_sleep_jiffies);
+ return;
+ }
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, MAX_NICE);
+
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+static void set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes) {
+ if (user_min_free_kbytes >= 0)
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
+ min_free_kbytes, recommended_min);
+
+ min_free_kbytes = recommended_min;
+ }
+ setup_per_zone_wmarks();
+}
+
+int start_stop_khugepaged(void)
+{
+ static struct task_struct *khugepaged_thread __read_mostly;
+ static DEFINE_MUTEX(khugepaged_mutex);
+ int err = 0;
+
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled()) {
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (IS_ERR(khugepaged_thread)) {
+ pr_err("khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ goto fail;
+ }
+
+ if (!list_empty(&khugepaged_scan.mm_head))
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+fail:
+ mutex_unlock(&khugepaged_mutex);
+ return err;
+}
diff --git a/mm/ksm.c b/mm/ksm.c
index 4786b4150f62..73d43bafd9fb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,9 +376,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
- ret = handle_mm_fault(vma->vm_mm, vma, addr,
- FAULT_FLAG_WRITE |
- FAULT_FLAG_REMOTE);
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -532,8 +531,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
void *expected_mapping;
unsigned long kpfn;
- expected_mapping = (void *)stable_node +
- (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+ expected_mapping = (void *)((unsigned long)stable_node |
+ PAGE_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index ac1248933b31..ca099159b45a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -584,6 +584,9 @@ repeat:
nid, flags);
}
+ if (!nr_new)
+ return 0;
+
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5339c89dff63..f3a84c64f35c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
+ .memcg = memcg,
.gfp_mask = gfp_mask,
.order = order,
};
@@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1289,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(&oc, task, totalpages)) {
+ switch (oom_scan_process_thread(&oc, task)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ oom_kill_process(&oc, chosen, points, totalpages,
"Memory cgroup out of memory");
}
unlock:
@@ -2272,20 +2273,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
current->memcg_kmem_skip_account = 0;
}
-/*
+static inline bool memcg_kmem_bypass(void)
+{
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+ return false;
+}
+
+/**
+ * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
+ * @cachep: the original global kmem cache
+ *
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
- * If the cache does not exist yet, if we are the first user of it,
- * we either create it immediately, if possible, or create it asynchronously
- * in a workqueue.
- * In the latter case, we will let the current allocation go through with
- * the original cache.
+ * If the cache does not exist yet, if we are the first user of it, we
+ * create it asynchronously in a workqueue and let the current allocation
+ * go through with the original cache.
*
- * Can't be called in interrupt context or from kernel threads.
- * This function needs to be called with rcu_read_lock() held.
+ * This function takes a reference to the cache it returns to assure it
+ * won't get destroyed while we are working with it. Once the caller is
+ * done with it, memcg_kmem_put_cache() must be called to release the
+ * reference.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2293,10 +2304,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
VM_BUG_ON(!is_root_cache(cachep));
- if (cachep->flags & SLAB_ACCOUNT)
- gfp |= __GFP_ACCOUNT;
-
- if (!(gfp & __GFP_ACCOUNT))
+ if (memcg_kmem_bypass())
return cachep;
if (current->memcg_kmem_skip_account)
@@ -2329,14 +2337,27 @@ out:
return cachep;
}
-void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+/**
+ * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
+ * @cachep: the cache returned by memcg_kmem_get_cache
+ */
+void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
css_put(&cachep->memcg_params.memcg->css);
}
-int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg)
+/**
+ * memcg_kmem_charge: charge a kmem page
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ * @memcg: memory cgroup to charge
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
struct page_counter *counter;
@@ -2357,19 +2378,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
return 0;
}
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+/**
+ * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
+ if (memcg_kmem_bypass())
+ return 0;
+
memcg = get_mem_cgroup_from_mm(current->mm);
if (!mem_cgroup_is_root(memcg))
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
-
-void __memcg_kmem_uncharge(struct page *page, int order)
+/**
+ * memcg_kmem_uncharge: uncharge a kmem page
+ * @page: page to uncharge
+ * @order: allocation order
+ */
+void memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -4409,7 +4445,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
#ifdef CONFIG_SWAP
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
@@ -4428,7 +4464,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
}
#else
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ pte_t ptent, swp_entry_t *entry)
{
return NULL;
}
@@ -4471,7 +4507,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
+ * @compound: charge the page as compound or small page
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
@@ -4593,7 +4629,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
- page = mc_handle_swap_pte(vma, addr, ptent, &ent);
+ page = mc_handle_swap_pte(vma, ptent, &ent);
else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
@@ -5333,6 +5369,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
* @memcgp: charged memcg return
+ * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
@@ -5395,6 +5432,7 @@ out:
* @page: page to charge
* @memcg: memcg to charge the page to
* @lrucare: page might be on LRU already
+ * @compound: charge the page as compound or small page
*
* Finalize a charge transaction started by mem_cgroup_try_charge(),
* after page->mapping has been set up. This must happen atomically
@@ -5446,6 +5484,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
* mem_cgroup_cancel_charge - cancel a page charge
* @page: page to charge
* @memcg: memcg to charge the page to
+ * @compound: charge the page as compound or small page
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
@@ -5469,15 +5508,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_huge, struct page *dummy_page)
+ unsigned long nr_huge, unsigned long nr_kmem,
+ struct page *dummy_page)
{
- unsigned long nr_pages = nr_anon + nr_file;
+ unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(memcg)) {
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
+ page_counter_uncharge(&memcg->kmem, nr_kmem);
memcg_oom_recover(memcg);
}
@@ -5500,6 +5542,7 @@ static void uncharge_list(struct list_head *page_list)
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
+ unsigned long nr_kmem = 0;
unsigned long pgpgout = 0;
struct list_head *next;
struct page *page;
@@ -5510,8 +5553,6 @@ static void uncharge_list(struct list_head *page_list)
*/
next = page_list->next;
do {
- unsigned int nr_pages = 1;
-
page = list_entry(next, struct page, lru);
next = page->lru.next;
@@ -5530,31 +5571,34 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, page);
- pgpgout = nr_anon = nr_file = nr_huge = 0;
+ nr_huge, nr_kmem, page);
+ pgpgout = nr_anon = nr_file =
+ nr_huge = nr_kmem = 0;
}
memcg = page->mem_cgroup;
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- nr_huge += nr_pages;
- }
+ if (!PageKmemcg(page)) {
+ unsigned int nr_pages = 1;
- if (PageAnon(page))
- nr_anon += nr_pages;
- else
- nr_file += nr_pages;
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ nr_huge += nr_pages;
+ }
+ if (PageAnon(page))
+ nr_anon += nr_pages;
+ else
+ nr_file += nr_pages;
+ pgpgout++;
+ } else
+ nr_kmem += 1 << compound_order(page);
page->mem_cgroup = NULL;
-
- pgpgout++;
} while (next != page_list);
if (memcg)
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, page);
+ nr_huge, nr_kmem, page);
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index 9e046819e619..4425b6059339 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -233,6 +233,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
#endif
+ tlb->page_size = 0;
__tlb_reset_range(tlb);
}
@@ -292,23 +293,31 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
* handling the additional races in SMP caused by other CPUs caching valid
* mappings in their TLBs. Returns the number of free page slots left.
* When out of page slots we must call tlb_flush_mmu().
+ *returns true if the caller should flush.
*/
-int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
+ if (!tlb->page_size)
+ tlb->page_size = page_size;
+ else {
+ if (page_size != tlb->page_size)
+ return true;
+ }
+
batch = tlb->active;
- batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
- return 0;
+ return true;
batch = tlb->active;
}
VM_BUG_ON_PAGE(batch->nr > batch->max, page);
- return batch->max - batch->nr;
+ batch->pages[batch->nr++] = page;
+ return false;
}
#endif /* HAVE_GENERIC_MMU_GATHER */
@@ -1109,6 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ struct page *pending_page = NULL;
again:
init_rss_vec(rss);
@@ -1132,7 +1142,7 @@ again:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
- details->check_mapping != page->mapping)
+ details->check_mapping != page_rmapping(page))
continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -1160,8 +1170,9 @@ again:
page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- if (unlikely(!__tlb_remove_page(tlb, page))) {
+ if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
+ pending_page = page;
addr += PAGE_SIZE;
break;
}
@@ -1202,7 +1213,11 @@ again:
if (force_flush) {
force_flush = 0;
tlb_flush_mmu_free(tlb);
-
+ if (pending_page) {
+ /* remove the page with new size */
+ __tlb_remove_pte_page(tlb, pending_page);
+ pending_page = NULL;
+ }
if (addr != end)
goto again;
}
@@ -1479,7 +1494,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page);
+ page_add_file_rmap(page, false);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
@@ -2055,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
- struct page *page, int page_mkwrite,
- int dirty_shared)
- __releases(ptl)
+static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
+ struct page *page, int page_mkwrite, int dirty_shared)
+ __releases(fe->ptl)
{
+ struct vm_area_struct *vma = fe->vma;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2071,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, address, pte_pfn(orig_pte));
+ flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry, 1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
+ if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
+ update_mmu_cache(vma, fe->address, fe->pte);
+ pte_unmap_unlock(fe->pte, fe->ptl);
if (dirty_shared) {
struct address_space *mapping;
@@ -2122,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- pte_t orig_pte, struct page *old_page)
+static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
+ struct page *old_page)
{
+ struct vm_area_struct *vma = fe->vma;
+ struct mm_struct *mm = vma->vm_mm;
struct page *new_page = NULL;
- spinlock_t *ptl = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
- const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, address);
+ new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
if (!new_page)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ fe->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, address, vma);
+ cow_user_page(new_page, old_page, fe->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2158,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Re-check the pte - we dropped the lock
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (likely(pte_same(*page_table, orig_pte))) {
+ fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
+ if (likely(pte_same(*fe->pte, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2169,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, address, pte_pfn(orig_pte));
+ flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2178,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address, false);
+ ptep_clear_flush_notify(vma, fe->address, fe->pte);
+ page_add_new_anon_rmap(new_page, vma, fe->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2187,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, address, page_table, entry);
- update_mmu_cache(vma, address, page_table);
+ set_pte_at_notify(mm, fe->address, fe->pte, entry);
+ update_mmu_cache(vma, fe->address, fe->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2225,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2253,44 +2267,43 @@ oom:
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
- pmd_t *pmd)
+static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
{
+ struct vm_area_struct *vma = fe->vma;
+
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
struct vm_fault vmf = {
.page = NULL,
- .pgoff = linear_page_index(vma, address),
- .virtual_address = (void __user *)(address & PAGE_MASK),
+ .pgoff = linear_page_index(vma, fe->address),
+ .virtual_address =
+ (void __user *)(fe->address & PAGE_MASK),
.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
};
int ret;
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
if (ret & VM_FAULT_ERROR)
return ret;
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
/*
* We might have raced with another page fault while we
* released the pte_offset_map_lock.
*/
- if (!pte_same(*page_table, orig_pte)) {
- pte_unmap_unlock(page_table, ptl);
+ if (!pte_same(*fe->pte, orig_pte)) {
+ pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
}
- return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
- NULL, 0, 0);
+ return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
}
-static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table,
- pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
- struct page *old_page)
- __releases(ptl)
+static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
+ struct page *old_page)
+ __releases(fe->ptl)
{
+ struct vm_area_struct *vma = fe->vma;
int page_mkwrite = 0;
get_page(old_page);
@@ -2298,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
+ pte_unmap_unlock(fe->pte, fe->ptl);
+ tmp = do_page_mkwrite(vma, old_page, fe->address);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(old_page);
@@ -2311,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
+ if (!pte_same(*fe->pte, orig_pte)) {
unlock_page(old_page);
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
put_page(old_page);
return 0;
}
page_mkwrite = 1;
}
- return wp_page_reuse(mm, vma, address, page_table, ptl,
- orig_pte, old_page, page_mkwrite, 1);
+ return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
}
/*
@@ -2344,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
- __releases(ptl)
+static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
+ __releases(fe->ptl)
{
+ struct vm_area_struct *vma = fe->vma;
struct page *old_page;
- old_page = vm_normal_page(vma, address, orig_pte);
+ old_page = vm_normal_page(vma, fe->address, orig_pte);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2362,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(mm, vma, address, page_table, ptl,
- orig_pte, pmd);
+ return wp_pfn_shared(fe, orig_pte);
- pte_unmap_unlock(page_table, ptl);
- return wp_page_copy(mm, vma, address, page_table, pmd,
- orig_pte, old_page);
+ pte_unmap_unlock(fe->pte, fe->ptl);
+ return wp_page_copy(fe, orig_pte, old_page);
}
/*
@@ -2378,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
int total_mapcount;
if (!trylock_page(old_page)) {
get_page(old_page);
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
+ fe->address, &fe->ptl);
+ if (!pte_same(*fe->pte, orig_pte)) {
unlock_page(old_page);
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
put_page(old_page);
return 0;
}
@@ -2402,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_move_anon_rmap(old_page, vma);
}
unlock_page(old_page);
- return wp_page_reuse(mm, vma, address, page_table, ptl,
- orig_pte, old_page, 0, 0);
+ return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(mm, vma, address, page_table, pmd,
- ptl, orig_pte, old_page);
+ return wp_page_shared(fe, orig_pte, old_page);
}
/*
@@ -2417,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
get_page(old_page);
- pte_unmap_unlock(page_table, ptl);
- return wp_page_copy(mm, vma, address, page_table, pmd,
- orig_pte, old_page);
+ pte_unmap_unlock(fe->pte, fe->ptl);
+ return wp_page_copy(fe, orig_pte, old_page);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2507,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+int do_swap_page(struct fault_env *fe, pte_t orig_pte)
{
- spinlock_t *ptl;
+ struct vm_area_struct *vma = fe->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2520,17 +2524,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
goto out;
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(mm, pmd, address);
+ migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, address, orig_pte, NULL);
+ print_bad_pte(vma, fe->address, orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2539,14 +2543,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = lookup_swap_cache(entry);
if (!page) {
page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, address);
+ GFP_HIGHUSER_MOVABLE, vma, fe->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (likely(pte_same(*page_table, orig_pte)))
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
+ fe->address, &fe->ptl);
+ if (likely(pte_same(*fe->pte, orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2555,7 +2560,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(mm, PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -2568,7 +2573,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swapcache = page;
- locked = lock_page_or_retry(page, mm, flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2585,14 +2590,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, address);
+ page = ksm_might_need_to_copy(page, vma, fe->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2600,8 +2606,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Back out if somebody else already faulted in this pte.
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*page_table, orig_pte)))
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
+ if (unlikely(!pte_same(*fe->pte, orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2619,24 +2626,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* must be called after the swap_free(), or it will never succeed.
*/
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- dec_mm_counter_fast(mm, MM_SWAPENTS);
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- flags &= ~FAULT_FLAG_WRITE;
+ fe->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(mm, address, page_table, pte);
+ set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, address, exclusive);
+ do_page_add_anon_rmap(page, vma, fe->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address, false);
+ page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2659,22 +2666,22 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(swapcache);
}
- if (flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ if (fe->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(fe, pte);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, page_table);
+ update_mmu_cache(vma, fe->address, fe->pte);
unlock:
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2725,37 +2732,51 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags)
+static int do_anonymous_page(struct fault_env *fe)
{
+ struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
struct page *page;
- spinlock_t *ptl;
pte_t entry;
- pte_unmap(page_table);
-
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, address) < 0)
+ if (check_stack_guard_page(vma, fe->address) < 0)
return VM_FAULT_SIGSEGV;
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have down_read(mmap_sem).
+ */
+ if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ return VM_FAULT_OOM;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(fe->pmd)))
+ return 0;
+
/* Use the zero-page for reads */
- if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
+ if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm)) {
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
vma->vm_page_prot));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
+ if (!pte_none(*fe->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(page_table, ptl);
- return handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ pte_unmap_unlock(fe->pte, fe->ptl);
+ return handle_userfault(fe, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2763,11 +2784,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, address);
+ page = alloc_zeroed_user_highpage_movable(vma, fe->address);
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
@@ -2781,30 +2802,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
+ if (!pte_none(*fe->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ return handle_userfault(fe, VM_UFFD_MISSING);
}
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address, false);
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(mm, address, page_table, entry);
+ set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, page_table);
+ update_mmu_cache(vma, fe->address, fe->pte);
unlock:
- pte_unmap_unlock(page_table, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2821,17 +2842,16 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct vm_area_struct *vma, unsigned long address,
- pgoff_t pgoff, unsigned int flags,
- struct page *cow_page, struct page **page,
- void **entry)
+static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
+ struct page *cow_page, struct page **page, void **entry)
{
+ struct vm_area_struct *vma = fe->vma;
struct vm_fault vmf;
int ret;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
vmf.pgoff = pgoff;
- vmf.flags = flags;
+ vmf.flags = fe->flags;
vmf.page = NULL;
vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.cow_page = cow_page;
@@ -2860,41 +2880,168 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
return ret;
}
+static int pte_alloc_one_map(struct fault_env *fe)
+{
+ struct vm_area_struct *vma = fe->vma;
+
+ if (!pmd_none(*fe->pmd))
+ goto map_pte;
+ if (fe->prealloc_pte) {
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_none(*fe->pmd))) {
+ spin_unlock(fe->ptl);
+ goto map_pte;
+ }
+
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
+ pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+ spin_unlock(fe->ptl);
+ fe->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ return VM_FAULT_OOM;
+ }
+map_pte:
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
+ * in a different thread of this mm, in turn leading to a misleading
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
+ * regular pmd that we can walk with pte_offset_map() and we can do that
+ * through an atomic read in C, which is what pmd_trans_unstable()
+ * provides.
+ */
+ if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ return VM_FAULT_NOPAGE;
+
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+ &fe->ptl);
+ return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+
+#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
+ (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+ return false;
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return false;
+ return true;
+}
+
+static int do_set_pmd(struct fault_env *fe, struct page *page)
+{
+ struct vm_area_struct *vma = fe->vma;
+ bool write = fe->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ pmd_t entry;
+ int i, ret;
+
+ if (!transhuge_vma_suitable(vma, haddr))
+ return VM_FAULT_FALLBACK;
+
+ ret = VM_FAULT_FALLBACK;
+ page = compound_head(page);
+
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_none(*fe->pmd)))
+ goto out;
+
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ flush_icache_page(vma, page + i);
+
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ if (write)
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+ add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
+ page_add_file_rmap(page, true);
+
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+
+ update_mmu_cache_pmd(vma, haddr, fe->pmd);
+
+ /* fault is handled */
+ ret = 0;
+ count_vm_event(THP_FILE_MAPPED);
+out:
+ spin_unlock(fe->ptl);
+ return ret;
+}
+#else
+static int do_set_pmd(struct fault_env *fe, struct page *page)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
/**
- * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ * alloc_set_pte - setup new PTE entry for given page and add reverse page
+ * mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @vma: virtual memory area
- * @address: user virtual address
+ * @fe: fault environment
+ * @memcg: memcg to charge page (only for private mappings)
* @page: page to map
- * @pte: pointer to target page table entry
- * @write: true, if new entry is writable
- * @anon: true, if it's anonymous page
*
- * Caller must hold page table lock relevant for @pte.
+ * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- struct page *page, pte_t *pte, bool write, bool anon)
+int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+ struct page *page)
{
+ struct vm_area_struct *vma = fe->vma;
+ bool write = fe->flags & FAULT_FLAG_WRITE;
pte_t entry;
+ int ret;
+
+ if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ /* THP on COW? */
+ VM_BUG_ON_PAGE(memcg, page);
+
+ ret = do_set_pmd(fe, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+
+ if (!fe->pte) {
+ ret = pte_alloc_one_map(fe);
+ if (ret)
+ return ret;
+ }
+
+ /* Re-check under ptl */
+ if (unlikely(!pte_none(*fe->pte)))
+ return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (anon) {
+ /* copy-on-write page */
+ if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address, false);
+ page_add_new_anon_rmap(page, vma, fe->address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page);
+ page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, address, pte, entry);
+ set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, pte);
+ update_mmu_cache(vma, fe->address, fe->pte);
+
+ return 0;
}
static unsigned long fault_around_bytes __read_mostly =
@@ -2961,57 +3108,66 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pgoff_t pgoff, unsigned int flags)
+static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
{
- unsigned long start_addr, nr_pages, mask;
- pgoff_t max_pgoff;
- struct vm_fault vmf;
- int off;
+ unsigned long address = fe->address, nr_pages, mask;
+ pgoff_t end_pgoff;
+ int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- start_addr = max(address & mask, vma->vm_start);
- off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- pte -= off;
- pgoff -= off;
+ fe->address = max(address & mask, fe->vma->vm_start);
+ off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ start_pgoff -= off;
/*
- * max_pgoff is either end of page table or end of vma
- * or fault_around_pages() from pgoff, depending what is nearest.
+ * end_pgoff is either end of page table or end of vma
+ * or fault_around_pages() from start_pgoff, depending what is nearest.
*/
- max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ end_pgoff = start_pgoff -
+ ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
- pgoff + nr_pages - 1);
+ end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ start_pgoff + nr_pages - 1);
- /* Check if it makes any sense to call ->map_pages */
- while (!pte_none(*pte)) {
- if (++pgoff > max_pgoff)
- return;
- start_addr += PAGE_SIZE;
- if (start_addr >= vma->vm_end)
- return;
- pte++;
+ if (pmd_none(*fe->pmd)) {
+ fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
+ smp_wmb(); /* See comment in __pte_alloc() */
}
- vmf.virtual_address = (void __user *) start_addr;
- vmf.pte = pte;
- vmf.pgoff = pgoff;
- vmf.max_pgoff = max_pgoff;
- vmf.flags = flags;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vma->vm_ops->map_pages(vma, &vmf);
+ fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+
+ /* preallocated pagetable is unused: free it */
+ if (fe->prealloc_pte) {
+ pte_free(fe->vma->vm_mm, fe->prealloc_pte);
+ fe->prealloc_pte = 0;
+ }
+ /* Huge page is mapped? Page fault is solved */
+ if (pmd_trans_huge(*fe->pmd)) {
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ /* ->map_pages() haven't done anything useful. Cold page cache? */
+ if (!fe->pte)
+ goto out;
+
+ /* check if the page fault is solved */
+ fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*fe->pte))
+ ret = VM_FAULT_NOPAGE;
+ pte_unmap_unlock(fe->pte, fe->ptl);
+out:
+ fe->address = address;
+ fe->pte = NULL;
+ return ret;
}
-static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
{
+ struct vm_area_struct *vma = fe->vma;
struct page *fault_page;
- spinlock_t *ptl;
- pte_t *pte;
int ret = 0;
/*
@@ -3020,85 +3176,64 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- do_fault_around(vma, address, pte, pgoff, flags);
- if (!pte_same(*pte, orig_pte))
- goto unlock_out;
- pte_unmap_unlock(pte, ptl);
+ ret = do_fault_around(fe, pgoff);
+ if (ret)
+ return ret;
}
- ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
+ ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- put_page(fault_page);
- return ret;
- }
- do_set_pte(vma, address, fault_page, pte, false, false);
+ ret |= alloc_set_pte(fe, NULL, fault_page);
+ if (fe->pte)
+ pte_unmap_unlock(fe->pte, fe->ptl);
unlock_page(fault_page);
-unlock_out:
- pte_unmap_unlock(pte, ptl);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ put_page(fault_page);
return ret;
}
-static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
{
+ struct vm_area_struct *vma = fe->vma;
struct page *fault_page, *new_page;
void *fault_entry;
struct mem_cgroup *memcg;
- spinlock_t *ptl;
- pte_t *pte;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
+ if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
put_page(new_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
- &fault_entry);
+ ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, address, vma);
+ copy_user_highpage(new_page, fault_page, fe->address, vma);
__SetPageUptodate(new_page);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping,
- pgoff);
- }
- goto uncharge_out;
- }
- do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
- pte_unmap_unlock(pte, ptl);
+ ret |= alloc_set_pte(fe, memcg, new_page);
+ if (fe->pte)
+ pte_unmap_unlock(fe->pte, fe->ptl);
if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
} else {
dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
}
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ goto uncharge_out;
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -3106,18 +3241,15 @@ uncharge_out:
return ret;
}
-static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
{
+ struct vm_area_struct *vma = fe->vma;
struct page *fault_page;
struct address_space *mapping;
- spinlock_t *ptl;
- pte_t *pte;
int dirtied = 0;
int ret, tmp;
- ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
+ ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3127,7 +3259,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, address);
+ tmp = do_page_mkwrite(vma, fault_page, fe->address);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(fault_page);
@@ -3135,15 +3267,15 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
}
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
+ ret |= alloc_set_pte(fe, NULL, fault_page);
+ if (fe->pte)
+ pte_unmap_unlock(fe->pte, fe->ptl);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY))) {
unlock_page(fault_page);
put_page(fault_page);
return ret;
}
- do_set_pte(vma, address, fault_page, pte, true, false);
- pte_unmap_unlock(pte, ptl);
if (set_page_dirty(fault_page))
dirtied = 1;
@@ -3175,23 +3307,19 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+static int do_fault(struct fault_env *fe)
{
- pgoff_t pgoff = linear_page_index(vma, address);
+ struct vm_area_struct *vma = fe->vma;
+ pgoff_t pgoff = linear_page_index(vma, fe->address);
- pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!(flags & FAULT_FLAG_WRITE))
- return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
+ if (!(fe->flags & FAULT_FLAG_WRITE))
+ return do_read_fault(fe, pgoff);
if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return do_cow_fault(fe, pgoff);
+ return do_shared_fault(fe, pgoff);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3209,11 +3337,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+static int do_numa_page(struct fault_env *fe, pte_t pte)
{
+ struct vm_area_struct *vma = fe->vma;
struct page *page = NULL;
- spinlock_t *ptl;
int page_nid = -1;
int last_cpupid;
int target_nid;
@@ -3233,10 +3360,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- if (unlikely(!pte_same(*ptep, pte))) {
- pte_unmap_unlock(ptep, ptl);
+ fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
+ spin_lock(fe->ptl);
+ if (unlikely(!pte_same(*fe->pte, pte))) {
+ pte_unmap_unlock(fe->pte, fe->ptl);
goto out;
}
@@ -3245,18 +3372,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(mm, addr, ptep, pte);
- update_mmu_cache(vma, addr, ptep);
+ set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ update_mmu_cache(vma, fe->address, fe->pte);
- page = vm_normal_page(vma, addr, pte);
+ page = vm_normal_page(vma, fe->address, pte);
if (!page) {
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
@@ -3280,8 +3407,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
- pte_unmap_unlock(ptep, ptl);
+ target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ &flags);
+ pte_unmap_unlock(fe->pte, fe->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3301,24 +3429,29 @@ out:
return 0;
}
-static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags)
+static int create_huge_pmd(struct fault_env *fe)
{
+ struct vm_area_struct *vma = fe->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ return do_huge_pmd_anonymous_page(fe);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
+ fe->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
- unsigned int flags)
+static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
{
- if (vma_is_anonymous(vma))
- return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
- if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ if (vma_is_anonymous(fe->vma))
+ return do_huge_pmd_wp_page(fe, orig_pmd);
+ if (fe->vma->vm_ops->pmd_fault)
+ return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
+ fe->flags);
+
+ /* COW handled on pte level: split pmd */
+ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
+ split_huge_pmd(fe->vma, fe->pmd, fe->address);
+
return VM_FAULT_FALLBACK;
}
@@ -3331,59 +3464,79 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
+ * concurrent faults).
*
- * The mmap_sem may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * The mmap_sem may have been released depending on flags and our return value.
+ * See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+static int handle_pte_fault(struct fault_env *fe)
{
pte_t entry;
- spinlock_t *ptl;
- /*
- * some architectures can have larger ptes than wordsize,
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
- * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
- * The code below just needs a consistent view for the ifs and
- * we later double check anyway with the ptl lock held. So here
- * a barrier will do.
- */
- entry = *pte;
- barrier();
- if (!pte_present(entry)) {
+ if (unlikely(pmd_none(*fe->pmd))) {
+ /*
+ * Leave __pte_alloc() until later: because vm_ops->fault may
+ * want to allocate huge page, and if we expose page table
+ * for an instant, it will be difficult to retract from
+ * concurrent faults and from rmap lookups.
+ */
+ fe->pte = NULL;
+ } else {
+ /* See comment in pte_alloc_one_map() */
+ if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge
+ * pmd from under us anymore at this point because we hold the
+ * mmap_sem read mode and khugepaged takes it in write mode.
+ * So now it's safe to run pte_offset_map().
+ */
+ fe->pte = pte_offset_map(fe->pmd, fe->address);
+
+ entry = *fe->pte;
+
+ /*
+ * some architectures can have larger ptes than wordsize,
+ * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
+ * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
+ * atomic accesses. The code below just needs a consistent
+ * view for the ifs and we later double check anyway with the
+ * ptl lock held. So here a barrier will do.
+ */
+ barrier();
if (pte_none(entry)) {
- if (vma_is_anonymous(vma))
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
- else
- return do_fault(mm, vma, address, pte, pmd,
- flags, entry);
+ pte_unmap(fe->pte);
+ fe->pte = NULL;
}
- return do_swap_page(mm, vma, address,
- pte, pmd, flags, entry);
}
+ if (!fe->pte) {
+ if (vma_is_anonymous(fe->vma))
+ return do_anonymous_page(fe);
+ else
+ return do_fault(fe);
+ }
+
+ if (!pte_present(entry))
+ return do_swap_page(fe, entry);
+
if (pte_protnone(entry))
- return do_numa_page(mm, vma, address, entry, pte, pmd);
+ return do_numa_page(fe, entry);
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- if (unlikely(!pte_same(*pte, entry)))
+ fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
+ spin_lock(fe->ptl);
+ if (unlikely(!pte_same(*fe->pte, entry)))
goto unlock;
- if (flags & FAULT_FLAG_WRITE) {
+ if (fe->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(mm, vma, address,
- pte, pmd, ptl, entry);
+ return do_wp_page(fe, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(vma, address, pte);
+ if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
+ fe->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(fe->vma, fe->address, fe->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3391,11 +3544,11 @@ static int handle_pte_fault(struct mm_struct *mm,
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vma, address);
+ if (fe->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(fe->vma, fe->address);
}
unlock:
- pte_unmap_unlock(pte, ptl);
+ pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
@@ -3405,87 +3558,51 @@ unlock:
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
{
+ struct fault_env fe = {
+ .vma = vma,
+ .address = address,
+ .flags = flags,
+ };
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
- flags & FAULT_FLAG_INSTRUCTION,
- flags & FAULT_FLAG_REMOTE))
- return VM_FAULT_SIGSEGV;
-
- if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- pmd = pmd_alloc(mm, pud, address);
- if (!pmd)
+ fe.pmd = pmd_alloc(mm, pud, address);
+ if (!fe.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(mm, vma, address, pmd, flags);
+ if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&fe);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *pmd;
+ pmd_t orig_pmd = *fe.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
-
if (pmd_protnone(orig_pmd))
- return do_huge_pmd_numa_page(mm, vma, address,
- orig_pmd, pmd);
+ return do_huge_pmd_numa_page(&fe, orig_pmd);
- if (dirty && !pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(mm, vma, address, pmd,
- orig_pmd, flags);
+ if ((fe.flags & FAULT_FLAG_WRITE) &&
+ !pmd_write(orig_pmd)) {
+ ret = wp_huge_pmd(&fe, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(mm, vma, address, pmd,
- orig_pmd, dirty);
+ huge_pmd_set_accessed(&fe, orig_pmd);
return 0;
}
}
}
- /*
- * Use pte_alloc() instead of pte_alloc_map, because we can't
- * run pte_offset_map on the pmd, if an huge pmd could
- * materialize from under us from a different thread.
- */
- if (unlikely(pte_alloc(mm, pmd, address)))
- return VM_FAULT_OOM;
- /*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
- * didn't become pmd_trans_huge under us and then back to pmd_none, as
- * a result of MADV_DONTNEED running immediately after a huge pmd fault
- * in a different thread of this mm, in turn leading to a misleading
- * pmd_trans_huge() retval. All we have to ensure is that it is a
- * regular pmd that we can walk with pte_offset_map() and we can do that
- * through an atomic read in C, which is what pmd_trans_unstable()
- * provides.
- */
- if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
- return 0;
- /*
- * A regular pmd is established and it can't morph into a huge pmd
- * from under us anymore at this point because we hold the mmap_sem
- * read mode and khugepaged takes it in write mode. So now it's
- * safe to run pte_offset_map().
- */
- pte = pte_offset_map(pmd, address);
-
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ return handle_pte_fault(&fe);
}
/*
@@ -3494,15 +3611,15 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(mm, PGFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
@@ -3514,7 +3631,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
- ret = __handle_mm_fault(mm, vma, address, flags);
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
+ else
+ ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e3cbdcaff2a5..82d0b98d27f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -449,6 +449,25 @@ out_fail:
return -1;
}
+static struct zone * __meminit move_pfn_range(int zone_shift,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct zone *zone = page_zone(pfn_to_page(start_pfn));
+ int ret = 0;
+
+ if (zone_shift < 0)
+ ret = move_pfn_range_left(zone + zone_shift, zone,
+ start_pfn, end_pfn);
+ else if (zone_shift)
+ ret = move_pfn_range_right(zone, zone + zone_shift,
+ start_pfn, end_pfn);
+
+ if (ret)
+ return NULL;
+
+ return zone + zone_shift;
+}
+
static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -1028,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
+int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target)
+{
+ struct zone *zone = page_zone(pfn_to_page(pfn));
+ enum zone_type idx = zone_idx(zone);
+ int i;
+
+ if (idx < target) {
+ /* pages must be at end of current zone */
+ if (pfn + nr_pages != zone_end_pfn(zone))
+ return 0;
+
+ /* no zones in use between current zone and target */
+ for (i = idx + 1; i < target; i++)
+ if (zone_is_initialized(zone - idx + i))
+ return 0;
+ }
+
+ if (target < idx) {
+ /* pages must be at beginning of current zone */
+ if (pfn != zone->zone_start_pfn)
+ return 0;
+
+ /* no zones in use between current zone and target */
+ for (i = target + 1; i < idx; i++)
+ if (zone_is_initialized(zone - idx + i))
+ return 0;
+ }
+
+ return target - idx;
+}
/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
@@ -1039,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
+ int zone_shift = 0;
/*
* This doesn't need a lock to do pfn_to_page().
@@ -1052,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
!can_online_high_movable(zone))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL &&
- zone_idx(zone) == ZONE_MOVABLE) {
- if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- return -EINVAL;
- }
- if (online_type == MMOP_ONLINE_MOVABLE &&
- zone_idx(zone) == ZONE_MOVABLE - 1) {
- if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- return -EINVAL;
- }
+ if (online_type == MMOP_ONLINE_KERNEL)
+ zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
+ else if (online_type == MMOP_ONLINE_MOVABLE)
+ zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
- /* Previous code may changed the zone of the pfn range */
- zone = page_zone(pfn_to_page(pfn));
+ zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
+ if (!zone)
+ return -EINVAL;
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 297d6854f849..53e40d3f3933 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,6 +512,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
}
}
+ if (pmd_trans_unstable(pmd))
+ return 0;
retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -529,7 +531,7 @@ retry:
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
- if (PageTransCompound(page) && PageAnon(page)) {
+ if (PageTransCompound(page)) {
get_page(page);
pte_unmap_unlock(pte, ptl);
lock_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index bd3fdc202e8b..2232f6923cc7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -31,6 +31,7 @@
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
+#include <linux/compaction.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
@@ -73,6 +74,81 @@ int migrate_prep_local(void)
return 0;
}
+bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+{
+ struct address_space *mapping;
+
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a movable page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto out;
+
+ /*
+ * Check PageMovable before holding a PG_lock because page's owner
+ * assumes anybody doesn't touch PG_lock of newly allocated page
+ * so unconditionally grapping the lock ruins page's owner side.
+ */
+ if (unlikely(!__PageMovable(page)))
+ goto out_putpage;
+ /*
+ * As movable pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the releasing a page.
+ *
+ * In order to avoid having an already isolated movable page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released,
+ * lets be sure we have the page lock
+ * before proceeding with the movable page isolation steps.
+ */
+ if (unlikely(!trylock_page(page)))
+ goto out_putpage;
+
+ if (!PageMovable(page) || PageIsolated(page))
+ goto out_no_isolated;
+
+ mapping = page_mapping(page);
+ VM_BUG_ON_PAGE(!mapping, page);
+
+ if (!mapping->a_ops->isolate_page(page, mode))
+ goto out_no_isolated;
+
+ /* Driver shouldn't use PG_isolated bit of page->flags */
+ WARN_ON_ONCE(PageIsolated(page));
+ __SetPageIsolated(page);
+ unlock_page(page);
+
+ return true;
+
+out_no_isolated:
+ unlock_page(page);
+out_putpage:
+ put_page(page);
+out:
+ return false;
+}
+
+/* It should be called on page which is PG_movable */
+void putback_movable_page(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ mapping = page_mapping(page);
+ mapping->a_ops->putback_page(page);
+ __ClearPageIsolated(page);
+}
+
/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
@@ -94,10 +170,23 @@ void putback_movable_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(isolated_balloon_page(page)))
- balloon_page_putback(page);
- else
+ /*
+ * We isolated non-lru movable page so here we can use
+ * __PageMovable because LRU page's mapping cannot have
+ * PAGE_MAPPING_MOVABLE.
+ */
+ if (unlikely(__PageMovable(page))) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ } else {
putback_lru_page(page);
+ }
}
}
@@ -170,7 +259,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
} else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr, false);
else
- page_add_file_rmap(new);
+ page_add_file_rmap(new, false);
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
@@ -594,7 +683,7 @@ EXPORT_SYMBOL(migrate_page_copy);
***********************************************************/
/*
- * Common logic to directly migrate a single page suitable for
+ * Common logic to directly migrate a single LRU page suitable for
* pages that do not use PagePrivate/PagePrivate2.
*
* Pages are locked upon entry and exit.
@@ -757,33 +846,72 @@ static int move_to_new_page(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
struct address_space *mapping;
- int rc;
+ int rc = -EAGAIN;
+ bool is_lru = !__PageMovable(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
mapping = page_mapping(page);
- if (!mapping)
- rc = migrate_page(mapping, newpage, page, mode);
- else if (mapping->a_ops->migratepage)
+
+ if (likely(is_lru)) {
+ if (!mapping)
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * provide a migratepage callback. Anonymous pages
+ * are part of swap space which also has its own
+ * migratepage callback. This is the most common path
+ * for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage,
+ page, mode);
+ } else {
/*
- * Most pages have a mapping and most filesystems provide a
- * migratepage callback. Anonymous pages are part of swap
- * space which also has its own migratepage callback. This
- * is the most common path for page migration.
+ * In case of non-lru page, it could be released after
+ * isolation step. In that case, we shouldn't try migration.
*/
- rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
- else
- rc = fallback_migrate_page(mapping, newpage, page, mode);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (!PageMovable(page)) {
+ rc = MIGRATEPAGE_SUCCESS;
+ __ClearPageIsolated(page);
+ goto out;
+ }
+
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
+ !PageIsolated(page));
+ }
/*
* When successful, old pagecache page->mapping must be cleared before
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (!PageAnon(page))
+ if (__PageMovable(page)) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ /*
+ * We clear PG_movable under page_lock so any compactor
+ * cannot try to migrate this page.
+ */
+ __ClearPageIsolated(page);
+ }
+
+ /*
+ * Anonymous and movable page->mapping will be cleard by
+ * free_pages_prepare so don't reset it here for keeping
+ * the type to work PageAnon, for example.
+ */
+ if (!PageMappingFlags(page))
page->mapping = NULL;
}
+out:
return rc;
}
@@ -793,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
int rc = -EAGAIN;
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
+ bool is_lru = !__PageMovable(page);
if (!trylock_page(page)) {
if (!force || mode == MIGRATE_ASYNC)
@@ -861,15 +990,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (unlikely(!trylock_page(newpage)))
goto out_unlock;
- if (unlikely(isolated_balloon_page(page))) {
- /*
- * A ballooned page does not need any special attention from
- * physical to virtual reverse mapping procedures.
- * Skip any attempt to unmap PTEs or to remap swap cache,
- * in order to avoid burning cycles at rmap level, and perform
- * the page migration right away (proteced by page lock).
- */
- rc = balloon_page_migrate(newpage, page, mode);
+ if (unlikely(!is_lru)) {
+ rc = move_to_new_page(newpage, page, mode);
goto out_unlock_both;
}
@@ -915,6 +1037,19 @@ out_unlock:
put_anon_vma(anon_vma);
unlock_page(page);
out:
+ /*
+ * If migration is successful, decrease refcount of the newpage
+ * which will not free the page because new page owner increased
+ * refcounter. As well, if it is LRU page, add the page to LRU
+ * list in here.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ if (unlikely(__PageMovable(newpage)))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+
return rc;
}
@@ -948,6 +1083,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ if (unlikely(__PageMovable(page))) {
+ lock_page(page);
+ if (!PageMovable(page))
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ }
+ if (put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
goto out;
}
@@ -960,10 +1107,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
}
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS) {
- put_new_page = NULL;
+ if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
- }
out:
if (rc != -EAGAIN) {
@@ -976,33 +1121,45 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- /* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) {
+ }
+
+ /*
+ * If migration is successful, releases reference grabbed during
+ * isolation. Otherwise, restore the page to right list unless
+ * we want to retry.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ put_page(page);
+ if (reason == MR_MEMORY_FAILURE) {
/*
- * With this release, we free successfully migrated
- * page and set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird, it's how
- * HWPoison flag works at the moment.
+ * Set PG_HWPoison on just freed page
+ * intentionally. Although it's rather weird,
+ * it's how HWPoison flag works at the moment.
*/
- put_page(page);
if (!test_set_page_hwpoison(page))
num_poisoned_pages_inc();
- } else
- putback_lru_page(page);
- }
+ }
+ } else {
+ if (rc != -EAGAIN) {
+ if (likely(!__PageMovable(page))) {
+ putback_lru_page(page);
+ goto put_new;
+ }
- /*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, putback_lru_page() will drop the reference grabbed
- * during isolation.
- */
- if (put_new_page)
- put_new_page(newpage, private);
- else if (unlikely(__is_movable_balloon_page(newpage))) {
- /* drop our reference, page already in the balloon */
- put_page(newpage);
- } else
- putback_lru_page(newpage);
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ }
+put_new:
+ if (put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
+ }
if (result) {
if (rc)
@@ -1829,8 +1986,7 @@ fail_putback:
}
orig_entry = *pmd;
- entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 234edffec1d0..86b18f334f4f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -25,6 +25,7 @@
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
@@ -675,6 +676,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
@@ -695,8 +698,6 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
- vma_adjust_trans_huge(vma, start, end, adjust_next);
-
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
@@ -1897,8 +1898,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
- if (file && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
+ if (file) {
+ if (file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ } else if (flags & MAP_SHARED) {
+ /*
+ * mmap_region() will call shmem_zero_setup() to create a file,
+ * so use shmem's get_unmapped_area in case it can be huge.
+ * do_mmap_pgoff() will clear pgoff, so match alignment.
+ */
+ pgoff = 0;
+ get_area = shmem_get_unmapped_area;
+ }
+
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
@@ -2591,6 +2603,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
/* drop PG_Mlocked flag for over-mapped range */
for (tmp = vma; tmp->vm_start >= start + size;
tmp = tmp->vm_next) {
+ /*
+ * Split pmd and munlock page on the border
+ * of the range.
+ */
+ vma_adjust_trans_huge(tmp, start, start + size, 0);
+
munlock_vma_pages_range(tmp,
max(tmp->vm_start, start),
min(tmp->vm_end, start + size));
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5019a1ef2848..a4830f0325fe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -163,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
split_huge_pmd(vma, pmd, addr);
- if (pmd_none(*pmd))
+ if (pmd_trans_unstable(pmd))
continue;
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
diff --git a/mm/mremap.c b/mm/mremap.c
index 1f157adfdaf9..da22ad2a5678 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,9 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
}
}
split_huge_pmd(vma, old_pmd, old_addr);
- if (pmd_none(*old_pmd))
+ if (pmd_trans_unstable(old_pmd))
continue;
- VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
diff --git a/mm/nommu.c b/mm/nommu.c
index c2e58880207f..95daf81a4855 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+void filemap_map_pages(struct fault_env *fe,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ddf74487f848..d4a929d79470 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,7 +274,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
#endif
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task, unsigned long totalpages)
+ struct task_struct *task)
{
if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
@@ -311,7 +311,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
for_each_process(p) {
unsigned int points;
- switch (oom_scan_process_thread(oc, p, totalpages)) {
+ switch (oom_scan_process_thread(oc, p)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
-static void dump_header(struct oom_control *oc, struct task_struct *p,
- struct mem_cgroup *memcg)
+static void dump_header(struct oom_control *oc, struct task_struct *p)
{
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
cpuset_print_current_mems_allowed();
dump_stack();
- if (memcg)
- mem_cgroup_print_oom_info(memcg, p);
+ if (oc->memcg)
+ mem_cgroup_print_oom_info(oc->memcg, p);
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, oc->nodemask);
+ dump_tasks(oc->memcg, oc->nodemask);
}
/*
@@ -453,7 +452,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
* __oom_reap_task exit_mm
- * atomic_inc_not_zero
+ * mmget_not_zero
* mmput
* atomic_dec_and_test
* exit_oom_victim
@@ -475,12 +474,22 @@ static bool __oom_reap_task(struct task_struct *tsk)
if (!p)
goto unlock_oom;
mm = p->mm;
- atomic_inc(&mm->mm_users);
+ atomic_inc(&mm->mm_count);
task_unlock(p);
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
- goto unlock_oom;
+ goto mm_drop;
+ }
+
+ /*
+ * increase mm_users only after we know we will reap something so
+ * that the mmput_async is called only when we have reaped something
+ * and delayed __mmput doesn't matter that much
+ */
+ if (!mmget_not_zero(mm)) {
+ up_read(&mm->mmap_sem);
+ goto mm_drop;
}
tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -522,15 +531,16 @@ static bool __oom_reap_task(struct task_struct *tsk)
* to release its memory.
*/
set_bit(MMF_OOM_REAPED, &mm->flags);
-unlock_oom:
- mutex_unlock(&oom_lock);
/*
* Drop our reference but make sure the mmput slow path is called from a
* different context because we shouldn't risk we get stuck there and
* put the oom_reaper out of the way.
*/
- if (mm)
- mmput_async(mm);
+ mmput_async(mm);
+mm_drop:
+ mmdrop(mm);
+unlock_oom:
+ mutex_unlock(&oom_lock);
return ret;
}
@@ -739,7 +749,7 @@ void oom_killer_enable(void)
*/
void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, const char *message)
+ const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -765,7 +775,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(oc, p, memcg);
+ dump_header(oc, p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
@@ -786,8 +796,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, oc->nodemask,
- totalpages);
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
victim = child;
@@ -865,8 +875,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
- struct mem_cgroup *memcg)
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -882,7 +891,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
return;
- dump_header(oc, NULL, memcg);
+ dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -957,13 +966,13 @@ bool out_of_memory(struct oom_control *oc)
constraint = constrained_alloc(oc, &totalpages);
if (constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
- check_panic_on_oom(oc, constraint, NULL);
+ check_panic_on_oom(oc, constraint);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(oc, current, 0, totalpages, NULL,
+ oom_kill_process(oc, current, 0, totalpages,
"Out of memory (oom_kill_allocating_task)");
return true;
}
@@ -971,12 +980,11 @@ bool out_of_memory(struct oom_control *oc)
p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p && !is_sysrq_oom(oc)) {
- dump_header(oc, NULL, NULL);
+ dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
if (p && p != (void *)-1UL) {
- oom_kill_process(oc, p, points, totalpages, NULL,
- "Out of memory");
+ oom_kill_process(oc, p, points, totalpages, "Out of memory");
/*
* Give the killed process a good chance to exit before trying
* to allocate memory again.
@@ -988,14 +996,15 @@ bool out_of_memory(struct oom_control *oc)
/*
* The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
- * parallel oom killing is already in progress so do nothing.
+ * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
+ * killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
+ .memcg = NULL,
.gfp_mask = 0,
.order = 0,
};
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e2481949494c..d578d2a56b19 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2563,6 +2563,7 @@ int set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ page = compound_head(page);
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
/*
@@ -2747,6 +2748,11 @@ int test_clear_page_writeback(struct page *page)
__wb_writeout_inc(wb);
}
}
+
+ if (mapping->host && !mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK))
+ sb_clear_inode_writeback(mapping->host);
+
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
@@ -2774,11 +2780,24 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret) {
+ bool on_wblist;
+
+ on_wblist = mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK);
+
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
__inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+
+ /*
+ * We can come through here when swapping anonymous
+ * pages, so we don't necessarily have an inode to track
+ * for sync.
+ */
+ if (mapping->host && !on_wblist)
+ sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b3e1341b754..452513bf02ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,6 +63,7 @@
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
#include <linux/kthread.h>
+#include <linux/memcontrol.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -1006,6 +1007,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+ if (compound)
+ ClearPageDoubleMap(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
@@ -1016,8 +1019,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageAnonHead(page))
+ if (PageMappingFlags(page))
page->mapping = NULL;
+ if (memcg_kmem_enabled() && PageKmemcg(page)) {
+ memcg_kmem_uncharge(page, order);
+ __ClearPageKmemcg(page);
+ }
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1724,6 +1731,19 @@ static bool check_new_pages(struct page *page, unsigned int order)
return false;
}
+inline void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
+{
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+ kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
+ set_page_owner(page, order, gfp_flags);
+}
+
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
@@ -1736,13 +1756,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
poisoned &= page_is_poisoned(p);
}
- set_page_private(page, 0);
- set_page_refcounted(page);
-
- arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
- kasan_alloc_pages(page, order);
+ post_alloc_hook(page, order, gfp_flags);
if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
@@ -1751,8 +1765,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
- set_page_owner(page, order, gfp_flags);
-
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
@@ -2461,7 +2473,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
- gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2475,12 +2486,9 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- gfp_mask = get_page_owner_gfp(page);
- set_page_owner(page, 0, gfp_mask);
- for (i = 1; i < (1 << order); i++) {
+ for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, gfp_mask);
- }
+ split_page_owner(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2509,8 +2517,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- set_page_owner(page, order, __GFP_MOVABLE);
-
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
@@ -2527,33 +2533,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
- unsigned int order;
- int nr_pages;
-
- order = page_order(page);
-
- nr_pages = __isolate_free_page(page, order);
- if (!nr_pages)
- return 0;
-
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
- return nr_pages;
-}
-
-/*
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
@@ -3105,6 +3084,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct oom_control oc = {
.zonelist = ac->zonelist,
.nodemask = ac->nodemask,
+ .memcg = NULL,
.gfp_mask = gfp_mask,
.order = order,
};
@@ -3868,6 +3848,14 @@ no_zone:
}
out:
+ if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
+ if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
+ __free_pages(page, order);
+ page = NULL;
+ } else
+ __SetPageKmemcg(page);
+ }
+
if (kmemcheck_enabled && page)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
@@ -4023,56 +4011,6 @@ void __free_page_frag(void *addr)
}
EXPORT_SYMBOL(__free_page_frag);
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
- * equivalent to alloc_pages.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
-
- page = alloc_pages(gfp_mask, order);
- if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
- __free_pages(page, order);
- page = NULL;
- }
- return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
-
- page = alloc_pages_node(nid, gfp_mask, order);
- if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
- __free_pages(page, order);
- page = NULL;
- }
- return page;
-}
-
-/*
- * __free_kmem_pages and free_kmem_pages will free pages allocated with
- * alloc_kmem_pages.
- */
-void __free_kmem_pages(struct page *page, unsigned int order)
-{
- memcg_kmem_uncharge(page, order);
- __free_pages(page, order);
-}
-
-void free_kmem_pages(unsigned long addr, unsigned int order)
-{
- if (addr != 0) {
- VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_kmem_pages(virt_to_page((void *)addr), order);
- }
-}
-
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
@@ -4374,6 +4312,9 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
+#endif
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
@@ -4391,6 +4332,11 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
+ global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
+ global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
+#endif
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
@@ -4425,6 +4371,11 @@ void show_free_areas(unsigned int filter)
" writeback:%lukB"
" mapped:%lukB"
" shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
@@ -4457,6 +4408,12 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_WRITEBACK)),
K(zone_page_state(zone, NR_FILE_MAPPED)),
K(zone_page_state(zone, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+ K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
+ * HPAGE_PMD_NR),
+ K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK) *
@@ -6467,15 +6424,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
- arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
- arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
- for (i = 1; i < MAX_NR_ZONES; i++) {
+
+ start_pfn = find_min_pfn_with_active_regions();
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- arch_zone_lowest_possible_pfn[i] =
- arch_zone_highest_possible_pfn[i-1];
- arch_zone_highest_possible_pfn[i] =
- max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+
+ end_pfn = max(max_zone_pfn[i], start_pfn);
+ arch_zone_lowest_possible_pfn[i] = start_pfn;
+ arch_zone_highest_possible_pfn[i] = end_pfn;
+
+ start_pfn = end_pfn;
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 612122bf6a42..064b7fb6e0b5 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,7 @@
#include <linux/pageblock-flags.h>
#include <linux/memory.h>
#include <linux/hugetlb.h>
+#include <linux/page_owner.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -80,7 +81,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
- struct page *isolated_page = NULL;
+ bool isolated_page = false;
unsigned int order;
unsigned long page_idx, buddy_idx;
struct page *buddy;
@@ -108,9 +109,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (pfn_valid_within(page_to_pfn(buddy)) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
- kernel_map_pages(page, (1 << order), 1);
- set_page_refcounted(page);
- isolated_page = page;
+ isolated_page = true;
}
}
}
@@ -128,8 +127,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (isolated_page)
- __free_pages(isolated_page, order);
+ if (isolated_page) {
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ __free_pages(page, order);
+ }
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index fedeba88c9cb..ec6dc1886f71 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -7,11 +7,22 @@
#include <linux/page_owner.h>
#include <linux/jump_label.h>
#include <linux/migrate.h>
+#include <linux/stackdepot.h>
+
#include "internal.h"
+/*
+ * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
+ * to use off stack temporal storage
+ */
+#define PAGE_OWNER_STACK_DEPTH (16)
+
static bool page_owner_disabled = true;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
+
static void init_early_allocated_pages(void);
static int early_page_owner_param(char *buf)
@@ -34,11 +45,41 @@ static bool need_page_owner(void)
return true;
}
+static noinline void register_dummy_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace dummy;
+
+ dummy.nr_entries = 0;
+ dummy.max_entries = ARRAY_SIZE(entries);
+ dummy.entries = &entries[0];
+ dummy.skip = 0;
+
+ save_stack_trace(&dummy);
+ dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+}
+
+static noinline void register_failure_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace failure;
+
+ failure.nr_entries = 0;
+ failure.max_entries = ARRAY_SIZE(entries);
+ failure.entries = &entries[0];
+ failure.skip = 0;
+
+ save_stack_trace(&failure);
+ failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+}
+
static void init_page_owner(void)
{
if (page_owner_disabled)
return;
+ register_dummy_stack();
+ register_failure_stack();
static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -61,25 +102,66 @@ void __reset_page_owner(struct page *page, unsigned int order)
}
}
-void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+static inline bool check_recursive_alloc(struct stack_trace *trace,
+ unsigned long ip)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ int i, count;
+
+ if (!trace->nr_entries)
+ return false;
+
+ for (i = 0, count = 0; i < trace->nr_entries; i++) {
+ if (trace->entries[i] == ip && ++count == 2)
+ return true;
+ }
+
+ return false;
+}
+static noinline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
.nr_entries = 0,
- .max_entries = ARRAY_SIZE(page_ext->trace_entries),
- .entries = &page_ext->trace_entries[0],
- .skip = 3,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
};
+ depot_stack_handle_t handle;
+
+ save_stack_trace(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ /*
+ * We need to check recursion here because our request to stackdepot
+ * could trigger memory allocation to save new entry. New memory
+ * allocation would reach here and call depot_save_stack() again
+ * if we don't catch it. There is still not enough memory in stackdepot
+ * so it would try to allocate memory again and loop forever.
+ */
+ if (check_recursive_alloc(&trace, _RET_IP_))
+ return dummy_handle;
+
+ handle = depot_save_stack(&trace, flags);
+ if (!handle)
+ handle = failure_handle;
+
+ return handle;
+}
+
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
- save_stack_trace(&trace);
-
+ page_ext->handle = save_stack(gfp_mask);
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
- page_ext->nr_entries = trace.nr_entries;
page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
@@ -94,34 +176,31 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_ext->last_migrate_reason = reason;
}
-gfp_t __get_page_owner_gfp(struct page *page)
+void __split_page_owner(struct page *page, unsigned int order)
{
+ int i;
struct page_ext *page_ext = lookup_page_ext(page);
+
if (unlikely(!page_ext))
- /*
- * The caller just returns 0 if no valid gfp
- * So return 0 here too.
- */
- return 0;
+ return;
- return page_ext->gfp_mask;
+ page_ext->order = 0;
+ for (i = 1; i < (1 << order); i++)
+ __copy_page_owner(page, page + i);
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
{
struct page_ext *old_ext = lookup_page_ext(oldpage);
struct page_ext *new_ext = lookup_page_ext(newpage);
- int i;
if (unlikely(!old_ext || !new_ext))
return;
new_ext->order = old_ext->order;
new_ext->gfp_mask = old_ext->gfp_mask;
- new_ext->nr_entries = old_ext->nr_entries;
-
- for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
- new_ext->trace_entries[i] = old_ext->trace_entries[i];
+ new_ext->last_migrate_reason = old_ext->last_migrate_reason;
+ new_ext->handle = old_ext->handle;
/*
* We don't clear the bit on the oldpage as it's going to be freed
@@ -137,14 +216,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
- struct page *page, struct page_ext *page_ext)
+ struct page *page, struct page_ext *page_ext,
+ depot_stack_handle_t handle)
{
int ret;
int pageblock_mt, page_mt;
char *kbuf;
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
- .nr_entries = page_ext->nr_entries,
- .entries = &page_ext->trace_entries[0],
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
};
kbuf = kmalloc(count, GFP_KERNEL);
@@ -173,6 +256,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ depot_fetch_stack(handle, &trace);
ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
if (ret >= count)
goto err;
@@ -203,10 +287,14 @@ err:
void __dump_page_owner(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
- .nr_entries = page_ext->nr_entries,
- .entries = &page_ext->trace_entries[0],
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
};
+ depot_stack_handle_t handle;
gfp_t gfp_mask;
int mt;
@@ -222,6 +310,13 @@ void __dump_page_owner(struct page *page)
return;
}
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ depot_fetch_stack(handle, &trace);
pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
print_stack_trace(&trace, 0);
@@ -237,6 +332,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long pfn;
struct page *page;
struct page_ext *page_ext;
+ depot_stack_handle_t handle;
if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
@@ -285,10 +381,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Access to page_ext->handle isn't synchronous so we should
+ * be careful to access it.
+ */
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle)
+ continue;
+
/* Record the next PFN to read in the file offset */
*ppos = (pfn - min_low_pfn) + 1;
- return print_page_owner(buf, count, pfn, page, page_ext);
+ return print_page_owner(buf, count, pfn, page,
+ page_ext, handle);
}
return 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index 40be3ae0afe3..65ec288dc057 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -89,7 +89,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
page = lru_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+ readahead_gfp_mask(mapping))) {
read_cache_pages_invalidate_page(mapping, page);
continue;
}
@@ -108,7 +108,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned nr_pages)
+ struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
{
struct blk_plug plug;
unsigned page_idx;
@@ -126,10 +126,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = lru_to_page(pages);
list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index,
- mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+ if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
mapping->a_ops->readpage(filp, page);
- }
put_page(page);
}
ret = 0;
@@ -159,6 +157,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
+ gfp_t gfp_mask = readahead_gfp_mask(mapping);
if (isize == 0)
goto out;
@@ -180,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (page && !radix_tree_exceptional_entry(page))
continue;
- page = page_cache_alloc_readahead(mapping);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
break;
page->index = page_offset;
@@ -196,7 +195,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
* will then handle the error.
*/
if (ret)
- read_pages(mapping, filp, &page_pool, ret);
+ read_pages(mapping, filp, &page_pool, ret, gfp_mask);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index 701b93fea2a0..8a13d9f7b566 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1212,10 +1212,8 @@ void do_page_add_anon_rmap(struct page *page,
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (compound) {
- __inc_zone_page_state(page,
- NR_ANON_TRANSPARENT_HUGEPAGES);
- }
+ if (compound)
+ __inc_zone_page_state(page, NR_ANON_THPS);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
@@ -1253,7 +1251,7 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __inc_zone_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1270,18 +1268,42 @@ void page_add_new_anon_rmap(struct page *page,
*
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page)
+void page_add_file_rmap(struct page *page, bool compound)
{
+ int i, nr = 1;
+
+ VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
- if (atomic_inc_and_test(&page->_mapcount)) {
- __inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ if (compound && PageTransHuge(page)) {
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_inc_and_test(&page[i]._mapcount))
+ nr++;
+ }
+ if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
+ goto out;
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+ } else {
+ if (PageTransCompound(page)) {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ SetPageDoubleMap(compound_head(page));
+ if (PageMlocked(page))
+ clear_page_mlock(compound_head(page));
+ }
+ if (!atomic_inc_and_test(&page->_mapcount))
+ goto out;
}
+ __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+out:
unlock_page_memcg(page);
}
-static void page_remove_file_rmap(struct page *page)
+static void page_remove_file_rmap(struct page *page, bool compound)
{
+ int i, nr = 1;
+
+ VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
@@ -1292,15 +1314,26 @@ static void page_remove_file_rmap(struct page *page)
}
/* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ if (compound && PageTransHuge(page)) {
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ goto out;
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+ } else {
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+ }
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_zone_page_state(page, NR_FILE_MAPPED);
+ __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr);
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
@@ -1323,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __dec_zone_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
@@ -1356,11 +1389,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page, bool compound)
{
- if (!PageAnon(page)) {
- VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
- page_remove_file_rmap(page);
- return;
- }
+ if (!PageAnon(page))
+ return page_remove_file_rmap(page, compound);
if (compound)
return page_remove_anon_compound_rmap(page);
@@ -1436,8 +1466,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
if (!(flags & TTU_IGNORE_MLOCK)) {
if (vma->vm_flags & VM_LOCKED) {
- /* Holding pte lock, we do *not* need mmap_sem here */
- mlock_vma_page(page);
+ /* PTE-mapped THP are never mlocked */
+ if (!PageTransCompound(page)) {
+ /*
+ * Holding pte lock, we do *not* need
+ * mmap_sem here
+ */
+ mlock_vma_page(page);
+ }
ret = SWAP_MLOCK;
goto out_unmap;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 171dee7a131f..62e42c7d544c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -32,6 +32,7 @@
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/uio.h>
+#include <linux/khugepaged.h>
static struct vfsmount *shm_mnt;
@@ -97,14 +98,6 @@ struct shmem_falloc {
pgoff_t nr_unswapped; /* how often writepage refused to swap out */
};
-/* Flag allocation requirements to shmem_getpage */
-enum sgp_type {
- SGP_READ, /* don't exceed i_size, don't allocate page */
- SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
- SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
-};
-
#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
@@ -124,7 +117,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
-static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+int shmem_getpage(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
@@ -173,10 +166,13 @@ static inline int shmem_reacct_size(unsigned long flags,
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
-static inline int shmem_acct_block(unsigned long flags)
+static inline int shmem_acct_block(unsigned long flags, long pages)
{
- return (flags & VM_NORESERVE) ?
- security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0;
+ if (!(flags & VM_NORESERVE))
+ return 0;
+
+ return security_vm_enough_memory_mm(current->mm,
+ pages * VM_ACCT(PAGE_SIZE));
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -192,6 +188,7 @@ static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
+static struct file_system_type shmem_fs_type;
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -249,6 +246,53 @@ static void shmem_recalc_inode(struct inode *inode)
}
}
+bool shmem_charge(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ unsigned long flags;
+
+ if (shmem_acct_block(info->flags, pages))
+ return false;
+ spin_lock_irqsave(&info->lock, flags);
+ info->alloced += pages;
+ inode->i_blocks += pages * BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irqrestore(&info->lock, flags);
+ inode->i_mapping->nrpages += pages;
+
+ if (!sbinfo->max_blocks)
+ return true;
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - pages) > 0) {
+ inode->i_mapping->nrpages -= pages;
+ spin_lock_irqsave(&info->lock, flags);
+ info->alloced -= pages;
+ shmem_recalc_inode(inode);
+ spin_unlock_irqrestore(&info->lock, flags);
+
+ return false;
+ }
+ percpu_counter_add(&sbinfo->used_blocks, pages);
+ return true;
+}
+
+void shmem_uncharge(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ unsigned long flags;
+
+ spin_lock_irqsave(&info->lock, flags);
+ info->alloced -= pages;
+ inode->i_blocks -= pages * BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irqrestore(&info->lock, flags);
+
+ if (sbinfo->max_blocks)
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
+}
+
/*
* Replace item expected in radix tree by a new item, while holding tree lock.
*/
@@ -289,36 +333,256 @@ static bool shmem_confirm_swap(struct address_space *mapping,
}
/*
+ * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
+ *
+ * SHMEM_HUGE_NEVER:
+ * disables huge pages for the mount;
+ * SHMEM_HUGE_ALWAYS:
+ * enables huge pages for the mount;
+ * SHMEM_HUGE_WITHIN_SIZE:
+ * only allocate huge pages if the page will be fully within i_size,
+ * also respect fadvise()/madvise() hints;
+ * SHMEM_HUGE_ADVISE:
+ * only allocate huge pages if requested with fadvise()/madvise();
+ */
+
+#define SHMEM_HUGE_NEVER 0
+#define SHMEM_HUGE_ALWAYS 1
+#define SHMEM_HUGE_WITHIN_SIZE 2
+#define SHMEM_HUGE_ADVISE 3
+
+/*
+ * Special values.
+ * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
+ *
+ * SHMEM_HUGE_DENY:
+ * disables huge on shm_mnt and all mounts, for emergency use;
+ * SHMEM_HUGE_FORCE:
+ * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
+ *
+ */
+#define SHMEM_HUGE_DENY (-1)
+#define SHMEM_HUGE_FORCE (-2)
+
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+/* ifdef here to avoid bloating shmem.o when not necessary */
+
+int shmem_huge __read_mostly;
+
+static int shmem_parse_huge(const char *str)
+{
+ if (!strcmp(str, "never"))
+ return SHMEM_HUGE_NEVER;
+ if (!strcmp(str, "always"))
+ return SHMEM_HUGE_ALWAYS;
+ if (!strcmp(str, "within_size"))
+ return SHMEM_HUGE_WITHIN_SIZE;
+ if (!strcmp(str, "advise"))
+ return SHMEM_HUGE_ADVISE;
+ if (!strcmp(str, "deny"))
+ return SHMEM_HUGE_DENY;
+ if (!strcmp(str, "force"))
+ return SHMEM_HUGE_FORCE;
+ return -EINVAL;
+}
+
+static const char *shmem_format_huge(int huge)
+{
+ switch (huge) {
+ case SHMEM_HUGE_NEVER:
+ return "never";
+ case SHMEM_HUGE_ALWAYS:
+ return "always";
+ case SHMEM_HUGE_WITHIN_SIZE:
+ return "within_size";
+ case SHMEM_HUGE_ADVISE:
+ return "advise";
+ case SHMEM_HUGE_DENY:
+ return "deny";
+ case SHMEM_HUGE_FORCE:
+ return "force";
+ default:
+ VM_BUG_ON(1);
+ return "bad_val";
+ }
+}
+
+static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ struct shrink_control *sc, unsigned long nr_to_split)
+{
+ LIST_HEAD(list), *pos, *next;
+ struct inode *inode;
+ struct shmem_inode_info *info;
+ struct page *page;
+ unsigned long batch = sc ? sc->nr_to_scan : 128;
+ int removed = 0, split = 0;
+
+ if (list_empty(&sbinfo->shrinklist))
+ return SHRINK_STOP;
+
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_for_each_safe(pos, next, &sbinfo->shrinklist) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+
+ /* pin the inode */
+ inode = igrab(&info->vfs_inode);
+
+ /* inode is about to be evicted */
+ if (!inode) {
+ list_del_init(&info->shrinklist);
+ removed++;
+ goto next;
+ }
+
+ /* Check if there's anything to gain */
+ if (round_up(inode->i_size, PAGE_SIZE) ==
+ round_up(inode->i_size, HPAGE_PMD_SIZE)) {
+ list_del_init(&info->shrinklist);
+ removed++;
+ iput(inode);
+ goto next;
+ }
+
+ list_move(&info->shrinklist, &list);
+next:
+ if (!--batch)
+ break;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+
+ list_for_each_safe(pos, next, &list) {
+ int ret;
+
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+
+ if (nr_to_split && split >= nr_to_split) {
+ iput(inode);
+ continue;
+ }
+
+ page = find_lock_page(inode->i_mapping,
+ (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
+ if (!page)
+ goto drop;
+
+ if (!PageTransHuge(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto drop;
+ }
+
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (ret) {
+ /* split failed: leave it on the list */
+ iput(inode);
+ continue;
+ }
+
+ split++;
+drop:
+ list_del_init(&info->shrinklist);
+ removed++;
+ iput(inode);
+ }
+
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_splice_tail(&list, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len -= removed;
+ spin_unlock(&sbinfo->shrinklist_lock);
+
+ return split;
+}
+
+static long shmem_unused_huge_scan(struct super_block *sb,
+ struct shrink_control *sc)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ if (!READ_ONCE(sbinfo->shrinklist_len))
+ return SHRINK_STOP;
+
+ return shmem_unused_huge_shrink(sbinfo, sc, 0);
+}
+
+static long shmem_unused_huge_count(struct super_block *sb,
+ struct shrink_control *sc)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ return READ_ONCE(sbinfo->shrinklist_len);
+}
+#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+
+#define shmem_huge SHMEM_HUGE_DENY
+
+static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ struct shrink_control *sc, unsigned long nr_to_split)
+{
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+
+/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, void *expected)
{
- int error;
+ int error, nr = hpage_nr_pages(page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(index != round_down(index, nr), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON(expected && PageTransHuge(page));
- get_page(page);
+ page_ref_add(page, nr);
page->mapping = mapping;
page->index = index;
spin_lock_irq(&mapping->tree_lock);
- if (!expected)
+ if (PageTransHuge(page)) {
+ void __rcu **results;
+ pgoff_t idx;
+ int i;
+
+ error = 0;
+ if (radix_tree_gang_lookup_slot(&mapping->page_tree,
+ &results, &idx, index, 1) &&
+ idx < index + HPAGE_PMD_NR) {
+ error = -EEXIST;
+ }
+
+ if (!error) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ error = radix_tree_insert(&mapping->page_tree,
+ index + i, page + i);
+ VM_BUG_ON(error);
+ }
+ count_vm_event(THP_FILE_ALLOC);
+ }
+ } else if (!expected) {
error = radix_tree_insert(&mapping->page_tree, index, page);
- else
+ } else {
error = shmem_radix_tree_replace(mapping, index, expected,
page);
+ }
+
if (!error) {
- mapping->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(page, NR_SHMEM);
+ mapping->nrpages += nr;
+ if (PageTransHuge(page))
+ __inc_zone_page_state(page, NR_SHMEM_THPS);
+ __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr);
+ __mod_zone_page_state(page_zone(page), NR_SHMEM, nr);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
- put_page(page);
+ page_ref_sub(page, nr);
}
return error;
}
@@ -331,6 +595,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
struct address_space *mapping = page->mapping;
int error;
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+
spin_lock_irq(&mapping->tree_lock);
error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
page->mapping = NULL;
@@ -510,10 +776,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
}
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
+
if (!trylock_page(page))
continue;
+
+ if (PageTransTail(page)) {
+ /* Middle of THP: zero out the page */
+ clear_highpage(page);
+ unlock_page(page);
+ continue;
+ } else if (PageTransHuge(page)) {
+ if (index == round_down(end, HPAGE_PMD_NR)) {
+ /*
+ * Range ends in the middle of THP:
+ * zero out the page
+ */
+ clear_highpage(page);
+ unlock_page(page);
+ continue;
+ }
+ index += HPAGE_PMD_NR - 1;
+ i += HPAGE_PMD_NR - 1;
+ }
+
if (!unfalloc || !PageUptodate(page)) {
- if (page->mapping == mapping) {
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ if (page_mapping(page) == mapping) {
VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
@@ -589,8 +878,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
lock_page(page);
+
+ if (PageTransTail(page)) {
+ /* Middle of THP: zero out the page */
+ clear_highpage(page);
+ unlock_page(page);
+ /*
+ * Partial thp truncate due 'start' in middle
+ * of THP: don't need to look on these pages
+ * again on !pvec.nr restart.
+ */
+ if (index != round_down(end, HPAGE_PMD_NR))
+ start++;
+ continue;
+ } else if (PageTransHuge(page)) {
+ if (index == round_down(end, HPAGE_PMD_NR)) {
+ /*
+ * Range ends in the middle of THP:
+ * zero out the page
+ */
+ clear_highpage(page);
+ unlock_page(page);
+ continue;
+ }
+ index += HPAGE_PMD_NR - 1;
+ i += HPAGE_PMD_NR - 1;
+ }
+
if (!unfalloc || !PageUptodate(page)) {
- if (page->mapping == mapping) {
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ if (page_mapping(page) == mapping) {
VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
} else {
@@ -607,10 +924,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index++;
}
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->swapped -= nr_swaps_freed;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
}
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
@@ -627,9 +944,9 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct shmem_inode_info *info = SHMEM_I(inode);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
}
generic_fillattr(inode, stat);
return 0;
@@ -639,6 +956,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
error = inode_change_ok(inode, attr);
@@ -674,6 +992,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (oldsize > holebegin)
unmap_mapping_range(inode->i_mapping,
holebegin, 0, 1);
+
+ /*
+ * Part of the huge page can be beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ spin_lock(&sbinfo->shrinklist_lock);
+ if (list_empty(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
}
}
@@ -686,11 +1018,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (inode->i_mapping->a_ops == &shmem_aops) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
shmem_truncate_range(inode, 0, (loff_t)-1);
+ if (!list_empty(&info->shrinklist)) {
+ spin_lock(&sbinfo->shrinklist_lock);
+ if (!list_empty(&info->shrinklist)) {
+ list_del_init(&info->shrinklist);
+ sbinfo->shrinklist_len--;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
if (!list_empty(&info->swaplist)) {
mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist);
@@ -773,9 +1114,9 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
delete_from_swap_cache(*pagep);
set_page_dirty(*pagep);
if (!error) {
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->swapped--;
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
swap_free(swap);
}
}
@@ -848,6 +1189,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
swp_entry_t swap;
pgoff_t index;
+ VM_BUG_ON_PAGE(PageCompound(page), page);
BUG_ON(!PageLocked(page));
mapping = page->mapping;
index = page->index;
@@ -922,10 +1264,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
list_add_tail(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
swap_shmem_alloc(swap);
shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
@@ -984,24 +1326,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
#define vm_policy vm_private_data
#endif
+static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ /* Create a pseudo vma that just contains the policy */
+ vma->vm_start = 0;
+ /* Bias interleave by inode number to distribute better across nodes */
+ vma->vm_pgoff = index + info->vfs_inode.i_ino;
+ vma->vm_ops = NULL;
+ vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+}
+
+static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
+{
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(vma->vm_policy);
+}
+
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
- /* Create a pseudo vma that just contains the policy */
- pvma.vm_start = 0;
- /* Bias interleave by inode number to distribute better across nodes */
- pvma.vm_pgoff = index + info->vfs_inode.i_ino;
- pvma.vm_ops = NULL;
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-
+ shmem_pseudo_vma_init(&pvma, info, index);
page = swapin_readahead(swap, gfp, &pvma, 0);
+ shmem_pseudo_vma_destroy(&pvma);
- /* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(pvma.vm_policy);
+ return page;
+}
+
+static struct page *shmem_alloc_hugepage(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct vm_area_struct pvma;
+ struct inode *inode = &info->vfs_inode;
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR);
+ void __rcu **results;
+ struct page *page;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return NULL;
+
+ rcu_read_lock();
+ if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
+ hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ rcu_read_unlock();
+
+ shmem_pseudo_vma_init(&pvma, info, hindex);
+ page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
+ HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ shmem_pseudo_vma_destroy(&pvma);
+ if (page)
+ prep_transhuge_page(page);
return page;
}
@@ -1011,23 +1392,51 @@ static struct page *shmem_alloc_page(gfp_t gfp,
struct vm_area_struct pvma;
struct page *page;
- /* Create a pseudo vma that just contains the policy */
- pvma.vm_start = 0;
- /* Bias interleave by inode number to distribute better across nodes */
- pvma.vm_pgoff = index + info->vfs_inode.i_ino;
- pvma.vm_ops = NULL;
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+ shmem_pseudo_vma_init(&pvma, info, index);
+ page = alloc_page_vma(gfp, &pvma, 0);
+ shmem_pseudo_vma_destroy(&pvma);
+
+ return page;
+}
+
+static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+ struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+ pgoff_t index, bool huge)
+{
+ struct page *page;
+ int nr;
+ int err = -ENOSPC;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ huge = false;
+ nr = huge ? HPAGE_PMD_NR : 1;
+
+ if (shmem_acct_block(info->flags, nr))
+ goto failed;
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - nr) > 0)
+ goto unacct;
+ percpu_counter_add(&sbinfo->used_blocks, nr);
+ }
- page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+ if (huge)
+ page = shmem_alloc_hugepage(gfp, info, index);
+ else
+ page = shmem_alloc_page(gfp, info, index);
if (page) {
__SetPageLocked(page);
__SetPageSwapBacked(page);
+ return page;
}
- /* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(pvma.vm_policy);
-
- return page;
+ err = -ENOMEM;
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -nr);
+unacct:
+ shmem_unacct_blocks(info->flags, nr);
+failed:
+ return ERR_PTR(err);
}
/*
@@ -1132,12 +1541,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
+ enum sgp_type sgp_huge = sgp;
+ pgoff_t hindex = index;
int error;
int once = 0;
int alloced = 0;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
+ if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
+ sgp = SGP_CACHE;
repeat:
swap.val = 0;
page = find_lock_entry(mapping, index);
@@ -1240,10 +1653,10 @@ repeat:
mem_cgroup_commit_charge(page, memcg, true, false);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
if (sgp == SGP_WRITE)
mark_page_accessed(page);
@@ -1253,51 +1666,111 @@ repeat:
swap_free(swap);
} else {
- if (shmem_acct_block(info->flags)) {
- error = -ENOSPC;
- goto failed;
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ loff_t i_size;
+ pgoff_t off;
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ goto alloc_huge;
+ /* fallthrough */
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
}
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks) >= 0) {
- error = -ENOSPC;
- goto unacct;
+
+alloc_huge:
+ page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+ index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto failed;
+ /*
+ * Try to reclaim some spece by splitting a huge page
+ * beyond i_size on the filesystem.
+ */
+ while (retry--) {
+ int ret;
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
}
- percpu_counter_inc(&sbinfo->used_blocks);
+ goto failed;
}
- page = shmem_alloc_page(gfp, info, index);
- if (!page) {
- error = -ENOMEM;
- goto decused;
- }
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
+
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
- false);
+ PageTransHuge(page));
if (error)
- goto decused;
- error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+ goto unacct;
+ error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
+ compound_order(page));
if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
+ error = shmem_add_to_page_cache(page, mapping, hindex,
NULL);
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- goto decused;
+ mem_cgroup_cancel_charge(page, memcg,
+ PageTransHuge(page));
+ goto unacct;
}
- mem_cgroup_commit_charge(page, memcg, false, false);
+ mem_cgroup_commit_charge(page, memcg, false,
+ PageTransHuge(page));
lru_cache_add_anon(page);
- spin_lock(&info->lock);
- info->alloced++;
- inode->i_blocks += BLOCKS_PER_PAGE;
+ spin_lock_irq(&info->lock);
+ info->alloced += 1 << compound_order(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
alloced = true;
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
+ /*
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ if (list_empty(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+
/*
* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
*/
@@ -1309,10 +1782,15 @@ clear:
* but SGP_FALLOC on a page fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
- if (sgp != SGP_WRITE) {
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ int i;
+
+ for (i = 0; i < (1 << compound_order(head)); i++) {
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
+ }
+ SetPageUptodate(head);
}
}
@@ -1322,24 +1800,30 @@ clear:
if (alloced) {
ClearPageDirty(page);
delete_from_page_cache(page);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
}
error = -EINVAL;
goto unlock;
}
- *pagep = page;
+ *pagep = page + index - hindex;
return 0;
/*
* Error recovery.
*/
-decused:
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -1);
unacct:
- shmem_unacct_blocks(info->flags, 1);
+ if (sbinfo->max_blocks)
+ percpu_counter_sub(&sbinfo->used_blocks,
+ 1 << compound_order(page));
+ shmem_unacct_blocks(info->flags, 1 << compound_order(page));
+
+ if (PageTransHuge(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto alloc_nohuge;
+ }
failed:
if (swap.val && !shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
@@ -1350,9 +1834,9 @@ unlock:
}
if (error == -ENOSPC && !once++) {
info = SHMEM_I(inode);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
goto repeat;
}
if (error == -EEXIST) /* from above or from radix_tree_insert */
@@ -1364,6 +1848,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
+ enum sgp_type sgp;
int error;
int ret = VM_FAULT_LOCKED;
@@ -1425,13 +1910,107 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ sgp = SGP_CACHE;
+ if (vma->vm_flags & VM_HUGEPAGE)
+ sgp = SGP_HUGE;
+ else if (vma->vm_flags & VM_NOHUGEPAGE)
+ sgp = SGP_NOHUGE;
+
+ error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
gfp, vma->vm_mm, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
return ret;
}
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long uaddr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ unsigned long (*get_area)(struct file *,
+ unsigned long, unsigned long, unsigned long, unsigned long);
+ unsigned long addr;
+ unsigned long offset;
+ unsigned long inflated_len;
+ unsigned long inflated_addr;
+ unsigned long inflated_offset;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ get_area = current->mm->get_unmapped_area;
+ addr = get_area(file, uaddr, len, pgoff, flags);
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return addr;
+ if (IS_ERR_VALUE(addr))
+ return addr;
+ if (addr & ~PAGE_MASK)
+ return addr;
+ if (addr > TASK_SIZE - len)
+ return addr;
+
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return addr;
+ if (len < HPAGE_PMD_SIZE)
+ return addr;
+ if (flags & MAP_FIXED)
+ return addr;
+ /*
+ * Our priority is to support MAP_SHARED mapped hugely;
+ * and support MAP_PRIVATE mapped hugely too, until it is COWed.
+ * But if caller specified an address hint, respect that as before.
+ */
+ if (uaddr)
+ return addr;
+
+ if (shmem_huge != SHMEM_HUGE_FORCE) {
+ struct super_block *sb;
+
+ if (file) {
+ VM_BUG_ON(file->f_op != &shmem_file_operations);
+ sb = file_inode(file)->i_sb;
+ } else {
+ /*
+ * Called directly from mm/mmap.c, or drivers/char/mem.c
+ * for "/dev/zero", to create a shared anonymous object.
+ */
+ if (IS_ERR(shm_mnt))
+ return addr;
+ sb = shm_mnt->mnt_sb;
+ }
+ if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
+ return addr;
+ }
+
+ offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
+ if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
+ return addr;
+ if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
+ return addr;
+
+ inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
+ if (inflated_len > TASK_SIZE)
+ return addr;
+ if (inflated_len < len)
+ return addr;
+
+ inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+ if (IS_ERR_VALUE(inflated_addr))
+ return addr;
+ if (inflated_addr & ~PAGE_MASK)
+ return addr;
+
+ inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
+ inflated_addr += offset - inflated_offset;
+ if (inflated_offset > offset)
+ inflated_addr += HPAGE_PMD_SIZE;
+
+ if (inflated_addr > TASK_SIZE - len)
+ return addr;
+ return inflated_addr;
+}
+
#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
@@ -1456,7 +2035,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
if (lock && !(info->flags & VM_LOCKED)) {
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
@@ -1471,7 +2050,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
retval = 0;
out_nomem:
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
return retval;
}
@@ -1479,6 +2058,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
+ (vma->vm_end & HPAGE_PMD_MASK)) {
+ khugepaged_enter(vma, vma->vm_flags);
+ }
return 0;
}
@@ -1504,6 +2088,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
spin_lock_init(&info->lock);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
+ INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
@@ -1589,12 +2174,23 @@ shmem_write_end(struct file *file, struct address_space *mapping,
i_size_write(inode, pos + copied);
if (!PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ if (PageTransCompound(page)) {
+ int i;
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (head + i == page)
+ continue;
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
+ }
+ }
if (copied < PAGE_SIZE) {
unsigned from = pos & (PAGE_SIZE - 1);
zero_user_segments(page, 0, from,
from + copied, PAGE_SIZE);
}
- SetPageUptodate(page);
+ SetPageUptodate(head);
}
set_page_dirty(page);
unlock_page(page);
@@ -2860,11 +3456,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
sbinfo->gid = make_kgid(current_user_ns(), gid);
if (!gid_valid(sbinfo->gid))
goto bad_val;
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+ } else if (!strcmp(this_char, "huge")) {
+ int huge;
+ huge = shmem_parse_huge(value);
+ if (huge < 0)
+ goto bad_val;
+ if (!has_transparent_hugepage() &&
+ huge != SHMEM_HUGE_NEVER)
+ goto bad_val;
+ sbinfo->huge = huge;
+#endif
+#ifdef CONFIG_NUMA
} else if (!strcmp(this_char,"mpol")) {
mpol_put(mpol);
mpol = NULL;
if (mpol_parse_str(value, &mpol))
goto bad_val;
+#endif
} else {
pr_err("tmpfs: Bad mount option %s\n", this_char);
goto error;
@@ -2910,6 +3519,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
goto out;
error = 0;
+ sbinfo->huge = config.huge;
sbinfo->max_blocks = config.max_blocks;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2943,6 +3553,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+ /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
+ if (sbinfo->huge)
+ seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+#endif
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
@@ -3072,6 +3687,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
+ spin_lock_init(&sbinfo->shrinklist_lock);
+ INIT_LIST_HEAD(&sbinfo->shrinklist);
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
@@ -3161,6 +3778,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
.read_iter = shmem_file_read_iter,
@@ -3233,6 +3851,10 @@ static const struct super_operations shmem_ops = {
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+ .nr_cached_objects = shmem_unused_huge_count,
+ .free_cached_objects = shmem_unused_huge_scan,
+#endif
};
static const struct vm_operations_struct shmem_vm_ops = {
@@ -3282,6 +3904,13 @@ int __init shmem_init(void)
pr_err("Could not kern_mount tmpfs\n");
goto out1;
}
+
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+ if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
+ SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+ else
+ shmem_huge = 0; /* just in case it was patched */
+#endif
return 0;
out1:
@@ -3293,6 +3922,91 @@ out3:
return error;
}
+#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+static ssize_t shmem_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int values[] = {
+ SHMEM_HUGE_ALWAYS,
+ SHMEM_HUGE_WITHIN_SIZE,
+ SHMEM_HUGE_ADVISE,
+ SHMEM_HUGE_NEVER,
+ SHMEM_HUGE_DENY,
+ SHMEM_HUGE_FORCE,
+ };
+ int i, count;
+
+ for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
+ const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+
+ count += sprintf(buf + count, fmt,
+ shmem_format_huge(values[i]));
+ }
+ buf[count - 1] = '\n';
+ return count;
+}
+
+static ssize_t shmem_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ char tmp[16];
+ int huge;
+
+ if (count + 1 > sizeof(tmp))
+ return -EINVAL;
+ memcpy(tmp, buf, count);
+ tmp[count] = '\0';
+ if (count && tmp[count - 1] == '\n')
+ tmp[count - 1] = '\0';
+
+ huge = shmem_parse_huge(tmp);
+ if (huge == -EINVAL)
+ return -EINVAL;
+ if (!has_transparent_hugepage() &&
+ huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
+ return -EINVAL;
+
+ shmem_huge = huge;
+ if (shmem_huge < SHMEM_HUGE_DENY)
+ SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+ return count;
+}
+
+struct kobj_attribute shmem_enabled_attr =
+ __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+
+bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ loff_t i_size;
+ pgoff_t off;
+
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ switch (sbinfo->huge) {
+ case SHMEM_HUGE_NEVER:
+ return false;
+ case SHMEM_HUGE_ALWAYS:
+ return true;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ return true;
+ case SHMEM_HUGE_ADVISE:
+ /* TODO: implement fadvise() hints */
+ return (vma->vm_flags & VM_HUGEPAGE);
+ default:
+ VM_BUG_ON(1);
+ return false;
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+
#else /* !CONFIG_SHMEM */
/*
@@ -3335,6 +4049,15 @@ void shmem_unlock_mapping(struct address_space *mapping)
{
}
+#ifdef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
+
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
truncate_inode_pages_range(inode->i_mapping, lstart, lend);
@@ -3461,6 +4184,13 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
+ (vma->vm_end & HPAGE_PMD_MASK)) {
+ khugepaged_enter(vma, vma->vm_flags);
+ }
+
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index cc8bbc1e6bc9..09771ed3e693 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1236,61 +1236,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
}
}
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
- size_t count)
-{
- size_t i;
- unsigned int rand;
-
- for (i = 0; i < count; i++)
- list[i] = i;
-
- /* Fisher-Yates shuffle */
- for (i = count - 1; i > 0; i--) {
- rand = prandom_u32_state(state);
- rand %= (i + 1);
- swap(list[i], list[rand]);
- }
-}
-
-/* Create a random sequence per cache */
-static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
-{
- unsigned int seed, count = cachep->num;
- struct rnd_state state;
-
- if (count < 2)
- return 0;
-
- /* If it fails, we will just use the global lists */
- cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
- if (!cachep->random_seq)
- return -ENOMEM;
-
- /* Get best entropy at this stage */
- get_random_bytes_arch(&seed, sizeof(seed));
- prandom_seed_state(&state, seed);
-
- freelist_randomize(&state, cachep->random_seq, count);
- return 0;
-}
-
-/* Destroy the per-cache random freelist sequence */
-static void cache_random_seq_destroy(struct kmem_cache *cachep)
-{
- kfree(cachep->random_seq);
- cachep->random_seq = NULL;
-}
-#else
-static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
-{
- return 0;
-}
-static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
-#endif /* CONFIG_SLAB_FREELIST_RANDOM */
-
-
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
@@ -2535,7 +2480,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
union freelist_init_state {
struct {
unsigned int pos;
- freelist_idx_t *list;
+ unsigned int *list;
unsigned int count;
unsigned int rand;
};
@@ -2554,7 +2499,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
unsigned int rand;
/* Use best entropy available to define a random shift */
- get_random_bytes_arch(&rand, sizeof(rand));
+ rand = get_random_int();
/* Use a random state if the pre-computed list is not available */
if (!cachep->random_seq) {
@@ -2576,13 +2521,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state)
return (state->list[state->pos++] + state->rand) % state->count;
}
+/* Swap two freelist entries */
+static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
+{
+ swap(((freelist_idx_t *)page->freelist)[a],
+ ((freelist_idx_t *)page->freelist)[b]);
+}
+
/*
* Shuffle the freelist initialization state based on pre-computed lists.
* return true if the list was successfully shuffled, false otherwise.
*/
static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
{
- unsigned int objfreelist = 0, i, count = cachep->num;
+ unsigned int objfreelist = 0, i, rand, count = cachep->num;
union freelist_init_state state;
bool precomputed;
@@ -2607,7 +2559,15 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
* Later use a pre-computed list for speed.
*/
if (!precomputed) {
- freelist_randomize(&state.rnd_state, page->freelist, count);
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, i);
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(&state.rnd_state);
+ rand %= (i + 1);
+ swap_free_obj(page, i, rand);
+ }
} else {
for (i = 0; i < count; i++)
set_free_obj(page, i, next_random_slot(&state));
@@ -2726,8 +2686,11 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* critical path in kmem_cache_alloc().
*/
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
- BUG();
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
}
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3489,8 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
n->free_objects -= cachep->num;
page = list_last_entry(&n->slabs_free, struct page, lru);
- list_del(&page->lru);
- list_add(&page->lru, list);
+ list_move(&page->lru, list);
}
}
@@ -3979,7 +3941,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
int shared = 0;
int batchcount = 0;
- err = cache_random_seq_create(cachep, gfp);
+ err = cache_random_seq_create(cachep, cachep->num, gfp);
if (err)
goto end;
diff --git a/mm/slab.h b/mm/slab.h
index dedb1a920fb8..f33980ab0406 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -42,6 +42,7 @@ struct kmem_cache {
#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
+#include <linux/random.h>
/*
* State of the slab allocator.
@@ -253,8 +254,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
if (is_root_cache(s))
return 0;
- ret = __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+ ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
if (ret)
return ret;
@@ -268,6 +268,9 @@ static __always_inline int memcg_charge_slab(struct page *page,
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
+ if (!memcg_kmem_enabled())
+ return;
+
memcg_kmem_update_page_stat(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
@@ -390,7 +393,11 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (should_failslab(s, flags))
return NULL;
- return memcg_kmem_get_cache(s, flags);
+ if (memcg_kmem_enabled() &&
+ ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
+ return memcg_kmem_get_cache(s);
+
+ return s;
}
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
@@ -407,7 +414,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
s->flags, flags);
kasan_slab_alloc(s, object, flags);
}
- memcg_kmem_put_cache(s);
+
+ if (memcg_kmem_enabled())
+ memcg_kmem_put_cache(s);
}
#ifndef CONFIG_SLOB
@@ -464,4 +473,17 @@ int memcg_slab_show(struct seq_file *m, void *p);
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
+ gfp_t gfp);
+void cache_random_seq_destroy(struct kmem_cache *cachep);
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep,
+ unsigned int count, gfp_t gfp)
+{
+ return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 82317abb03ed..71f0b28a1bec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
struct page *page;
flags |= __GFP_COMP;
- page = alloc_kmem_pages(flags, order);
+ page = alloc_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
kasan_kmalloc_large(ret, size, flags);
@@ -1030,6 +1030,53 @@ void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
EXPORT_SYMBOL(kmalloc_order_trace);
#endif
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Randomize a generic freelist */
+static void freelist_randomize(struct rnd_state *state, unsigned int *list,
+ size_t count)
+{
+ size_t i;
+ unsigned int rand;
+
+ for (i = 0; i < count; i++)
+ list[i] = i;
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(state);
+ rand %= (i + 1);
+ swap(list[i], list[rand]);
+ }
+}
+
+/* Create a random sequence per cache */
+int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
+ gfp_t gfp)
+{
+ struct rnd_state state;
+
+ if (count < 2 || cachep->random_seq)
+ return 0;
+
+ cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
+ if (!cachep->random_seq)
+ return -ENOMEM;
+
+ /* Get best entropy at this stage of boot */
+ prandom_seed_state(&state, get_random_long());
+
+ freelist_randomize(&state, cachep->random_seq, count);
+ return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+ kfree(cachep->random_seq);
+ cachep->random_seq = NULL;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
#ifdef CONFIG_SLABINFO
#ifdef CONFIG_SLAB
diff --git a/mm/slub.c b/mm/slub.c
index 825ff4505336..f9da8716b8b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1405,6 +1405,109 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
return page;
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Pre-initialize the random sequence cache */
+static int init_cache_random_seq(struct kmem_cache *s)
+{
+ int err;
+ unsigned long i, count = oo_objects(s->oo);
+
+ err = cache_random_seq_create(s, count, GFP_KERNEL);
+ if (err) {
+ pr_err("SLUB: Unable to initialize free list for %s\n",
+ s->name);
+ return err;
+ }
+
+ /* Transform to an offset on the set of pages */
+ if (s->random_seq) {
+ for (i = 0; i < count; i++)
+ s->random_seq[i] *= s->size;
+ }
+ return 0;
+}
+
+/* Initialize each random sequence freelist per cache */
+static void __init init_freelist_randomization(void)
+{
+ struct kmem_cache *s;
+
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list)
+ init_cache_random_seq(s);
+
+ mutex_unlock(&slab_mutex);
+}
+
+/* Get the next entry on the pre-computed freelist randomized */
+static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
+ unsigned long *pos, void *start,
+ unsigned long page_limit,
+ unsigned long freelist_count)
+{
+ unsigned int idx;
+
+ /*
+ * If the target page allocation failed, the number of objects on the
+ * page might be smaller than the usual size defined by the cache.
+ */
+ do {
+ idx = s->random_seq[*pos];
+ *pos += 1;
+ if (*pos >= freelist_count)
+ *pos = 0;
+ } while (unlikely(idx >= page_limit));
+
+ return (char *)start + idx;
+}
+
+/* Shuffle the single linked freelist based on a random pre-computed sequence */
+static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+{
+ void *start;
+ void *cur;
+ void *next;
+ unsigned long idx, pos, page_limit, freelist_count;
+
+ if (page->objects < 2 || !s->random_seq)
+ return false;
+
+ freelist_count = oo_objects(s->oo);
+ pos = get_random_int() % freelist_count;
+
+ page_limit = page->objects * s->size;
+ start = fixup_red_left(s, page_address(page));
+
+ /* First entry is used as the base of the freelist */
+ cur = next_freelist_entry(s, page, &pos, start, page_limit,
+ freelist_count);
+ page->freelist = cur;
+
+ for (idx = 1; idx < page->objects; idx++) {
+ setup_object(s, page, cur);
+ next = next_freelist_entry(s, page, &pos, start, page_limit,
+ freelist_count);
+ set_freepointer(s, cur, next);
+ cur = next;
+ }
+ setup_object(s, page, cur);
+ set_freepointer(s, cur, NULL);
+
+ return true;
+}
+#else
+static inline int init_cache_random_seq(struct kmem_cache *s)
+{
+ return 0;
+}
+static inline void init_freelist_randomization(void) { }
+static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
@@ -1412,6 +1515,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
gfp_t alloc_gfp;
void *start, *p;
int idx, order;
+ bool shuffle;
flags &= gfp_allowed_mask;
@@ -1473,15 +1577,19 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kasan_poison_slab(page);
- for_each_object_idx(p, idx, s, start, page->objects) {
- setup_object(s, page, p);
- if (likely(idx < page->objects))
- set_freepointer(s, p, p + s->size);
- else
- set_freepointer(s, p, NULL);
+ shuffle = shuffle_freelist(s, page);
+
+ if (!shuffle) {
+ for_each_object_idx(p, idx, s, start, page->objects) {
+ setup_object(s, page, p);
+ if (likely(idx < page->objects))
+ set_freepointer(s, p, p + s->size);
+ else
+ set_freepointer(s, p, NULL);
+ }
+ page->freelist = fixup_red_left(s, start);
}
- page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
@@ -1504,8 +1612,10 @@ out:
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
- BUG();
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
}
return allocate_slab(s,
@@ -2867,7 +2977,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_kmem_pages(page, compound_order(page));
+ __free_pages(page, compound_order(page));
p[size] = NULL; /* mark object processed */
return size;
}
@@ -3207,6 +3317,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
+ cache_random_seq_destroy(s);
free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
}
@@ -3431,6 +3542,13 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
+
+ /* Initialize the pre-computed randomized freelist if slab is up */
+ if (slab_state >= UP) {
+ if (init_cache_random_seq(s))
+ goto error;
+ }
+
if (!init_kmem_cache_nodes(s))
goto error;
@@ -3575,7 +3693,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
void *ptr = NULL;
flags |= __GFP_COMP | __GFP_NOTRACK;
- page = alloc_kmem_pages_node(node, flags, get_order(size));
+ page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3656,7 +3774,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(x);
- __free_kmem_pages(page, compound_order(page));
+ __free_pages(page, compound_order(page));
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -3947,6 +4065,9 @@ void __init kmem_cache_init(void)
setup_kmalloc_cache_index_table();
create_kmalloc_caches(0);
+ /* Setup random freelists for each cache */
+ init_freelist_randomization();
+
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
#endif
diff --git a/mm/swap.c b/mm/swap.c
index 90530ff8ed16..616df4ddd870 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -292,6 +292,7 @@ static bool need_activate_page_drain(int cpu)
void activate_page(struct page *page)
{
+ page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
@@ -316,6 +317,7 @@ void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
+ page = compound_head(page);
spin_lock_irq(&zone->lru_lock);
__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 031713ab40ce..78cfa292a29a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2493,7 +2493,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
}
/* frontswap enabled? set up bit-per-page map for frontswap */
- if (frontswap_enabled)
+ if (IS_ENABLED(CONFIG_FRONTSWAP))
frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 4064f8f53daa..a01cce450a26 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -155,10 +155,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
int truncate_inode_page(struct address_space *mapping, struct page *page)
{
+ loff_t holelen;
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
if (page_mapped(page)) {
unmap_mapping_range(mapping,
(loff_t)page->index << PAGE_SHIFT,
- PAGE_SIZE, 0);
+ holelen, 0);
}
return truncate_complete_page(mapping, page);
}
@@ -279,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (!trylock_page(page))
continue;
- WARN_ON(page->index != index);
+ WARN_ON(page_to_pgoff(page) != index);
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -367,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
lock_page(page);
- WARN_ON(page->index != index);
+ WARN_ON(page_to_pgoff(page) != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
@@ -487,7 +491,21 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
if (!trylock_page(page))
continue;
- WARN_ON(page->index != index);
+
+ WARN_ON(page_to_pgoff(page) != index);
+
+ /* Middle of THP: skip */
+ if (PageTransTail(page)) {
+ unlock_page(page);
+ continue;
+ } else if (PageTransHuge(page)) {
+ index += HPAGE_PMD_NR - 1;
+ i += HPAGE_PMD_NR - 1;
+ /* 'end' is in the middle of THP */
+ if (index == round_down(end, HPAGE_PMD_NR))
+ continue;
+ }
+
ret = invalidate_inode_page(page);
unlock_page(page);
/*
@@ -594,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
lock_page(page);
- WARN_ON(page->index != index);
+ WARN_ON(page_to_pgoff(page) != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
diff --git a/mm/util.c b/mm/util.c
index 917e0e3d0f8e..8d010ef2ce1c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page)
}
mapping = page->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+ if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
- return mapping;
+
+ return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
+EXPORT_SYMBOL(page_mapping);
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
@@ -410,6 +412,12 @@ int __page_mapcount(struct page *page)
int ret;
ret = atomic_read(&page->_mapcount) + 1;
+ /*
+ * For file THP page->_mapcount contains total number of mapping
+ * of the page: no need to look into compound_mapcount.
+ */
+ if (!PageAnon(page) && !PageHuge(page))
+ return ret;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e11475cdeb7a..91f44e78c516 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_kmem_pages(page, 0);
+ __free_pages(page, 0);
}
kvfree(area->pages);
@@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_kmem_pages(alloc_mask, order);
+ page = alloc_pages(alloc_mask, order);
else
- page = alloc_kmem_pages_node(node, alloc_mask, order);
+ page = alloc_pages_node(node, alloc_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c4a2f4512fca..21d417ccff69 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1055,8 +1055,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Adding to swap updated mapping */
mapping = page_mapping(page);
+ } else if (unlikely(PageTransHuge(page))) {
+ /* Split file THP */
+ if (split_huge_page_to_list(page, page_list))
+ goto keep_locked;
}
+ VM_BUG_ON_PAGE(PageTransHuge(page), page);
+
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -1254,7 +1260,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !isolated_balloon_page(page)) {
+ !__PageMovable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cb2a67bb4158..7997f52935c9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -718,7 +718,9 @@ const char * const vmstat_text[] = {
"nr_dirtied",
"nr_written",
"nr_pages_scanned",
-
+#if IS_ENABLED(CONFIG_ZSMALLOC)
+ "nr_zspages",
+#endif
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
@@ -731,6 +733,8 @@ const char * const vmstat_text[] = {
"workingset_activate",
"workingset_nodereclaim",
"nr_anon_transparent_hugepages",
+ "nr_shmem_hugepages",
+ "nr_shmem_pmdmapped",
"nr_free_cma",
/* enum writeback_stat_item counters */
@@ -815,6 +819,8 @@ const char * const vmstat_text[] = {
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
+ "thp_file_alloc",
+ "thp_file_mapped",
"thp_split_page",
"thp_split_page_failed",
"thp_deferred_split_page",
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b6d4f258cb53..04176de6df70 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -16,32 +16,15 @@
* struct page(s) to form a zspage.
*
* Usage of struct page fields:
- * page->private: points to the first component (0-order) page
- * page->index (union with page->freelist): offset of the first object
- * starting in this page. For the first page, this is
- * always 0, so we use this field (aka freelist) to point
- * to the first free object in zspage.
- * page->lru: links together all component pages (except the first page)
- * of a zspage
- *
- * For _first_ page only:
- *
- * page->private: refers to the component page after the first page
- * If the page is first_page for huge object, it stores handle.
- * Look at size_class->huge.
- * page->freelist: points to the first free object in zspage.
- * Free objects are linked together using in-place
- * metadata.
- * page->objects: maximum number of objects we can store in this
- * zspage (class->zspage_order * PAGE_SIZE / class->size)
- * page->lru: links together first pages of various zspages.
- * Basically forming list of zspages in a fullness group.
- * page->mapping: class index and fullness group of the zspage
- * page->inuse: the number of objects that are used in this zspage
+ * page->private: points to zspage
+ * page->freelist(index): links together all component pages of a zspage
+ * For the huge page, this is always 0, so we use this field
+ * to store handle.
*
* Usage of struct page flags:
* PG_private: identifies the first component page
* PG_private2: identifies the last component page
+ * PG_owner_priv_1: indentifies the huge component page
*
*/
@@ -66,6 +49,11 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
+#include <linux/mount.h>
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
+#define ZSPAGE_MAGIC 0x58
/*
* This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -88,9 +76,7 @@
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
*
- * Note that object index <obj_idx> is relative to system
- * page <PFN> it is stored in, so for each sub-page belonging
- * to a zspage, obj_idx starts with 0.
+ * Note that object index <obj_idx> starts from 0.
*
* This is made more complicated by various memory models and PAE.
*/
@@ -149,33 +135,29 @@
* ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
* (reason above)
*/
-#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
/*
* We do not maintain any list for completely empty or full pages
*/
enum fullness_group {
- ZS_ALMOST_FULL,
- ZS_ALMOST_EMPTY,
- _ZS_NR_FULLNESS_GROUPS,
-
ZS_EMPTY,
- ZS_FULL
+ ZS_ALMOST_EMPTY,
+ ZS_ALMOST_FULL,
+ ZS_FULL,
+ NR_ZS_FULLNESS,
};
enum zs_stat_type {
+ CLASS_EMPTY,
+ CLASS_ALMOST_EMPTY,
+ CLASS_ALMOST_FULL,
+ CLASS_FULL,
OBJ_ALLOCATED,
OBJ_USED,
- CLASS_ALMOST_FULL,
- CLASS_ALMOST_EMPTY,
+ NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
-#else
-#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
-#endif
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
@@ -184,6 +166,10 @@ struct zs_size_stat {
static struct dentry *zs_stat_root;
#endif
+#ifdef CONFIG_COMPACTION
+static struct vfsmount *zsmalloc_mnt;
+#endif
+
/*
* number of size_classes
*/
@@ -207,35 +193,49 @@ static const int fullness_threshold_frac = 4;
struct size_class {
spinlock_t lock;
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+ struct list_head fullness_list[NR_ZS_FULLNESS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
*/
int size;
- unsigned int index;
-
- struct zs_size_stat stats;
-
+ int objs_per_zspage;
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
- bool huge;
+
+ unsigned int index;
+ struct zs_size_stat stats;
};
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetPageHugeObject(struct page *page)
+{
+ SetPageOwnerPriv1(page);
+}
+
+static void ClearPageHugeObject(struct page *page)
+{
+ ClearPageOwnerPriv1(page);
+}
+
+static int PageHugeObject(struct page *page)
+{
+ return PageOwnerPriv1(page);
+}
+
/*
* Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
+ * For every zspage, zspage->freeobj gives head of this list.
*
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
union {
/*
- * Position of next free chunk (encodes <PFN, obj_idx>)
+ * Free object index;
* It's valid for non-allocated object
*/
- void *next;
+ unsigned long next;
/*
* Handle of allocated object.
*/
@@ -248,6 +248,7 @@ struct zs_pool {
struct size_class **size_class;
struct kmem_cache *handle_cachep;
+ struct kmem_cache *zspage_cachep;
atomic_long_t pages_allocated;
@@ -263,16 +264,36 @@ struct zs_pool {
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
+#ifdef CONFIG_COMPACTION
+ struct inode *inode;
+ struct work_struct free_work;
+#endif
};
/*
* A zspage's class index and fullness group
* are encoded in its (first)page->mapping
*/
-#define CLASS_IDX_BITS 28
-#define FULLNESS_BITS 4
-#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
-#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
+#define FULLNESS_BITS 2
+#define CLASS_BITS 8
+#define ISOLATED_BITS 3
+#define MAGIC_VAL_BITS 8
+
+struct zspage {
+ struct {
+ unsigned int fullness:FULLNESS_BITS;
+ unsigned int class:CLASS_BITS;
+ unsigned int isolated:ISOLATED_BITS;
+ unsigned int magic:MAGIC_VAL_BITS;
+ };
+ unsigned int inuse;
+ unsigned int freeobj;
+ struct page *first_page;
+ struct list_head list; /* fullness list */
+#ifdef CONFIG_COMPACTION
+ rwlock_t lock;
+#endif
+};
struct mapping_area {
#ifdef CONFIG_PGTABLE_MAPPING
@@ -284,29 +305,74 @@ struct mapping_area {
enum zs_mapmode vm_mm; /* mapping mode */
};
-static int create_handle_cache(struct zs_pool *pool)
+#ifdef CONFIG_COMPACTION
+static int zs_register_migration(struct zs_pool *pool);
+static void zs_unregister_migration(struct zs_pool *pool);
+static void migrate_lock_init(struct zspage *zspage);
+static void migrate_read_lock(struct zspage *zspage);
+static void migrate_read_unlock(struct zspage *zspage);
+static void kick_deferred_free(struct zs_pool *pool);
+static void init_deferred_free(struct zs_pool *pool);
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
+#else
+static int zsmalloc_mount(void) { return 0; }
+static void zsmalloc_unmount(void) {}
+static int zs_register_migration(struct zs_pool *pool) { return 0; }
+static void zs_unregister_migration(struct zs_pool *pool) {}
+static void migrate_lock_init(struct zspage *zspage) {}
+static void migrate_read_lock(struct zspage *zspage) {}
+static void migrate_read_unlock(struct zspage *zspage) {}
+static void kick_deferred_free(struct zs_pool *pool) {}
+static void init_deferred_free(struct zs_pool *pool) {}
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
+static int create_cache(struct zs_pool *pool)
{
pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
0, 0, NULL);
- return pool->handle_cachep ? 0 : 1;
+ if (!pool->handle_cachep)
+ return 1;
+
+ pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+ if (!pool->zspage_cachep) {
+ kmem_cache_destroy(pool->handle_cachep);
+ pool->handle_cachep = NULL;
+ return 1;
+ }
+
+ return 0;
}
-static void destroy_handle_cache(struct zs_pool *pool)
+static void destroy_cache(struct zs_pool *pool)
{
kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->zspage_cachep);
}
-static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
+static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~__GFP_HIGHMEM);
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
-static void free_handle(struct zs_pool *pool, unsigned long handle)
+static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
kmem_cache_free(pool->handle_cachep, (void *)handle);
}
+static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
+{
+ return kmem_cache_alloc(pool->zspage_cachep,
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+};
+
+static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+{
+ kmem_cache_free(pool->zspage_cachep, zspage);
+}
+
static void record_obj(unsigned long handle, unsigned long obj)
{
/*
@@ -409,38 +475,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static bool is_zspage_isolated(struct zspage *zspage)
+{
+ return zspage->isolated;
+}
+
static int is_first_page(struct page *page)
{
return PagePrivate(page);
}
-static int is_last_page(struct page *page)
+/* Protected by class->lock */
+static inline int get_zspage_inuse(struct zspage *zspage)
+{
+ return zspage->inuse;
+}
+
+static inline void set_zspage_inuse(struct zspage *zspage, int val)
+{
+ zspage->inuse = val;
+}
+
+static inline void mod_zspage_inuse(struct zspage *zspage, int val)
+{
+ zspage->inuse += val;
+}
+
+static inline struct page *get_first_page(struct zspage *zspage)
+{
+ struct page *first_page = zspage->first_page;
+
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ return first_page;
+}
+
+static inline int get_first_obj_offset(struct page *page)
+{
+ return page->units;
+}
+
+static inline void set_first_obj_offset(struct page *page, int offset)
+{
+ page->units = offset;
+}
+
+static inline unsigned int get_freeobj(struct zspage *zspage)
+{
+ return zspage->freeobj;
+}
+
+static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
{
- return PagePrivate2(page);
+ zspage->freeobj = obj;
}
-static void get_zspage_mapping(struct page *first_page,
+static void get_zspage_mapping(struct zspage *zspage,
unsigned int *class_idx,
enum fullness_group *fullness)
{
- unsigned long m;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
- m = (unsigned long)first_page->mapping;
- *fullness = m & FULLNESS_MASK;
- *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+ *fullness = zspage->fullness;
+ *class_idx = zspage->class;
}
-static void set_zspage_mapping(struct page *first_page,
+static void set_zspage_mapping(struct zspage *zspage,
unsigned int class_idx,
enum fullness_group fullness)
{
- unsigned long m;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
- m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
- (fullness & FULLNESS_MASK);
- first_page->mapping = (struct address_space *)m;
+ zspage->class = class_idx;
+ zspage->fullness = fullness;
}
/*
@@ -464,23 +568,19 @@ static int get_size_class_index(int size)
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- if (type < NR_ZS_STAT_TYPE)
- class->stats.objs[type] += cnt;
+ class->stats.objs[type] += cnt;
}
static inline void zs_stat_dec(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- if (type < NR_ZS_STAT_TYPE)
- class->stats.objs[type] -= cnt;
+ class->stats.objs[type] -= cnt;
}
static inline unsigned long zs_stat_get(struct size_class *class,
enum zs_stat_type type)
{
- if (type < NR_ZS_STAT_TYPE)
- return class->stats.objs[type];
- return 0;
+ return class->stats.objs[type];
}
#ifdef CONFIG_ZSMALLOC_STAT
@@ -624,6 +724,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
}
#endif
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -631,21 +732,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
* the pool (not yet implemented). This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct page *first_page)
+static enum fullness_group get_fullness_group(struct size_class *class,
+ struct zspage *zspage)
{
- int inuse, max_objects;
+ int inuse, objs_per_zspage;
enum fullness_group fg;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
- inuse = first_page->inuse;
- max_objects = first_page->objects;
+ inuse = get_zspage_inuse(zspage);
+ objs_per_zspage = class->objs_per_zspage;
if (inuse == 0)
fg = ZS_EMPTY;
- else if (inuse == max_objects)
+ else if (inuse == objs_per_zspage)
fg = ZS_FULL;
- else if (inuse <= 3 * max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -660,32 +760,25 @@ static enum fullness_group get_fullness_group(struct page *first_page)
* identified by <class, fullness_group>.
*/
static void insert_zspage(struct size_class *class,
- enum fullness_group fullness,
- struct page *first_page)
+ struct zspage *zspage,
+ enum fullness_group fullness)
{
- struct page **head;
-
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
- if (fullness >= _ZS_NR_FULLNESS_GROUPS)
- return;
-
- zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
-
- head = &class->fullness_list[fullness];
- if (!*head) {
- *head = first_page;
- return;
- }
+ struct zspage *head;
+ zs_stat_inc(class, fullness, 1);
+ head = list_first_entry_or_null(&class->fullness_list[fullness],
+ struct zspage, list);
/*
- * We want to see more ZS_FULL pages and less almost
- * empty/full. Put pages with higher ->inuse first.
+ * We want to see more ZS_FULL pages and less almost empty/full.
+ * Put pages with higher ->inuse first.
*/
- list_add_tail(&first_page->lru, &(*head)->lru);
- if (first_page->inuse >= (*head)->inuse)
- *head = first_page;
+ if (head) {
+ if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
+ list_add(&zspage->list, &head->list);
+ return;
+ }
+ }
+ list_add(&zspage->list, &class->fullness_list[fullness]);
}
/*
@@ -693,27 +786,14 @@ static void insert_zspage(struct size_class *class,
* by <class, fullness_group>.
*/
static void remove_zspage(struct size_class *class,
- enum fullness_group fullness,
- struct page *first_page)
+ struct zspage *zspage,
+ enum fullness_group fullness)
{
- struct page **head;
-
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
- if (fullness >= _ZS_NR_FULLNESS_GROUPS)
- return;
-
- head = &class->fullness_list[fullness];
- VM_BUG_ON_PAGE(!*head, first_page);
- if (list_empty(&(*head)->lru))
- *head = NULL;
- else if (*head == first_page)
- *head = (struct page *)list_entry((*head)->lru.next,
- struct page, lru);
+ VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
+ VM_BUG_ON(is_zspage_isolated(zspage));
- list_del_init(&first_page->lru);
- zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+ list_del_init(&zspage->list);
+ zs_stat_dec(class, fullness, 1);
}
/*
@@ -726,19 +806,22 @@ static void remove_zspage(struct size_class *class,
* fullness group.
*/
static enum fullness_group fix_fullness_group(struct size_class *class,
- struct page *first_page)
+ struct zspage *zspage)
{
int class_idx;
enum fullness_group currfg, newfg;
- get_zspage_mapping(first_page, &class_idx, &currfg);
- newfg = get_fullness_group(first_page);
+ get_zspage_mapping(zspage, &class_idx, &currfg);
+ newfg = get_fullness_group(class, zspage);
if (newfg == currfg)
goto out;
- remove_zspage(class, currfg, first_page);
- insert_zspage(class, newfg, first_page);
- set_zspage_mapping(first_page, class_idx, newfg);
+ if (!is_zspage_isolated(zspage)) {
+ remove_zspage(class, zspage, currfg);
+ insert_zspage(class, zspage, newfg);
+ }
+
+ set_zspage_mapping(zspage, class_idx, newfg);
out:
return newfg;
@@ -780,64 +863,49 @@ static int get_pages_per_zspage(int class_size)
return max_usedpc_order;
}
-/*
- * A single 'zspage' is composed of many system pages which are
- * linked together using fields in struct page. This function finds
- * the first/head page, given any component page of a zspage.
- */
-static struct page *get_first_page(struct page *page)
+static struct zspage *get_zspage(struct page *page)
{
- if (is_first_page(page))
- return page;
- else
- return (struct page *)page_private(page);
+ struct zspage *zspage = (struct zspage *)page->private;
+
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
+ return zspage;
}
static struct page *get_next_page(struct page *page)
{
- struct page *next;
+ if (unlikely(PageHugeObject(page)))
+ return NULL;
- if (is_last_page(page))
- next = NULL;
- else if (is_first_page(page))
- next = (struct page *)page_private(page);
- else
- next = list_entry(page->lru.next, struct page, lru);
+ return page->freelist;
+}
- return next;
+/**
+ * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static void obj_to_location(unsigned long obj, struct page **page,
+ unsigned int *obj_idx)
+{
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
}
-/*
- * Encode <page, obj_idx> as a single handle value.
- * We use the least bit of handle for tagging.
+/**
+ * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
+ * @page: page object resides in zspage
+ * @obj_idx: object index
*/
-static void *location_to_obj(struct page *page, unsigned long obj_idx)
+static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
{
unsigned long obj;
- if (!page) {
- VM_BUG_ON(obj_idx);
- return NULL;
- }
-
obj = page_to_pfn(page) << OBJ_INDEX_BITS;
- obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj |= obj_idx & OBJ_INDEX_MASK;
obj <<= OBJ_TAG_BITS;
- return (void *)obj;
-}
-
-/*
- * Decode <page, obj_idx> pair from the given object handle. We adjust the
- * decoded obj_idx back to its original value since it was adjusted in
- * location_to_obj().
- */
-static void obj_to_location(unsigned long obj, struct page **page,
- unsigned long *obj_idx)
-{
- obj >>= OBJ_TAG_BITS;
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
- *obj_idx = (obj & OBJ_INDEX_MASK);
+ return obj;
}
static unsigned long handle_to_obj(unsigned long handle)
@@ -845,109 +913,147 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static unsigned long obj_to_head(struct size_class *class, struct page *page,
- void *obj)
+static unsigned long obj_to_head(struct page *page, void *obj)
{
- if (class->huge) {
+ if (unlikely(PageHugeObject(page))) {
VM_BUG_ON_PAGE(!is_first_page(page), page);
- return page_private(page);
+ return page->index;
} else
return *(unsigned long *)obj;
}
-static unsigned long obj_idx_to_offset(struct page *page,
- unsigned long obj_idx, int class_size)
+static inline int testpin_tag(unsigned long handle)
{
- unsigned long off = 0;
-
- if (!is_first_page(page))
- off = page->index;
-
- return off + obj_idx * class_size;
+ return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static inline int trypin_tag(unsigned long handle)
{
- unsigned long *ptr = (unsigned long *)handle;
-
- return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+ return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void pin_tag(unsigned long handle)
{
- while (!trypin_tag(handle));
+ bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void unpin_tag(unsigned long handle)
{
- unsigned long *ptr = (unsigned long *)handle;
-
- clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+ bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void reset_page(struct page *page)
{
+ __ClearPageMovable(page);
clear_bit(PG_private, &page->flags);
clear_bit(PG_private_2, &page->flags);
set_page_private(page, 0);
- page->mapping = NULL;
- page->freelist = NULL;
page_mapcount_reset(page);
+ ClearPageHugeObject(page);
+ page->freelist = NULL;
+}
+
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+void lock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ lock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
}
-static void free_zspage(struct page *first_page)
+int trylock_zspage(struct zspage *zspage)
{
- struct page *nextp, *tmp, *head_extra;
+ struct page *cursor, *fail;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- VM_BUG_ON_PAGE(first_page->inuse, first_page);
+ for (cursor = get_first_page(zspage); cursor != NULL; cursor =
+ get_next_page(cursor)) {
+ if (!trylock_page(cursor)) {
+ fail = cursor;
+ goto unlock;
+ }
+ }
- head_extra = (struct page *)page_private(first_page);
+ return 1;
+unlock:
+ for (cursor = get_first_page(zspage); cursor != fail; cursor =
+ get_next_page(cursor))
+ unlock_page(cursor);
- reset_page(first_page);
- __free_page(first_page);
+ return 0;
+}
- /* zspage with only 1 system page */
- if (!head_extra)
- return;
+static void __free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ struct page *page, *next;
+ enum fullness_group fg;
+ unsigned int class_idx;
- list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
- list_del(&nextp->lru);
- reset_page(nextp);
- __free_page(nextp);
+ get_zspage_mapping(zspage, &class_idx, &fg);
+
+ assert_spin_locked(&class->lock);
+
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(fg != ZS_EMPTY);
+
+ next = page = get_first_page(zspage);
+ do {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ next = get_next_page(page);
+ reset_page(page);
+ unlock_page(page);
+ dec_zone_page_state(page, NR_ZSPAGES);
+ put_page(page);
+ page = next;
+ } while (page != NULL);
+
+ cache_free_zspage(pool, zspage);
+
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+}
+
+static void free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(list_empty(&zspage->list));
+
+ if (!trylock_zspage(zspage)) {
+ kick_deferred_free(pool);
+ return;
}
- reset_page(head_extra);
- __free_page(head_extra);
+
+ remove_zspage(class, zspage, ZS_EMPTY);
+ __free_zspage(pool, class, zspage);
}
/* Initialize a newly allocated zspage */
-static void init_zspage(struct size_class *class, struct page *first_page)
+static void init_zspage(struct size_class *class, struct zspage *zspage)
{
+ unsigned int freeobj = 1;
unsigned long off = 0;
- struct page *page = first_page;
-
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ struct page *page = get_first_page(zspage);
while (page) {
struct page *next_page;
struct link_free *link;
- unsigned int i = 1;
void *vaddr;
- /*
- * page->index stores offset of first object starting
- * in the page. For the first page, this is always 0,
- * so we use first_page->index (aka ->freelist) to store
- * head of corresponding zspage's freelist.
- */
- if (page != first_page)
- page->index = off;
+ set_first_obj_offset(page, off);
vaddr = kmap_atomic(page);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = location_to_obj(page, i++);
+ link->next = freeobj++ << OBJ_TAG_BITS;
link += class->size / sizeof(*link);
}
@@ -957,87 +1063,112 @@ static void init_zspage(struct size_class *class, struct page *first_page)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = location_to_obj(next_page, 0);
+ if (next_page) {
+ link->next = freeobj++ << OBJ_TAG_BITS;
+ } else {
+ /*
+ * Reset OBJ_TAG_BITS bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ link->next = -1 << OBJ_TAG_BITS;
+ }
kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
+
+ set_freeobj(zspage, 0);
}
-/*
- * Allocate a zspage for the given size class
- */
-static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+static void create_page_chain(struct size_class *class, struct zspage *zspage,
+ struct page *pages[])
{
- int i, error;
- struct page *first_page = NULL, *uninitialized_var(prev_page);
+ int i;
+ struct page *page;
+ struct page *prev_page = NULL;
+ int nr_pages = class->pages_per_zspage;
/*
* Allocate individual pages and link them together as:
- * 1. first page->private = first sub-page
- * 2. all sub-pages are linked together using page->lru
- * 3. each sub-page is linked to the first page using page->private
+ * 1. all pages are linked together using page->freelist
+ * 2. each sub-page point to zspage using page->private
*
- * For each size class, First/Head pages are linked together using
- * page->lru. Also, we set PG_private to identify the first page
- * (i.e. no other sub-page has this flag set) and PG_private_2 to
- * identify the last page.
+ * we set PG_private to identify the first page (i.e. no other sub-page
+ * has this flag set) and PG_private_2 to identify the last page.
*/
- error = -ENOMEM;
- for (i = 0; i < class->pages_per_zspage; i++) {
- struct page *page;
-
- page = alloc_page(flags);
- if (!page)
- goto cleanup;
-
- INIT_LIST_HEAD(&page->lru);
- if (i == 0) { /* first page */
+ for (i = 0; i < nr_pages; i++) {
+ page = pages[i];
+ set_page_private(page, (unsigned long)zspage);
+ page->freelist = NULL;
+ if (i == 0) {
+ zspage->first_page = page;
SetPagePrivate(page);
- set_page_private(page, 0);
- first_page = page;
- first_page->inuse = 0;
+ if (unlikely(class->objs_per_zspage == 1 &&
+ class->pages_per_zspage == 1))
+ SetPageHugeObject(page);
+ } else {
+ prev_page->freelist = page;
}
- if (i == 1)
- set_page_private(first_page, (unsigned long)page);
- if (i >= 1)
- set_page_private(page, (unsigned long)first_page);
- if (i >= 2)
- list_add(&page->lru, &prev_page->lru);
- if (i == class->pages_per_zspage - 1) /* last page */
+ if (i == nr_pages - 1)
SetPagePrivate2(page);
prev_page = page;
}
+}
+
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct zspage *alloc_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ gfp_t gfp)
+{
+ int i;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+
+ if (!zspage)
+ return NULL;
- init_zspage(class, first_page);
+ memset(zspage, 0, sizeof(struct zspage));
+ zspage->magic = ZSPAGE_MAGIC;
+ migrate_lock_init(zspage);
- first_page->freelist = location_to_obj(first_page, 0);
- /* Maximum number of objects we can store in this zspage */
- first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
- error = 0; /* Success */
+ page = alloc_page(gfp);
+ if (!page) {
+ while (--i >= 0) {
+ dec_zone_page_state(pages[i], NR_ZSPAGES);
+ __free_page(pages[i]);
+ }
+ cache_free_zspage(pool, zspage);
+ return NULL;
+ }
-cleanup:
- if (unlikely(error) && first_page) {
- free_zspage(first_page);
- first_page = NULL;
+ inc_zone_page_state(page, NR_ZSPAGES);
+ pages[i] = page;
}
- return first_page;
+ create_page_chain(class, zspage, pages);
+ init_zspage(class, zspage);
+
+ return zspage;
}
-static struct page *find_get_zspage(struct size_class *class)
+static struct zspage *find_get_zspage(struct size_class *class)
{
int i;
- struct page *page;
+ struct zspage *zspage;
- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
- page = class->fullness_list[i];
- if (page)
+ for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[i],
+ struct zspage, list);
+ if (zspage)
break;
}
- return page;
+ return zspage;
}
#ifdef CONFIG_PGTABLE_MAPPING
@@ -1242,11 +1373,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-static bool zspage_full(struct page *first_page)
+static bool zspage_full(struct size_class *class, struct zspage *zspage)
{
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
- return first_page->inuse == first_page->objects;
+ return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1272,8 +1401,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
+ struct zspage *zspage;
struct page *page;
- unsigned long obj, obj_idx, off;
+ unsigned long obj, off;
+ unsigned int obj_idx;
unsigned int class_idx;
enum fullness_group fg;
@@ -1294,9 +1425,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ zspage = get_zspage(page);
+
+ /* migration cannot move any subpage in this zspage */
+ migrate_read_lock(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fg);
class = pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = (class->size * obj_idx) & ~PAGE_MASK;
area = &get_cpu_var(zs_map_area);
area->vm_mm = mm;
@@ -1314,7 +1450,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
ret = __zs_map_object(area, pages, off, class->size);
out:
- if (!class->huge)
+ if (likely(!PageHugeObject(page)))
ret += ZS_HANDLE_SIZE;
return ret;
@@ -1323,8 +1459,10 @@ EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
+ struct zspage *zspage;
struct page *page;
- unsigned long obj, obj_idx, off;
+ unsigned long obj, off;
+ unsigned int obj_idx;
unsigned int class_idx;
enum fullness_group fg;
@@ -1333,9 +1471,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
class = pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = (class->size * obj_idx) & ~PAGE_MASK;
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1350,38 +1489,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
put_cpu_var(zs_map_area);
+
+ migrate_read_unlock(zspage);
unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
static unsigned long obj_malloc(struct size_class *class,
- struct page *first_page, unsigned long handle)
+ struct zspage *zspage, unsigned long handle)
{
+ int i, nr_page, offset;
unsigned long obj;
struct link_free *link;
struct page *m_page;
- unsigned long m_objidx, m_offset;
+ unsigned long m_offset;
void *vaddr;
handle |= OBJ_ALLOCATED_TAG;
- obj = (unsigned long)first_page->freelist;
- obj_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+ obj = get_freeobj(zspage);
+
+ offset = obj * class->size;
+ nr_page = offset >> PAGE_SHIFT;
+ m_offset = offset & ~PAGE_MASK;
+ m_page = get_first_page(zspage);
+
+ for (i = 0; i < nr_page; i++)
+ m_page = get_next_page(m_page);
vaddr = kmap_atomic(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
- first_page->freelist = link->next;
- if (!class->huge)
+ set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
+ if (likely(!PageHugeObject(m_page)))
/* record handle in the header of allocated chunk */
link->handle = handle;
else
- /* record handle in first_page->private */
- set_page_private(first_page, handle);
+ /* record handle to page->index */
+ zspage->first_page->index = handle;
+
kunmap_atomic(vaddr);
- first_page->inuse++;
+ mod_zspage_inuse(zspage, 1);
zs_stat_inc(class, OBJ_USED, 1);
+ obj = location_to_obj(m_page, obj);
+
return obj;
}
@@ -1399,12 +1550,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
- struct page *first_page;
+ enum fullness_group newfg;
+ struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- handle = alloc_handle(pool, gfp);
+ handle = cache_alloc_handle(pool, gfp);
if (!handle)
return 0;
@@ -1413,29 +1565,38 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
- first_page = find_get_zspage(class);
-
- if (!first_page) {
+ zspage = find_get_zspage(class);
+ if (likely(zspage)) {
+ obj = obj_malloc(class, zspage, handle);
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(class, zspage);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- first_page = alloc_zspage(class, gfp);
- if (unlikely(!first_page)) {
- free_handle(pool, handle);
- return 0;
- }
- set_zspage_mapping(first_page, class->index, ZS_EMPTY);
- atomic_long_add(class->pages_per_zspage,
- &pool->pages_allocated);
+ return handle;
+ }
- spin_lock(&class->lock);
- zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
+ spin_unlock(&class->lock);
+
+ zspage = alloc_zspage(pool, class, gfp);
+ if (!zspage) {
+ cache_free_handle(pool, handle);
+ return 0;
}
- obj = obj_malloc(class, first_page, handle);
- /* Now move the zspage to another fullness group, if required */
- fix_fullness_group(class, first_page);
+ spin_lock(&class->lock);
+ obj = obj_malloc(class, zspage, handle);
+ newfg = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, newfg);
+ set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
+ atomic_long_add(class->pages_per_zspage,
+ &pool->pages_allocated);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+
+ /* We completely set up zspage so mark them as movable */
+ SetZsPageMovable(pool, zspage);
spin_unlock(&class->lock);
return handle;
@@ -1445,36 +1606,38 @@ EXPORT_SYMBOL_GPL(zs_malloc);
static void obj_free(struct size_class *class, unsigned long obj)
{
struct link_free *link;
- struct page *first_page, *f_page;
- unsigned long f_objidx, f_offset;
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long f_offset;
+ unsigned int f_objidx;
void *vaddr;
obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
- first_page = get_first_page(f_page);
-
- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+ f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+ zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- link->next = first_page->freelist;
- if (class->huge)
- set_page_private(first_page, 0);
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
kunmap_atomic(vaddr);
- first_page->freelist = (void *)obj;
- first_page->inuse--;
+ set_freeobj(zspage, f_objidx);
+ mod_zspage_inuse(zspage, -1);
zs_stat_dec(class, OBJ_USED, 1);
}
void zs_free(struct zs_pool *pool, unsigned long handle)
{
- struct page *first_page, *f_page;
- unsigned long obj, f_objidx;
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long obj;
+ unsigned int f_objidx;
int class_idx;
struct size_class *class;
enum fullness_group fullness;
+ bool isolated;
if (unlikely(!handle))
return;
@@ -1482,25 +1645,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
pin_tag(handle);
obj = handle_to_obj(handle);
obj_to_location(obj, &f_page, &f_objidx);
- first_page = get_first_page(f_page);
+ zspage = get_zspage(f_page);
- get_zspage_mapping(first_page, &class_idx, &fullness);
+ migrate_read_lock(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fullness);
class = pool->size_class[class_idx];
spin_lock(&class->lock);
obj_free(class, obj);
- fullness = fix_fullness_group(class, first_page);
- if (fullness == ZS_EMPTY) {
- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
- free_zspage(first_page);
+ fullness = fix_fullness_group(class, zspage);
+ if (fullness != ZS_EMPTY) {
+ migrate_read_unlock(zspage);
+ goto out;
}
+
+ isolated = is_zspage_isolated(zspage);
+ migrate_read_unlock(zspage);
+ /* If zspage is isolated, zs_page_putback will free the zspage */
+ if (likely(!isolated))
+ free_zspage(pool, class, zspage);
+out:
+
spin_unlock(&class->lock);
unpin_tag(handle);
-
- free_handle(pool, handle);
+ cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1508,7 +1677,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
unsigned long src)
{
struct page *s_page, *d_page;
- unsigned long s_objidx, d_objidx;
+ unsigned int s_objidx, d_objidx;
unsigned long s_off, d_off;
void *s_addr, *d_addr;
int s_size, d_size, size;
@@ -1519,8 +1688,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
- d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+ s_off = (class->size * s_objidx) & ~PAGE_MASK;
+ d_off = (class->size * d_objidx) & ~PAGE_MASK;
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
@@ -1579,12 +1748,11 @@ static unsigned long find_alloced_obj(struct size_class *class,
unsigned long handle = 0;
void *addr = kmap_atomic(page);
- if (!is_first_page(page))
- offset = page->index;
+ offset = get_first_obj_offset(page);
offset += class->size * index;
while (offset < PAGE_SIZE) {
- head = obj_to_head(class, page, addr + offset);
+ head = obj_to_head(page, addr + offset);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
if (trypin_tag(handle))
@@ -1601,7 +1769,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
}
struct zs_compact_control {
- /* Source page for migration which could be a subpage of zspage. */
+ /* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
/* Destination page for migration which should be a first page
* of zspage. */
@@ -1632,14 +1800,14 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
/* Stop if there is no more space */
- if (zspage_full(d_page)) {
+ if (zspage_full(class, get_zspage(d_page))) {
unpin_tag(handle);
ret = -ENOMEM;
break;
}
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(class, d_page, handle);
+ free_obj = obj_malloc(class, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj);
index++;
/*
@@ -1661,68 +1829,422 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
return ret;
}
-static struct page *isolate_target_page(struct size_class *class)
+static struct zspage *isolate_zspage(struct size_class *class, bool source)
{
int i;
- struct page *page;
+ struct zspage *zspage;
+ enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
- page = class->fullness_list[i];
- if (page) {
- remove_zspage(class, i, page);
- break;
+ if (!source) {
+ fg[0] = ZS_ALMOST_FULL;
+ fg[1] = ZS_ALMOST_EMPTY;
+ }
+
+ for (i = 0; i < 2; i++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
+ struct zspage, list);
+ if (zspage) {
+ VM_BUG_ON(is_zspage_isolated(zspage));
+ remove_zspage(class, zspage, fg[i]);
+ return zspage;
}
}
- return page;
+ return zspage;
}
/*
- * putback_zspage - add @first_page into right class's fullness list
- * @pool: target pool
+ * putback_zspage - add @zspage into right class's fullness list
* @class: destination class
- * @first_page: target page
+ * @zspage: target page
*
- * Return @fist_page's fullness_group
+ * Return @zspage's fullness_group
*/
-static enum fullness_group putback_zspage(struct zs_pool *pool,
- struct size_class *class,
- struct page *first_page)
+static enum fullness_group putback_zspage(struct size_class *class,
+ struct zspage *zspage)
{
enum fullness_group fullness;
- fullness = get_fullness_group(first_page);
- insert_zspage(class, fullness, first_page);
- set_zspage_mapping(first_page, class->index, fullness);
+ VM_BUG_ON(is_zspage_isolated(zspage));
- if (fullness == ZS_EMPTY) {
- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
+ fullness = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, fullness);
+ set_zspage_mapping(zspage, class->index, fullness);
+
+ return fullness;
+}
+
+#ifdef CONFIG_COMPACTION
+static struct dentry *zs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+}
+
+static struct file_system_type zsmalloc_fs = {
+ .name = "zsmalloc",
+ .mount = zs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static int zsmalloc_mount(void)
+{
+ int ret = 0;
- free_zspage(first_page);
+ zsmalloc_mnt = kern_mount(&zsmalloc_fs);
+ if (IS_ERR(zsmalloc_mnt))
+ ret = PTR_ERR(zsmalloc_mnt);
+
+ return ret;
+}
+
+static void zsmalloc_unmount(void)
+{
+ kern_unmount(zsmalloc_mnt);
+}
+
+static void migrate_lock_init(struct zspage *zspage)
+{
+ rwlock_init(&zspage->lock);
+}
+
+static void migrate_read_lock(struct zspage *zspage)
+{
+ read_lock(&zspage->lock);
+}
+
+static void migrate_read_unlock(struct zspage *zspage)
+{
+ read_unlock(&zspage->lock);
+}
+
+static void migrate_write_lock(struct zspage *zspage)
+{
+ write_lock(&zspage->lock);
+}
+
+static void migrate_write_unlock(struct zspage *zspage)
+{
+ write_unlock(&zspage->lock);
+}
+
+/* Number of isolated subpage for *page migration* in this zspage */
+static void inc_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated++;
+}
+
+static void dec_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated--;
+}
+
+static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+ struct page *newpage, struct page *oldpage)
+{
+ struct page *page;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ int idx = 0;
+
+ page = get_first_page(zspage);
+ do {
+ if (page == oldpage)
+ pages[idx] = newpage;
+ else
+ pages[idx] = page;
+ idx++;
+ } while ((page = get_next_page(page)) != NULL);
+
+ create_page_chain(class, zspage, pages);
+ set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ if (unlikely(PageHugeObject(oldpage)))
+ newpage->index = oldpage->index;
+ __SetPageMovable(newpage, page_mapping(oldpage));
+}
+
+bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct address_space *mapping;
+
+ /*
+ * Page is locked so zspage couldn't be destroyed. For detail, look at
+ * lock_zspage in free_zspage.
+ */
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /*
+ * Without class lock, fullness could be stale while class_idx is okay
+ * because class_idx is constant unless page is freed so we should get
+ * fullness again under class lock.
+ */
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ if (get_zspage_inuse(zspage) == 0) {
+ spin_unlock(&class->lock);
+ return false;
}
- return fullness;
+ /* zspage is isolated for object migration */
+ if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ spin_unlock(&class->lock);
+ return false;
+ }
+
+ /*
+ * If this is first time isolation for the zspage, isolate zspage from
+ * size_class to prevent further object allocation from the zspage.
+ */
+ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ remove_zspage(class, zspage, fullness);
+ }
+
+ inc_zspage_isolation(zspage);
+ spin_unlock(&class->lock);
+
+ return true;
}
-static struct page *isolate_source_page(struct size_class *class)
+int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct page *dummy;
+ void *s_addr, *d_addr, *addr;
+ int offset, pos;
+ unsigned long handle, head;
+ unsigned long old_obj, new_obj;
+ unsigned int obj_idx;
+ int ret = -EAGAIN;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /* Concurrent compactor cannot migrate any subpage in zspage */
+ migrate_write_lock(zspage);
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+ offset = get_first_obj_offset(page);
+
+ spin_lock(&class->lock);
+ if (!get_zspage_inuse(zspage)) {
+ ret = -EBUSY;
+ goto unlock_class;
+ }
+
+ pos = offset;
+ s_addr = kmap_atomic(page);
+ while (pos < PAGE_SIZE) {
+ head = obj_to_head(page, s_addr + pos);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!trypin_tag(handle))
+ goto unpin_objects;
+ }
+ pos += class->size;
+ }
+
+ /*
+ * Here, any user cannot access all objects in the zspage so let's move.
+ */
+ d_addr = kmap_atomic(newpage);
+ memcpy(d_addr, s_addr, PAGE_SIZE);
+ kunmap_atomic(d_addr);
+
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+
+ old_obj = handle_to_obj(handle);
+ obj_to_location(old_obj, &dummy, &obj_idx);
+ new_obj = (unsigned long)location_to_obj(newpage,
+ obj_idx);
+ new_obj |= BIT(HANDLE_PIN_BIT);
+ record_obj(handle, new_obj);
+ }
+ }
+
+ replace_sub_page(class, zspage, newpage, page);
+ get_page(newpage);
+
+ dec_zspage_isolation(zspage);
+
+ /*
+ * Page migration is done so let's putback isolated zspage to
+ * the list if @page is final isolated subpage in the zspage.
+ */
+ if (!is_zspage_isolated(zspage))
+ putback_zspage(class, zspage);
+
+ reset_page(page);
+ put_page(page);
+ page = newpage;
+
+ ret = MIGRATEPAGE_SUCCESS;
+unpin_objects:
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+ unpin_tag(handle);
+ }
+ }
+ kunmap_atomic(s_addr);
+unlock_class:
+ spin_unlock(&class->lock);
+ migrate_write_unlock(zspage);
+
+ return ret;
+}
+
+void zs_page_putback(struct page *page)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fg;
+ struct address_space *mapping;
+ struct zspage *zspage;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ dec_zspage_isolation(zspage);
+ if (!is_zspage_isolated(zspage)) {
+ fg = putback_zspage(class, zspage);
+ /*
+ * Due to page_lock, we cannot free zspage immediately
+ * so let's defer.
+ */
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+ }
+ spin_unlock(&class->lock);
+}
+
+const struct address_space_operations zsmalloc_aops = {
+ .isolate_page = zs_page_isolate,
+ .migratepage = zs_page_migrate,
+ .putback_page = zs_page_putback,
+};
+
+static int zs_register_migration(struct zs_pool *pool)
+{
+ pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &zsmalloc_aops;
+ return 0;
+}
+
+static void zs_unregister_migration(struct zs_pool *pool)
+{
+ flush_work(&pool->free_work);
+ if (pool->inode)
+ iput(pool->inode);
+}
+
+/*
+ * Caller should hold page_lock of all pages in the zspage
+ * In here, we cannot use zspage meta data.
+ */
+static void async_free_zspage(struct work_struct *work)
{
int i;
- struct page *page = NULL;
+ struct size_class *class;
+ unsigned int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage, *tmp;
+ LIST_HEAD(free_pages);
+ struct zs_pool *pool = container_of(work, struct zs_pool,
+ free_work);
- for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
- page = class->fullness_list[i];
- if (!page)
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+ if (class->index != i)
continue;
- remove_zspage(class, i, page);
- break;
+ spin_lock(&class->lock);
+ list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+ spin_unlock(&class->lock);
+ }
+
+
+ list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
+ list_del(&zspage->list);
+ lock_zspage(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ VM_BUG_ON(fullness != ZS_EMPTY);
+ class = pool->size_class[class_idx];
+ spin_lock(&class->lock);
+ __free_zspage(pool, pool->size_class[class_idx], zspage);
+ spin_unlock(&class->lock);
}
+};
+
+static void kick_deferred_free(struct zs_pool *pool)
+{
+ schedule_work(&pool->free_work);
+}
+
+static void init_deferred_free(struct zs_pool *pool)
+{
+ INIT_WORK(&pool->free_work, async_free_zspage);
+}
+
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
- return page;
+ do {
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
}
+#endif
/*
*
@@ -1748,20 +2270,20 @@ static unsigned long zs_can_compact(struct size_class *class)
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
struct zs_compact_control cc;
- struct page *src_page;
- struct page *dst_page = NULL;
+ struct zspage *src_zspage;
+ struct zspage *dst_zspage = NULL;
spin_lock(&class->lock);
- while ((src_page = isolate_source_page(class))) {
+ while ((src_zspage = isolate_zspage(class, true))) {
if (!zs_can_compact(class))
break;
cc.index = 0;
- cc.s_page = src_page;
+ cc.s_page = get_first_page(src_zspage);
- while ((dst_page = isolate_target_page(class))) {
- cc.d_page = dst_page;
+ while ((dst_zspage = isolate_zspage(class, false))) {
+ cc.d_page = get_first_page(dst_zspage);
/*
* If there is no more space in dst_page, resched
* and see if anyone had allocated another zspage.
@@ -1769,23 +2291,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
if (!migrate_zspage(pool, class, &cc))
break;
- putback_zspage(pool, class, dst_page);
+ putback_zspage(class, dst_zspage);
}
/* Stop if we couldn't find slot */
- if (dst_page == NULL)
+ if (dst_zspage == NULL)
break;
- putback_zspage(pool, class, dst_page);
- if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ putback_zspage(class, dst_zspage);
+ if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
+ free_zspage(pool, class, src_zspage);
pool->stats.pages_compacted += class->pages_per_zspage;
+ }
spin_unlock(&class->lock);
cond_resched();
spin_lock(&class->lock);
}
- if (src_page)
- putback_zspage(pool, class, src_page);
+ if (src_zspage)
+ putback_zspage(class, src_zspage);
spin_unlock(&class->lock);
}
@@ -1892,6 +2416,7 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool)
return NULL;
+ init_deferred_free(pool);
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
@@ -1903,7 +2428,7 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
- if (create_handle_cache(pool))
+ if (create_cache(pool))
goto err;
/*
@@ -1914,6 +2439,7 @@ struct zs_pool *zs_create_pool(const char *name)
int size;
int pages_per_zspage;
struct size_class *class;
+ int fullness = 0;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
@@ -1943,11 +2469,13 @@ struct zs_pool *zs_create_pool(const char *name)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
- if (pages_per_zspage == 1 &&
- get_maxobj_per_zspage(size, pages_per_zspage) == 1)
- class->huge = true;
+ class->objs_per_zspage = class->pages_per_zspage *
+ PAGE_SIZE / class->size;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
+ for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
+ fullness++)
+ INIT_LIST_HEAD(&class->fullness_list[fullness]);
prev_class = class;
}
@@ -1955,6 +2483,9 @@ struct zs_pool *zs_create_pool(const char *name)
/* debug only, don't abort if it fails */
zs_pool_stat_create(pool, name);
+ if (zs_register_migration(pool))
+ goto err;
+
/*
* Not critical, we still can use the pool
* and user can trigger compaction manually.
@@ -1974,6 +2505,7 @@ void zs_destroy_pool(struct zs_pool *pool)
int i;
zs_unregister_shrinker(pool);
+ zs_unregister_migration(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
@@ -1986,8 +2518,8 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;
- for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
- if (class->fullness_list[fg]) {
+ for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
+ if (!list_empty(&class->fullness_list[fg])) {
pr_info("Freeing non-empty class with size %db, fullness group %d\n",
class->size, fg);
}
@@ -1995,7 +2527,7 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
- destroy_handle_cache(pool);
+ destroy_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
@@ -2004,7 +2536,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
- int ret = zs_register_cpu_notifier();
+ int ret;
+
+ ret = zsmalloc_mount();
+ if (ret)
+ goto out;
+
+ ret = zs_register_cpu_notifier();
if (ret)
goto notifier_fail;
@@ -2021,7 +2559,8 @@ static int __init zs_init(void)
notifier_fail:
zs_unregister_cpu_notifier();
-
+ zsmalloc_unmount();
+out:
return ret;
}
@@ -2030,6 +2569,7 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
+ zsmalloc_unmount();
zs_unregister_cpu_notifier();
zs_stat_exit();