diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/compaction.c | 142 | ||||
-rw-r--r-- | mm/frontswap.c | 314 | ||||
-rw-r--r-- | mm/internal.h | 9 | ||||
-rw-r--r-- | mm/migrate.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 57 | ||||
-rw-r--r-- | mm/swapfile.c | 54 |
12 files changed, 457 insertions, 168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98e..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -389,3 +389,20 @@ config CLEANCACHE in a negligible performance hit. If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. diff --git a/mm/Makefile b/mm/Makefile index a156285ce88d..2e2fbbefb99f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/compaction.c b/mm/compaction.c index 4ac338af5120..7ea259d82a99 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -304,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * satisfies the allocation */ pageblock_nr = low_pfn >> pageblock_order; - if (cc->mode != COMPACT_SYNC && - last_pageblock_nr != pageblock_nr && + if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; @@ -326,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; lruvec = mem_cgroup_page_lruvec(page, zone); @@ -361,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* - * Returns true if MIGRATE_UNMOVABLE pageblock was successfully - * converted to MIGRATE_MOVABLE type, false otherwise. - */ -static bool rescue_unmovable_pageblock(struct page *page) -{ - unsigned long pfn, start_pfn, end_pfn; - struct page *start_page, *end_page; - - pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages; - - start_page = pfn_to_page(start_pfn); - end_page = pfn_to_page(end_pfn); - - /* Do not deal with pageblocks that overlap zones */ - if (page_zone(start_page) != page_zone(end_page)) - return false; - - for (page = start_page, pfn = start_pfn; page < end_page; pfn++, - page++) { - if (!pfn_valid_within(pfn)) - continue; - - if (PageBuddy(page)) { - int order = page_order(page); - - pfn += (1 << order) - 1; - page += (1 << order) - 1; - - continue; - } else if (page_count(page) == 0 || PageLRU(page)) - continue; - - return false; - } - - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); - return true; -} -enum smt_result { - GOOD_AS_MIGRATION_TARGET, - FAIL_UNMOVABLE_TARGET, - FAIL_BAD_TARGET, -}; - -/* - * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block - * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page - * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. - */ -static enum smt_result suitable_migration_target(struct page *page, - struct compact_control *cc) +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) { int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return FAIL_BAD_TARGET; + return false; /* If the page is a large free page, then allow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return GOOD_AS_MIGRATION_TARGET; + return true; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (cc->mode != COMPACT_ASYNC_UNMOVABLE && - migrate_async_suitable(migratetype)) - return GOOD_AS_MIGRATION_TARGET; - - if (cc->mode == COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE) - return FAIL_UNMOVABLE_TARGET; - - if (cc->mode != COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE && - rescue_unmovable_pageblock(page)) - return GOOD_AS_MIGRATION_TARGET; + if (migrate_async_suitable(migratetype)) + return true; /* Otherwise skip the block */ - return FAIL_BAD_TARGET; + return false; } /* @@ -478,13 +414,6 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; /* - * isolate_freepages() may be called more than once during - * compact_zone_order() run and we want only the most recent - * count. - */ - cc->nr_pageblocks_skipped = 0; - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. @@ -492,7 +421,6 @@ static void isolate_freepages(struct zone *zone, for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; - enum smt_result ret; if (!pfn_valid(pfn)) continue; @@ -509,12 +437,9 @@ static void isolate_freepages(struct zone *zone, continue; /* Check the block is suitable for migration */ - ret = suitable_migration_target(page, cc); - if (ret != GOOD_AS_MIGRATION_TARGET) { - if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + if (!suitable_migration_target(page)) continue; - } + /* * Found a block suitable for isolating free pages from. Now * we disabled interrupts, double check things are ok and @@ -523,14 +448,12 @@ static void isolate_freepages(struct zone *zone, */ isolated = 0; spin_lock_irqsave(&zone->lock, flags); - ret = suitable_migration_target(page, cc); - if (ret == GOOD_AS_MIGRATION_TARGET) { + if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, freelist, false); nr_freepages += isolated; - } else if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + } spin_unlock_irqrestore(&zone->lock, flags); /* @@ -762,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)&cc->freepages, false, - (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT - : MIGRATE_ASYNC); + (unsigned long)cc, false, + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -793,8 +715,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - enum compact_mode mode, - unsigned long *nr_pageblocks_skipped) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -802,17 +723,12 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .mode = mode, + .sync = sync, }; - unsigned long rc; - INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - rc = compact_zone(zone, &cc); - *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; - - return rc; + return compact_zone(zone, &cc); } int sysctl_extfrag_threshold = 500; @@ -837,8 +753,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; - unsigned long nr_pageblocks_skipped; - enum compact_mode mode; /* * Check whether it is worth even starting compaction. The order check is @@ -855,22 +769,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; -retry: - status = compact_zone_order(zone, order, gfp_mask, mode, - &nr_pageblocks_skipped); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; - - if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { - if (nr_pageblocks_skipped) { - mode = COMPACT_ASYNC_UNMOVABLE; - goto retry; - } - } } return rc; @@ -904,7 +808,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (ok && cc->order > zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ - else if (!ok && cc->mode == COMPACT_SYNC) + else if (!ok && cc->sync) defer_compaction(zone, cc->order); } @@ -919,7 +823,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .mode = COMPACT_ASYNC_MOVABLE, + .sync = false, }; return __compact_pgdat(pgdat, &cc); @@ -929,7 +833,7 @@ static int compact_node(int nid) { struct compact_control cc = { .order = -1, - .mode = COMPACT_SYNC, + .sync = true, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..e25025574a02 --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,314 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops frontswap_ops __read_mostly; + +/* + * This global enablement flag reduces overhead on systems where frontswap_ops + * has not been registered, so is preferred to the slower alternative: a + * function call that checks a non-global. + */ +bool frontswap_enabled __read_mostly; +EXPORT_SYMBOL(frontswap_enabled); + +/* + * If enabled, frontswap_store will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) { + frontswap_loads++; +} +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; +} +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops old = frontswap_ops; + + frontswap_ops = *ops; + frontswap_enabled = true; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + if (frontswap_enabled) + (*frontswap_ops.init)(type); +} +EXPORT_SYMBOL(__frontswap_init); + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implmentation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + dup = 1; + ret = (*frontswap_ops.store)(type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + inc_frontswap_succ_stores(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else if (dup) { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + frontswap_clear(sis, offset); + atomic_dec(&sis->frontswap_pages); + inc_frontswap_failed_stores(); + } else + inc_frontswap_failed_stores(); + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_store); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + ret = (*frontswap_ops.load)(type, offset, page); + if (ret == 0) + inc_frontswap_loads(); + return ret; +} +EXPORT_SYMBOL(__frontswap_load); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) { + (*frontswap_ops.invalidate_page)(type, offset); + atomic_dec(&sis->frontswap_pages); + frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + (*frontswap_ops.invalidate_area)(type); + atomic_set(&sis->frontswap_pages, 0); + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages = 0, total_pages_to_unuse; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + bool locked = false; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + locked = true; + total_pages = 0; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + total_pages += atomic_read(&si->frontswap_pages); + } + if (total_pages <= target_pages) + goto out; + total_pages_to_unuse = total_pages - target_pages; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) + pages = pages_to_unuse = total_pages_to_unuse; + else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) + continue; + vm_unacct_memory(pages); + break; + } + if (type < 0) + goto out; + locked = false; + spin_unlock(&swap_lock); + try_to_unuse(type, true, pages_to_unuse); +out: + if (locked) + spin_unlock(&swap_lock); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += atomic_read(&si->frontswap_pages); + } + spin_unlock(&swap_lock); + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/mm/internal.h b/mm/internal.h index 5cbb78190041..2ba87fbfb75b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern void set_pageblock_migratetype(struct page *page, int migratetype); -extern int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page); #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#include <linux/compaction.h> /* * in mm/compaction.c @@ -123,14 +119,11 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - enum compact_mode mode; /* Compaction mode */ + bool sync; /* Synchronous migration */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - /* Number of UNMOVABLE destination pageblocks skipped during scan */ - unsigned long nr_pageblocks_skipped; }; unsigned long diff --git a/mm/migrate.c b/mm/migrate.c index ab81d482ae6f..be26d5cbe56b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) * is actually a signal that all of the page has become dirty. * Whereas only part of our page may be dirty. */ - __set_page_dirty_nobuffers(newpage); + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); } mlock_migrate_page(newpage, page); diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..416637f0e924 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -183,7 +183,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points; + long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -223,7 +223,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - return points ? points : 1; + return points > 0 ? points : 1; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6092f331b32e..44030096da63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -void set_pageblock_migratetype(struct page *page, int migratetype) +static void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -5651,7 +5651,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .mode = COMPACT_SYNC, + .sync = true, }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> +#include <linux/frontswap.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + if (frontswap_store(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220a21e..a15a466d0d1d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,10 +683,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, mutex_lock(&shmem_swaplist_mutex); /* * We needed to drop mutex to make that restrictive page - * allocation; but the inode might already be freed by now, - * and we cannot refer to inode or mapping or info to check. - * However, we do hold page lock on the PageSwapCache page, - * so can check if that still has our reference remaining. + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). */ if (!page_swapcount(*pagep)) error = -ENOENT; @@ -730,9 +741,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) /* * There's a faint possibility that swap page was replaced before - * caller locked it: it will come back later with the right page. + * caller locked it: caller will come back later with the right page. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) goto out; /* @@ -995,21 +1006,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, newpage = shmem_alloc_page(gfp, info, index); if (!newpage) return -ENOMEM; - VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); - *pagep = newpage; page_cache_get(newpage); copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); - VM_BUG_ON(!PageLocked(oldpage)); __set_page_locked(newpage); - VM_BUG_ON(!PageUptodate(oldpage)); SetPageUptodate(newpage); - VM_BUG_ON(!PageSwapBacked(oldpage)); SetPageSwapBacked(newpage); - VM_BUG_ON(!swap_index); set_page_private(newpage, swap_index); - VM_BUG_ON(!PageSwapCache(oldpage)); SetPageSwapCache(newpage); /* @@ -1019,13 +1024,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, spin_lock_irq(&swap_mapping->tree_lock); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } spin_unlock_irq(&swap_mapping->tree_lock); - BUG_ON(error); - mem_cgroup_replace_page_cache(oldpage, newpage); - lru_cache_add_anon(newpage); + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } ClearPageSwapCache(oldpage); set_page_private(oldpage, 0); @@ -1033,7 +1049,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, unlock_page(oldpage); page_cache_release(oldpage); page_cache_release(oldpage); - return 0; + return error; } /* @@ -1107,7 +1123,8 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); - if (!PageSwapCache(page) || page->mapping) { + if (!PageSwapCache(page) || page_private(page) != swap.val || + page->mapping) { error = -EEXIST; /* try again */ goto failed; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 457b10baef59..de5bc51c4a66 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,6 +31,8 @@ #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); if ((p->flags & SWP_BLKDEV) && disk->fops->swap_slot_free_notify) disk->fops->swap_slot_free_notify(p->bdev, offset); @@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1486,7 +1504,8 @@ bad_bmap: } static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + unsigned long *frontswap_map) { int i, prev; @@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; nr_swap_pages += p->pages; total_swap_pages += p->pages; @@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; + frontswap_init(p->type); spin_unlock(&swap_lock); } @@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { @@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) * sys_swapoff for this swap_info_struct at this point. */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); goto out_dput; } @@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_invalidate_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(frontswap_map_get(p)); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1988,6 +2011,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2071,6 +2095,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(maxpages / sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2086,14 +2113,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); |