diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/compaction.c | 605 | ||||
-rw-r--r-- | mm/filemap.c | 57 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 689 | ||||
-rw-r--r-- | mm/memory.c | 13 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 36 | ||||
-rw-r--r-- | mm/mempolicy.c | 227 | ||||
-rw-r--r-- | mm/migrate.c | 74 | ||||
-rw-r--r-- | mm/mincore.c | 263 | ||||
-rw-r--r-- | mm/nommu.c | 32 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 317 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 40 | ||||
-rw-r--r-- | mm/shmem.c | 111 | ||||
-rw-r--r-- | mm/slab.c | 51 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/sparse.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 1 | ||||
-rw-r--r-- | mm/truncate.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 213 | ||||
-rw-r--r-- | mm/vmstat.c | 253 |
26 files changed, 2403 insertions, 647 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 9c61158308dc..527136b22384 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -172,6 +172,15 @@ config SPLIT_PTLOCK_CPUS default "4" # +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + select MIGRATION + depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + help + Allows the compaction of memory for the allocation of huge pages. + +# # support for page migration # config MIGRATION @@ -180,9 +189,11 @@ config MIGRATION depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE help Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful for - example on NUMA systems to put pages nearer to the processors accessing - the page. + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT diff --git a/mm/Makefile b/mm/Makefile index 6c2a73a54a43..8982504bd03b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 000000000000..94cce51b0b35 --- /dev/null +++ b/mm/compaction.c @@ -0,0 +1,605 @@ +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> + */ +#include <linux/swap.h> +#include <linux/migrate.h> +#include <linux/compaction.h> +#include <linux/mm_inline.h> +#include <linux/backing-dev.h> +#include <linux/sysctl.h> +#include <linux/sysfs.h> +#include "internal.h" + +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + + /* Account for isolated anon and file pages */ + unsigned long nr_anon; + unsigned long nr_file; + + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +}; + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long count = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + list_del(&page->lru); + __free_page(page); + count++; + } + + return count; +} + +/* Isolate free pages onto a private freelist. Must hold zone->lock */ +static unsigned long isolate_freepages_block(struct zone *zone, + unsigned long blockpfn, + struct list_head *freelist) +{ + unsigned long zone_end_pfn, end_pfn; + int total_isolated = 0; + struct page *cursor; + + /* Get the last PFN we should scan for free pages at */ + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + + /* Find the first usable PFN in the block to initialse page cursor */ + for (; blockpfn < end_pfn; blockpfn++) { + if (pfn_valid_within(blockpfn)) + break; + } + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. This assumes the block is valid */ + for (; blockpfn < end_pfn; blockpfn++, cursor++) { + int isolated, i; + struct page *page = cursor; + + if (!pfn_valid_within(blockpfn)) + continue; + + if (!PageBuddy(page)) + continue; + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ + if (isolated) { + blockpfn += isolated - 1; + cursor += isolated - 1; + } + } + + return total_isolated; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE, allow migration */ + if (migratetype == MIGRATE_MOVABLE) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + high_pfn = low_pfn; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + spin_lock_irqsave(&zone->lock, flags); + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* Found a block suitable for isolating free pages from */ + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* split_free_page does not map the pages */ + list_for_each_entry(page, freelist, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; +} + +/* Update the number of anon and file isolated pages in the zone */ +static void acct_isolated(struct zone *zone, struct compact_control *cc) +{ + struct page *page; + unsigned int count[NR_LRU_LISTS] = { 0, }; + + list_for_each_entry(page, &cc->migratepages, lru) { + int lru = page_lru_base_type(page); + count[lru]++; + } + + cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct zone *zone) +{ + + unsigned long inactive, isolated; + + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + + return isolated > inactive; +} + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static unsigned long isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + struct list_head *migratelist = &cc->migratepages; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return 0; + } + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(zone))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + if (fatal_signal_pending(current)) + return 0; + } + + /* Time to isolate some pages for migration */ + spin_lock_irq(&zone->lru_lock); + for (; low_pfn < end_pfn; low_pfn++) { + struct page *page; + if (!pfn_valid_within(low_pfn)) + continue; + + /* Get the page and skip if free */ + page = pfn_to_page(low_pfn); + if (PageBuddy(page)) + continue; + + /* Try isolate the page */ + if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + continue; + + /* Successfully isolated */ + del_page_from_lru_list(zone, page, page_lru(page)); + list_add(&page->lru, migratelist); + mem_cgroup_del_lru(page); + cc->nr_migratepages++; + + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + + acct_isolated(zone, cc); + + spin_unlock_irq(&zone->lru_lock); + cc->migrate_pfn = low_pfn; + + return cc->nr_migratepages; +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data, + int **result) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + /* Isolate free pages if necessary */ + if (list_empty(&cc->freepages)) { + isolate_freepages(cc->zone, cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * We cannot control nr_migratepages and nr_freepages fully when migration is + * running as migrate_pages() has no knowledge of compact_control. When + * migration is complete, we count the number of pages on the lists by hand. + */ +static void update_nr_listpages(struct compact_control *cc) +{ + int nr_migratepages = 0; + int nr_freepages = 0; + struct page *page; + + list_for_each_entry(page, &cc->migratepages, lru) + nr_migratepages++; + list_for_each_entry(page, &cc->freepages, lru) + nr_freepages++; + + cc->nr_migratepages = nr_migratepages; + cc->nr_freepages = nr_freepages; +} + +static int compact_finished(struct zone *zone, + struct compact_control *cc) +{ + unsigned int order; + unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + + if (fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) + return COMPACT_COMPLETE; + + /* Compaction run is not finished if the watermark is not met */ + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + return COMPACT_CONTINUE; + + if (cc->order == -1) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + for (order = cc->order; order < MAX_ORDER; order++) { + /* Job done if page is free of the right migratetype */ + if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (order >= pageblock_order && zone->free_area[order].nr_free) + return COMPACT_PARTIAL; + } + + return COMPACT_CONTINUE; +} + +static int compact_zone(struct zone *zone, struct compact_control *cc) +{ + int ret; + + /* Setup to move all movable pages to the end of the zone */ + cc->migrate_pfn = zone->zone_start_pfn; + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); + + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + unsigned long nr_migrate, nr_remaining; + + if (!isolate_migratepages(zone, cc)) + continue; + + nr_migrate = cc->nr_migratepages; + migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, 0); + update_nr_listpages(cc); + nr_remaining = cc->nr_migratepages; + + count_vm_event(COMPACTBLOCKS); + count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); + if (nr_remaining) + count_vm_events(COMPACTPAGEFAILED, nr_remaining); + + /* Release LRU pages not migrated */ + if (!list_empty(&cc->migratepages)) { + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; + } + + } + + /* Release free pages and check accounting */ + cc->nr_freepages -= release_freepages(&cc->freepages); + VM_BUG_ON(cc->nr_freepages != 0); + + return ret; +} + +static unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask) +{ + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = order, + .migratetype = allocflags_to_migratetype(gfp_mask), + .zone = zone, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + return compact_zone(zone, &cc); +} + +int sysctl_extfrag_threshold = 500; + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @zonelist: The zonelist used for the current allocation + * @order: The order of the current allocation + * @gfp_mask: The GFP mask of the current allocation + * @nodemask: The allowed nodes to allocate from + * + * This is the main entry point for direct page compaction. + */ +unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + int may_enter_fs = gfp_mask & __GFP_FS; + int may_perform_io = gfp_mask & __GFP_IO; + unsigned long watermark; + struct zoneref *z; + struct zone *zone; + int rc = COMPACT_SKIPPED; + + /* + * Check whether it is worth even starting compaction. The order check is + * made because an assumption is made that the page allocator can satisfy + * the "cheaper" orders without taking special steps + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + return rc; + + count_vm_event(COMPACTSTALL); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + nodemask) { + int fragindex; + int status; + + /* + * Watermarks for order-0 must be met for compaction. Note + * the 2UL. This is because during migration, copies of + * pages need to be allocated and for a short time, the + * footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + continue; + + /* + * fragmentation index determines if allocation failures are + * due to low memory or external fragmentation + * + * index of -1 implies allocations might succeed depending + * on watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + continue; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { + rc = COMPACT_PARTIAL; + break; + } + + status = compact_zone_order(zone, order, gfp_mask); + rc = max(status, rc); + + if (zone_watermark_ok(zone, order, watermark, 0, 0)) + break; + } + + return rc; +} + + +/* Compact all zones within a node */ +static int compact_node(int nid) +{ + int zoneid; + pg_data_t *pgdat; + struct zone *zone; + + if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) + return -EINVAL; + pgdat = NODE_DATA(nid); + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = -1, + }; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + return 0; +} + +/* Compact all nodes in the system */ +static int compact_nodes(void) +{ + int nid; + + for_each_online_node(nid) + compact_node(nid); + + return COMPACT_COMPLETE; +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return compact_nodes(); + + return 0; +} + +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +ssize_t sysfs_compact_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + compact_node(dev->id); + + return count; +} +static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); + +int compaction_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return sysdev_remove_file(&node->sysdev, &attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda9640f..20e5642e9f9f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page) spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); } +EXPORT_SYMBOL(remove_from_page_cache); static int sync_page(void *word) { @@ -441,7 +442,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@ -452,7 +453,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } @@ -461,9 +462,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } @@ -1099,6 +1106,12 @@ page_not_up_to_date_locked: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); @@ -1263,7 +1276,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@ -1290,21 +1303,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; diff --git a/mm/highmem.c b/mm/highmem.c index bed8a8bfd01f..66baa20f78f5 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -422,7 +422,7 @@ void __init page_address_init(void) #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ -#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) +#ifdef CONFIG_DEBUG_HIGHMEM void debug_kmap_atomic(enum km_type type) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c9e6bbf3772..54d42b009dbe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } @@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, struct anon_vma *anon_vma) { rmap_item->anon_vma = anon_vma; - atomic_inc(&anon_vma->ksm_refcount); + atomic_inc(&anon_vma->external_refcount); } static void drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { int empty = list_empty(&anon_vma->head); spin_unlock(&anon_vma->lock); if (empty) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8569bc298ff..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -149,16 +149,35 @@ struct mem_cgroup_threshold { u64 threshold; }; +/* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below usage. */ - atomic_t current_threshold; + int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[0]; }; +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + static void mem_cgroup_threshold(struct mem_cgroup *mem); +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); /* * The memory controller data structure. The memory controller controls both @@ -207,6 +226,8 @@ struct mem_cgroup { atomic_t refcnt; unsigned int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; /* set when res.limit == memsw.limit */ bool memsw_is_minimum; @@ -215,17 +236,19 @@ struct mem_cgroup { struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_threshold_ary *thresholds; + struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_threshold_ary *memsw_thresholds; + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; - /* * percpu counter. */ @@ -239,6 +262,7 @@ struct mem_cgroup { */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ NR_MOVE_TYPE, }; @@ -255,6 +279,18 @@ static struct move_charge_struct { .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; +static bool move_anon(void) +{ + return test_bit(MOVE_CHARGE_TYPE_ANON, + &mc.to->move_charge_at_immigrate); +} + +static bool move_file(void) +{ + return test_bit(MOVE_CHARGE_TYPE_FILE, + &mc.to->move_charge_at_immigrate); +} + /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -282,9 +318,12 @@ enum charge_type { /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) /* * Reclaim flags for mem_cgroup_hierarchical_reclaim @@ -1293,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); +struct oom_wait_info { + struct mem_cgroup *mem; + wait_queue_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + + if (oom_wait_info->mem == wake_mem) + goto wakeup; + /* if no hierarchy, no match */ + if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) + return 0; + /* + * Both of oom_wait_info->mem and wake_mem are stable under us. + * Then we can use css_is_ancestor without taking care of RCU. + */ + if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && + !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + return 0; + +wakeup: + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_wakeup_oom(struct mem_cgroup *mem) +{ + /* for filtering, pass "mem" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); +} + +static void memcg_oom_recover(struct mem_cgroup *mem) +{ + if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) + memcg_wakeup_oom(mem); +} + /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { - DEFINE_WAIT(wait); - bool locked; + struct oom_wait_info owait; + bool locked, need_to_kill; + owait.mem = mem; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + need_to_kill = true; /* At first, try to OOM lock hierarchy under mem.*/ mutex_lock(&memcg_oom_mutex); locked = mem_cgroup_oom_lock(mem); @@ -1309,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * under OOM is always welcomed, use TASK_KILLABLE here. */ - if (!locked) - prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + if (!locked || mem->oom_kill_disable) + need_to_kill = false; + if (locked) + mem_cgroup_oom_notify(mem); mutex_unlock(&memcg_oom_mutex); - if (locked) + if (need_to_kill) { + finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(mem, mask); - else { + } else { schedule(); - finish_wait(&memcg_oom_waitq, &wait); + finish_wait(&memcg_oom_waitq, &owait.wait); } mutex_lock(&memcg_oom_mutex); mem_cgroup_oom_unlock(mem); - /* - * Here, we use global waitq .....more fine grained waitq ? - * Assume following hierarchy. - * A/ - * 01 - * 02 - * assume OOM happens both in A and 01 at the same time. Tthey are - * mutually exclusive by lock. (kill in 01 helps A.) - * When we use per memcg waitq, we have to wake up waiters on A and 02 - * in addtion to waiters on 01. We use global waitq for avoiding mess. - * It will not be a big problem. - * (And a task may be moved to other groups while it's waiting for OOM.) - */ - wake_up_all(&memcg_oom_waitq); + memcg_wakeup_oom(mem); mutex_unlock(&memcg_oom_mutex); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) @@ -2118,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; batch = ¤t->memcg_batch; /* @@ -2137,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->memcg) batch->memcg = mem; /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + + if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. * If not, we uncharge res_counter ony by one. @@ -2152,6 +2232,8 @@ direct_uncharge: res_counter_uncharge(&mem->res, PAGE_SIZE); if (uncharge_memsw) res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (unlikely(batch->memcg != mem)) + memcg_oom_recover(mem); return; } @@ -2188,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: case MEM_CGROUP_CHARGE_TYPE_DROP: - if (page_mapped(page)) + /* See mem_cgroup_prepare_migration() */ + if (page_mapped(page) || PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -2288,6 +2371,7 @@ void mem_cgroup_uncharge_end(void) res_counter_uncharge(&batch->memcg->res, batch->bytes); if (batch->memsw_bytes) res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; } @@ -2410,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +int mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + enum charge_type ctype; int ret = 0; if (mem_cgroup_disabled()) @@ -2424,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); + /* + * At migrating an anonymous page, its mapcount goes down + * to 0 and uncharge() will be called. But, even if it's fully + * unmapped, migration may fail and this page has to be + * charged again. We set MIGRATION flag here and delay uncharge + * until end_migration() is called + * + * Corner Case Thinking + * A) + * When the old page was mapped as Anon and it's unmap-and-freed + * while migration was ongoing. + * If unmap finds the old page, uncharge() of it will be delayed + * until end_migration(). If unmap finds a new page, it's + * uncharged when it make mapcount to be 1->0. If unmap code + * finds swap_migration_entry, the new page will not be mapped + * and end_migration() will find it(mapcount==0). + * + * B) + * When the old page was mapped but migraion fails, the kernel + * remaps it. A charge for it is kept by MIGRATION flag even + * if mapcount goes down to 0. We can do remap successfully + * without charging it again. + * + * C) + * The "old" page is under lock_page() until the end of + * migration, so, the old page itself will not be swapped-out. + * If the new page is swapped out before end_migraton, our + * hook to usual swap-out path will catch the event. + */ + if (PageAnon(page)) + SetPageCgroupMigration(pc); } unlock_page_cgroup(pc); + /* + * If the page is not charged at this point, + * we return here. + */ + if (!mem) + return 0; *ptr = mem; - if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); - css_put(&mem->css); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + css_put(&mem->css);/* drop extra refcnt */ + if (ret || *ptr == NULL) { + if (PageAnon(page)) { + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + /* + * The old page may be fully unmapped while we kept it. + */ + mem_cgroup_uncharge_page(page); + } + return -ENOMEM; } + /* + * We charge new page before it's used/mapped. So, even if unlock_page() + * is called before end_migration, we can catch all events on this new + * page. In the case new page is migrated but not remapped, new page's + * mapcount will be finally 0 and we call uncharge in end_migration(). + */ + pc = lookup_page_cgroup(newpage); + if (PageAnon(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + else if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + __mem_cgroup_commit_charge(mem, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage) { - struct page *target, *unused; + struct page *used, *unused; struct page_cgroup *pc; - enum charge_type ctype; if (!mem) return; + /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { - target = oldpage; - unused = NULL; + used = oldpage; + unused = newpage; } else { - target = newpage; + used = newpage; unused = oldpage; } - - if (PageAnon(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; - else if (page_is_file_cache(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - /* unused page is not on radix-tree now. */ - if (unused) - __mem_cgroup_uncharge_common(unused, ctype); - - pc = lookup_page_cgroup(target); /* - * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. - * So, double-counting is effectively avoided. + * We disallowed uncharge of pages under migration because mapcount + * of the page goes down to zero, temporarly. + * Clear the flag and check the page should be charged. */ - __mem_cgroup_commit_charge(mem, pc, ctype); + pc = lookup_page_cgroup(oldpage); + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + + if (unused != oldpage) + pc = lookup_page_cgroup(unused); + __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); + pc = lookup_page_cgroup(used); /* - * Both of oldpage and newpage are still under lock_page(). - * Then, we don't have to care about race in radix-tree. - * But we have to be careful that this page is unmapped or not. - * - * There is a case for !page_mapped(). At the start of - * migration, oldpage was mapped. But now, it's zapped. - * But we know *target* page is not freed/reused under us. - * mem_cgroup_uncharge_page() does all necessary checks. + * If a page is a file cache, radix-tree replacement is very atomic + * and we can skip this check. When it was an Anon page, its mapcount + * goes down to 0. But because we added MIGRATION flage, it's not + * uncharged yet. There are several case but page->mapcount check + * and USED bit check in mem_cgroup_uncharge_page() will do enough + * check. (see prepare_charge() also) */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - mem_cgroup_uncharge_page(target); + if (PageAnon(used)) + mem_cgroup_uncharge_page(used); /* - * At migration, we may charge account against cgroup which has no tasks + * At migration, we may charge account against cgroup which has no + * tasks. * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ @@ -2524,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit; + u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; + int enlarge; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -2538,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + enlarge = 0; while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -2555,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + + memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); + if (memlimit < val) + enlarge = 1; + ret = res_counter_set_limit(&memcg->res, val); if (!ret) { if (memswlimit == val) @@ -2576,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -2584,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, oldusage, curusage; + u64 memlimit, memswlimit, oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; + int enlarge = 0; /* see mem_cgroup_resize_res_limit */ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; @@ -2608,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + if (memswlimit < val) + enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); if (!ret) { if (memlimit == val) @@ -2630,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -2821,6 +2978,7 @@ move_account: if (ret) break; } + memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; @@ -3311,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) rcu_read_lock(); if (!swap) - t = rcu_dereference(memcg->thresholds); + t = rcu_dereference(memcg->thresholds.primary); else - t = rcu_dereference(memcg->memsw_thresholds); + t = rcu_dereference(memcg->memsw_thresholds.primary); if (!t) goto unlock; @@ -3325,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * If it's not true, a threshold was crossed after last * call of __mem_cgroup_threshold(). */ - i = atomic_read(&t->current_threshold); + i = t->current_threshold; /* * Iterate backward over array of thresholds starting from @@ -3349,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) eventfd_signal(t->entries[i].eventfd, 1); /* Update current_threshold */ - atomic_set(&t->current_threshold, i - 1); + t->current_threshold = i - 1; unlock: rcu_read_unlock(); } @@ -3369,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_eventfd_list *ev; + + list_for_each_entry(ev, &mem->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); +} + +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; int type = MEMFILE_TYPE(cft->private); u64 threshold, usage; - int size; - int i, ret; + int i, size, ret; ret = res_counter_memparse_write_strategy(args, &threshold); if (ret) return ret; mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) - thresholds = memcg->thresholds; + thresholds = &memcg->thresholds; else if (type == _MEMSWAP) - thresholds = memcg->memsw_thresholds; + thresholds = &memcg->memsw_thresholds; else BUG(); usage = mem_cgroup_usage(memcg, type == _MEMSWAP); /* Check if a threshold crossed before adding a new one */ - if (thresholds) + if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); - if (thresholds) - size = thresholds->size + 1; - else - size = 1; + size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - thresholds_new = kmalloc(sizeof(*thresholds_new) + - size * sizeof(struct mem_cgroup_threshold), + new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), GFP_KERNEL); - if (!thresholds_new) { + if (!new) { ret = -ENOMEM; goto unlock; } - thresholds_new->size = size; + new->size = size; /* Copy thresholds (if any) to new array */ - if (thresholds) - memcpy(thresholds_new->entries, thresholds->entries, - thresholds->size * + if (thresholds->primary) { + memcpy(new->entries, thresholds->primary->entries, (size - 1) * sizeof(struct mem_cgroup_threshold)); + } + /* Add new threshold */ - thresholds_new->entries[size - 1].eventfd = eventfd; - thresholds_new->entries[size - 1].threshold = threshold; + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(thresholds_new->entries, size, - sizeof(struct mem_cgroup_threshold), + sort(new->entries, size, sizeof(struct mem_cgroup_threshold), compare_thresholds, NULL); /* Find current threshold */ - atomic_set(&thresholds_new->current_threshold, -1); + new->current_threshold = -1; for (i = 0; i < size; i++) { - if (thresholds_new->entries[i].threshold < usage) { + if (new->entries[i].threshold < usage) { /* - * thresholds_new->current_threshold will not be used - * until rcu_assign_pointer(), so it's safe to increment + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment * it here. */ - atomic_inc(&thresholds_new->current_threshold); + ++new->current_threshold; } } - if (type == _MEM) - rcu_assign_pointer(memcg->thresholds, thresholds_new); - else - rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); - /* To be sure that nobody uses thresholds before freeing it */ + /* To be sure that nobody uses thresholds */ synchronize_rcu(); - kfree(thresholds); unlock: mutex_unlock(&memcg->thresholds_lock); return ret; } -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd) +static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; int type = MEMFILE_TYPE(cft->private); u64 usage; - int size = 0; - int i, j, ret; + int i, j, size; mutex_lock(&memcg->thresholds_lock); if (type == _MEM) - thresholds = memcg->thresholds; + thresholds = &memcg->thresholds; else if (type == _MEMSWAP) - thresholds = memcg->memsw_thresholds; + thresholds = &memcg->memsw_thresholds; else BUG(); @@ -3484,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, __mem_cgroup_threshold(memcg, type == _MEMSWAP); /* Calculate new number of threshold */ - for (i = 0; i < thresholds->size; i++) { - if (thresholds->entries[i].eventfd != eventfd) + size = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) size++; } + new = thresholds->spare; + /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { - thresholds_new = NULL; - goto assign; + kfree(new); + new = NULL; + goto swap_buffers; } - /* Allocate memory for new array of thresholds */ - thresholds_new = kmalloc(sizeof(*thresholds_new) + - size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); - if (!thresholds_new) { - ret = -ENOMEM; - goto unlock; - } - thresholds_new->size = size; + new->size = size; /* Copy thresholds and find current threshold */ - atomic_set(&thresholds_new->current_threshold, -1); - for (i = 0, j = 0; i < thresholds->size; i++) { - if (thresholds->entries[i].eventfd == eventfd) + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) continue; - thresholds_new->entries[j] = thresholds->entries[i]; - if (thresholds_new->entries[j].threshold < usage) { + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold < usage) { /* - * thresholds_new->current_threshold will not be used + * new->current_threshold will not be used * until rcu_assign_pointer(), so it's safe to increment * it here. */ - atomic_inc(&thresholds_new->current_threshold); + ++new->current_threshold; } j++; } -assign: - if (type == _MEM) - rcu_assign_pointer(memcg->thresholds, thresholds_new); - else - rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + rcu_assign_pointer(thresholds->primary, new); - /* To be sure that nobody uses thresholds before freeing it */ + /* To be sure that nobody uses thresholds */ synchronize_rcu(); - kfree(thresholds); -unlock: mutex_unlock(&memcg->thresholds_lock); +} - return ret; +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *event; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + mutex_lock(&memcg_oom_mutex); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->oom_lock)) + eventfd_signal(eventfd, 1); + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *ev, *tmp; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + + mutex_lock(&memcg_oom_mutex); + + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + mutex_unlock(&memcg_oom_mutex); +} + +static int mem_cgroup_oom_control_read(struct cgroup *cgrp, + struct cftype *cft, struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + + if (atomic_read(&mem->oom_lock)) + cb->fill(cb, "under_oom", 1); + else + cb->fill(cb, "under_oom", 0); + return 0; +} + +/* + */ +static int mem_cgroup_oom_control_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *parent; + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (!cgrp->parent || !((val == 0) || (val == 1))) + return -EINVAL; + + parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* oom-kill-disable is a flag for subhierarchy. */ + if ((parent->use_hierarchy) || + (mem->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); + return -EINVAL; + } + mem->oom_kill_disable = val; + cgroup_unlock(); + return 0; } static struct cftype mem_cgroup_files[] = { @@ -3544,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -3594,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_move_charge_read, .write_u64 = mem_cgroup_move_charge_write, }, + { + .name = "oom_control", + .read_map = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, + .register_event = mem_cgroup_oom_register_event, + .unregister_event = mem_cgroup_oom_unregister_event, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -3602,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -3831,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; + mem->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { @@ -3849,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); + INIT_LIST_HEAD(&mem->oom_notify); if (parent) mem->swappiness = get_swappiness(parent); @@ -3976,6 +4232,80 @@ enum mc_target_type { MC_TARGET_SWAP, }; +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + struct page *page = vm_normal_page(vma, addr, ptent); + + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + /* we don't move shared anon */ + if (!move_anon() || page_mapcount(page) > 2) + return NULL; + } else if (!move_file()) + /* we ignore mapcount for file pages */ + return NULL; + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + int usage_count; + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!move_anon() || non_swap_entry(ent)) + return NULL; + usage_count = mem_cgroup_count_swap_user(ent, &page); + if (usage_count > 1) { /* we don't move shared anon */ + if (page) + put_page(page); + return NULL; + } + if (do_swap_account) + entry->val = ent.val; + + return page; +} + +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct inode *inode; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!move_file()) + return NULL; + + inode = vma->vm_file->f_path.dentry->d_inode; + mapping = vma->vm_file->f_mapping; + if (pte_none(ptent)) + pgoff = linear_page_index(vma, addr); + else /* pte_file(ptent) is true */ + pgoff = pte_to_pgoff(ptent); + + /* page is moved even if it's not RSS of this task(page-faulted). */ + if (!mapping_cap_swap_backed(mapping)) { /* normal file */ + page = find_get_page(mapping, pgoff); + } else { /* shmem/tmpfs file. we should take account of swap too. */ + swp_entry_t ent; + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + if (do_swap_account) + entry->val = ent.val; + } + + return page; +} + static int is_target_pte_for_mc(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { @@ -3983,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, struct page_cgroup *pc; int ret = 0; swp_entry_t ent = { .val = 0 }; - int usage_count = 0; - bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, - &mc.to->move_charge_at_immigrate); - if (!pte_present(ptent)) { - /* TODO: handle swap of shmes/tmpfs */ - if (pte_none(ptent) || pte_file(ptent)) - return 0; - else if (is_swap_pte(ptent)) { - ent = pte_to_swp_entry(ptent); - if (!move_anon || non_swap_entry(ent)) - return 0; - usage_count = mem_cgroup_count_swap_user(ent, &page); - } - } else { - page = vm_normal_page(vma, addr, ptent); - if (!page || !page_mapped(page)) - return 0; - /* - * TODO: We don't move charges of file(including shmem/tmpfs) - * pages for now. - */ - if (!move_anon || !PageAnon(page)) - return 0; - if (!get_page_unless_zero(page)) - return 0; - usage_count = page_mapcount(page); - } - if (usage_count > 1) { - /* - * TODO: We don't move charges of shared(used by multiple - * processes) pages for now. - */ - if (page) - put_page(page); + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); + + if (!page && !ent.val) return 0; - } if (page) { pc = lookup_page_cgroup(page); /* @@ -4035,8 +4338,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, if (!ret || !target) put_page(page); } - /* throught */ - if (ent.val && do_swap_account && !ret && + /* There is a swap entry and a page doesn't exist or isn't charged */ + if (ent.val && !ret && css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { ret = MC_TARGET_SWAP; if (target) @@ -4077,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } @@ -4102,6 +4402,7 @@ static void mem_cgroup_clear_mc(void) if (mc.precharge) { __mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; + memcg_oom_recover(mc.to); } /* * we didn't uncharge from mc.from at mem_cgroup_move_account(), so @@ -4110,6 +4411,7 @@ static void mem_cgroup_clear_mc(void) if (mc.moved_charge) { __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; + memcg_oom_recover(mc.from); } /* we must fixup refcnts and charges */ if (mc.moved_swap) { @@ -4274,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; ret = walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_move_charge_walk); if (ret) diff --git a/mm/memory.c b/mm/memory.c index 833952d8b74d..119b7ccdf39b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/* - * Do a quick page-table lookup for a single page. +/** + * follow_page - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index be211a582930..a4cfcdc00455 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ + mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); - zone_pcp_update(zone); + mutex_unlock(&zonelists_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); } - if (need_zonelists_rebuild) - build_all_zonelists(); - else - vm_total_pages = nr_free_pagecache_pages(); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } +/* + * called by cpu_up() to online a node without onlined memory. + */ +int mem_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + lock_system_sleep(); + pgdat = hotadd_new_pgdat(nid, 0); + if (pgdat) { + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + +out: + unlock_system_sleep(); + return ret; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08f40a2f3fe0..5d6fb339de03 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -119,7 +119,22 @@ struct mempolicy default_policy = { static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; /* Check that the nodemask contains at least one populated zone */ @@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) { int nd, k; - /* Check that there is something useful in this mask */ - k = policy_zone; - for_each_node_mask(nd, *nodemask) { struct zone *z; @@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, @@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { } -static void mpol_rebind_nodemask(struct mempolicy *pol, - const nodemask_t *nodes) +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); } - pol->v.nodes = tmp; + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + if (!node_isset(current->il_next, tmp)) { current->il_next = next_node(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) @@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes) + const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, } } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && step == 0 && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - mpol_ops[pol->mode].rebind(pol, newmask); + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); } /* @@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) { - mpol_rebind_policy(tsk->mempolicy, new); + mpol_rebind_policy(tsk->mempolicy, new, step); } /* @@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); up_write(&mm->mmap_sem); } @@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) @@ -1444,15 +1513,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node is part of the mask, we use the zonelist for + * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; - case MPOL_INTERLEAVE: /* should not happen */ - break; default: BUG(); } @@ -1572,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1617,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) if (!(mask && current->mempolicy)) return false; + task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: @@ -1636,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) default: BUG(); } + task_unlock(current); return true; } @@ -1683,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1699,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1729,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -1750,6 +1833,9 @@ EXPORT_SYMBOL(alloc_pages_current); * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ @@ -1759,13 +1845,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - mpol_rebind_policy(old, &mems); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } rcu_read_unlock(); - *new = *old; atomic_set(&new->refcnt, 1); return new; } @@ -1792,16 +1889,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, return tompol; } -static int mpol_match_intent(const struct mempolicy *a, - const struct mempolicy *b) -{ - if (a->flags != b->flags) - return 0; - if (!mpol_store_user_nodemask(a)) - return 1; - return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); -} - /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -1809,8 +1896,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) return 0; if (a->mode != b->mode) return 0; - if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + if (a->flags != b->flags) return 0; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return 0; + switch (a->mode) { case MPOL_BIND: /* Fall through */ @@ -2006,27 +2097,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) { - mpol_put(mpol); /* drop our ref on sb mpol */ - NODEMASK_SCRATCH_FREE(scratch); - return; /* no valid nodemask intersection */ - } + if (IS_ERR(new)) + goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ - if (ret) { - NODEMASK_SCRATCH_FREE(scratch); - mpol_put(new); - return; - } + if (ret) + goto put_free; /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_free: mpol_put(new); /* drop initial ref */ +free_scratch: NODEMASK_SCRATCH_FREE(scratch); } } @@ -2132,9 +2220,15 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave", "local" }; +#define MPOL_LOCAL MPOL_MAX +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local" +}; #ifdef CONFIG_TMPFS @@ -2159,12 +2253,11 @@ static const char * const policy_types[] = int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) { struct mempolicy *new = NULL; - unsigned short uninitialized_var(mode); + unsigned short mode; unsigned short uninitialized_var(mode_flags); nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int i; int err = 1; if (nodelist) { @@ -2180,13 +2273,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (i = 0; i <= MPOL_LOCAL; i++) { - if (!strcmp(str, policy_types[i])) { - mode = i; + for (mode = 0; mode <= MPOL_LOCAL; mode++) { + if (!strcmp(str, policy_modes[mode])) { break; } } - if (i > MPOL_LOCAL) + if (mode > MPOL_LOCAL) goto out; switch (mode) { @@ -2250,7 +2342,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - { + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } else { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2266,10 +2361,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) } } err = 0; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } out: /* Restore string for error message */ @@ -2338,11 +2429,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) BUG(); } - l = strlen(policy_types[mode]); + l = strlen(policy_modes[mode]); if (buffer + maxlen < p + l + 1) return -ENOSPC; - strcpy(p, policy_types[mode]); + strcpy(p, policy_modes[mode]); p += l; if (flags & MPOL_MODE_FLAGS) { diff --git a/mm/migrate.c b/mm/migrate.c index d3f3f7f81075..4205b1d6049e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -40,7 +40,8 @@ /* * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). + * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is + * undesirable, use migrate_prep_local() */ int migrate_prep(void) { @@ -55,26 +56,29 @@ int migrate_prep(void) return 0; } +/* Do the necessary work of migrate_prep but not if it involves other CPUs */ +int migrate_prep_local(void) +{ + lru_add_drain(); + + return 0; +} + /* * Add isolated pages on the list back to the LRU under page lock * to avoid leaking evictable pages back onto unevictable list. - * - * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +void putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); - count++; } - return count; } /* @@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping, * < 0 - error code * == 0 - success */ -static int move_to_new_page(struct page *newpage, struct page *page) +static int move_to_new_page(struct page *newpage, struct page *page, + int remap_swapcache) { struct address_space *mapping; int rc; @@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) - remove_migration_ptes(page, newpage); - else + if (rc) { newpage->mapping = NULL; + } else { + if (remap_swapcache) + remove_migration_ptes(page, newpage); + } unlock_page(newpage); @@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int remap_swapcache = 1; int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; + struct anon_vma *anon_vma = NULL; if (!newpage) return -ENOMEM; @@ -581,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; @@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (PageAnon(page)) { rcu_read_lock(); rcu_locked = 1; + + /* Determine how to safely use anon_vma */ + if (!page_mapped(page)) { + if (!PageSwapCache(page)) + goto rcu_unlock; + + /* + * We cannot be sure that the anon_vma of an unmapped + * swapcache page is safe to use because we don't + * know in advance if the VMA that this page belonged + * to still exists. If the VMA and others sharing the + * data have been freed, then the anon_vma could + * already be invalid. + * + * To avoid this possibility, swapcache pages get + * migrated but are not remapped when migration + * completes + */ + remap_swapcache = 0; + } else { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + anon_vma = page_anon_vma(page); + atomic_inc(&anon_vma->external_refcount); + } } /* @@ -638,11 +675,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page); + rc = move_to_new_page(newpage, page, remap_swapcache); - if (rc) + if (rc && remap_swapcache) remove_migration_ptes(page, page); rcu_unlock: + + /* Drop an anon_vma reference if we took one */ + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + if (rcu_locked) rcu_read_unlock(); uncharge: diff --git a/mm/mincore.c b/mm/mincore.c index f77433c20279..9ac42dc6d7b6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -19,6 +19,40 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> +static void mincore_hugetlb_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct hstate *h; + + h = hstate_vma(vma); + while (1) { + unsigned char present; + pte_t *ptep; + /* + * Huge pages are always in RAM for now, but + * theoretically it needs to be checked. + */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = ptep && !huge_pte_none(huge_ptep_get(ptep)); + while (1) { + *vec = present; + vec++; + addr += PAGE_SIZE; + if (addr == end) + return; + /* check hugepage border */ + if (!(addr & ~huge_page_mask(h))) + break; + } + } +#else + BUG(); +#endif +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) return present; } -/* - * Do a chunk of "sys_mincore()". We've already checked - * all the arguments, we hold the mmap semaphore: we should - * just return the amount of info we're asked for. - */ -static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) +static void mincore_unmapped_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; - unsigned long nr; + unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; - pgoff_t pgoff; - struct vm_area_struct *vma = find_vma(current->mm, addr); - /* - * find_vma() didn't find anything above us, or we're - * in an unmapped hole in the address space: ENOMEM. - */ - if (!vma || addr < vma->vm_start) - return -ENOMEM; - -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) { - struct hstate *h; - unsigned long nr_huge; - unsigned char present; + if (vma->vm_file) { + pgoff_t pgoff; - i = 0; - nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); - h = hstate_vma(vma); - nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) - - (addr >> huge_page_shift(h)) + 1; - nr_huge = min(nr_huge, - (vma->vm_end - addr) >> huge_page_shift(h)); - while (1) { - /* hugepage always in RAM for now, - * but generally it needs to be check */ - ptep = huge_pte_offset(current->mm, - addr & huge_page_mask(h)); - present = !!(ptep && - !huge_pte_none(huge_ptep_get(ptep))); - while (1) { - vec[i++] = present; - addr += PAGE_SIZE; - /* reach buffer limit */ - if (i == nr) - return nr; - /* check hugepage border */ - if (!((addr & ~huge_page_mask(h)) - >> PAGE_SHIFT)) - break; - } - } - return nr; + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; } -#endif - - /* - * Calculate how many pages there are left in the last level of the - * PTE array for our address. - */ - nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); - - /* - * Don't overrun this vma - */ - nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); - - /* - * Don't return more than the caller asked for - */ - nr = min(nr, pages); +} - pgd = pgd_offset(vma->vm_mm, addr); - if (pgd_none_or_clear_bad(pgd)) - goto none_mapped; - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) - goto none_mapped; - pmd = pmd_offset(pud, addr); - if (pmd_none_or_clear_bad(pmd)) - goto none_mapped; +static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + spinlock_t *ptl; + pte_t *ptep; ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { - unsigned char present; + do { pte_t pte = *ptep; + pgoff_t pgoff; - if (pte_present(pte)) { - present = 1; - - } else if (pte_none(pte)) { - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - present = mincore_page(vma->vm_file->f_mapping, - pgoff); - } else - present = 0; - - } else if (pte_file(pte)) { + next = addr + PAGE_SIZE; + if (pte_none(pte)) + mincore_unmapped_range(vma, addr, next, vec); + else if (pte_present(pte)) + *vec = 1; + else if (pte_file(pte)) { pgoff = pte_to_pgoff(pte); - present = mincore_page(vma->vm_file->f_mapping, pgoff); - + *vec = mincore_page(vma->vm_file->f_mapping, pgoff); } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { /* migration entries are always uptodate */ - present = 1; + *vec = 1; } else { #ifdef CONFIG_SWAP pgoff = entry.val; - present = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(&swapper_space, pgoff); #else WARN_ON(1); - present = 1; + *vec = 1; #endif } } + vec++; + } while (ptep++, addr = next, addr != end); + pte_unmap_unlock(ptep - 1, ptl); +} - vec[i] = present; - } - pte_unmap_unlock(ptep-1, ptl); +static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pmd_t *pmd; - return nr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pte_range(vma, pmd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pmd++, addr = next, addr != end); +} -none_mapped: - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - for (i = 0; i < nr; i++, pgoff++) - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { - for (i = 0; i < nr; i++) - vec[i] = 0; +static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pud_t *pud; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pmd_range(vma, pud, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pud++, addr = next, addr != end); +} + +static void mincore_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pgd_t *pgd; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pud_range(vma, pgd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pgd++, addr = next, addr != end); +} + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +{ + struct vm_area_struct *vma; + unsigned long end; + + vma = find_vma(current->mm, addr); + if (!vma || addr < vma->vm_start) + return -ENOMEM; + + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + + if (is_vm_hugetlb_page(vma)) { + mincore_hugetlb_page_range(vma, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; } - return nr; + end = pmd_addr_end(addr, end); + + if (is_vm_hugetlb_page(vma)) + mincore_hugetlb_page_range(vma, addr, end, vec); + else + mincore_page_range(vma, addr, end, vec); + + return (end - addr) >> PAGE_SHIFT; } /* @@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * the temporary buffer size. */ down_read(¤t->mm->mmap_sem); - retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); up_read(¤t->mm->mmap_sem); if (retval <= 0) diff --git a/mm/nommu.c b/mm/nommu.c index 63fa17d121f0..b76f3ee0abe0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -918,14 +918,6 @@ static int validate_mmap_request(struct file *file, if (!(capabilities & BDI_CAP_MAP_DIRECT)) return -ENODEV; - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) - ) { - printk("MAP_SHARED not completely supported on !MMU\n"); - return -EINVAL; - } - /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; } @@ -941,6 +933,20 @@ static int validate_mmap_request(struct file *file, capabilities &= ~BDI_CAP_MAP_DIRECT; } + if (capabilities & BDI_CAP_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (flags & MAP_SHARED) { + printk(KERN_WARNING + "MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + /* handle executable mappings and implied executable * mappings */ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { @@ -996,22 +1002,20 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); - vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* vm_flags |= mm->def_flags; */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) vm_flags |= VM_MAYSHARE; - } - else { + } else { /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); if (flags & MAP_SHARED) - vm_flags |= VM_MAYSHARE | VM_SHARED; - else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) - vm_flags |= VM_MAYSHARE; + vm_flags |= VM_SHARED; } /* refuse to let anyone share private mappings with this process if diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b68e802a7a7d..709aedfaa014 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -479,12 +479,9 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) read_lock(&tasklist_lock); retry: p = select_bad_process(&points, mem); - if (PTR_ERR(p) == -1UL) + if (!p || PTR_ERR(p) == -1UL) goto out; - if (!p) - p = current; - if (oom_kill_process(p, gfp_mask, 0, points, mem, "Memory cgroup out of memory")) goto retry; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6326c71b663..431214b941ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,6 +49,7 @@ #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/memory.h> +#include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> @@ -56,6 +57,22 @@ #include <asm/div64.h> #include "internal.h" +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in <linux/topology.h>. + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif + /* * Array of node states. */ @@ -475,6 +492,8 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; + unsigned long combined_idx; + struct page *buddy; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -488,9 +507,6 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - unsigned long combined_idx; - struct page *buddy; - buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) break; @@ -505,8 +521,29 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + + /* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ + if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = __find_combined_index(page_idx, order); + higher_page = page + combined_idx - page_idx; + higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } + + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: zone->free_area[order].nr_free++; } @@ -599,20 +636,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order, spin_unlock(&zone->lock); } -static void __free_pages_ok(struct page *page, unsigned int order) +static bool free_pages_prepare(struct page *page, unsigned int order) { - unsigned long flags; int i; int bad = 0; - int wasMlocked = __TestClearPageMlocked(page); trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0 ; i < (1 << order) ; ++i) - bad += free_pages_check(page + i); + for (i = 0; i < (1 << order); i++) { + struct page *pg = page + i; + + if (PageAnon(pg)) + pg->mapping = NULL; + bad += free_pages_check(pg); + } if (bad) - return; + return false; if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); @@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + return true; +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int wasMlocked = __TestClearPageMlocked(page); + + if (!free_pages_prepare(page, order)) + return; + local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); @@ -1107,21 +1158,9 @@ void free_hot_cold_page(struct page *page, int cold) int migratetype; int wasMlocked = __TestClearPageMlocked(page); - trace_mm_page_free_direct(page, 0); - kmemcheck_free_shadow(page, 0); - - if (PageAnon(page)) - page->mapping = NULL; - if (free_pages_check(page)) + if (!free_pages_prepare(page, 0)) return; - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - debug_check_no_obj_freed(page_address(page), PAGE_SIZE); - } - arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1188,6 +1227,51 @@ void split_page(struct page *page, unsigned int order) } /* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + unsigned long watermark; + struct zone *zone; + + BUG_ON(!PageBuddy(page)); + + zone = page_zone(page); + order = page_order(page); + + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + /* Remove page from free list */ + list_del(&page->lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } + + return 1 << order; +} + +/* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. @@ -1693,6 +1777,62 @@ out: return page; } +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page; + + if (!order || compaction_deferred(preferred_zone)) + return NULL; + + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + nodemask); + if (*did_some_progress != COMPACT_SKIPPED) { + + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); + + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + if (page) { + preferred_zone->compact_considered = 0; + preferred_zone->compact_defer_shift = 0; + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. + * The most likely reason is that pages exist, + * but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + defer_compaction(preferred_zone); + + cond_resched(); + } + + return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + return NULL; +} +#endif /* CONFIG_COMPACTION */ + /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, @@ -1879,6 +2019,15 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; + /* Try direct compaction */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1970,10 +2119,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -1983,6 +2135,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; @@ -2434,8 +2587,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) - build_all_zonelists(); + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } out: mutex_unlock(&zl_order_mutex); @@ -2582,7 +2738,7 @@ static int default_zonelist_order(void) * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + * This function detect ZONE_DMA/DMA32 size and configures zone order. */ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ low_kmem_size = 0; @@ -2594,6 +2750,15 @@ static int default_zonelist_order(void) if (zone_type < ZONE_NORMAL) low_kmem_size += z->present_pages; total_size += z->present_pages; + } else if (zone_type == ZONE_NORMAL) { + /* + * If any node has only lowmem, then node order + * is preferred to allow kernel allocations + * locally; otherwise, they can easily infringe + * on other nodes when there is an abundance of + * lowmem available to allocate from. + */ + return ZONELIST_ORDER_NODE; } } } @@ -2707,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zone *zone; + + (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL, + &zone); + return zone->node; +} +#endif #else /* CONFIG_NUMA */ @@ -2776,9 +2959,16 @@ static void build_zonelist_cache(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); + +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; @@ -2793,6 +2983,14 @@ static int __build_all_zonelists(void *dummy) build_zonelist_cache(pgdat); } +#ifdef CONFIG_MEMORY_HOTPLUG + /* Setup real pagesets for the new zone */ + if (data) { + struct zone *zone = data; + setup_zone_pageset(zone); + } +#endif + /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -2806,13 +3004,31 @@ static int __build_all_zonelists(void *dummy) * needs the percpu allocator in order to allocate its pagesets * (a chicken-egg dilemma). */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { setup_pageset(&per_cpu(boot_pageset, cpu), 0); +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + if (cpu_online(cpu)) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + return 0; } -void build_all_zonelists(void) +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2823,7 +3039,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, data, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3146,31 +3362,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } +static __meminit void setup_zone_pageset(struct zone *zone) +{ + int cpu; + + zone->pageset = alloc_percpu(struct per_cpu_pageset); + + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + setup_pageset(pcp, zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. - * Boot pagesets will no longer be used by this processorr - * after setup_per_cpu_pageset(). */ void __init setup_per_cpu_pageset(void) { struct zone *zone; - int cpu; - - for_each_populated_zone(zone) { - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->present_pages / - percpu_pagelist_fraction)); - } - } + for_each_populated_zone(zone) + setup_zone_pageset(zone); } static noinline __init_refok diff --git a/mm/readahead.c b/mm/readahead.c index dfa9a1a03a11..77506a291a2d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @req_size: hint: total size of the read which the caller is performing in * pagecache pages * - * page_cache_async_ondemand() should be called when a page is used which + * page_cache_async_readahead() should be called when a page is used which * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. diff --git a/mm/rmap.c b/mm/rmap.c index 0feeef860a8f..38a336e2eea1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -250,7 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); + empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) @@ -274,7 +274,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); - ksm_refcount_init(anon_vma); + anonvma_external_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } +static bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); + unsigned long address; + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && + is_vma_temporary_stack(vma)) + continue; + + address = vma_address(page, vma); if (address == -EFAULT) continue; ret = try_to_unmap_one(page, vma, address, flags); @@ -1355,10 +1383,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma() * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem, which also gave the necessary guarantee - * (that this anon_vma's slab has not already been destroyed). - * This needs to be reviewed later: avoiding page_lock_anon_vma() - * is risky, and currently limits the usefulness of rmap_walk(). + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) diff --git a/mm/shmem.c b/mm/shmem.c index 0cd7f66f1c66..7e5030ae18ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); - if (page) - set_page_private(page, 0); spin_lock(&info->lock); if (!page) { @@ -729,10 +727,11 @@ done2: if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { /* * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since vmtruncate or - * generic_delete_inode did it, before we lowered next_index. - * Also, though shmem_getpage checks i_size before adding to - * cache, no recheck after: so fix the narrow window there too. + * may have swizzled a page in from swap since + * truncate_pagecache or generic_delete_inode did it, before we + * lowered next_index. Also, though shmem_getpage checks + * i_size before adding to cache, no recheck after: so fix the + * narrow window there too. * * Recalling truncate_inode_pages_range and unmap_mapping_range * every time for punch_hole (which never got a chance to clear @@ -762,19 +761,16 @@ done2: } } -static void shmem_truncate(struct inode *inode) -{ - shmem_truncate_range(inode, inode->i_size, (loff_t)-1); -} - static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct page *page = NULL; int error; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size < inode->i_size) { + loff_t newsize = attr->ia_size; + struct page *page = NULL; + + if (newsize < inode->i_size) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -782,9 +778,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (newsize & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + newsize >> PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); if (page) unlock_page(page); @@ -796,24 +792,29 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * if it's being fully truncated to zero-length: the * nrpages check is efficient enough in that case. */ - if (attr->ia_size) { + if (newsize) { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); info->flags &= ~SHMEM_PAGEIN; spin_unlock(&info->lock); } } + + error = simple_setsize(inode, newsize); + if (page) + page_cache_release(page); + if (error) + return error; + shmem_truncate_range(inode, newsize, (loff_t)-1); } error = inode_change_ok(inode, attr); if (!error) - error = inode_setattr(inode, attr); + generic_setattr(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL if (!error && (attr->ia_valid & ATTR_MODE)) error = generic_acl_chmod(inode); #endif - if (page) - page_cache_release(page); return error; } @@ -821,11 +822,11 @@ static void shmem_delete_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - if (inode->i_op->truncate == shmem_truncate) { + if (inode->i_mapping->a_ops == &shmem_aops) { truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; - shmem_truncate(inode); + shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -2024,7 +2025,6 @@ static const struct inode_operations shmem_symlink_inline_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .truncate = shmem_truncate, .readlink = generic_readlink, .follow_link = shmem_follow_link, .put_link = shmem_put_link, @@ -2435,14 +2435,13 @@ static const struct file_operations shmem_file_operations = { .write = do_sync_write, .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, #endif }; static const struct inode_operations shmem_inode_operations = { - .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_POSIX_ACL @@ -2561,6 +2560,45 @@ out4: return error; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + swp_entry_t entry = { .val = 0 }, *ptr; + struct page *page = NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, pgoff, NULL); +#ifdef CONFIG_SWAP + if (ptr && ptr->val) { + entry.val = ptr->val; + page = find_get_page(&swapper_space, entry.val); + } else +#endif + page = find_get_page(inode->i_mapping, pgoff); + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); +out: + *pagep = page; + *ent = entry; +} +#endif + #else /* !CONFIG_SHMEM */ /* @@ -2600,6 +2638,31 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + struct page *page = NULL; + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + page = find_get_page(inode->i_mapping, pgoff); +out: + *pagep = page; + *ent = (swp_entry_t){ .val = 0 }; +} +#endif + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) diff --git a/mm/slab.c b/mm/slab.c index 50a73fca19c4..e49f8f46f46d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -821,7 +821,7 @@ static void init_reap_node(int cpu) { int node; - node = next_node(cpu_to_node(cpu), node_online_map); + node = next_node(cpu_to_mem(cpu), node_online_map); if (node == MAX_NUMNODES) node = first_node(node_online_map); @@ -1050,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = numa_mem_id(); /* * Make sure we are not freeing a object from another node to the array @@ -1129,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { @@ -1194,7 +1194,7 @@ static int __cpuinit cpuup_prepare(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); int err; /* @@ -1321,7 +1321,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, mutex_unlock(&cache_chain_mutex); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpucache_notifier = { @@ -1479,7 +1479,7 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_node_id(); + node = numa_mem_id(); /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); @@ -2121,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } } - cachep->nodelists[numa_node_id()]->next_reap = + cachep->nodelists[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2452,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); + assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); #endif } @@ -2479,7 +2479,7 @@ static void do_drain(void *arg) { struct kmem_cache *cachep = arg; struct array_cache *ac; - int node = numa_node_id(); + int node = numa_mem_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -3012,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); - node = numa_node_id(); + node = numa_mem_id(); ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3216,11 +3216,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; - nid_alloc = nid_here = numa_node_id(); + nid_alloc = nid_here = numa_mem_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_mem_spread_node(); + nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3278,7 +3281,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); + obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3302,6 +3305,7 @@ retry: } } } + put_mems_allowed(); return obj; } @@ -3385,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, { unsigned long save_flags; void *ptr; + int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; @@ -3397,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_save(save_flags); if (nodeid == -1) - nodeid = numa_node_id(); + nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ @@ -3405,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == slab_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback @@ -3449,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_mem_id()); out: return objp; @@ -3547,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); batchcount = ac->batchcount; #if DEBUG @@ -3981,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return -ENOMEM; for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) @@ -4003,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); + spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4111,7 +4116,7 @@ static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) diff --git a/mm/slub.c b/mm/slub.c index c2d6e6951f33..578f68f3c51f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index dc0cc4d43ff3..95ac219af379 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) { struct page *map; + unsigned long size; map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; - map = alloc_bootmem_pages_node(NODE_DATA(nid), - PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); + size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); + map = __alloc_bootmem_node_high(NODE_DATA(nid), size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); + map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) diff --git a/mm/swap.c b/mm/swap.c index 7cd60bf0a972..3ce7bc373a52 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -224,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } +EXPORT_SYMBOL(__lru_cache_add); /** * lru_cache_add_lru - add a page to a page list diff --git a/mm/truncate.c b/mm/truncate.c index f42675a3615d..937571b8b233 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache); * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last * incomplete page. Ugly, but necessary. + * + * This function is deprecated and simple_setsize or truncate_pagecache + * should be used instead. */ int vmtruncate(struct inode *inode, loff_t offset) { - loff_t oldsize; int error; - error = inode_newsize_ok(inode, offset); + error = simple_setsize(inode, offset); if (error) return error; - oldsize = inode->i_size; - i_size_write(inode, offset); - truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) inode->i_op->truncate(inode); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f5..915dceb487c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -73,10 +73,14 @@ struct scan_control { int swappiness; - int all_unreclaimable; - int order; + /* + * Intend to reclaim enough contenious memory rather than to reclaim + * enough amount memory. I.e, it's the mode for high order allocation. + */ + bool lumpy_reclaim_mode; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -85,12 +89,6 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; - - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -575,7 +573,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + if (sc->lumpy_reclaim_mode) return PAGEREF_RECLAIM; /* @@ -839,11 +837,6 @@ keep: return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1011,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, - struct mem_cgroup *mem_cont, int active, int file) { int lru = LRU_BASE; @@ -1130,7 +1122,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int lumpy_reclaim = 0; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1140,17 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, return SWAP_CLUSTER_MAX; } - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_reclaim = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1163,15 +1143,15 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE; unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, sc->order, mode, - zone, sc->mem_cgroup, 0, file); - if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, + &page_list, &nr_scan, + sc->order, mode, + zone, 0, file); zone->pages_scanned += nr_scan; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1179,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + } else { + nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX, + &page_list, &nr_scan, + sc->order, mode, + zone, sc->mem_cgroup, + 0, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } if (nr_taken == 0) @@ -1216,7 +1206,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { + sc->lumpy_reclaim_mode) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* @@ -1356,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, lru_add_drain(); spin_lock_irq(&zone->lru_lock); - nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + 1, file); zone->pages_scanned += pgscanned; + } else { + nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); @@ -1519,21 +1516,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, } /* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= SWAP_CLUSTER_MAX) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} + +/* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * percent[0] specifies how much pressure to put on ram/swap backed - * memory, while percent[1] determines pressure on the file LRUs. + * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_ratio(struct zone *zone, struct scan_control *sc, - unsigned long *percent) +static void get_scan_count(struct zone *zone, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + u64 fraction[2], denominator; + enum lru_list l; + int noswap = 0; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; + } anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); @@ -1545,9 +1573,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - percent[0] = 100; - percent[1] = 0; - return; + fraction[0] = 1; + fraction[1] = 0; + denominator = 1; + goto out; } } @@ -1594,29 +1623,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - /* Normalize to percentages */ - percent[0] = 100 * ap / (ap + fp + 1); - percent[1] = 100 - percent[0]; + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + for_each_evictable_lru(l) { + int file = is_file_lru(l); + unsigned long scan; + + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = div64_u64(scan * fraction[file], denominator); + } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); + } } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) { - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = 1; else - nr = 0; - - return nr; + sc->lumpy_reclaim_mode = 0; } /* @@ -1627,33 +1664,13 @@ static void shrink_zone(int priority, struct zone *zone, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int noswap = 0; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - percent[0] = 0; - percent[1] = 100; - } else - get_scan_ratio(zone, sc, percent); - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; + get_scan_count(zone, sc, nr, priority); - scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); - } + set_lumpy_reclaim_mode(priority, sc); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -1707,14 +1724,14 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static int shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; + int progress = 0; - sc->all_unreclaimable = 1; for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, sc->nodemask) { if (!populated_zone(zone)) @@ -1730,19 +1747,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; } else { /* * Ignore cpuset limitation here. We just want to reduce * # of used pages by us regardless of memory shortage. */ - sc->all_unreclaimable = 0; mem_cgroup_note_reclaim_priority(sc->mem_cgroup, priority); } shrink_zone(priority, zone, sc); + progress = 1; } + return progress; } /* @@ -1774,6 +1791,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) @@ -1795,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - shrink_zones(priority, zonelist, sc); + ret = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1832,7 +1850,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scanning_global_lru(sc)) + if (ret && scanning_global_lru(sc)) ret = sc->nr_reclaimed; out: /* @@ -1857,6 +1875,7 @@ out: mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); delayacct_freepages_end(); + put_mems_allowed(); return ret; } @@ -1873,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, .nodemask = nodemask, }; @@ -1894,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, - .isolate_pages = mem_cgroup_isolate_pages, }; nodemask_t nm = nodemask_of_node(nid); @@ -1928,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; @@ -2006,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, }; /* * temp_priority is used to remember the scanning priority at which @@ -2385,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, .swappiness = vm_swappiness, .order = 0, - .isolate_pages = isolate_pages_global, }; struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -2570,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, - .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; diff --git a/mm/vmstat.c b/mm/vmstat.c index fa12ea3051fb..7759941d4e77 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/vmstat.h> #include <linux/sched.h> +#include <linux/math64.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) } #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_COMPACTION +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* Count number of free blocks */ + blocks = zone->free_area[order].nr_free; + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, spin_unlock_irqrestore(&zone->lock, flags); } } +#endif +#ifdef CONFIG_PROC_FS static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -693,6 +775,16 @@ static const char * const vmstat_text[] = { "allocstall", "pgrotated", + +#ifdef CONFIG_COMPACTION + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", + "compact_stall", + "compact_fail", + "compact_success", +#endif + #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", @@ -954,3 +1046,162 @@ static int __init setup_vmstat(void) return 0; } module_init(setup_vmstat) + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) +#include <linux/debugfs.h> + +static struct dentry *extfrag_debug_root; + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +static int unusable_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unusable_op); +} + +static const struct file_operations unusable_file_ops = { + .open = unusable_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +static int extfrag_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &extfrag_op); +} + +static const struct file_operations extfrag_file_ops = { + .open = extfrag_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init extfrag_debug_init(void) +{ + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + if (!extfrag_debug_root) + return -ENOMEM; + + if (!debugfs_create_file("unusable_index", 0444, + extfrag_debug_root, NULL, &unusable_file_ops)) + return -ENOMEM; + + if (!debugfs_create_file("extfrag_index", 0444, + extfrag_debug_root, NULL, &extfrag_file_ops)) + return -ENOMEM; + + return 0; +} + +module_init(extfrag_debug_init); +#endif |