diff options
author | Johannes Weiner <jweiner@redhat.com> | 2012-01-12 17:18:15 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-12 20:13:05 -0800 |
commit | 925b7673cce39116ce61e7a06683a4a0dad1e72a (patch) | |
tree | 66c134db836e531e196ee3dfc23c124ff74ac827 /mm/memcontrol.c | |
parent | 6290df545814990ca2663baf6e894669132d5f73 (diff) |
mm: make per-memcg LRU lists exclusive
Now that all code that operated on global per-zone LRU lists is
converted to operate on per-memory cgroup LRU lists instead, there is no
reason to keep the double-LRU scheme around any longer.
The pc->lru member is removed and page->lru is linked directly to the
per-memory cgroup LRU lists, which removes two pointers from a
descriptor that exists for every page frame in the system.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 311 |
1 files changed, 151 insertions, 160 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e7f849a1a9e..972878b648c2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -995,6 +995,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -1009,104 +1030,123 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. + */ +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { - struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + struct page_cgroup *pc; if (mem_cgroup_disabled()) - return; + return &zone->lruvec; + pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); + smp_mb(); + /* + * If the page is uncharged, it may be freed soon, but it + * could also be swap cache (readahead, swapoff) that needs to + * be reclaimable in the future. root_mem_cgroup will babysit + * it for the time being. + */ + if (PageCgroupUsed(pc)) { + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + memcg = pc->mem_cgroup; + SetPageCgroupAcctLRU(pc); + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); + return &mz->lruvec; } -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; - enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lruvec.lists[lru]); + /* + * root_mem_cgroup babysits uncharged LRU pages, but + * PageCgroupUsed is cleared when the page is about to get + * freed. PageCgroupAcctLRU remembers whether the + * LRU-accounting happened against pc->mem_cgroup or + * root_mem_cgroup. + */ + if (TestClearPageCgroupAcctLRU(pc)) { + VM_BUG_ON(!pc->mem_cgroup); + memcg = pc->mem_cgroup; + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +void mem_cgroup_lru_del(struct page *page) { - struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(page); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lruvec.lists[lru]); + mem_cgroup_lru_del_list(page, page_lru(page)); } -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - list_add(&pc->lru, &mz->lruvec.lists[lru]); + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1117,6 +1157,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) */ static void mem_cgroup_lru_del_before_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1133,17 +1174,28 @@ static void mem_cgroup_lru_del_before_commit(struct page *page) return; spin_lock_irqsave(&zone->lru_lock, flags); + lru = page_lru(page); /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. + * The uncharged page could still be registered to the LRU of + * the stale pc->mem_cgroup. + * + * As pc->mem_cgroup is about to get overwritten, the old LRU + * accounting needs to be taken care of. Let root_mem_cgroup + * babysit the page until the new memcg is responsible for it. + * + * The PCG_USED bit is guarded by lock_page() as the page is + * swapcache/pagecache. */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); + if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } static void mem_cgroup_lru_add_after_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1161,22 +1213,22 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) if (likely(!PageLRU(page))) return; spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); + lru = page_lru(page); + /* + * If the page is not on the LRU, someone will soon put it + * there. If it is, and also already accounted for on the + * memcg-side, it must be on the right lruvec as setting + * pc->mem_cgroup and PageCgroupUsed is properly ordered. + * Otherwise, root_mem_cgroup has been babysitting the page + * during the charge. Move it to the new memcg now. + */ + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) -{ - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); -} - /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree @@ -1282,68 +1334,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lruvec.lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -3726,11 +3716,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; @@ -3742,6 +3732,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3750,16 +3741,16 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); if (ret == -ENOMEM) @@ -3767,7 +3758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; |