diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 281 |
1 files changed, 184 insertions, 97 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bac2de4b9c42..2d8549ae1b30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,13 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; -#else -#define cgroup_memory_noswap 1 -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -102,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -662,6 +655,81 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_MEMCG_EVENTS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_MEMCG_EVENTS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -809,27 +877,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } @@ -1136,7 +1214,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. @@ -1461,29 +1539,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; @@ -1524,10 +1579,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -1601,17 +1661,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -2783,6 +2843,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ folio->memcg_data = (unsigned long)memcg; } @@ -3356,7 +3417,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -3969,6 +4030,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3982,6 +4045,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4010,7 +4075,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4041,7 +4107,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -5158,12 +5224,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_exit_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5186,6 +5254,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5222,6 +5294,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); @@ -5256,6 +5329,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5404,9 +5478,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5419,15 +5493,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + for (i = 0; i < NR_MEMCG_EVENTS; i++) { + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5438,9 +5512,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { @@ -5555,7 +5629,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); @@ -5865,7 +5939,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6163,9 +6237,7 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } @@ -6190,6 +6262,30 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + /* find the first leader if there is any */ + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && READ_ONCE(task->mm->owner) == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6595,6 +6691,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, @@ -6807,21 +6904,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; @@ -7194,7 +7290,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7232,7 +7328,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7261,7 +7357,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7297,7 +7393,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7313,7 +7409,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7341,15 +7437,18 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (!mem_cgroup_is_root(memcg)) { + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); @@ -7361,7 +7460,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7370,18 +7469,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; @@ -7398,10 +7497,9 @@ bool mem_cgroup_swap_full(struct page *page) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7670,20 +7768,9 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); @@ -7693,6 +7780,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ |