summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 10:08:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 10:08:28 -0700
commit14726903c835101cd8d0a703b609305094350d61 (patch)
tree5cdcf5d2f06ca14be76efd33a4de0e3b28a70de0 /mm/memcontrol.c
parenta9c9a6f741cdaa2fa9ba24a790db8d07295761e3 (diff)
parentd5fffc5aff269717a035baa087630adca612a6c4 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "173 patches. Subsystems affected by this series: ia64, ocfs2, block, and mm (debug, pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap, bootmem, sparsemem, vmalloc, kasan, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, compaction, mempolicy, memblock, oom-kill, migration, ksm, percpu, vmstat, and madvise)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (173 commits) mm/madvise: add MADV_WILLNEED to process_madvise() mm/vmstat: remove unneeded return value mm/vmstat: simplify the array size calculation mm/vmstat: correct some wrong comments mm/percpu,c: remove obsolete comments of pcpu_chunk_populated() selftests: vm: add COW time test for KSM pages selftests: vm: add KSM merging time test mm: KSM: fix data type selftests: vm: add KSM merging across nodes test selftests: vm: add KSM zero page merging test selftests: vm: add KSM unmerge test selftests: vm: add KSM merge test mm/migrate: correct kernel-doc notation mm: wire up syscall process_mrelease mm: introduce process_mrelease system call memblock: make memblock_find_in_range method private mm/mempolicy.c: use in_task() in mempolicy_slab_node() mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies mm/mempolicy: advertise new MPOL_PREFERRED_MANY mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c228
1 files changed, 101 insertions, 127 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 389b5766e74f..b762215d73eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
+/* memcg and lruvec stats flushing */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static void flush_memcg_stats_work(struct work_struct *w);
+static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
+static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
+static DEFINE_SPINLOCK(stats_flush_lock);
+
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
@@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
return &memcg->vmpressure;
}
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
{
- return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+ return container_of(vmpr, struct mem_cgroup, vmpressure);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -646,17 +654,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
-static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
long x = 0;
@@ -671,23 +668,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
return x;
}
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
- struct mem_cgroup *parent;
-
- parent = parent_mem_cgroup(pn->memcg);
- if (!parent)
- return NULL;
- return parent->nodeinfo[nid];
-}
-
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -696,21 +681,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_state(memcg, idx, val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
- if (vmstat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > threshold)) {
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
- struct mem_cgroup_per_node *pi;
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
- atomic_long_add(x, &pi->lruvec_stat[idx]);
- x = 0;
- }
- __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+ if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
+ queue_work(system_unbound_wq, &stats_flush_work);
}
/**
@@ -905,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task);
static __always_inline struct mem_cgroup *active_memcg(void)
{
- if (in_interrupt())
+ if (!in_task())
return this_cpu_read(int_active_memcg);
else
return current->active_memcg;
@@ -2205,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy)
unsigned long flags;
/*
- * The only protection from memory hotplug vs. drain_stock races is
- * that we always operate on local CPU stock here with IRQ disabled
+ * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
+ * drain_stock races is that we always operate on local CPU stock
+ * here with IRQ disabled
*/
local_irq_save(flags);
@@ -2273,7 +2247,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
- if (obj_stock_flush_required(stock, root_memcg))
+ else if (obj_stock_flush_required(stock, root_memcg))
flush = true;
rcu_read_unlock();
@@ -2289,40 +2263,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
- int nid;
-
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
- unsigned long stat[NR_VM_NODE_STAT_ITEMS];
- struct batched_lruvec_stat *lstatc;
- int i;
-
- lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- stat[i] = lstatc->count[i];
- lstatc->count[i] = 0;
- }
-
- do {
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
-}
-
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- for_each_mem_cgroup(memcg)
- memcg_flush_lruvec_page_state(memcg, cpu);
-
return 0;
}
@@ -4116,7 +4063,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val > 100)
+ if (val > 200)
return -EINVAL;
if (!mem_cgroup_is_root(memcg))
@@ -4668,7 +4615,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
- cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
@@ -4892,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
vfs_poll(efile.file, &event->pt);
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
@@ -5129,17 +5076,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_local) {
- kfree(pn);
- return 1;
- }
-
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_cpu) {
- free_percpu(pn->lruvec_stat_local);
+ pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
@@ -5160,8 +5099,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;
- free_percpu(pn->lruvec_stat_cpu);
- free_percpu(pn->lruvec_stat_local);
+ free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5177,15 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
- int cpu;
-
memcg_wb_domain_exit(memcg);
- /*
- * Flush percpu lruvec stats to guarantee the value
- * correctness on parent's and all ancestor levels.
- */
- for_each_online_cpu(cpu)
- memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5321,6 +5251,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 2UL*HZ);
return 0;
}
@@ -5334,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
@@ -5412,13 +5346,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+void mem_cgroup_flush_stats(void)
+{
+ if (!spin_trylock(&stats_flush_lock))
+ return;
+
+ cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+ spin_unlock(&stats_flush_lock);
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
+static void flush_memcg_stats_work(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+}
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
long delta, v;
- int i;
+ int i, nid;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5466,6 +5420,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent)
parent->vmstats.events_pending[i] += delta;
}
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats_percpu *lstatc;
+
+ if (parent)
+ ppn = parent->nodeinfo[nid];
+
+ lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ delta = pn->lruvec_stats.state_pending[i];
+ if (delta)
+ pn->lruvec_stats.state_pending[i] = 0;
+
+ v = READ_ONCE(lstatc->state[i]);
+ if (v != lstatc->state_prev[i]) {
+ delta += v - lstatc->state_prev[i];
+ lstatc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
+ }
}
#ifdef CONFIG_MMU
@@ -6399,6 +6383,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -6704,8 +6690,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
-static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
- gfp_t gfp)
+static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
{
unsigned int nr_pages = thp_nr_pages(page);
int ret;
@@ -6726,7 +6711,7 @@ out:
}
/**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
@@ -6739,16 +6724,14 @@ out:
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
struct mem_cgroup *memcg;
int ret;
- if (mem_cgroup_disabled())
- return 0;
-
memcg = get_mem_cgroup_from_mm(mm);
- ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ ret = charge_memcg(page, memcg, gfp_mask);
css_put(&memcg->css);
return ret;
@@ -6783,7 +6766,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
memcg = get_mem_cgroup_from_mm(mm);
rcu_read_unlock();
- ret = __mem_cgroup_charge(page, memcg, gfp);
+ ret = charge_memcg(page, memcg, gfp);
css_put(&memcg->css);
return ret;
@@ -6919,18 +6902,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
}
/**
- * mem_cgroup_uncharge - uncharge a page
+ * __mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_charge().
+ * Uncharge a page previously charged with __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct page *page)
{
struct uncharge_gather ug;
- if (mem_cgroup_disabled())
- return;
-
/* Don't touch page->lru of any random page, pre-check: */
if (!page_memcg(page))
return;
@@ -6941,20 +6921,17 @@ void mem_cgroup_uncharge(struct page *page)
}
/**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
struct uncharge_gather ug;
struct page *page;
- if (mem_cgroup_disabled())
- return;
-
uncharge_gather_clear(&ug);
list_for_each_entry(page, page_list, lru)
uncharge_page(page, &ug);
@@ -7244,7 +7221,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
@@ -7252,16 +7229,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (mem_cgroup_disabled())
- return 0;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
@@ -7297,11 +7271,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;