diff options
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r-- | mm/vmstat.c | 417 |
1 files changed, 341 insertions, 76 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c index 7997f52935c9..89cec42d19ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu) * * vm_stat contains the global counters */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -EXPORT_SYMBOL(vm_stat); +atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone) */ void refresh_zone_stat_thresholds(void) { + struct pglist_data *pgdat; struct zone *zone; int cpu; int threshold; + /* Zero current pgdat thresholds */ + for_each_online_pgdat(pgdat) { + for_each_online_cpu(cpu) { + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; + } + } + for_each_populated_zone(zone) { + struct pglist_data *pgdat = zone->zone_pgdat; unsigned long max_drift, tolerate_drift; threshold = calculate_normal_threshold(zone); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + int pgdat_threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; + /* Base nodestat threshold on the largest populated zone. */ + pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold + = max(threshold, pgdat_threshold); + } + /* * Only set percpu_drift_mark if there is a danger that * NR_FREE_PAGES reports the low watermark is ok when in fact @@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(__mod_zone_page_state); +void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long x; + long t; + + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { + node_page_state_add(x, pgdat, item); + x = 0; + } + __this_cpu_write(*p, x); +} +EXPORT_SYMBOL(__mod_node_page_state); + /* * Optimized increment and decrement functions. * @@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } +} + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__inc_zone_page_state); +void __inc_node_page_state(struct page *page, enum node_stat_item item) +{ + __inc_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__inc_node_page_state); + void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; @@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } +} + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__dec_zone_page_state); +void __dec_node_page_state(struct page *page, enum node_stat_item item) +{ + __dec_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__dec_node_page_state); + #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead @@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, enum zone_stat_item item, - long delta, int overstep_mode) +static inline void mod_zone_state(struct zone *zone, + enum zone_stat_item item, long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item, void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - mod_state(zone, item, delta, 0); + mod_zone_state(zone, item, delta, 0); } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - mod_state(zone, item, 1, 1); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, 1, 1); + mod_zone_state(page_zone(page), item, 1, 1); } EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, -1, -1); + mod_zone_state(page_zone(page), item, -1, -1); } EXPORT_SYMBOL(dec_zone_page_state); + +static inline void mod_node_state(struct pglist_data *pgdat, + enum node_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to node counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a node. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to node counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + node_page_state_add(z, pgdat, item); +} + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + mod_node_state(pgdat, item, delta, 0); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + mod_node_state(pgdat, item, 1, 1); +} + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_node_page_state); #else /* * Use interrupt disable to serialize counter updates @@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_zone_state(zone, item); - local_irq_restore(flags); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; @@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); -#endif +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_state); + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_node_page_state(pgdat, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + struct pglist_data *pgdat; + + pgdat = page_pgdat(page); + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_node_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_node_page_state); +#endif /* * Fold a differential into the global counters. * Returns the number of counters updated. */ -static int fold_diff(int *diff) +static int fold_diff(int *zone_diff, int *node_diff) { int i; int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) { - atomic_long_add(diff[i], &vm_stat[i]); + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); changes++; } return changes; @@ -462,9 +641,11 @@ static int fold_diff(int *diff) */ static int refresh_cpu_vm_stats(bool do_pagesets) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; for_each_populated_zone(zone) { @@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (v) { atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ __this_cpu_write(p->expire, 3); @@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } #endif } - changes += fold_diff(global_diff); + + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); + if (v) { + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + } + + changes += fold_diff(global_zone_diff, global_node_diff); return changes; } @@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets) */ void cpu_vm_stats_fold(int cpu) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { struct per_cpu_pageset *p; @@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu) v = p->vm_stat_diff[i]; p->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; } } - fold_diff(global_diff); + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (p->vm_node_stat_diff[i]) { + int v; + + v = p->vm_node_stat_diff[i]; + p->vm_node_stat_diff[i] = 0; + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + + fold_diff(global_zone_diff, global_node_diff); } /* @@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) int v = pset->vm_stat_diff[i]; pset->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_stat[i]); + atomic_long_add(v, &vm_zone_stat[i]); } } #endif #ifdef CONFIG_NUMA /* - * Determine the per node value of a stat item. + * Determine the per node value of a stat item. This function + * is called frequently in a NUMA machine, so try to be as + * frugal as possible. */ -unsigned long node_page_state(int node, enum zone_stat_item item) +unsigned long sum_zone_node_page_state(int node, + enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; int i; @@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item) return count; } +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} #endif #ifdef CONFIG_COMPACTION @@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", - "nr_alloc_batch", - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", + "nr_zone_inactive_anon", + "nr_zone_active_anon", + "nr_zone_inactive_file", + "nr_zone_active_file", + "nr_zone_unevictable", + "nr_zone_write_pending", "nr_mlock", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", - "nr_unstable", "nr_bounce", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_writeback_temp", - "nr_isolated_anon", - "nr_isolated_file", - "nr_shmem", - "nr_dirtied", - "nr_written", - "nr_pages_scanned", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif @@ -729,13 +944,35 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_free_cma", + + /* Node-based counters */ + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_isolated_anon", + "nr_isolated_file", + "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", - "nr_anon_transparent_hugepages", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_writeback_temp", + "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", - "nr_free_cma", + "nr_anon_transparent_hugepages", + "nr_unstable", + "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", + "nr_dirtied", + "nr_written", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -749,6 +986,8 @@ const char * const vmstat_text[] = { "pswpout", TEXTS_FOR_ZONES("pgalloc") + TEXTS_FOR_ZONES("allocstall") + TEXTS_FOR_ZONES("pgskip") "pgfree", "pgactivate", @@ -758,11 +997,11 @@ const char * const vmstat_text[] = { "pgmajfault", "pglazyfreed", - TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal_kswapd") - TEXTS_FOR_ZONES("pgsteal_direct") - TEXTS_FOR_ZONES("pgscan_kswapd") - TEXTS_FOR_ZONES("pgscan_direct") + "pgrefill", + "pgsteal_kswapd", + "pgsteal_direct", + "pgscan_kswapd", + "pgscan_direct", "pgscan_direct_throttle", #ifdef CONFIG_NUMA @@ -774,7 +1013,6 @@ const char * const vmstat_text[] = { "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", "pageoutrun", - "allocstall", "pgrotated", @@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; +static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *compare = &pgdat->node_zones[zid]; + + if (populated_zone(compare)) + return zone == compare; + } + + /* The zone must be somewhere! */ + WARN_ON_ONCE(1); + return false; +} + static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { int i; seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + if (is_zone_first_populated(pgdat, zone)) { + seq_printf(m, "\n per-node stats"); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + node_page_state(pgdat, i)); + } + } seq_printf(m, "\n pages free %lu" "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu" + "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone_page_state(zone, NR_PAGES_SCANNED), + node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], + seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); seq_printf(m, @@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, - "\n all_unreclaimable: %u" - "\n start_pfn: %lu" - "\n inactive_ratio: %u", - !zone_reclaimable(zone), + "\n node_unreclaimable: %u" + "\n start_pfn: %lu" + "\n node_inactive_ratio: %u", + !pgdat_reclaimable(zone->zone_pgdat), zone->zone_start_pfn, - zone->inactive_ratio); + zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } @@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS @@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[i] = global_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + v[i] = global_node_page_state(i); + v += NR_VM_NODE_STAT_ITEMS; + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); v += NR_VM_WRITEBACK_STAT_ITEMS; @@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); return 0; } @@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_stat[i]); + val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { switch (i) { - case NR_ALLOC_BATCH: case NR_PAGES_SCANNED: /* - * These are often seen to go negative in + * This is often seen to go negative in * recent kernels, but not to go permanently * negative. Whilst it would be nicer not to * have exceptions, rooting them out would be |