diff options
Diffstat (limited to 'mm/vmstat.c')
| -rw-r--r-- | mm/vmstat.c | 1251 |
1 files changed, 774 insertions, 477 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c index 83b30edc2f7f..65de88cdf40e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/vmstat.c * @@ -6,7 +7,7 @@ * * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., - * Christoph Lameter <christoph@lameter.com> + * Christoph Lameter <cl@gentwo.org> * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> @@ -25,26 +26,27 @@ #include <linux/writeback.h> #include <linux/compaction.h> #include <linux/mm_inline.h> -#include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/sched/isolation.h> #include "internal.h" -#define NUMA_STATS_THRESHOLD (U16_MAX - 2) - +#ifdef CONFIG_PROC_FS #ifdef CONFIG_NUMA -int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; +#define ENABLE_NUMA_STAT 1 +static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { - atomic_long_set(&zone->vm_numa_stat[item], 0); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_event[item], 0); + for_each_online_cpu(cpu) { + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; + } } } @@ -62,8 +64,8 @@ static void zero_global_numa_counters(void) { int item; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) - atomic_long_set(&vm_numa_stat[item], 0); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) @@ -74,8 +76,8 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); -int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -102,6 +104,7 @@ out: return ret; } #endif +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -129,9 +132,9 @@ static void sum_vm_events(unsigned long *ret) */ void all_vm_events(unsigned long *ret) { - get_online_cpus(); + cpus_read_lock(); sum_vm_events(ret); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -160,12 +163,39 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); -EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) +{ + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; + + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; + + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); + } + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); +} + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif + #ifdef CONFIG_SMP int calculate_pressure_threshold(struct zone *zone) @@ -205,7 +235,7 @@ int calculate_normal_threshold(struct zone *zone) * * Some sample thresholds: * - * Threshold Processors (fls) Zonesize fls(mem+1) + * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 @@ -265,7 +295,7 @@ void refresh_zone_stat_thresholds(void) for_each_online_cpu(cpu) { int pgdat_threshold; - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; /* Base nodestat threshold on the largest populated zone. */ @@ -302,7 +332,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, threshold = (*calculate_pressure)(zone); for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; } } @@ -315,20 +345,31 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; long x; long t; + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + preempt_disable_nested(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { zone_page_state_add(x, zone, item); x = 0; } __this_cpu_write(*p, x); + + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -340,15 +381,31 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long x; long t; + if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + + /* See __mod_zone_page_state() */ + preempt_disable_nested(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { node_page_state_add(x, pgdat, item); x = 0; } __this_cpu_write(*p, x); + + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -377,10 +434,13 @@ EXPORT_SYMBOL(__mod_node_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_zone_page_state() */ + preempt_disable_nested(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -389,6 +449,8 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v + overstep, zone, item); __this_cpu_write(*p, -overstep); } + + preempt_enable_nested(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -397,6 +459,11 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + /* See __mod_zone_page_state() */ + preempt_disable_nested(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -405,6 +472,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); } + + preempt_enable_nested(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -421,10 +490,13 @@ EXPORT_SYMBOL(__inc_node_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_zone_page_state() */ + preempt_disable_nested(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -433,6 +505,8 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v - overstep, zone, item); __this_cpu_write(*p, overstep); } + + preempt_enable_nested(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -441,6 +515,11 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + /* See __mod_zone_page_state() */ + preempt_disable_nested(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -449,6 +528,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); } + + preempt_enable_nested(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) @@ -479,10 +560,12 @@ EXPORT_SYMBOL(__dec_node_page_state); static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item, long delta, int overstep_mode) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; + o = this_cpu_read(*p); do { z = 0; /* overflow to zone counters */ @@ -498,17 +581,16 @@ static inline void mod_zone_state(struct zone *zone, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to zone counters */ z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) zone_page_state_add(z, zone, item); @@ -538,8 +620,21 @@ static inline void mod_node_state(struct pglist_data *pgdat, { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; s8 __percpu *p = pcp->vm_node_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; + + if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + o = this_cpu_read(*p); do { z = 0; /* overflow to node counters */ @@ -555,17 +650,16 @@ static inline void mod_node_state(struct pglist_data *pgdat, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to node counters */ z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) node_page_state_add(z, pgdat, item); @@ -677,53 +771,29 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ -#ifdef CONFIG_NUMA -static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) -{ - int i; - int changes = 0; - - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (zone_diff[i]) { - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; - } - - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (numa_diff[i]) { - atomic_long_add(numa_diff[i], &vm_numa_stat[i]); - changes++; - } - - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - if (node_diff[i]) { - atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; - } - return changes; -} -#else static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; + } } - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; + } } - return changes; + + return changed; } -#endif /* CONFIG_NUMA */ /* * Update the zone counters for the current cpu. @@ -739,52 +809,42 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { - struct per_cpu_pageset __percpu *p = zone->pageset; + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; - v = this_cpu_xchg(p->vm_stat_diff[i], 0); + v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0); if (v) { atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - __this_cpu_write(p->expire, 3); + __this_cpu_write(pcp->expire, 3); #endif } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - int v; - - v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); - if (v) { - - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; - __this_cpu_write(p->expire, 3); - } - } if (do_pagesets) { cond_resched(); + + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; +#ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this * processor @@ -792,27 +852,29 @@ static int refresh_cpu_vm_stats(bool do_pagesets) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!__this_cpu_read(p->expire) || - !__this_cpu_read(p->pcp.count)) + if (!__this_cpu_read(pcp->expire) || + !__this_cpu_read(pcp->count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); + __this_cpu_write(pcp->expire, 0); continue; } - if (__this_cpu_dec_return(p->expire)) + if (__this_cpu_dec_return(pcp->expire)) { + changed = true; continue; + } - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(pcp->count)) { + drain_zone_pages(zone, this_cpu_ptr(pcp)); + changed = true; } - } #endif + } } for_each_online_pgdat(pgdat) { @@ -829,13 +891,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } -#ifdef CONFIG_NUMA - changes += fold_diff(global_zone_diff, global_numa_diff, - global_node_diff); -#else - changes += fold_diff(global_zone_diff, global_node_diff); -#endif - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* @@ -849,36 +907,33 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_zonestat *pzstats; - p = per_cpu_ptr(zone->pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + if (pzstats->vm_stat_diff[i]) { int v; - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } - + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (p->vm_numa_stat_diff[i]) { - int v; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + unsigned long v; - v = p->vm_numa_stat_diff[i]; - p->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } @@ -898,58 +953,39 @@ void cpu_vm_stats_fold(int cpu) } } -#ifdef CONFIG_NUMA - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); -#else fold_diff(global_zone_diff, global_node_diff); -#endif } /* * this is only called if !populated_zone(zone), which implies no other users of - * pset->vm_stat_diff[] exsist. + * pset->vm_stat_diff[] exist. */ -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { + unsigned long v; int i; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (pset->vm_stat_diff[i]) { - int v = pset->vm_stat_diff[i]; - pset->vm_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_zone_stat[i]); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + if (pzstats->vm_stat_diff[i]) { + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; + zone_page_state_add(v, zone, i); } + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pset->vm_numa_stat_diff[i]) { - int v = pset->vm_numa_stat_diff[i]; - - pset->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - atomic_long_add(v, &vm_numa_stat[i]); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } #endif #ifdef CONFIG_NUMA -void __inc_numa_state(struct zone *zone, - enum numa_stat_item item) -{ - struct per_cpu_pageset __percpu *pcp = zone->pageset; - u16 __percpu *p = pcp->vm_numa_stat_diff + item; - u16 v; - - v = __this_cpu_inc_return(*p); - - if (unlikely(v > NUMA_STATS_THRESHOLD)) { - zone_numa_state_add(v, zone, item); - __this_cpu_write(*p, 0); - } -} - /* * Determine the per node value of a stat item. This function * is called frequently in a NUMA machine, so try to be as @@ -968,19 +1004,16 @@ unsigned long sum_zone_node_page_state(int node, return count; } -/* - * Determine the per node value of a numa stat item. To avoid deviation, - * the per cpu stat number in vm_numa_stat_diff[] is also included. - */ -unsigned long sum_zone_numa_state(int node, +/* Determine the per node value of a numa stat item. */ +unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; - int i; unsigned long count = 0; + int i; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state_snapshot(zones + i, item); + count += zone_numa_event_state(zones + i, item); return count; } @@ -988,8 +1021,8 @@ unsigned long sum_zone_numa_state(int node, /* * Determine the per node value of a stat item. */ -unsigned long node_page_state(struct pglist_data *pgdat, - enum node_stat_item item) +unsigned long node_page_state_pages(struct pglist_data *pgdat, + enum node_stat_item item) { long x = atomic_long_read(&pgdat->vm_stat[item]); #ifdef CONFIG_SMP @@ -998,8 +1031,34 @@ unsigned long node_page_state(struct pglist_data *pgdat, #endif return x; } + +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + return node_page_state_pages(pgdat, item); +} #endif +/* + * Count number of pages "struct page" and "struct page_ext" consume. + * nr_memmap_boot_pages: # of pages allocated by boot allocator + * nr_memmap_pages: # of pages that were allocated by buddy allocator + */ +static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0); +static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0); + +void memmap_boot_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_boot_pages); +} + +void memmap_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_pages); +} + #ifdef CONFIG_COMPACTION struct contig_page_info { @@ -1026,11 +1085,16 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0; - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { unsigned long blocks; - /* Count number of free blocks */ - blocks = zone->free_area[order].nr_free; + /* + * Count number of free blocks. + * + * Access to nr_free is lockless as nr_free is used only for + * diagnostic purposes. Use data_race to avoid KCSAN warning. + */ + blocks = data_race(zone->free_area[order].nr_free); info->free_blocks_total += blocks; /* Count free base pages */ @@ -1054,7 +1118,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (WARN_ON_ONCE(order > MAX_PAGE_ORDER)) return 0; if (!info->free_blocks_total) @@ -1073,6 +1137,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { @@ -1083,219 +1165,346 @@ int fragmentation_index(struct zone *zone, unsigned int order) } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ + defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) #ifdef CONFIG_ZONE_DMA -#define TEXT_FOR_DMA(xx) xx "_dma", +#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma", #else -#define TEXT_FOR_DMA(xx) +#define TEXT_FOR_DMA(xx, yy) #endif #ifdef CONFIG_ZONE_DMA32 -#define TEXT_FOR_DMA32(xx) xx "_dma32", +#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32", #else -#define TEXT_FOR_DMA32(xx) +#define TEXT_FOR_DMA32(xx, yy) #endif #ifdef CONFIG_HIGHMEM -#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high", #else -#define TEXT_FOR_HIGHMEM(xx) +#define TEXT_FOR_HIGHMEM(xx, yy) #endif -#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device", +#else +#define TEXT_FOR_DEVICE(xx, yy) +#endif + +#define TEXTS_FOR_ZONES(xx, yy) \ + TEXT_FOR_DMA(xx, yy) \ + TEXT_FOR_DMA32(xx, yy) \ + [xx##_NORMAL] = yy "_normal", \ + TEXT_FOR_HIGHMEM(xx, yy) \ + [xx##_MOVABLE] = yy "_movable", \ + TEXT_FOR_DEVICE(xx, yy) const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ - "nr_free_pages", - "nr_zone_inactive_anon", - "nr_zone_active_anon", - "nr_zone_inactive_file", - "nr_zone_active_file", - "nr_zone_unevictable", - "nr_zone_write_pending", - "nr_mlock", - "nr_page_table_pages", - "nr_kernel_stack", - "nr_bounce", + /* enum zone_stat_item counters */ +#define I(x) (x) + [I(NR_FREE_PAGES)] = "nr_free_pages", + [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks", + [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon", + [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon", + [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file", + [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file", + [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable", + [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending", + [I(NR_MLOCK)] = "nr_mlock", #if IS_ENABLED(CONFIG_ZSMALLOC) - "nr_zspages", + [I(NR_ZSPAGES)] = "nr_zspages", +#endif + [I(NR_FREE_CMA_PAGES)] = "nr_free_cma", +#ifdef CONFIG_UNACCEPTED_MEMORY + [I(NR_UNACCEPTED)] = "nr_unaccepted", #endif - "nr_free_cma", +#undef I /* enum numa_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + x) #ifdef CONFIG_NUMA - "numa_hit", - "numa_miss", - "numa_foreign", - "numa_interleave", - "numa_local", - "numa_other", + [I(NUMA_HIT)] = "numa_hit", + [I(NUMA_MISS)] = "numa_miss", + [I(NUMA_FOREIGN)] = "numa_foreign", + [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave", + [I(NUMA_LOCAL)] = "numa_local", + [I(NUMA_OTHER)] = "numa_other", #endif - - /* Node-based counters */ - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", - "nr_isolated_anon", - "nr_isolated_file", - "workingset_nodes", - "workingset_refault", - "workingset_activate", - "workingset_restore", - "workingset_nodereclaim", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", - "nr_writeback_temp", - "nr_shmem", - "nr_shmem_hugepages", - "nr_shmem_pmdmapped", - "nr_anon_transparent_hugepages", - "nr_unstable", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_dirtied", - "nr_written", - "nr_kernel_misc_reclaimable", - - /* enum writeback_stat_item counters */ - "nr_dirty_threshold", - "nr_dirty_background_threshold", - -#ifdef CONFIG_VM_EVENT_COUNTERS +#undef I + + /* enum node_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x) + [I(NR_INACTIVE_ANON)] = "nr_inactive_anon", + [I(NR_ACTIVE_ANON)] = "nr_active_anon", + [I(NR_INACTIVE_FILE)] = "nr_inactive_file", + [I(NR_ACTIVE_FILE)] = "nr_active_file", + [I(NR_UNEVICTABLE)] = "nr_unevictable", + [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable", + [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable", + [I(NR_ISOLATED_ANON)] = "nr_isolated_anon", + [I(NR_ISOLATED_FILE)] = "nr_isolated_file", + [I(WORKINGSET_NODES)] = "workingset_nodes", + [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon", + [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file", + [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon", + [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file", + [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon", + [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file", + [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim", + [I(NR_ANON_MAPPED)] = "nr_anon_pages", + [I(NR_FILE_MAPPED)] = "nr_mapped", + [I(NR_FILE_PAGES)] = "nr_file_pages", + [I(NR_FILE_DIRTY)] = "nr_dirty", + [I(NR_WRITEBACK)] = "nr_writeback", + [I(NR_SHMEM)] = "nr_shmem", + [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages", + [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped", + [I(NR_FILE_THPS)] = "nr_file_hugepages", + [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped", + [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages", + [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write", + [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim", + [I(NR_DIRTIED)] = "nr_dirtied", + [I(NR_WRITTEN)] = "nr_written", + [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written", + [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", + [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", + [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", +#endif + [I(NR_PAGETABLE)] = "nr_page_table_pages", + [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages", +#ifdef CONFIG_IOMMU_SUPPORT + [I(NR_IOMMU_PAGES)] = "nr_iommu_pages", +#endif +#ifdef CONFIG_SWAP + [I(NR_SWAPCACHE)] = "nr_swapcached", +#endif +#ifdef CONFIG_NUMA_BALANCING + [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", + [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", +#endif + [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", + [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", + [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", + [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", +#ifdef CONFIG_HUGETLB_PAGE + [I(NR_HUGETLB)] = "nr_hugetlb", +#endif + [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", +#undef I + + /* system-wide enum vm_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + x) + [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold", + [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold", + [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages", + [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages", +#undef I + +#if defined(CONFIG_VM_EVENT_COUNTERS) /* enum vm_event_item counters */ - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - TEXTS_FOR_ZONES("pgalloc") - TEXTS_FOR_ZONES("allocstall") - TEXTS_FOR_ZONES("pgskip") - - "pgfree", - "pgactivate", - "pgdeactivate", - "pglazyfree", - - "pgfault", - "pgmajfault", - "pglazyfreed", - - "pgrefill", - "pgsteal_kswapd", - "pgsteal_direct", - "pgscan_kswapd", - "pgscan_direct", - "pgscan_direct_throttle", +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x) + + [I(PGPGIN)] = "pgpgin", + [I(PGPGOUT)] = "pgpgout", + [I(PSWPIN)] = "pswpin", + [I(PSWPOUT)] = "pswpout", + +#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS) + TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc") + TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall") + TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip") +#undef OFF + + [I(PGFREE)] = "pgfree", + [I(PGACTIVATE)] = "pgactivate", + [I(PGDEACTIVATE)] = "pgdeactivate", + [I(PGLAZYFREE)] = "pglazyfree", + + [I(PGFAULT)] = "pgfault", + [I(PGMAJFAULT)] = "pgmajfault", + [I(PGLAZYFREED)] = "pglazyfreed", + + [I(PGREFILL)] = "pgrefill", + [I(PGREUSE)] = "pgreuse", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA - "zone_reclaim_failed", + [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", + [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed", #endif - "pginodesteal", - "slabs_scanned", - "kswapd_inodesteal", - "kswapd_low_wmark_hit_quickly", - "kswapd_high_wmark_hit_quickly", - "pageoutrun", + [I(PGINODESTEAL)] = "pginodesteal", + [I(SLABS_SCANNED)] = "slabs_scanned", + [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal", + [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly", + [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly", + [I(PAGEOUTRUN)] = "pageoutrun", - "pgrotated", + [I(PGROTATED)] = "pgrotated", - "drop_pagecache", - "drop_slab", - "oom_kill", + [I(DROP_PAGECACHE)] = "drop_pagecache", + [I(DROP_SLAB)] = "drop_slab", + [I(OOM_KILL)] = "oom_kill", #ifdef CONFIG_NUMA_BALANCING - "numa_pte_updates", - "numa_huge_pte_updates", - "numa_hint_faults", - "numa_hint_faults_local", - "numa_pages_migrated", + [I(NUMA_PTE_UPDATES)] = "numa_pte_updates", + [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates", + [I(NUMA_HINT_FAULTS)] = "numa_hint_faults", + [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local", + [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated", #endif #ifdef CONFIG_MIGRATION - "pgmigrate_success", - "pgmigrate_fail", + [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success", + [I(PGMIGRATE_FAIL)] = "pgmigrate_fail", + [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success", + [I(THP_MIGRATION_FAIL)] = "thp_migration_fail", + [I(THP_MIGRATION_SPLIT)] = "thp_migration_split", #endif #ifdef CONFIG_COMPACTION - "compact_migrate_scanned", - "compact_free_scanned", - "compact_isolated", - "compact_stall", - "compact_fail", - "compact_success", - "compact_daemon_wake", - "compact_daemon_migrate_scanned", - "compact_daemon_free_scanned", + [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned", + [I(COMPACTFREE_SCANNED)] = "compact_free_scanned", + [I(COMPACTISOLATED)] = "compact_isolated", + [I(COMPACTSTALL)] = "compact_stall", + [I(COMPACTFAIL)] = "compact_fail", + [I(COMPACTSUCCESS)] = "compact_success", + [I(KCOMPACTD_WAKE)] = "compact_daemon_wake", + [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned", + [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE - "htlb_buddy_alloc_success", - "htlb_buddy_alloc_fail", + [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success", + [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail", +#endif +#ifdef CONFIG_CMA + [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success", + [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail", #endif - "unevictable_pgs_culled", - "unevictable_pgs_scanned", - "unevictable_pgs_rescued", - "unevictable_pgs_mlocked", - "unevictable_pgs_munlocked", - "unevictable_pgs_cleared", - "unevictable_pgs_stranded", + [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled", + [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned", + [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued", + [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked", + [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked", + [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared", + [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded", #ifdef CONFIG_TRANSPARENT_HUGEPAGE - "thp_fault_alloc", - "thp_fault_fallback", - "thp_collapse_alloc", - "thp_collapse_alloc_failed", - "thp_file_alloc", - "thp_file_mapped", - "thp_split_page", - "thp_split_page_failed", - "thp_deferred_split_page", - "thp_split_pmd", + [I(THP_FAULT_ALLOC)] = "thp_fault_alloc", + [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback", + [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge", + [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc", + [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed", + [I(THP_FILE_ALLOC)] = "thp_file_alloc", + [I(THP_FILE_FALLBACK)] = "thp_file_fallback", + [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge", + [I(THP_FILE_MAPPED)] = "thp_file_mapped", + [I(THP_SPLIT_PAGE)] = "thp_split_page", + [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed", + [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page", + [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page", + [I(THP_SPLIT_PMD)] = "thp_split_pmd", + [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte", + [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte", + [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - "thp_split_pud", + [I(THP_SPLIT_PUD)] = "thp_split_pud", #endif - "thp_zero_page_alloc", - "thp_zero_page_alloc_failed", - "thp_swpout", - "thp_swpout_fallback", + [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc", + [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed", + [I(THP_SWPOUT)] = "thp_swpout", + [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON - "balloon_inflate", - "balloon_deflate", + [I(BALLOON_INFLATE)] = "balloon_inflate", + [I(BALLOON_DEFLATE)] = "balloon_deflate", #ifdef CONFIG_BALLOON_COMPACTION - "balloon_migrate", + [I(BALLOON_MIGRATE)] = "balloon_migrate", #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP - "nr_tlb_remote_flush", - "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ - "nr_tlb_local_flush_all", - "nr_tlb_local_flush_one", + [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", + [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", + [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all", + [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP - "swap_ra", - "swap_ra_hit", + [I(SWAP_RA)] = "swap_ra", + [I(SWAP_RA_HIT)] = "swap_ra_hit", + [I(SWPIN_ZERO)] = "swpin_zero", + [I(SWPOUT_ZERO)] = "swpout_zero", +#ifdef CONFIG_KSM + [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy", +#endif #endif -#endif /* CONFIG_VM_EVENTS_COUNTERS */ +#ifdef CONFIG_KSM + [I(COW_KSM)] = "cow_ksm", +#endif +#ifdef CONFIG_ZSWAP + [I(ZSWPIN)] = "zswpin", + [I(ZSWPOUT)] = "zswpout", + [I(ZSWPWB)] = "zswpwb", +#endif +#ifdef CONFIG_X86 + [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits", + [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits", + [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses", + [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses", +#endif +#ifdef CONFIG_PER_VMA_LOCK_STATS + [I(VMA_LOCK_SUCCESS)] = "vma_lock_success", + [I(VMA_LOCK_ABORT)] = "vma_lock_abort", + [I(VMA_LOCK_RETRY)] = "vma_lock_retry", + [I(VMA_LOCK_MISS)] = "vma_lock_miss", +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + [I(KSTACK_1K)] = "kstack_1k", +#if THREAD_SIZE > 1024 + [I(KSTACK_2K)] = "kstack_2k", +#endif +#if THREAD_SIZE > 2048 + [I(KSTACK_4K)] = "kstack_4k", +#endif +#if THREAD_SIZE > 4096 + [I(KSTACK_8K)] = "kstack_8k", +#endif +#if THREAD_SIZE > 8192 + [I(KSTACK_16K)] = "kstack_16k", +#endif +#if THREAD_SIZE > 16384 + [I(KSTACK_32K)] = "kstack_32k", +#endif +#if THREAD_SIZE > 32768 + [I(KSTACK_64K)] = "kstack_64k", +#endif +#if THREAD_SIZE > 65536 + [I(KSTACK_REST)] = "kstack_rest", +#endif +#endif +#undef I +#endif /* CONFIG_VM_EVENT_COUNTERS */ }; -#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_PROC_FS) @@ -1356,8 +1565,12 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order; seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + for (order = 0; order < NR_PAGE_ORDERS; ++order) + /* + * Access to nr_free is lockless as nr_free is used only for + * printing purposes. Use data_race to avoid KCSAN warning. + */ + seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free)); seq_putc(m, '\n'); } @@ -1381,36 +1594,51 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; + bool overflow = false; area = &(zone->free_area[order]); - list_for_each(curr, &area->free_list[mtype]) - freecount++; - seq_printf(m, "%6lu ", freecount); + list_for_each(curr, &area->free_list[mtype]) { + /* + * Cap the free_list iteration because it might + * be really large and we are under a spinlock + * so a long time spent here could trigger a + * hard lockup detector. Anyway this is a + * debugging tool so knowing there is a handful + * of pages of this order should be more than + * sufficient. + */ + if (++freecount >= 100000) { + overflow = true; + break; + } + } + seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); + spin_unlock_irq(&zone->lock); + cond_resched(); + spin_lock_irq(&zone->lock); } seq_putc(m, '\n'); } } /* Print out the free pages at each order for each migatetype */ -static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; pg_data_t *pgdat = (pg_data_t *)arg; /* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order < MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); - - return 0; } static void pagetypeinfo_showblockcount_print(struct seq_file *m, @@ -1429,10 +1657,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!page) continue; - /* Watch for unexpected holes punched in the memmap */ - if (!memmap_valid_within(pfn, page, zone)) - continue; - if (page_zone(page) != zone) continue; @@ -1450,7 +1674,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, } /* Print out the number of pageblocks for each migratetype */ -static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1461,8 +1685,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showblockcount_print); - - return 0; } /* @@ -1549,27 +1771,35 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS], - node_page_state(pgdat, i)); + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; + seq_printf(m, "\n %-12s %lu", node_stat_name(i), + pages); } } seq_printf(m, "\n pages free %lu" + "\n boost %lu" "\n min %lu" "\n low %lu" "\n high %lu" + "\n promo %lu" "\n spanned %lu" "\n present %lu" - "\n managed %lu", + "\n managed %lu" + "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), + zone->watermark_boost, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), + promo_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone_managed_pages(zone)); + zone_managed_pages(zone), + zone_cma_pages(zone)); seq_printf(m, "\n protection: (%ld", @@ -1585,40 +1815,50 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); + seq_printf(m, "\n %-12s %lu", zone_stat_name(i), + zone_page_state(zone, i)); #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - zone_numa_state_snapshot(zone, i)); + fold_vm_zone_numa_events(zone); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", numa_stat_name(i), + zone_numa_event_state(zone, i)); #endif seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; + struct per_cpu_pages *pcp; + struct per_cpu_zonestat __maybe_unused *pzstats; - pageset = per_cpu_ptr(zone->pageset, i); + pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", + "\n count: %i" + "\n high: %i" + "\n batch: %i" + "\n high_min: %i" + "\n high_max: %i", i, - pageset->pcp.count, - pageset->pcp.high, - pageset->pcp.batch); + pcp->count, + pcp->high, + pcp->batch, + pcp->high_min, + pcp->high_max); #ifdef CONFIG_SMP + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", - pageset->stat_threshold); + pzstats->stat_threshold); #endif } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn); + "\n start_pfn: %lu" + "\n reserved_highatomic: %lu" + "\n free_highatomic: %lu", + atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + zone->zone_start_pfn, + zone->nr_reserved_highatomic, + zone->nr_free_highatomic); seq_putc(m, '\n'); } @@ -1643,31 +1883,24 @@ static const struct seq_operations zoneinfo_op = { .show = zoneinfo_show, }; -enum writeback_stat_item { - NR_DIRTY_THRESHOLD, - NR_DIRTY_BG_THRESHOLD, - NR_VM_WRITEBACK_STAT_ITEMS, -}; +#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + \ + NR_VM_STAT_ITEMS + \ + (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ + NR_VM_EVENT_ITEMS : 0)) static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; - int i, stat_items_size; + int i; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos >= NR_VMSTAT_ITEMS) return NULL; - stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); - -#ifdef CONFIG_VM_EVENT_COUNTERS - stat_items_size += sizeof(struct vm_event_state); -#endif - BUILD_BUG_ON(stat_items_size != - ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); - v = kmalloc(stat_items_size, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS); + fold_vm_numa_events(); + v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); @@ -1676,18 +1909,23 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_ZONE_STAT_ITEMS; #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - v[i] = global_numa_state(i); - v += NR_VM_NUMA_STAT_ITEMS; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + v[i] = global_numa_event_state(i); + v += NR_VM_NUMA_EVENT_ITEMS; #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - v[i] = global_node_page_state(i); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + v[i] = global_node_page_state_pages(i); + if (vmstat_item_print_in_thp(i)) + v[i] /= HPAGE_PMD_NR; + } v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); - v += NR_VM_WRITEBACK_STAT_ITEMS; + v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages); + v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages); + v += NR_VM_STAT_ITEMS; #ifdef CONFIG_VM_EVENT_COUNTERS all_vm_events(v); @@ -1700,7 +1938,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) { (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos >= NR_VMSTAT_ITEMS) return NULL; return (unsigned long *)m->private + *pos; } @@ -1713,6 +1951,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } @@ -1732,7 +1978,8 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); -int sysctl_stat_interval __read_mostly = HZ; +static int sysctl_stat_interval __read_mostly = HZ; +static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS static void refresh_vm_stats(struct work_struct *work) @@ -1740,8 +1987,8 @@ static void refresh_vm_stats(struct work_struct *work) refresh_cpu_vm_stats(true); } -int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int vmstat_refresh(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; @@ -1763,25 +2010,34 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; + __func__, zone_stat_name(i), val); } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_numa_stat[i]); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } + val = atomic_long_read(&vm_node_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); - err = -EINVAL; + __func__, node_stat_name(i), val); } } -#endif - if (err) - return err; if (write) *ppos += *lenp; else @@ -1805,37 +2061,30 @@ static void vmstat_update(struct work_struct *w) } /* - * Switch off vmstat processing and then fold all the remaining differentials - * until the diffs stay at zero. The function is used by NOHZ and can only be - * invoked when tick processing is not active. - */ -/* * Check if the diffs for a certain cpu indicate that * an update is needed. */ static bool need_update(int cpu) { + pg_data_t *last_pgdat = NULL; struct zone *zone; for_each_populated_zone(zone) { - struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); - - BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); -#ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); -#endif + struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + struct per_cpu_nodestat *n; /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(p->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff))) return true; -#ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * - sizeof(p->vm_numa_stat_diff[0]))) + + if (last_pgdat == zone->zone_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); + if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff))) return true; -#endif } return false; } @@ -1879,15 +2128,31 @@ static void vmstat_shepherd(struct work_struct *w) { int cpu; - get_online_cpus(); + cpus_read_lock(); /* Check processors whose vmstat worker threads have been disabled */ for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + /* + * In kernel users of vmstat counters either require the precise value and + * they are using zone_page_state_snapshot interface or they can live with + * an imprecision as the regular flushing can happen at arbitrary time and + * cumulative error can grow (see calculate_normal_threshold). + * + * From that POV the regular flushing can be postponed for CPUs that have + * been isolated from the kernel interference without critical + * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd + * for all isolated CPUs to avoid interference with the isolated workload. + */ + if (cpu_is_isolated(cpu)) + continue; + if (!delayed_work_pending(dw) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); + + cond_resched(); } - put_online_cpus(); + cpus_read_unlock(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); @@ -1897,10 +2162,20 @@ static void __init start_shepherd_timer(void) { int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); + /* + * For secondary CPUs during CPU hotplug scenarios, + * vmstat_cpu_online() will enable the work. + * mm/vmstat:online enables and disables vmstat_work + * symmetrically during CPU hotplug events. + */ + if (!cpu_online(cpu)) + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + } + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -1910,21 +2185,27 @@ static void __init init_cpu_node_state(void) int node; for_each_online_node(node) { - if (cpumask_weight(cpumask_of_node(node)) > 0) + if (!cpumask_empty(cpumask_of_node(node))) node_set_state(node, N_CPU); } } static int vmstat_cpu_online(unsigned int cpu) { - refresh_zone_stat_thresholds(); - node_set_state(cpu_to_node(cpu), N_CPU); + if (vmstat_late_init_done) + refresh_zone_stat_thresholds(); + + if (!node_state(cpu_to_node(cpu), N_CPU)) { + node_set_state(cpu_to_node(cpu), N_CPU); + } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); + return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } @@ -1937,13 +2218,54 @@ static int vmstat_cpu_dead(unsigned int cpu) refresh_zone_stat_thresholds(); node_cpus = cpumask_of_node(node); - if (cpumask_weight(node_cpus) > 0) + if (!cpumask_empty(node_cpus)) return 0; node_clear_state(node, N_CPU); + + return 0; +} + +static int __init vmstat_late_init(void) +{ + refresh_zone_stat_thresholds(); + vmstat_late_init_done = 1; + return 0; } +late_initcall(vmstat_late_init); +#endif +#ifdef CONFIG_PROC_FS +static const struct ctl_table vmstat_table[] = { +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +}; #endif struct workqueue_struct *mm_percpu_wq; @@ -1966,17 +2288,18 @@ void __init init_mm_internals(void) if (ret < 0) pr_err("vmstat: failed to register 'online' hotplug state\n"); - get_online_cpus(); + cpus_read_lock(); init_cpu_node_state(); - put_online_cpus(); + cpus_read_unlock(); start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); - proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); + proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + register_sysctl_init("vm", vmstat_table); #endif } @@ -2014,7 +2337,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2045,24 +2368,14 @@ static int unusable_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2076,10 +2389,10 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); - seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); } seq_putc(m, '\n'); @@ -2097,46 +2410,30 @@ static int extfrag_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { struct dentry *extfrag_debug_root; extfrag_debug_root = debugfs_create_dir("extfrag", NULL); - if (!extfrag_debug_root) - return -ENOMEM; - if (!debugfs_create_file("unusable_index", 0444, - extfrag_debug_root, NULL, &unusable_file_ops)) - goto fail; + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_fops); - if (!debugfs_create_file("extfrag_index", 0444, - extfrag_debug_root, NULL, &extfrag_file_ops)) - goto fail; + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_fops); return 0; -fail: - debugfs_remove_recursive(extfrag_debug_root); - return -ENOMEM; } module_init(extfrag_debug_init); + #endif |
