Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1762798AbdLSM2z (ORCPT ); Tue, 19 Dec 2017 07:28:55 -0500 Received: from mx2.suse.de ([195.135.220.15]:33921 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751281AbdLSM2x (ORCPT ); Tue, 19 Dec 2017 07:28:53 -0500 Date: Tue, 19 Dec 2017 13:28:48 +0100 From: Michal Hocko To: Kemi Wang Cc: Greg Kroah-Hartman , Andrew Morton , Vlastimil Babka , Mel Gorman , Johannes Weiner , Christopher Lameter , YASUAKI ISHIMATSU , Andrey Ryabinin , Nikolay Borisov , Pavel Tatashin , David Rientjes , Sebastian Andrzej Siewior , Dave , Andi Kleen , Tim Chen , Jesper Dangaard Brouer , Ying Huang , Aaron Lu , Aubrey Li , Linux MM , Linux Kernel Subject: Re: [PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node Message-ID: <20171219122848.GM2787@dhcp22.suse.cz> References: <1513665566-4465-1-git-send-email-kemi.wang@intel.com> <1513665566-4465-2-git-send-email-kemi.wang@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1513665566-4465-2-git-send-email-kemi.wang@intel.com> User-Agent: Mutt/1.9.1 (2017-09-22) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 18438 Lines: 575 On Tue 19-12-17 14:39:22, Kemi Wang wrote: > There is not really any use to get NUMA stats separated by zone, and > current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code > cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the > existed per-cpu infrastructure. Let's hope that nobody really depends on the per-zone numbers. It would be really strange as those counters are inherently per-node and that is what users should care about but who knows... Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch keeps it and follow up patches even use it further. I will comment on those separately but this still makes these few counters really special which I think is wrong. > Suggested-by: Andi Kleen > Suggested-by: Michal Hocko > Signed-off-by: Kemi Wang I have to fully grasp the rest of the series before I'll give my Ack, but I _really_ like the simplification this adds to the code. I believe it can be even simpler. > --- > drivers/base/node.c | 23 +++---- > include/linux/mmzone.h | 27 ++++---- > include/linux/vmstat.h | 31 --------- > mm/mempolicy.c | 2 +- > mm/page_alloc.c | 16 +++-- > mm/vmstat.c | 177 +++++-------------------------------------------- > 6 files changed, 46 insertions(+), 230 deletions(-) > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index ee090ab..a045ea1 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev, > "interleave_hit %lu\n" > "local_node %lu\n" > "other_node %lu\n", > - sum_zone_numa_state(dev->id, NUMA_HIT), > - sum_zone_numa_state(dev->id, NUMA_MISS), > - sum_zone_numa_state(dev->id, NUMA_FOREIGN), > - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), > - sum_zone_numa_state(dev->id, NUMA_LOCAL), > - sum_zone_numa_state(dev->id, NUMA_OTHER)); > + node_page_state(NODE_DATA(dev->id), NUMA_HIT), > + node_page_state(NODE_DATA(dev->id), NUMA_MISS), > + node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), > + node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), > + node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), > + node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); > } > + > static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); > > static ssize_t node_read_vmstat(struct device *dev, > @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev, > n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], > sum_zone_node_page_state(nid, i)); > > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - n += sprintf(buf+n, "%s %lu\n", > - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], > - sum_zone_numa_state(nid, i)); > -#endif > - > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > n += sprintf(buf+n, "%s %lu\n", > - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + > - NR_VM_NUMA_STAT_ITEMS], > + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], > node_page_state(pgdat, i)); > > return n; > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 67f2e3c..c06d880 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -115,20 +115,6 @@ struct zone_padding { > #define ZONE_PADDING(name) > #endif > > -#ifdef CONFIG_NUMA > -enum numa_stat_item { > - NUMA_HIT, /* allocated in intended node */ > - NUMA_MISS, /* allocated in non intended node */ > - NUMA_FOREIGN, /* was intended here, hit elsewhere */ > - NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ > - NUMA_LOCAL, /* allocation from local node */ > - NUMA_OTHER, /* allocation from other node */ > - NR_VM_NUMA_STAT_ITEMS > -}; > -#else > -#define NR_VM_NUMA_STAT_ITEMS 0 > -#endif > - > enum zone_stat_item { > /* First 128 byte cacheline (assuming 64 bit words) */ > NR_FREE_PAGES, > @@ -151,7 +137,18 @@ enum zone_stat_item { > NR_VM_ZONE_STAT_ITEMS }; > > enum node_stat_item { > - NR_LRU_BASE, > +#ifdef CONFIG_NUMA > + NUMA_HIT, /* allocated in intended node */ > + NUMA_MISS, /* allocated in non intended node */ > + NUMA_FOREIGN, /* was intended here, hit elsewhere */ > + NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ > + NUMA_LOCAL, /* allocation from local node */ > + NUMA_OTHER, /* allocation from other node */ > + NR_VM_NUMA_STAT_ITEMS, > +#else > +#define NR_VM_NUMA_STAT_ITEMS 0 > +#endif > + NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS, > NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ > NR_ACTIVE_ANON, /* " " " " " */ > NR_INACTIVE_FILE, /* " " " " " */ > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index 1779c98..80bf290 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu) > * Zone and node-based page accounting with per cpu differentials. > */ > extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; > -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; > extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; > > -#ifdef CONFIG_NUMA > -static inline void zone_numa_state_add(long x, struct zone *zone, > - enum numa_stat_item item) > -{ > - atomic_long_add(x, &zone->vm_numa_stat[item]); > - atomic_long_add(x, &vm_numa_stat[item]); > -} > - > -static inline unsigned long global_numa_state(enum numa_stat_item item) > -{ > - long x = atomic_long_read(&vm_numa_stat[item]); > - > - return x; > -} > - > -static inline unsigned long zone_numa_state_snapshot(struct zone *zone, > - enum numa_stat_item item) > -{ > - long x = atomic_long_read(&zone->vm_numa_stat[item]); > - int cpu; > - > - for_each_online_cpu(cpu) > - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; > - > - return x; > -} > -#endif /* CONFIG_NUMA */ > - > static inline void zone_page_state_add(long x, struct zone *zone, > enum zone_stat_item item) > { > @@ -234,10 +205,8 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, > > > #ifdef CONFIG_NUMA > -extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); > extern unsigned long sum_zone_node_page_state(int node, > enum zone_stat_item item); > -extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); > extern unsigned long node_page_state(struct pglist_data *pgdat, > enum node_stat_item item); > #else > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 4ce44d3..b2293e3 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -1920,7 +1920,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, > return page; > if (page && page_to_nid(page) == nid) { > preempt_disable(); > - __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); > + inc_node_state(page_pgdat(page), NUMA_INTERLEAVE_HIT); > preempt_enable(); > } > return page; > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7e5e775..81e8d8f 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -2793,22 +2793,24 @@ int __isolate_free_page(struct page *page, unsigned int order) > static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) > { > #ifdef CONFIG_NUMA > - enum numa_stat_item local_stat = NUMA_LOCAL; > + int preferred_nid = preferred_zone->node; > + int nid = z->node; > + enum node_stat_item local_stat = NUMA_LOCAL; > > /* skip numa counters update if numa stats is disabled */ > if (!static_branch_likely(&vm_numa_stat_key)) > return; > > - if (z->node != numa_node_id()) > + if (nid != numa_node_id()) > local_stat = NUMA_OTHER; > > - if (z->node == preferred_zone->node) > - __inc_numa_state(z, NUMA_HIT); > + if (nid == preferred_nid) > + inc_node_state(NODE_DATA(nid), NUMA_HIT); > else { > - __inc_numa_state(z, NUMA_MISS); > - __inc_numa_state(preferred_zone, NUMA_FOREIGN); > + inc_node_state(NODE_DATA(nid), NUMA_MISS); > + inc_node_state(NODE_DATA(preferred_nid), NUMA_FOREIGN); > } > - __inc_numa_state(z, local_stat); > + inc_node_state(NODE_DATA(nid), local_stat); > #endif > } > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 40b2db6..1dd12ae 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -30,46 +30,44 @@ > > #include "internal.h" > > -#define NUMA_STATS_THRESHOLD (U16_MAX - 2) > - > #ifdef CONFIG_NUMA > int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; > > -/* zero numa counters within a zone */ > -static void zero_zone_numa_counters(struct zone *zone) > +/* zero numa stats within a node */ > +static void zero_node_numa_stats(int node) > { > int item, cpu; > > for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { > - atomic_long_set(&zone->vm_numa_stat[item], 0); > + atomic_long_set(&(NODE_DATA(node)->vm_stat[item]), 0); > for_each_online_cpu(cpu) > - per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] > - = 0; > + per_cpu_ptr(NODE_DATA(node)->per_cpu_nodestats, > + cpu)->vm_node_stat_diff[item] = 0; > } > } > > -/* zero numa counters of all the populated zones */ > -static void zero_zones_numa_counters(void) > +/* zero numa stats of all the online nodes */ > +static void zero_nodes_numa_stats(void) > { > - struct zone *zone; > + int node; > > - for_each_populated_zone(zone) > - zero_zone_numa_counters(zone); > + for_each_online_node(node) > + zero_node_numa_stats(node); > } > > -/* zero global numa counters */ > -static void zero_global_numa_counters(void) > +/* zero global numa stats */ > +static void zero_global_numa_stats(void) > { > int item; > > for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) > - atomic_long_set(&vm_numa_stat[item], 0); > + atomic_long_set(&vm_node_stat[item], 0); > } > > static void invalid_numa_statistics(void) > { > - zero_zones_numa_counters(); > - zero_global_numa_counters(); > + zero_nodes_numa_stats(); > + zero_global_numa_stats(); > } > > static DEFINE_MUTEX(vm_numa_stat_lock); > @@ -160,10 +158,8 @@ void vm_events_fold_cpu(int cpu) > * vm_stat contains the global counters > */ > atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; > -atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; > atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; > EXPORT_SYMBOL(vm_zone_stat); > -EXPORT_SYMBOL(vm_numa_stat); > EXPORT_SYMBOL(vm_node_stat); > > #ifdef CONFIG_SMP > @@ -679,32 +675,6 @@ EXPORT_SYMBOL(dec_node_page_state); > * Fold a differential into the global counters. > * Returns the number of counters updated. > */ > -#ifdef CONFIG_NUMA > -static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) > -{ > - int i; > - int changes = 0; > - > - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) > - if (zone_diff[i]) { > - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); > - changes++; > - } > - > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - if (numa_diff[i]) { > - atomic_long_add(numa_diff[i], &vm_numa_stat[i]); > - changes++; > - } > - > - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > - if (node_diff[i]) { > - atomic_long_add(node_diff[i], &vm_node_stat[i]); > - changes++; > - } > - return changes; > -} > -#else > static int fold_diff(int *zone_diff, int *node_diff) > { > int i; > @@ -723,7 +693,6 @@ static int fold_diff(int *zone_diff, int *node_diff) > } > return changes; > } > -#endif /* CONFIG_NUMA */ > > /* > * Update the zone counters for the current cpu. > @@ -747,9 +716,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > struct zone *zone; > int i; > int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; > -#ifdef CONFIG_NUMA > - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; > -#endif > int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; > int changes = 0; > > @@ -771,18 +737,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > } > } > #ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { > - int v; > - > - v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); > - if (v) { > - > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - global_numa_diff[i] += v; > - __this_cpu_write(p->expire, 3); > - } > - } > - > if (do_pagesets) { > cond_resched(); > /* > @@ -829,12 +783,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > } > } > > -#ifdef CONFIG_NUMA > - changes += fold_diff(global_zone_diff, global_numa_diff, > - global_node_diff); > -#else > changes += fold_diff(global_zone_diff, global_node_diff); > -#endif > return changes; > } > > @@ -849,9 +798,6 @@ void cpu_vm_stats_fold(int cpu) > struct zone *zone; > int i; > int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; > -#ifdef CONFIG_NUMA > - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; > -#endif > int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; > > for_each_populated_zone(zone) { > @@ -868,18 +814,6 @@ void cpu_vm_stats_fold(int cpu) > atomic_long_add(v, &zone->vm_stat[i]); > global_zone_diff[i] += v; > } > - > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - if (p->vm_numa_stat_diff[i]) { > - int v; > - > - v = p->vm_numa_stat_diff[i]; > - p->vm_numa_stat_diff[i] = 0; > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - global_numa_diff[i] += v; > - } > -#endif > } > > for_each_online_pgdat(pgdat) { > @@ -898,11 +832,7 @@ void cpu_vm_stats_fold(int cpu) > } > } > > -#ifdef CONFIG_NUMA > - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); > -#else > fold_diff(global_zone_diff, global_node_diff); > -#endif > } > > /* > @@ -920,36 +850,10 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) > atomic_long_add(v, &zone->vm_stat[i]); > atomic_long_add(v, &vm_zone_stat[i]); > } > - > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - if (pset->vm_numa_stat_diff[i]) { > - int v = pset->vm_numa_stat_diff[i]; > - > - pset->vm_numa_stat_diff[i] = 0; > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - atomic_long_add(v, &vm_numa_stat[i]); > - } > -#endif > } > #endif > > #ifdef CONFIG_NUMA > -void __inc_numa_state(struct zone *zone, > - enum numa_stat_item item) > -{ > - struct per_cpu_pageset __percpu *pcp = zone->pageset; > - u16 __percpu *p = pcp->vm_numa_stat_diff + item; > - u16 v; > - > - v = __this_cpu_inc_return(*p); > - > - if (unlikely(v > NUMA_STATS_THRESHOLD)) { > - zone_numa_state_add(v, zone, item); > - __this_cpu_write(*p, 0); > - } > -} > - > /* > * Determine the per node value of a stat item. This function > * is called frequently in a NUMA machine, so try to be as > @@ -969,23 +873,6 @@ unsigned long sum_zone_node_page_state(int node, > } > > /* > - * Determine the per node value of a numa stat item. To avoid deviation, > - * the per cpu stat number in vm_numa_stat_diff[] is also included. > - */ > -unsigned long sum_zone_numa_state(int node, > - enum numa_stat_item item) > -{ > - struct zone *zones = NODE_DATA(node)->node_zones; > - int i; > - unsigned long count = 0; > - > - for (i = 0; i < MAX_NR_ZONES; i++) > - count += zone_numa_state_snapshot(zones + i, item); > - > - return count; > -} > - > -/* > * Determine the per node value of a stat item. > */ > unsigned long node_page_state(struct pglist_data *pgdat, > @@ -1569,8 +1456,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > seq_printf(m, "\n per-node stats"); > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { > seq_printf(m, "\n %-12s %lu", > - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + > - NR_VM_NUMA_STAT_ITEMS], > + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], > node_page_state(pgdat, i)); > } > } > @@ -1607,13 +1493,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > seq_printf(m, "\n %-12s %lu", vmstat_text[i], > zone_page_state(zone, i)); > > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - seq_printf(m, "\n %-12s %lu", > - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], > - zone_numa_state_snapshot(zone, i)); > -#endif > - > seq_printf(m, "\n pagesets"); > for_each_online_cpu(i) { > struct per_cpu_pageset *pageset; > @@ -1688,7 +1567,6 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) > if (*pos >= ARRAY_SIZE(vmstat_text)) > return NULL; > stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + > - NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + > NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + > NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); > > @@ -1704,12 +1582,6 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) > v[i] = global_zone_page_state(i); > v += NR_VM_ZONE_STAT_ITEMS; > > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - v[i] = global_numa_state(i); > - v += NR_VM_NUMA_STAT_ITEMS; > -#endif > - > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > v[i] = global_node_page_state(i); > v += NR_VM_NODE_STAT_ITEMS; > @@ -1811,16 +1683,6 @@ int vmstat_refresh(struct ctl_table *table, int write, > err = -EINVAL; > } > } > -#ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { > - val = atomic_long_read(&vm_numa_stat[i]); > - if (val < 0) { > - pr_warn("%s: %s %ld\n", > - __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); > - err = -EINVAL; > - } > - } > -#endif > if (err) > return err; > if (write) > @@ -1862,9 +1724,6 @@ static bool need_update(int cpu) > struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); > > BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); > -#ifdef CONFIG_NUMA > - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); > -#endif > > /* > * The fast way of checking if there are any vmstat diffs. > @@ -1872,10 +1731,6 @@ static bool need_update(int cpu) > */ > if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) > return true; > -#ifdef CONFIG_NUMA > - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) > - return true; > -#endif > } > return false; > } > -- > 2.7.4 > -- Michal Hocko SUSE Labs