Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
Subject: Re: [PATCH] sched/numa: advanced per-cgroup numa statistic
From:   =?UTF-8?B?546L6LSH?= <yun.wang@linux.alibaba.com>
To:     Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Juri Lelli <juri.lelli@redhat.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
        linux-kernel@vger.kernel.org, mkoutny@suse.com
References: <46b0fd25-7b73-aa80-372a-9fcd025154cb@linux.alibaba.com>
Message-ID: <dbaab94e-88d8-7207-8a4e-ebff3aba9089@linux.alibaba.com>
Date:   Thu, 24 Oct 2019 11:16:06 +0800
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0)
 Gecko/20100101 Thunderbird/60.9.0
MIME-Version: 1.0
In-Reply-To: <46b0fd25-7b73-aa80-372a-9fcd025154cb@linux.alibaba.com>
Content-Type: text/plain; charset=utf-8
Content-Language: en-US
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk

Hi Peter, Michal

The statistic has been proved practically useful on detecting numa
issues, especially for the box running with complex mixing workloads.

Also enabled for cgroup-v2 now :-)

Regards,
Michael Wang

On 2019/10/24 上午11:08, 王贇 wrote:
> Currently there are no good approach to monitoring the per-cgroup
> numa efficiency, this could be a trouble especially when groups
> are sharing CPUs, it's impossible to tell which one caused the
> remote-memory access by reading hardware counter since multiple
> workloads could sharing the same CPU, which make it painful when
> one want to find out the root cause and fix the issue.
> 
> In order to address this, we introduced new per-cgroup statistic
> for numa:
>   * the numa locality to imply the numa balancing efficiency
>   * the numa execution time on each node
> 
> The task locality is the local page accessing ratio traced on numa
> balancing PF, and the group locality is the topology of task execution
> time, sectioned by the locality into 8 regions.
> 
> For example the new entry 'cpu.numa_stat' show:
>   locality 15393 21259 13023 44461 21247 17012 28496 145402
>   exectime 311900 407166
> 
> Here we know the workloads executed 311900ms on node_0 and 407166ms
> on node_1, tasks with locality around 0~12% executed for 15393 ms, and
> tasks with locality around 88~100% executed for 145402 ms, which imply
> most of the memory access is local access, for the workloads of this
> group.
> 
> By monitoring the new statistic, we will be able to know the numa
> efficiency of each per-cgroup workloads on machine, whatever they
> sharing the CPUs or not, we will be able to find out which one
> introduced the remote access mostly.
> 
> Besides, per-node memory topology from 'memory.numa_stat' become
> more useful when we have the per-node execution time, workloads
> always executing on node_0 while it's memory is all on node_1 is
> usually a bad case.
> 
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
> ---
>  include/linux/sched.h |  8 ++++++-
>  kernel/sched/core.c   | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/debug.c  |  7 ++++++
>  kernel/sched/fair.c   | 51 ++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h  | 29 +++++++++++++++++++++++++
>  5 files changed, 153 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 263cf089d1b3..46995be622c1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1114,8 +1114,14 @@ struct task_struct {
>  	 * scan window were remote/local or failed to migrate. The task scan
>  	 * period is adapted based on the locality of the faults with different
>  	 * weights depending on whether they were shared or private faults
> +	 *
> +	 * 0 -- remote faults
> +	 * 1 -- local faults
> +	 * 2 -- page migration failure
> +	 * 3 -- remote page accessing
> +	 * 4 -- local page accessing
>  	 */
> -	unsigned long			numa_faults_locality[3];
> +	unsigned long			numa_faults_locality[5];
> 
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index eb42b71faab9..4364da279f04 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6544,6 +6544,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
>  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
>  DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +DECLARE_PER_CPU(struct numa_stat, root_numa_stat);
> +#endif
> +
>  void __init sched_init(void)
>  {
>  	unsigned long ptr = 0;
> @@ -6593,6 +6597,10 @@ void __init sched_init(void)
>  	init_defrootdomain();
>  #endif
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +	root_task_group.numa_stat = &root_numa_stat;
> +#endif
> +
>  #ifdef CONFIG_RT_GROUP_SCHED
>  	init_rt_bandwidth(&root_task_group.rt_bandwidth,
>  			global_rt_period(), global_rt_runtime());
> @@ -6918,6 +6926,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
> 
>  static void sched_free_group(struct task_group *tg)
>  {
> +	free_tg_numa_stat(tg);
>  	free_fair_sched_group(tg);
>  	free_rt_sched_group(tg);
>  	autogroup_free(tg);
> @@ -6933,6 +6942,9 @@ struct task_group *sched_create_group(struct task_group *parent)
>  	if (!tg)
>  		return ERR_PTR(-ENOMEM);
> 
> +	if (!alloc_tg_numa_stat(tg))
> +		goto err;
> +
>  	if (!alloc_fair_sched_group(tg, parent))
>  		goto err;
> 
> @@ -7638,6 +7650,40 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>  }
>  #endif /* CONFIG_RT_GROUP_SCHED */
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
> +{
> +	int nr;
> +	struct task_group *tg = css_tg(seq_css(sf));
> +
> +	seq_puts(sf, "locality");
> +	for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
> +		int cpu;
> +		u64 sum = 0;
> +
> +		for_each_possible_cpu(cpu)
> +			sum += per_cpu(tg->numa_stat->locality[nr], cpu);
> +
> +		seq_printf(sf, " %u", jiffies_to_msecs(sum));
> +	}
> +	seq_putc(sf, '\n');
> +
> +	seq_puts(sf, "exectime");
> +	for_each_online_node(nr) {
> +		int cpu;
> +		u64 sum = 0;
> +
> +		for_each_cpu(cpu, cpumask_of_node(nr))
> +			sum += per_cpu(tg->numa_stat->jiffies, cpu);
> +
> +		seq_printf(sf, " %u", jiffies_to_msecs(sum));
> +	}
> +	seq_putc(sf, '\n');
> +
> +	return 0;
> +}
> +#endif
> +
>  static struct cftype cpu_legacy_files[] = {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	{
> @@ -7687,6 +7733,12 @@ static struct cftype cpu_legacy_files[] = {
>  		.seq_show = cpu_uclamp_max_show,
>  		.write = cpu_uclamp_max_write,
>  	},
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> +	{
> +		.name = "numa_stat",
> +		.seq_show = cpu_numa_stat_show,
> +	},
>  #endif
>  	{ }	/* Terminate */
>  };
> @@ -7868,6 +7920,13 @@ static struct cftype cpu_files[] = {
>  		.seq_show = cpu_uclamp_max_show,
>  		.write = cpu_uclamp_max_write,
>  	},
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> +	{
> +		.name = "numa_stat",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.seq_show = cpu_numa_stat_show,
> +	},
>  #endif
>  	{ }	/* terminate */
>  };
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index f7e4579e746c..a22b2a62aee2 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
>  	P(total_numa_faults);
>  	SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
>  			task_node(p), task_numa_group_id(p));
> +	SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
> +			p->numa_faults_locality[1],
> +			p->numa_faults_locality[0],
> +			p->numa_faults_locality[2]);
> +	SEQ_printf(m, "lhit=%lu rhit=%lu\n",
> +			p->numa_faults_locality[4],
> +			p->numa_faults_locality[3]);
>  	show_numa_stats(p, m);
>  	mpol_put(pol);
>  #endif
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a81c36472822..4ba3a41cdca3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>  	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>  	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>  	p->numa_faults_locality[local] += pages;
> +	/*
> +	 * We want to have the real local/remote page access statistic
> +	 * here, so use 'mem_node' which is the real residential node of
> +	 * page after migrate_misplaced_page().
> +	 */
> +	p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
>  }
> 
>  static void reset_ptenuma_scan(struct task_struct *p)
> @@ -2672,6 +2678,49 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
>  	}
>  }
> 
> +DEFINE_PER_CPU(struct numa_stat, root_numa_stat);
> +
> +int alloc_tg_numa_stat(struct task_group *tg)
> +{
> +	tg->numa_stat = alloc_percpu(struct numa_stat);
> +	if (!tg->numa_stat)
> +		return 0;
> +
> +	return 1;
> +}
> +
> +void free_tg_numa_stat(struct task_group *tg)
> +{
> +	free_percpu(tg->numa_stat);
> +}
> +
> +static void update_tg_numa_stat(struct task_struct *p)
> +{
> +	struct task_group *tg;
> +	unsigned long remote = p->numa_faults_locality[3];
> +	unsigned long local = p->numa_faults_locality[4];
> +	int idx = -1;
> +
> +	/* Tobe scaled? */
> +	if (remote || local)
> +		idx = NR_NL_INTERVAL * local / (remote + local + 1);
> +
> +	rcu_read_lock();
> +
> +	tg = task_group(p);
> +	while (tg) {
> +		/* skip account when there are no faults records */
> +		if (idx != -1)
> +			this_cpu_inc(tg->numa_stat->locality[idx]);
> +
> +		this_cpu_inc(tg->numa_stat->jiffies);
> +
> +		tg = tg->parent;
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
>  /*
>   * Drive the periodic memory faults..
>   */
> @@ -2686,6 +2735,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
>  	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
>  		return;
> 
> +	update_tg_numa_stat(curr);
> +
>  	/*
>  	 * Using runtime rather than walltime has the dual advantage that
>  	 * we (mostly) drive the selection from busy threads and that the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0db2c1b3361e..fd1ea597e349 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -351,6 +351,18 @@ struct cfs_bandwidth {
>  #endif
>  };
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +
> +/* NUMA Locality Interval, 8 bucket for cache align */
> +#define NR_NL_INTERVAL	8
> +
> +struct numa_stat {
> +	u64 locality[NR_NL_INTERVAL];
> +	u64 jiffies;
> +};
> +
> +#endif
> +
>  /* Task group related information */
>  struct task_group {
>  	struct cgroup_subsys_state css;
> @@ -401,8 +413,25 @@ struct task_group {
>  	struct uclamp_se	uclamp[UCLAMP_CNT];
>  #endif
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +	struct numa_stat __percpu *numa_stat;
> +#endif
>  };
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +int alloc_tg_numa_stat(struct task_group *tg);
> +void free_tg_numa_stat(struct task_group *tg);
> +#else
> +static int alloc_tg_numa_stat(struct task_group *tg)
> +{
> +	return 1;
> +}
> +
> +static void free_tg_numa_stat(struct task_group *tg)
> +{
> +}
> +#endif
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  #define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
>