Currently there are no good approach to monitoring the per-cgroup
numa efficiency, this could be a trouble especially when groups
are sharing CPUs, it's impossible to tell which one caused the
remote-memory access by reading hardware counter since multiple
workloads could sharing the same CPU, which make it painful when
one want to find out the root cause and fix the issue.
In order to address this, we introduced new per-cgroup statistic
for numa:
* the numa locality to imply the numa balancing efficiency
* the numa execution time on each node
The task locality is the local page accessing ratio traced on numa
balancing PF, and the group locality is the topology of task execution
time, sectioned by the locality into 8 regions.
For example the new entry 'cpu.numa_stat' show:
locality 15393 21259 13023 44461 21247 17012 28496 145402
exectime 311900 407166
Here we know the workloads executed 311900ms on node_0 and 407166ms
on node_1, tasks with locality around 0~12% executed for 15393 ms, and
tasks with locality around 88~100% executed for 145402 ms, which imply
most of the memory access is local access, for the workloads of this
group.
By monitoring the new statistic, we will be able to know the numa
efficiency of each per-cgroup workloads on machine, whatever they
sharing the CPUs or not, we will be able to find out which one
introduced the remote access mostly.
Besides, per-node memory topology from 'memory.numa_stat' become
more useful when we have the per-node execution time, workloads
always executing on node_0 while it's memory is all on node_1 is
usually a bad case.
Cc: Peter Zijlstra <[email protected]>
Cc: Michal Koutný <[email protected]>
Signed-off-by: Michael Wang <[email protected]>
---
include/linux/sched.h | 8 ++++++-
kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/debug.c | 7 ++++++
kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 29 +++++++++++++++++++++++++
5 files changed, 153 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 263cf089d1b3..46995be622c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1114,8 +1114,14 @@ struct task_struct {
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
+ *
+ * 0 -- remote faults
+ * 1 -- local faults
+ * 2 -- page migration failure
+ * 3 -- remote page accessing
+ * 4 -- local page accessing
*/
- unsigned long numa_faults_locality[3];
+ unsigned long numa_faults_locality[5];
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb42b71faab9..4364da279f04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6544,6 +6544,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+#ifdef CONFIG_NUMA_BALANCING
+DECLARE_PER_CPU(struct numa_stat, root_numa_stat);
+#endif
+
void __init sched_init(void)
{
unsigned long ptr = 0;
@@ -6593,6 +6597,10 @@ void __init sched_init(void)
init_defrootdomain();
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ root_task_group.numa_stat = &root_numa_stat;
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -6918,6 +6926,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
static void sched_free_group(struct task_group *tg)
{
+ free_tg_numa_stat(tg);
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
@@ -6933,6 +6942,9 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!tg)
return ERR_PTR(-ENOMEM);
+ if (!alloc_tg_numa_stat(tg))
+ goto err;
+
if (!alloc_fair_sched_group(tg, parent))
goto err;
@@ -7638,6 +7650,40 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+static int cpu_numa_stat_show(struct seq_file *sf, void *v)
+{
+ int nr;
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_puts(sf, "locality");
+ for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
+ int cpu;
+ u64 sum = 0;
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(tg->numa_stat->locality[nr], cpu);
+
+ seq_printf(sf, " %u", jiffies_to_msecs(sum));
+ }
+ seq_putc(sf, '\n');
+
+ seq_puts(sf, "exectime");
+ for_each_online_node(nr) {
+ int cpu;
+ u64 sum = 0;
+
+ for_each_cpu(cpu, cpumask_of_node(nr))
+ sum += per_cpu(tg->numa_stat->jiffies, cpu);
+
+ seq_printf(sf, " %u", jiffies_to_msecs(sum));
+ }
+ seq_putc(sf, '\n');
+
+ return 0;
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -7687,6 +7733,12 @@ static struct cftype cpu_legacy_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .name = "numa_stat",
+ .seq_show = cpu_numa_stat_show,
+ },
#endif
{ } /* Terminate */
};
@@ -7868,6 +7920,13 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .name = "numa_stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_numa_stat_show,
+ },
#endif
{ } /* terminate */
};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..a22b2a62aee2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
+ SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
+ p->numa_faults_locality[1],
+ p->numa_faults_locality[0],
+ p->numa_faults_locality[2]);
+ SEQ_printf(m, "lhit=%lu rhit=%lu\n",
+ p->numa_faults_locality[4],
+ p->numa_faults_locality[3]);
show_numa_stats(p, m);
mpol_put(pol);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a81c36472822..4ba3a41cdca3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
+ /*
+ * We want to have the real local/remote page access statistic
+ * here, so use 'mem_node' which is the real residential node of
+ * page after migrate_misplaced_page().
+ */
+ p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -2672,6 +2678,49 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
}
}
+DEFINE_PER_CPU(struct numa_stat, root_numa_stat);
+
+int alloc_tg_numa_stat(struct task_group *tg)
+{
+ tg->numa_stat = alloc_percpu(struct numa_stat);
+ if (!tg->numa_stat)
+ return 0;
+
+ return 1;
+}
+
+void free_tg_numa_stat(struct task_group *tg)
+{
+ free_percpu(tg->numa_stat);
+}
+
+static void update_tg_numa_stat(struct task_struct *p)
+{
+ struct task_group *tg;
+ unsigned long remote = p->numa_faults_locality[3];
+ unsigned long local = p->numa_faults_locality[4];
+ int idx = -1;
+
+ /* Tobe scaled? */
+ if (remote || local)
+ idx = NR_NL_INTERVAL * local / (remote + local + 1);
+
+ rcu_read_lock();
+
+ tg = task_group(p);
+ while (tg) {
+ /* skip account when there are no faults records */
+ if (idx != -1)
+ this_cpu_inc(tg->numa_stat->locality[idx]);
+
+ this_cpu_inc(tg->numa_stat->jiffies);
+
+ tg = tg->parent;
+ }
+
+ rcu_read_unlock();
+}
+
/*
* Drive the periodic memory faults..
*/
@@ -2686,6 +2735,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;
+ update_tg_numa_stat(curr);
+
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0db2c1b3361e..fd1ea597e349 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,6 +351,18 @@ struct cfs_bandwidth {
#endif
};
+#ifdef CONFIG_NUMA_BALANCING
+
+/* NUMA Locality Interval, 8 bucket for cache align */
+#define NR_NL_INTERVAL 8
+
+struct numa_stat {
+ u64 locality[NR_NL_INTERVAL];
+ u64 jiffies;
+};
+
+#endif
+
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -401,8 +413,25 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct numa_stat __percpu *numa_stat;
+#endif
};
+#ifdef CONFIG_NUMA_BALANCING
+int alloc_tg_numa_stat(struct task_group *tg);
+void free_tg_numa_stat(struct task_group *tg);
+#else
+static int alloc_tg_numa_stat(struct task_group *tg)
+{
+ return 1;
+}
+
+static void free_tg_numa_stat(struct task_group *tg)
+{
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
--
2.14.4.44.g2045bb6
Hi Peter, Michal
The statistic has been proved practically useful on detecting numa
issues, especially for the box running with complex mixing workloads.
Also enabled for cgroup-v2 now :-)
Regards,
Michael Wang
On 2019/10/24 上午11:08, 王贇 wrote:
> Currently there are no good approach to monitoring the per-cgroup
> numa efficiency, this could be a trouble especially when groups
> are sharing CPUs, it's impossible to tell which one caused the
> remote-memory access by reading hardware counter since multiple
> workloads could sharing the same CPU, which make it painful when
> one want to find out the root cause and fix the issue.
>
> In order to address this, we introduced new per-cgroup statistic
> for numa:
> * the numa locality to imply the numa balancing efficiency
> * the numa execution time on each node
>
> The task locality is the local page accessing ratio traced on numa
> balancing PF, and the group locality is the topology of task execution
> time, sectioned by the locality into 8 regions.
>
> For example the new entry 'cpu.numa_stat' show:
> locality 15393 21259 13023 44461 21247 17012 28496 145402
> exectime 311900 407166
>
> Here we know the workloads executed 311900ms on node_0 and 407166ms
> on node_1, tasks with locality around 0~12% executed for 15393 ms, and
> tasks with locality around 88~100% executed for 145402 ms, which imply
> most of the memory access is local access, for the workloads of this
> group.
>
> By monitoring the new statistic, we will be able to know the numa
> efficiency of each per-cgroup workloads on machine, whatever they
> sharing the CPUs or not, we will be able to find out which one
> introduced the remote access mostly.
>
> Besides, per-node memory topology from 'memory.numa_stat' become
> more useful when we have the per-node execution time, workloads
> always executing on node_0 while it's memory is all on node_1 is
> usually a bad case.
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Michal Koutný <[email protected]>
> Signed-off-by: Michael Wang <[email protected]>
> ---
> include/linux/sched.h | 8 ++++++-
> kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/debug.c | 7 ++++++
> kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 29 +++++++++++++++++++++++++
> 5 files changed, 153 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 263cf089d1b3..46995be622c1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1114,8 +1114,14 @@ struct task_struct {
> * scan window were remote/local or failed to migrate. The task scan
> * period is adapted based on the locality of the faults with different
> * weights depending on whether they were shared or private faults
> + *
> + * 0 -- remote faults
> + * 1 -- local faults
> + * 2 -- page migration failure
> + * 3 -- remote page accessing
> + * 4 -- local page accessing
> */
> - unsigned long numa_faults_locality[3];
> + unsigned long numa_faults_locality[5];
>
> unsigned long numa_pages_migrated;
> #endif /* CONFIG_NUMA_BALANCING */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index eb42b71faab9..4364da279f04 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6544,6 +6544,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
> DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
> DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
>
> +#ifdef CONFIG_NUMA_BALANCING
> +DECLARE_PER_CPU(struct numa_stat, root_numa_stat);
> +#endif
> +
> void __init sched_init(void)
> {
> unsigned long ptr = 0;
> @@ -6593,6 +6597,10 @@ void __init sched_init(void)
> init_defrootdomain();
> #endif
>
> +#ifdef CONFIG_NUMA_BALANCING
> + root_task_group.numa_stat = &root_numa_stat;
> +#endif
> +
> #ifdef CONFIG_RT_GROUP_SCHED
> init_rt_bandwidth(&root_task_group.rt_bandwidth,
> global_rt_period(), global_rt_runtime());
> @@ -6918,6 +6926,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
>
> static void sched_free_group(struct task_group *tg)
> {
> + free_tg_numa_stat(tg);
> free_fair_sched_group(tg);
> free_rt_sched_group(tg);
> autogroup_free(tg);
> @@ -6933,6 +6942,9 @@ struct task_group *sched_create_group(struct task_group *parent)
> if (!tg)
> return ERR_PTR(-ENOMEM);
>
> + if (!alloc_tg_numa_stat(tg))
> + goto err;
> +
> if (!alloc_fair_sched_group(tg, parent))
> goto err;
>
> @@ -7638,6 +7650,40 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> }
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> +#ifdef CONFIG_NUMA_BALANCING
> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
> +{
> + int nr;
> + struct task_group *tg = css_tg(seq_css(sf));
> +
> + seq_puts(sf, "locality");
> + for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
> + int cpu;
> + u64 sum = 0;
> +
> + for_each_possible_cpu(cpu)
> + sum += per_cpu(tg->numa_stat->locality[nr], cpu);
> +
> + seq_printf(sf, " %u", jiffies_to_msecs(sum));
> + }
> + seq_putc(sf, '\n');
> +
> + seq_puts(sf, "exectime");
> + for_each_online_node(nr) {
> + int cpu;
> + u64 sum = 0;
> +
> + for_each_cpu(cpu, cpumask_of_node(nr))
> + sum += per_cpu(tg->numa_stat->jiffies, cpu);
> +
> + seq_printf(sf, " %u", jiffies_to_msecs(sum));
> + }
> + seq_putc(sf, '\n');
> +
> + return 0;
> +}
> +#endif
> +
> static struct cftype cpu_legacy_files[] = {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> {
> @@ -7687,6 +7733,12 @@ static struct cftype cpu_legacy_files[] = {
> .seq_show = cpu_uclamp_max_show,
> .write = cpu_uclamp_max_write,
> },
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + {
> + .name = "numa_stat",
> + .seq_show = cpu_numa_stat_show,
> + },
> #endif
> { } /* Terminate */
> };
> @@ -7868,6 +7920,13 @@ static struct cftype cpu_files[] = {
> .seq_show = cpu_uclamp_max_show,
> .write = cpu_uclamp_max_write,
> },
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + {
> + .name = "numa_stat",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = cpu_numa_stat_show,
> + },
> #endif
> { } /* terminate */
> };
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index f7e4579e746c..a22b2a62aee2 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
> P(total_numa_faults);
> SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
> task_node(p), task_numa_group_id(p));
> + SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
> + p->numa_faults_locality[1],
> + p->numa_faults_locality[0],
> + p->numa_faults_locality[2]);
> + SEQ_printf(m, "lhit=%lu rhit=%lu\n",
> + p->numa_faults_locality[4],
> + p->numa_faults_locality[3]);
> show_numa_stats(p, m);
> mpol_put(pol);
> #endif
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a81c36472822..4ba3a41cdca3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
> p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
> p->numa_faults_locality[local] += pages;
> + /*
> + * We want to have the real local/remote page access statistic
> + * here, so use 'mem_node' which is the real residential node of
> + * page after migrate_misplaced_page().
> + */
> + p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
> }
>
> static void reset_ptenuma_scan(struct task_struct *p)
> @@ -2672,6 +2678,49 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
> }
> }
>
> +DEFINE_PER_CPU(struct numa_stat, root_numa_stat);
> +
> +int alloc_tg_numa_stat(struct task_group *tg)
> +{
> + tg->numa_stat = alloc_percpu(struct numa_stat);
> + if (!tg->numa_stat)
> + return 0;
> +
> + return 1;
> +}
> +
> +void free_tg_numa_stat(struct task_group *tg)
> +{
> + free_percpu(tg->numa_stat);
> +}
> +
> +static void update_tg_numa_stat(struct task_struct *p)
> +{
> + struct task_group *tg;
> + unsigned long remote = p->numa_faults_locality[3];
> + unsigned long local = p->numa_faults_locality[4];
> + int idx = -1;
> +
> + /* Tobe scaled? */
> + if (remote || local)
> + idx = NR_NL_INTERVAL * local / (remote + local + 1);
> +
> + rcu_read_lock();
> +
> + tg = task_group(p);
> + while (tg) {
> + /* skip account when there are no faults records */
> + if (idx != -1)
> + this_cpu_inc(tg->numa_stat->locality[idx]);
> +
> + this_cpu_inc(tg->numa_stat->jiffies);
> +
> + tg = tg->parent;
> + }
> +
> + rcu_read_unlock();
> +}
> +
> /*
> * Drive the periodic memory faults..
> */
> @@ -2686,6 +2735,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> return;
>
> + update_tg_numa_stat(curr);
> +
> /*
> * Using runtime rather than walltime has the dual advantage that
> * we (mostly) drive the selection from busy threads and that the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0db2c1b3361e..fd1ea597e349 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -351,6 +351,18 @@ struct cfs_bandwidth {
> #endif
> };
>
> +#ifdef CONFIG_NUMA_BALANCING
> +
> +/* NUMA Locality Interval, 8 bucket for cache align */
> +#define NR_NL_INTERVAL 8
> +
> +struct numa_stat {
> + u64 locality[NR_NL_INTERVAL];
> + u64 jiffies;
> +};
> +
> +#endif
> +
> /* Task group related information */
> struct task_group {
> struct cgroup_subsys_state css;
> @@ -401,8 +413,25 @@ struct task_group {
> struct uclamp_se uclamp[UCLAMP_CNT];
> #endif
>
> +#ifdef CONFIG_NUMA_BALANCING
> + struct numa_stat __percpu *numa_stat;
> +#endif
> };
>
> +#ifdef CONFIG_NUMA_BALANCING
> +int alloc_tg_numa_stat(struct task_group *tg);
> +void free_tg_numa_stat(struct task_group *tg);
> +#else
> +static int alloc_tg_numa_stat(struct task_group *tg)
> +{
> + return 1;
> +}
> +
> +static void free_tg_numa_stat(struct task_group *tg)
> +{
> +}
> +#endif
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
> #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
>
On Thu, Oct 24, 2019 at 11:08:01AM +0800, 王贇 wrote:
> Currently there are no good approach to monitoring the per-cgroup
> numa efficiency, this could be a trouble especially when groups
> are sharing CPUs, it's impossible to tell which one caused the
> remote-memory access by reading hardware counter since multiple
> workloads could sharing the same CPU, which make it painful when
> one want to find out the root cause and fix the issue.
>
> In order to address this, we introduced new per-cgroup statistic
> for numa:
> * the numa locality to imply the numa balancing efficiency
> * the numa execution time on each node
>
> The task locality is the local page accessing ratio traced on numa
> balancing PF, and the group locality is the topology of task execution
> time, sectioned by the locality into 8 regions.
>
> For example the new entry 'cpu.numa_stat' show:
> locality 15393 21259 13023 44461 21247 17012 28496 145402
> exectime 311900 407166
>
> Here we know the workloads executed 311900ms on node_0 and 407166ms
> on node_1, tasks with locality around 0~12% executed for 15393 ms, and
> tasks with locality around 88~100% executed for 145402 ms, which imply
> most of the memory access is local access, for the workloads of this
> group.
>
> By monitoring the new statistic, we will be able to know the numa
> efficiency of each per-cgroup workloads on machine, whatever they
> sharing the CPUs or not, we will be able to find out which one
> introduced the remote access mostly.
>
> Besides, per-node memory topology from 'memory.numa_stat' become
> more useful when we have the per-node execution time, workloads
> always executing on node_0 while it's memory is all on node_1 is
> usually a bad case.
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Michal Koutný <[email protected]>
> Signed-off-by: Michael Wang <[email protected]>
Mel, can you have a peek at this too?
So this is the part I like least:
> +DEFINE_PER_CPU(struct numa_stat, root_numa_stat);
> +
> +int alloc_tg_numa_stat(struct task_group *tg)
> +{
> + tg->numa_stat = alloc_percpu(struct numa_stat);
> + if (!tg->numa_stat)
> + return 0;
> +
> + return 1;
> +}
> +
> +void free_tg_numa_stat(struct task_group *tg)
> +{
> + free_percpu(tg->numa_stat);
> +}
> +
> +static void update_tg_numa_stat(struct task_struct *p)
> +{
> + struct task_group *tg;
> + unsigned long remote = p->numa_faults_locality[3];
> + unsigned long local = p->numa_faults_locality[4];
> + int idx = -1;
> +
> + /* Tobe scaled? */
> + if (remote || local)
> + idx = NR_NL_INTERVAL * local / (remote + local + 1);
> +
> + rcu_read_lock();
> +
> + tg = task_group(p);
> + while (tg) {
> + /* skip account when there are no faults records */
> + if (idx != -1)
> + this_cpu_inc(tg->numa_stat->locality[idx]);
> +
> + this_cpu_inc(tg->numa_stat->jiffies);
> +
> + tg = tg->parent;
> + }
> +
> + rcu_read_unlock();
> +}
Thing is, we already have a cgroup hierarchy walk in the tick; see
task_tick_fair().
On top of that, you're walking an entirely different set of pointers,
instead of cfs_rq, you're walking tg->parent, which pretty much
guarantees you're adding even more cache misses.
How about you stick those numa_stats in cfs_rq (with cacheline
alignment) and see if you can frob your update loop into the cgroup walk
we already do.
On 2019/10/28 下午9:02, Peter Zijlstra wrote:
[snip]
>> + tg = task_group(p);
>> + while (tg) {
>> + /* skip account when there are no faults records */
>> + if (idx != -1)
>> + this_cpu_inc(tg->numa_stat->locality[idx]);
>> +
>> + this_cpu_inc(tg->numa_stat->jiffies);
>> +
>> + tg = tg->parent;
>> + }
>> +
>> + rcu_read_unlock();
>> +}
>
> Thing is, we already have a cgroup hierarchy walk in the tick; see
> task_tick_fair().
>
> On top of that, you're walking an entirely different set of pointers,
> instead of cfs_rq, you're walking tg->parent, which pretty much
> guarantees you're adding even more cache misses.
>
> How about you stick those numa_stats in cfs_rq (with cacheline
> alignment) and see if you can frob your update loop into the cgroup walk
> we already do.
Thanks for the reply :-)
The hierarchy walk here you mean is the loop of entity_tick(), correct?
Should working if we introduce the per-cfs_rq numa_stat accounting and
do update there, I'll try to reform the implementation in next version.
Regards,
Michael Wang
>
Currently there are no good approach to monitoring the per-cgroup
numa efficiency, this could be a trouble especially when groups
are sharing CPUs, it's impossible to tell which one caused the
remote-memory access by reading hardware counter since multiple
workloads could sharing the same CPU, which make it painful when
one want to find out the root cause and fix the issue.
In order to address this, we introduced new per-cgroup statistic
for numa:
* the numa locality to imply the numa balancing efficiency
* the numa execution time on each node
The task locality is the local page accessing ratio traced on numa
balancing PF, and the group locality is the topology of task execution
time, sectioned by the locality into 7 regions.
For example the new entry 'cpu.numa_stat' show:
locality 39541 60962 36842 72519 118605 721778 946553
exectime 1220127 1458684
Here we know the workloads in hierarchy executed 1220127ms on node_0
and 1458684ms on node_1 in total, tasks with locality around 0~13%
executed for 39541 ms, and tasks with locality around 87~100% executed
for 946553 ms, which imply most of the memory access are local access.
By monitoring the new statistic, we will be able to know the numa
efficiency of each per-cgroup workloads on machine, whatever they
sharing the CPUs or not, we will be able to find out which one
introduced the remote access mostly.
Besides, per-node memory topology from 'memory.numa_stat' become
more useful when we have the per-node execution time, workloads
always executing on node_0 while it's memory is all on node_1 is
usually a bad case.
Cc: Peter Zijlstra <[email protected]>
Cc: Michal Koutný <[email protected]>
Signed-off-by: Michael Wang <[email protected]>
---
Since v1:
* reform the implementation based on per-cfs_rq accounting (Suggest by Peter)
now update the cache-aligned numa_stat in entity_tick() for each cfs_rq
* fix the calculation issue when number is too small
include/linux/sched.h | 8 +++++++-
kernel/sched/core.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/debug.c | 7 +++++++
kernel/sched/fair.c | 25 +++++++++++++++++++++++++
kernel/sched/sched.h | 13 +++++++++++++
5 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 263cf089d1b3..46995be622c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1114,8 +1114,14 @@ struct task_struct {
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
+ *
+ * 0 -- remote faults
+ * 1 -- local faults
+ * 2 -- page migration failure
+ * 3 -- remote page accessing
+ * 4 -- local page accessing
*/
- unsigned long numa_faults_locality[3];
+ unsigned long numa_faults_locality[5];
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb42b71faab9..c87673f746b7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7638,6 +7638,45 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
+{
+ return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
+}
+
+static int cpu_numa_stat_show(struct seq_file *sf, void *v)
+{
+ int nr;
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_puts(sf, "locality");
+ for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
+ int cpu;
+ u64 sum = 0;
+
+ for_each_possible_cpu(cpu)
+ sum += tg_cfs_rq(tg, cpu)->nstat.locality[nr];
+
+ seq_printf(sf, " %u", jiffies_to_msecs(sum));
+ }
+ seq_putc(sf, '\n');
+
+ seq_puts(sf, "exectime");
+ for_each_online_node(nr) {
+ int cpu;
+ u64 sum = 0;
+
+ for_each_cpu(cpu, cpumask_of_node(nr))
+ sum += tg_cfs_rq(tg, cpu)->nstat.jiffies;
+
+ seq_printf(sf, " %u", jiffies_to_msecs(sum));
+ }
+ seq_putc(sf, '\n');
+
+ return 0;
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -7687,6 +7726,12 @@ static struct cftype cpu_legacy_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .name = "numa_stat",
+ .seq_show = cpu_numa_stat_show,
+ },
#endif
{ } /* Terminate */
};
@@ -7868,6 +7913,13 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .name = "numa_stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_numa_stat_show,
+ },
#endif
{ } /* terminate */
};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..a22b2a62aee2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
+ SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
+ p->numa_faults_locality[1],
+ p->numa_faults_locality[0],
+ p->numa_faults_locality[2]);
+ SEQ_printf(m, "lhit=%lu rhit=%lu\n",
+ p->numa_faults_locality[4],
+ p->numa_faults_locality[3]);
show_numa_stats(p, m);
mpol_put(pol);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a81c36472822..9d4db18da548 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
+ /*
+ * We want to have the real local/remote page access statistic
+ * here, so use 'mem_node' which is the real residential node of
+ * page after migrate_misplaced_page().
+ */
+ p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -2739,6 +2745,20 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
+static void update_numa_statistics(struct cfs_rq *cfs_rq)
+{
+ int idx;
+ unsigned long remote = current->numa_faults_locality[3];
+ unsigned long local = current->numa_faults_locality[4];
+
+ cfs_rq->nstat.jiffies++;
+
+ if (!remote && !local)
+ return;
+
+ idx = (NR_NL_INTERVAL - 1) * local / (remote + local);
+ cfs_rq->nstat.locality[idx]++;
+}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2756,6 +2776,9 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
+static void update_numa_statistics(struct cfs_rq *cfs_rq)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -4288,6 +4311,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
+ update_numa_statistics(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0db2c1b3361e..6579a5499154 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -486,6 +486,16 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+/* NUMA Locality Interval, 7 buckets for cache align */
+#define NR_NL_INTERVAL 7
+
+struct numa_statistics {
+ u64 jiffies;
+ u64 locality[NR_NL_INTERVAL];
+};
+#endif
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -575,6 +585,9 @@ struct cfs_rq {
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+ struct numa_statistics nstat ____cacheline_aligned;
+#endif
};
static inline int rt_bandwidth_enabled(void)
--
2.14.4.44.g2045bb6
On Thu, Oct 24, 2019 at 11:08:01AM +0800, ?????? wrote:
> Currently there are no good approach to monitoring the per-cgroup
> numa efficiency, this could be a trouble especially when groups
> are sharing CPUs, it's impossible to tell which one caused the
> remote-memory access by reading hardware counter since multiple
> workloads could sharing the same CPU, which make it painful when
> one want to find out the root cause and fix the issue.
>
> In order to address this, we introduced new per-cgroup statistic
> for numa:
> * the numa locality to imply the numa balancing efficiency
That is clear to some extent with the obvious caveat that a cgroup bound
to a memory node will report this as being nearly 100% with shared pages
being a possible exception. Locality might be fine but there could be
large latencies due to reclaim if the cgroup limits are exceeded or the
NUMA node is too small. It can give an artifical sense of benefit.
> * the numa execution time on each node
>
This is less obvious because it does not define what NUMA execution time
is and it's not documented by the patch.
> The task locality is the local page accessing ratio traced on numa
> balancing PF, and the group locality is the topology of task execution
> time, sectioned by the locality into 8 regions.
>
This is another important limitation. Disabling NUMA balancing will also
disable the stats which may be surprising to some users. It's not a
show-stopper but arguably the file should be non-existant or always
zeros if NUMA balancing is disabled.
> For example the new entry 'cpu.numa_stat' show:
> locality 15393 21259 13023 44461 21247 17012 28496 145402
> exectime 311900 407166
>
> Here we know the workloads executed 311900ms on node_0 and 407166ms
> on node_1, tasks with locality around 0~12% executed for 15393 ms, and
> tasks with locality around 88~100% executed for 145402 ms, which imply
> most of the memory access is local access, for the workloads of this
> group.
>
This needs documentation because it's not obvious at all that the
locality values are roughly percentiles. It's also not obvious what a
sysadmin would *do* with this information.
> By monitoring the new statistic, we will be able to know the numa
> efficiency of each per-cgroup workloads on machine, whatever they
> sharing the CPUs or not, we will be able to find out which one
> introduced the remote access mostly.
>
> Besides, per-node memory topology from 'memory.numa_stat' become
> more useful when we have the per-node execution time, workloads
> always executing on node_0 while it's memory is all on node_1 is
> usually a bad case.
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Michal Koutn? <[email protected]>
> Signed-off-by: Michael Wang <[email protected]>
So, superficially, I'm struggling to see what action a sysadmin would take,
if any, with this information. It makes sense to track what individual
tasks are doing but this can be done with ftrace if required.
> ---
> include/linux/sched.h | 8 ++++++-
> kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/debug.c | 7 ++++++
> kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 29 +++++++++++++++++++++++++
> 5 files changed, 153 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 263cf089d1b3..46995be622c1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1114,8 +1114,14 @@ struct task_struct {
> * scan window were remote/local or failed to migrate. The task scan
> * period is adapted based on the locality of the faults with different
> * weights depending on whether they were shared or private faults
> + *
> + * 0 -- remote faults
> + * 1 -- local faults
> + * 2 -- page migration failure
> + * 3 -- remote page accessing
> + * 4 -- local page accessing
> */
> - unsigned long numa_faults_locality[3];
> + unsigned long numa_faults_locality[5];
>
Not clear from the comment what the difference between a remote fault
and a remote page access is. Superficially, they're the same thing.
> unsigned long numa_pages_migrated;
> #endif /* CONFIG_NUMA_BALANCING */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index eb42b71faab9..4364da279f04 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6544,6 +6544,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
> DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
> DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
>
> +#ifdef CONFIG_NUMA_BALANCING
> +DECLARE_PER_CPU(struct numa_stat, root_numa_stat);
> +#endif
> +
> void __init sched_init(void)
> {
> unsigned long ptr = 0;
> @@ -6593,6 +6597,10 @@ void __init sched_init(void)
> init_defrootdomain();
> #endif
>
> +#ifdef CONFIG_NUMA_BALANCING
> + root_task_group.numa_stat = &root_numa_stat;
> +#endif
> +
> #ifdef CONFIG_RT_GROUP_SCHED
> init_rt_bandwidth(&root_task_group.rt_bandwidth,
> global_rt_period(), global_rt_runtime());
This is an indication that there is a universal hit to collect this data
whether cgroups are enabled or not. That hits the same problem I had
with PSI when it was first introduced. For users that care, the
information is useful, particularly as they generally have an
application consuming the data.
> @@ -6918,6 +6926,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
>
> static void sched_free_group(struct task_group *tg)
> {
> + free_tg_numa_stat(tg);
> free_fair_sched_group(tg);
> free_rt_sched_group(tg);
> autogroup_free(tg);
> @@ -6933,6 +6942,9 @@ struct task_group *sched_create_group(struct task_group *parent)
> if (!tg)
> return ERR_PTR(-ENOMEM);
>
> + if (!alloc_tg_numa_stat(tg))
> + goto err;
> +
> if (!alloc_fair_sched_group(tg, parent))
> goto err;
>
While this is very unlikely to fail, I find it odd to think that an
entire cgroup could fail to be created simply because stats cannot be
collected, particularly when no userspace component may care at all.
> @@ -7638,6 +7650,40 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> }
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> +#ifdef CONFIG_NUMA_BALANCING
> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
> +{
> + int nr;
> + struct task_group *tg = css_tg(seq_css(sf));
> +
> + seq_puts(sf, "locality");
> + for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
> + int cpu;
> + u64 sum = 0;
> +
> + for_each_possible_cpu(cpu)
> + sum += per_cpu(tg->numa_stat->locality[nr], cpu);
> +
> + seq_printf(sf, " %u", jiffies_to_msecs(sum));
> + }
> + seq_putc(sf, '\n');
> +
> + seq_puts(sf, "exectime");
> + for_each_online_node(nr) {
> + int cpu;
> + u64 sum = 0;
> +
> + for_each_cpu(cpu, cpumask_of_node(nr))
> + sum += per_cpu(tg->numa_stat->jiffies, cpu);
> +
> + seq_printf(sf, " %u", jiffies_to_msecs(sum));
> + }
> + seq_putc(sf, '\n');
> +
> + return 0;
> +}
> +#endif
> +
Ok, expensive on large machines but only hit with the proc file is read
so that's ok.
> static struct cftype cpu_legacy_files[] = {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> {
> @@ -7687,6 +7733,12 @@ static struct cftype cpu_legacy_files[] = {
> .seq_show = cpu_uclamp_max_show,
> .write = cpu_uclamp_max_write,
> },
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + {
> + .name = "numa_stat",
> + .seq_show = cpu_numa_stat_show,
> + },
> #endif
> { } /* Terminate */
> };
> @@ -7868,6 +7920,13 @@ static struct cftype cpu_files[] = {
> .seq_show = cpu_uclamp_max_show,
> .write = cpu_uclamp_max_write,
> },
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + {
> + .name = "numa_stat",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = cpu_numa_stat_show,
> + },
> #endif
> { } /* terminate */
> };
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index f7e4579e746c..a22b2a62aee2 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
> P(total_numa_faults);
> SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
> task_node(p), task_numa_group_id(p));
> + SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
> + p->numa_faults_locality[1],
> + p->numa_faults_locality[0],
> + p->numa_faults_locality[2]);
This should be a separate patch. "failed=" does not tell much. It should
be at least migfailed to give some indication it's about failed
migrations.
It might still be misleading because the failure could be due to lack of
memory or because the page is pinned. However, I would not worry about
that in this case.
What *does* make this dangerous is that numa_faults_locality is often
cleared. A user could easily think that this data somehow accumulates
over time but it does not. This exposes implementation details as
numa_faults_locality could change its behaviour in the future and tools
should not rely on the contents being stable. While I recognise that
some numa balancing information is already exposed in that file, it's
relatively harmless with the possible exception of numa_scan_seq but
at least that value is almost worthless (other than detecting if NUMA
balancing is completely broken) and very unlikely to change behaviour.
> + SEQ_printf(m, "lhit=%lu rhit=%lu\n",
> + p->numa_faults_locality[4],
> + p->numa_faults_locality[3]);
> show_numa_stats(p, m);
> mpol_put(pol);
> #endif
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a81c36472822..4ba3a41cdca3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
> p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
> p->numa_faults_locality[local] += pages;
> + /*
> + * We want to have the real local/remote page access statistic
> + * here, so use 'mem_node' which is the real residential node of
> + * page after migrate_misplaced_page().
> + */
> + p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
> }
>
This may be misleading if a task is using a preferred node policy that
happens to be remote. It'll report bad locality but it's how the task
waqs configured. Similarly, shared pages that are interleaves will show
as remote accesses which is not necessarily bad. It goes back to "what
does a sysadmin do with this information?"
> static void reset_ptenuma_scan(struct task_struct *p)
> @@ -2672,6 +2678,49 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
> }
> }
>
> +DEFINE_PER_CPU(struct numa_stat, root_numa_stat);
> +
> +int alloc_tg_numa_stat(struct task_group *tg)
> +{
> + tg->numa_stat = alloc_percpu(struct numa_stat);
> + if (!tg->numa_stat)
> + return 0;
> +
> + return 1;
> +}
> +
> +void free_tg_numa_stat(struct task_group *tg)
> +{
> + free_percpu(tg->numa_stat);
> +}
> +
> +static void update_tg_numa_stat(struct task_struct *p)
> +{
> + struct task_group *tg;
> + unsigned long remote = p->numa_faults_locality[3];
> + unsigned long local = p->numa_faults_locality[4];
> + int idx = -1;
> +
> + /* Tobe scaled? */
> + if (remote || local)
> + idx = NR_NL_INTERVAL * local / (remote + local + 1);
> +
> + rcu_read_lock();
> +
> + tg = task_group(p);
> + while (tg) {
> + /* skip account when there are no faults records */
> + if (idx != -1)
> + this_cpu_inc(tg->numa_stat->locality[idx]);
> +
> + this_cpu_inc(tg->numa_stat->jiffies);
> +
> + tg = tg->parent;
> + }
> +
> + rcu_read_unlock();
> +}
> +
This is potentially a long walk to do in the task tick context if there
are a lot of groups. Also, if no faults are recorded because the scanner is
running slowly, what does that mean for detecting locality? The information
could be way off if a lot of accesses are remote and simply not caught
by faults because of recent changes.
So, overall, I see the general intent that you want to identify cgroups
that have bad locality but it incurs a universal cost regardless of
whether the user wants it or not. The documentation is non-existant but
the biggest kicker is that it's unclear how a sysadmin would consume this
information. The problem is compounded by the fact that interpreting
the data accurately and making a decision requires knowledge of the
implementation and that severely limits who can benefit. In my mind, it
makes more sense to track the locality and NUMA behaviour of individual
tasks which can be already done with ftrace. If cgroup tracking was
required, a userspace application using ftrace could correlate task
activity with what cgroup it belongs to and collect the data from userspace
-- this is not necessarily easy and tracepoints might need updating, but
it makes more sense to track this in userspace instead of in the kernel
which never consumes the data.
--
Mel Gorman
SUSE Labs
Hi, Mel
Thanks for the comments :-)
On 2019/10/30 下午5:55, Mel Gorman wrote:
[snip]
>> In order to address this, we introduced new per-cgroup statistic
>> for numa:
>> * the numa locality to imply the numa balancing efficiency
>
> That is clear to some extent with the obvious caveat that a cgroup bound
> to a memory node will report this as being nearly 100% with shared pages
> being a possible exception.Locality might be fine but there could be
> large latencies due to reclaim if the cgroup limits are exceeded or the
> NUMA node is too small. It can give an artifical sense of benefit.
Currently we rely on locality on telling whether there are issues rather
than the benefit, mostly the deployment and configuration issues.
For example, tasks bind to the cpus of node_0 could have their memory on
node_1 when node_0 almost run out of memory, numa balancing may not be
able to help in this case, while by reading locality we could know how
critical the problem is, and may take action to rebind cpus to node_1 or
reclaim the memory of node_0.
>
>> * the numa execution time on each node
>>
>
> This is less obvious because it does not define what NUMA execution time
> is and it's not documented by the patch.
>
>> The task locality is the local page accessing ratio traced on numa
>> balancing PF, and the group locality is the topology of task execution
>> time, sectioned by the locality into 8 regions.
>>
>
> This is another important limitation. Disabling NUMA balancing will also
> disable the stats which may be surprising to some users. It's not a
> show-stopper but arguably the file should be non-existant or always
> zeros if NUMA balancing is disabled.
Maybe a check on sched_numa_balancing before showing the data could
be helpful? when user turn off numa balancing dynamically, we no longer
showing the data.
>
>> For example the new entry 'cpu.numa_stat' show:
>> locality 15393 21259 13023 44461 21247 17012 28496 145402
>> exectime 311900 407166
>>
>> Here we know the workloads executed 311900ms on node_0 and 407166ms
>> on node_1, tasks with locality around 0~12% executed for 15393 ms, and
>> tasks with locality around 88~100% executed for 145402 ms, which imply
>> most of the memory access is local access, for the workloads of this
>> group.
>>
>
> This needs documentation because it's not obvious at all that the
> locality values are roughly percentiles. It's also not obvious what a
> sysadmin would *do* with this information.
Make sense, I'll draft a doc regarding how to read and handle these
data then.
>
>> By monitoring the new statistic, we will be able to know the numa
>> efficiency of each per-cgroup workloads on machine, whatever they
>> sharing the CPUs or not, we will be able to find out which one
>> introduced the remote access mostly.
>>
>> Besides, per-node memory topology from 'memory.numa_stat' become
>> more useful when we have the per-node execution time, workloads
>> always executing on node_0 while it's memory is all on node_1 is
>> usually a bad case.
>>
>> Cc: Peter Zijlstra <[email protected]>
>> Cc: Michal Koutný <[email protected]>
>> Signed-off-by: Michael Wang <[email protected]>
>
> So, superficially, I'm struggling to see what action a sysadmin would take,
> if any, with this information. It makes sense to track what individual
> tasks are doing but this can be done with ftrace if required.
The idea here is to give user the ability of monitoring numa efficiency
per-cgroup, just like monitoring per-cgroup CPU or memory usages, ftrace
is a good way of further debugging, while IMHO not that suitable for
daily monitoring.
One common usecase is to alarm user when some cgroup's locality is
always low, and usually this is caused by wrong memory policy, or
exhausted numa node, then user can adjust the policy or binding other
nodes, or reclaim the exhausted node.
>
>> ---
>> include/linux/sched.h | 8 ++++++-
>> kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/debug.c | 7 ++++++
>> kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/sched.h | 29 +++++++++++++++++++++++++
>> 5 files changed, 153 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 263cf089d1b3..46995be622c1 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1114,8 +1114,14 @@ struct task_struct {
>> * scan window were remote/local or failed to migrate. The task scan
>> * period is adapted based on the locality of the faults with different
>> * weights depending on whether they were shared or private faults
>> + *
>> + * 0 -- remote faults
>> + * 1 -- local faults
>> + * 2 -- page migration failure
>> + * 3 -- remote page accessing
>> + * 4 -- local page accessing
>> */
>> - unsigned long numa_faults_locality[3];
>> + unsigned long numa_faults_locality[5];
>>
>
> Not clear from the comment what the difference between a remote fault
> and a remote page access is. Superficially, they're the same thing.
The 'fault' is recording before page migration while 'accessing'
is after, they could be different when the page has been migrated.
>
>> unsigned long numa_pages_migrated;
[snip]
>> unsigned long ptr = 0;
>> @@ -6593,6 +6597,10 @@ void __init sched_init(void)
>> init_defrootdomain();
>> #endif
>>
>> +#ifdef CONFIG_NUMA_BALANCING
>> + root_task_group.numa_stat = &root_numa_stat;
>> +#endif
>> +
>> #ifdef CONFIG_RT_GROUP_SCHED
>> init_rt_bandwidth(&root_task_group.rt_bandwidth,
>> global_rt_period(), global_rt_runtime());
>
> This is an indication that there is a universal hit to collect this data
> whether cgroups are enabled or not. That hits the same problem I had
> with PSI when it was first introduced. For users that care, the
> information is useful, particularly as they generally have an
> application consuming the data>
>> @@ -6918,6 +6926,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
>>
>> static void sched_free_group(struct task_group *tg)
>> {
>> + free_tg_numa_stat(tg);
>> free_fair_sched_group(tg);
>> free_rt_sched_group(tg);
>> autogroup_free(tg);
>> @@ -6933,6 +6942,9 @@ struct task_group *sched_create_group(struct task_group *parent)
>> if (!tg)
>> return ERR_PTR(-ENOMEM);
>>
>> + if (!alloc_tg_numa_stat(tg))
>> + goto err;
>> +
>> if (!alloc_fair_sched_group(tg, parent))
>> goto err;
>>
>
> While this is very unlikely to fail, I find it odd to think that an
> entire cgroup could fail to be created simply because stats cannot be
> collected, particularly when no userspace component may care at all.
We have rework the implementation in v2 (sent) and put the data into cfs_rq,
all these init works have been saved :-)
>
[snip]>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index f7e4579e746c..a22b2a62aee2 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -848,6 +848,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
>> P(total_numa_faults);
>> SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
>> task_node(p), task_numa_group_id(p));
>> + SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
>> + p->numa_faults_locality[1],
>> + p->numa_faults_locality[0],
>> + p->numa_faults_locality[2]);
>
> This should be a separate patch. "failed=" does not tell much. It should
> be at least migfailed to give some indication it's about failed
> migrations.
I'm always doing bad on naming :-P
>
> It might still be misleading because the failure could be due to lack of
> memory or because the page is pinned. However, I would not worry about
> that in this case.
>
> What *does* make this dangerous is that numa_faults_locality is often
> cleared. A user could easily think that this data somehow accumulates
> over time but it does not. This exposes implementation details as
> numa_faults_locality could change its behaviour in the future and tools
> should not rely on the contents being stable. While I recognise that
> some numa balancing information is already exposed in that file, it's
> relatively harmless with the possible exception of numa_scan_seq but
> at least that value is almost worthless (other than detecting if NUMA
> balancing is completely broken) and very unlikely to change behaviour.
Yeah, each new scan will drop the faults data of last scan, which is
important for group locality accumulation since we don't want to tracing
the out dated data for too long.
I think I get your point, these per-task faults are not accumulated, so they
are meaningless on period checking, actually they are not so helpful when
we consider cgroup as a unit for numa adapting, will remove them in next
version then.
>
>> + SEQ_printf(m, "lhit=%lu rhit=%lu\n",
>> + p->numa_faults_locality[4],
>> + p->numa_faults_locality[3]);
>> show_numa_stats(p, m);
>> mpol_put(pol);
>> #endif
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index a81c36472822..4ba3a41cdca3 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>> p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>> p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>> p->numa_faults_locality[local] += pages;
>> + /*
>> + * We want to have the real local/remote page access statistic
>> + * here, so use 'mem_node' which is the real residential node of
>> + * page after migrate_misplaced_page().
>> + */
>> + p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
>> }
>>
>
> This may be misleading if a task is using a preferred node policy that
> happens to be remote. It'll report bad locality but it's how the task
> waqs configured. Similarly, shared pages that are interleaves will show
> as remote accesses which is not necessarily bad. It goes back to "what
> does a sysadmin do with this information?"
IMHO a very important usage of these data is to tell the numa users
"something maybe going wrong", could be on purpose or really a setting
issue, users could then try to figure out whether this is a false alarm
or not.
Maybe we could give some hints for users on how to handle it when seeing
the downturn locality, in the coming docs :-)
>
>> static void reset_ptenuma_scan(struct task_struct *p)
[snip]
>> +void free_tg_numa_stat(struct task_group *tg)
>> +{
>> + free_percpu(tg->numa_stat);
>> +}
>> +
>> +static void update_tg_numa_stat(struct task_struct *p)
>> +{
>> + struct task_group *tg;
>> + unsigned long remote = p->numa_faults_locality[3];
>> + unsigned long local = p->numa_faults_locality[4];
>> + int idx = -1;
>> +
>> + /* Tobe scaled? */
>> + if (remote || local)
>> + idx = NR_NL_INTERVAL * local / (remote + local + 1);
>> +
>> + rcu_read_lock();
>> +
>> + tg = task_group(p);
>> + while (tg) {
>> + /* skip account when there are no faults records */
>> + if (idx != -1)
>> + this_cpu_inc(tg->numa_stat->locality[idx]);
>> +
>> + this_cpu_inc(tg->numa_stat->jiffies);
>> +
>> + tg = tg->parent;
>> + }
>> +
>> + rcu_read_unlock();
>> +}
>> +
>
> This is potentially a long walk to do in the task tick context if there
> are a lot of groups. Also, if no faults are recorded because the scanner is
> running slowly, what does that mean for detecting locality? The information
> could be way off if a lot of accesses are remote and simply not caught
> by faults because of recent changes.
We have merge the work into entity_tick() in V2, which just add few
lines to the existed hierarchy updating.
As you point out, locality assume the task-page node position won't
change too much between each scan, which is definitely not true all
the times.
However, scan period adapting it self based on the faults info, which
means when it getting longer, the task-page node relationship are
becoming stable, the locality are more reliable, and when it getting
shorter, the updating of faults becoming rapidly, which also make the
locality more accurate.
Anyway, locality is not supposed to be accurate all the times, but
should always telling the right tendency, if one found the locality
is always low, it can't be a good task-page node relationship in there.
>
> So, overall, I see the general intent that you want to identify cgroups
> that have bad locality but it incurs a universal cost regardless of
> whether the user wants it or not. The documentation is non-existant but
> the biggest kicker is that it's unclear how a sysadmin would consume this
> information. The problem is compounded by the fact that interpreting
> the data accurately and making a decision requires knowledge of the
> implementation and that severely limits who can benefit. In my mind, it
> makes more sense to track the locality and NUMA behaviour of individual
> tasks which can be already done with ftrace. If cgroup tracking was
> required, a userspace application using ftrace could correlate task
> activity with what cgroup it belongs to and collect the data from userspace
> -- this is not necessarily easy and tracepoints might need updating, but
> it makes more sense to track this in userspace instead of in the kernel
> which never consumes the data
Yeah, per-cgroup ftrace tracing may could work, but IMHO it also introduce
overhead (maybe even more), and not very likely a general monitoring way,
I suppose locality could be something similar to load_1/5/15, not so accurate
but help to warn the possible issues.
We have sent v2 which reworked the implementation according to Peter's
suggestions, would be very appreciate if you could help review to see if
the overhead concern has been addressed, and we will:
* stop showing the per-task faults info
* add checks for sched_numa_balancing before showing locality
in next version, also doc is on the way ;-)
Best Regards,
Michael Wang
>
On Thu, Oct 31, 2019 at 11:31:24AM +0800, ?????? wrote:
> On 2019/10/30 ??????5:55, Mel Gorman wrote:
> [snip]
> >> In order to address this, we introduced new per-cgroup statistic
> >> for numa:
> >> * the numa locality to imply the numa balancing efficiency
> >
> > That is clear to some extent with the obvious caveat that a cgroup bound
> > to a memory node will report this as being nearly 100% with shared pages
> > being a possible exception.Locality might be fine but there could be
> > large latencies due to reclaim if the cgroup limits are exceeded or the
> > NUMA node is too small. It can give an artifical sense of benefit.
>
> Currently we rely on locality on telling whether there are issues rather
> than the benefit, mostly the deployment and configuration issues.
>
> For example, tasks bind to the cpus of node_0 could have their memory on
> node_1 when node_0 almost run out of memory, numa balancing may not be
> able to help in this case, while by reading locality we could know how
> critical the problem is, and may take action to rebind cpus to node_1 or
> reclaim the memory of node_0.
>
You can already do this by walking each cgroup, identifying what tasks are
in it and look at /proc/PID/numa_maps and /proc/PID/status to see what
CPU bindings if any exist. This would use the actual memory placements
and not those sampled by NUMA balancing, would not require NUMA balancing
and would work on older kernels. It would be costly to access so I would
not suggest doing it at high frequency but it makes sense for the tool
that cares to pay the cost instead of spreading tidy bits of cost to
every task whether there is an interested tool or not.
> >
> >> * the numa execution time on each node
> >>
> >
> > This is less obvious because it does not define what NUMA execution time
> > is and it's not documented by the patch.
> >
> >> The task locality is the local page accessing ratio traced on numa
> >> balancing PF, and the group locality is the topology of task execution
> >> time, sectioned by the locality into 8 regions.
> >>
> >
> > This is another important limitation. Disabling NUMA balancing will also
> > disable the stats which may be surprising to some users. It's not a
> > show-stopper but arguably the file should be non-existant or always
> > zeros if NUMA balancing is disabled.
>
> Maybe a check on sched_numa_balancing before showing the data could
> be helpful? when user turn off numa balancing dynamically, we no longer
> showing the data.
>
That would be an improvement but it would not address the point that the
data required can already be gathered from userspace.
> >> By monitoring the new statistic, we will be able to know the numa
> >> efficiency of each per-cgroup workloads on machine, whatever they
> >> sharing the CPUs or not, we will be able to find out which one
> >> introduced the remote access mostly.
> >>
> >> Besides, per-node memory topology from 'memory.numa_stat' become
> >> more useful when we have the per-node execution time, workloads
> >> always executing on node_0 while it's memory is all on node_1 is
> >> usually a bad case.
> >>
> >> Cc: Peter Zijlstra <[email protected]>
> >> Cc: Michal Koutn? <[email protected]>
> >> Signed-off-by: Michael Wang <[email protected]>
> >
> > So, superficially, I'm struggling to see what action a sysadmin would take,
> > if any, with this information. It makes sense to track what individual
> > tasks are doing but this can be done with ftrace if required.
>
> The idea here is to give user the ability of monitoring numa efficiency
> per-cgroup, just like monitoring per-cgroup CPU or memory usages, ftrace
> is a good way of further debugging, while IMHO not that suitable for
> daily monitoring.
>
Basic monitoring of efficiency per cgroup can be done from userspace.
Further debugging with ftrace could be needed in some cases with or
without this patch.
> One common usecase is to alarm user when some cgroup's locality is
> always low, and usually this is caused by wrong memory policy, or
> exhausted numa node, then user can adjust the policy or binding other
> nodes, or reclaim the exhausted node.
>
Again, can be done from userspace.
> >> include/linux/sched.h | 8 ++++++-
> >> kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >> kernel/sched/debug.c | 7 ++++++
> >> kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
> >> kernel/sched/sched.h | 29 +++++++++++++++++++++++++
> >> 5 files changed, 153 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 263cf089d1b3..46995be622c1 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -1114,8 +1114,14 @@ struct task_struct {
> >> * scan window were remote/local or failed to migrate. The task scan
> >> * period is adapted based on the locality of the faults with different
> >> * weights depending on whether they were shared or private faults
> >> + *
> >> + * 0 -- remote faults
> >> + * 1 -- local faults
> >> + * 2 -- page migration failure
> >> + * 3 -- remote page accessing
> >> + * 4 -- local page accessing
> >> */
> >> - unsigned long numa_faults_locality[3];
> >> + unsigned long numa_faults_locality[5];
> >>
> >
> > Not clear from the comment what the difference between a remote fault
> > and a remote page access is. Superficially, they're the same thing.
>
> The 'fault' is recording before page migration while 'accessing'
> is after, they could be different when the page has been migrated.
>
That's a very small distinction given that the counters may only differ
by number of pages migrated which may be a very small number.
> > It might still be misleading because the failure could be due to lack of
> > memory or because the page is pinned. However, I would not worry about
> > that in this case.
> >
> > What *does* make this dangerous is that numa_faults_locality is often
> > cleared. A user could easily think that this data somehow accumulates
> > over time but it does not. This exposes implementation details as
> > numa_faults_locality could change its behaviour in the future and tools
> > should not rely on the contents being stable. While I recognise that
> > some numa balancing information is already exposed in that file, it's
> > relatively harmless with the possible exception of numa_scan_seq but
> > at least that value is almost worthless (other than detecting if NUMA
> > balancing is completely broken) and very unlikely to change behaviour.
>
> Yeah, each new scan will drop the faults data of last scan, which is
> important for group locality accumulation since we don't want to tracing
> the out dated data for too long.
>
> I think I get your point, these per-task faults are not accumulated, so they
> are meaningless on period checking, actually they are not so helpful when
> we consider cgroup as a unit for numa adapting, will remove them in next
> version then.
>
And besides, gathering the information from userspace reflects the
reality of placement and not guesswork.
> >
> >> + SEQ_printf(m, "lhit=%lu rhit=%lu\n",
> >> + p->numa_faults_locality[4],
> >> + p->numa_faults_locality[3]);
> >> show_numa_stats(p, m);
> >> mpol_put(pol);
> >> #endif
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index a81c36472822..4ba3a41cdca3 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -2466,6 +2466,12 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> >> p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
> >> p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
> >> p->numa_faults_locality[local] += pages;
> >> + /*
> >> + * We want to have the real local/remote page access statistic
> >> + * here, so use 'mem_node' which is the real residential node of
> >> + * page after migrate_misplaced_page().
> >> + */
> >> + p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
> >> }
> >>
> >
> > This may be misleading if a task is using a preferred node policy that
> > happens to be remote. It'll report bad locality but it's how the task
> > waqs configured. Similarly, shared pages that are interleaves will show
> > as remote accesses which is not necessarily bad. It goes back to "what
> > does a sysadmin do with this information?"
>
> IMHO a very important usage of these data is to tell the numa users
> "something maybe going wrong", could be on purpose or really a setting
> issue, users could then try to figure out whether this is a false alarm
> or not.
>
While this has value, again I think it should be done in userspace. I
didn't read v2 or respond to the individual points because I cannot
convince myself this needs to be in the kernel at all. If different
historical information was ever needed, it's easier to do that in a
userspace tool that is independent of the kernel version.
--
Mel Gorman
SUSE Labs
On 2019/10/31 下午9:17, Mel Gorman wrote:
> On Thu, Oct 31, 2019 at 11:31:24AM +0800, ?????? wrote:
[snip]
>> For example, tasks bind to the cpus of node_0 could have their memory on
>> node_1 when node_0 almost run out of memory, numa balancing may not be
>> able to help in this case, while by reading locality we could know how
>> critical the problem is, and may take action to rebind cpus to node_1 or
>> reclaim the memory of node_0.
>>
>
> You can already do this by walking each cgroup, identifying what tasks are
> in it and look at /proc/PID/numa_maps and /proc/PID/status to see what
> CPU bindings if any exist. This would use the actual memory placements
> and not those sampled by NUMA balancing, would not require NUMA balancing
> and would work on older kernels. It would be costly to access so I would
> not suggest doing it at high frequency but it makes sense for the tool
> that cares to pay the cost instead of spreading tidy bits of cost to
> every task whether there is an interested tool or not.
I see the biggest concern now is the necessity to let kernel providing these
data, IMHO there are actually good reasons here:
* there are too many tasks to gathering data from, reading proc cost a lot
* tasks living and dying, data lost between each sample window
For example in our cases, we could have hundreds of cgroups, each contain
hundreds of tasks, these worker thread could live and die at any moment,
for gathering we need to cat the list of tasks and then go reading these proc
files one by one, which fall into kernel rapidly and may even need to holding
some locks, this introduced big latency impact, and give no accurate output
since some task may already died before reading it's data.
Then before next sample window, info of tasks died during the window can't be
acquired anymore.
We need kernel's help on reserving data since tool can't catch them in time
before they are lost, also we have to avoid rapidly proc reading, which really
cost a lot and further more, introduce big latency on each sample window.
>
>>>
>>>> * the numa execution time on each node
>>>>
>>>
>>> This is less obvious because it does not define what NUMA execution time
>>> is and it's not documented by the patch.
>>>
>>>> The task locality is the local page accessing ratio traced on numa
>>>> balancing PF, and the group locality is the topology of task execution
>>>> time, sectioned by the locality into 8 regions.
>>>>
>>>
>>> This is another important limitation. Disabling NUMA balancing will also
>>> disable the stats which may be surprising to some users. It's not a
>>> show-stopper but arguably the file should be non-existant or always
>>> zeros if NUMA balancing is disabled.
>>
>> Maybe a check on sched_numa_balancing before showing the data could
>> be helpful? when user turn off numa balancing dynamically, we no longer
>> showing the data.
>>
[snip]
>>>
>>> Not clear from the comment what the difference between a remote fault
>>> and a remote page access is. Superficially, they're the same thing.
>>
>> The 'fault' is recording before page migration while 'accessing'
>> is after, they could be different when the page has been migrated.
>>
>
> That's a very small distinction given that the counters may only differ
> by number of pages migrated which may be a very small number.
This could be very different when we switch the workloads from one node to
another by changing it's allowed cpus, a lot of page migration will happen.
>
>>> It might still be misleading because the failure could be due to lack of
>>> memory or because the page is pinned. However, I would not worry about
>>> that in this case.
>>>
>>> What *does* make this dangerous is that numa_faults_locality is often
>>> cleared. A user could easily think that this data somehow accumulates
>>> over time but it does not. This exposes implementation details as
>>> numa_faults_locality could change its behaviour in the future and tools
>>> should not rely on the contents being stable. While I recognise that
>>> some numa balancing information is already exposed in that file, it's
>>> relatively harmless with the possible exception of numa_scan_seq but
>>> at least that value is almost worthless (other than detecting if NUMA
>>> balancing is completely broken) and very unlikely to change behaviour.
[snip]
>>> This may be misleading if a task is using a preferred node policy that
>>> happens to be remote. It'll report bad locality but it's how the task
>>> waqs configured. Similarly, shared pages that are interleaves will show
>>> as remote accesses which is not necessarily bad. It goes back to "what
>>> does a sysadmin do with this information?"
>>
>> IMHO a very important usage of these data is to tell the numa users
>> "something maybe going wrong", could be on purpose or really a setting
>> issue, users could then try to figure out whether this is a false alarm
>> or not.
>>
>
> While this has value, again I think it should be done in userspace. I
> didn't read v2 or respond to the individual points because I cannot
> convince myself this needs to be in the kernel at all. If different
> historical information was ever needed, it's easier to do that in a
> userspace tool that is independent of the kernel version.
IMHO gathering numa info from userspace can't help address the problem,
per-cgroup numa stat from kernel providing a general/easy/light way of
daily monitoring, which could be really necessary for numa platforms.
Regards,
Michael Wang
>
On Fri, Nov 01, 2019 at 09:49:20AM +0800, ?????? wrote:
>
>
> On 2019/10/31 ??????9:17, Mel Gorman wrote:
> > On Thu, Oct 31, 2019 at 11:31:24AM +0800, ?????? wrote:
> [snip]
> >> For example, tasks bind to the cpus of node_0 could have their memory on
> >> node_1 when node_0 almost run out of memory, numa balancing may not be
> >> able to help in this case, while by reading locality we could know how
> >> critical the problem is, and may take action to rebind cpus to node_1 or
> >> reclaim the memory of node_0.
> >>
> >
> > You can already do this by walking each cgroup, identifying what tasks are
> > in it and look at /proc/PID/numa_maps and /proc/PID/status to see what
> > CPU bindings if any exist. This would use the actual memory placements
> > and not those sampled by NUMA balancing, would not require NUMA balancing
> > and would work on older kernels. It would be costly to access so I would
> > not suggest doing it at high frequency but it makes sense for the tool
> > that cares to pay the cost instead of spreading tidy bits of cost to
> > every task whether there is an interested tool or not.
>
> I see the biggest concern now is the necessity to let kernel providing these
> data, IMHO there are actually good reasons here:
> * there are too many tasks to gathering data from, reading proc cost a lot
> * tasks living and dying, data lost between each sample window
>
> For example in our cases, we could have hundreds of cgroups, each contain
> hundreds of tasks, these worker thread could live and die at any moment,
> for gathering we need to cat the list of tasks and then go reading these proc
> files one by one, which fall into kernel rapidly and may even need to holding
> some locks, this introduced big latency impact, and give no accurate output
> since some task may already died before reading it's data.
>
> Then before next sample window, info of tasks died during the window can't be
> acquired anymore.
>
> We need kernel's help on reserving data since tool can't catch them in time
> before they are lost, also we have to avoid rapidly proc reading, which really
> cost a lot and further more, introduce big latency on each sample window.
>
There is somewhat of a disconnect here. You say that the information must
be accurate and historical yet are relying on NUMA hinting faults to build
the picture which may not be accurate at all given that faults are not
guaranteed to happen. For short-lived tasks, it is also potentially skewed
information if short-lived tasks dominated remote accesses for whatever
reason even though it does not matter -- the tasks were short-lived and
their performance is probably irrelevant. Short-lived tasks may not even
show up if they do not run longer than sysctl_numa_balancing_scan_delay
so the data gathered already has holes in it.
While it's a bit more of a stretch, even this could still be done from
userspace if numa_hint_fault was probed and the event handled (eBPF,
systemtap etc) to build the picture or add a tracepoint. That would give
a much higher degree of flexibility on what information is tracked and
allow flexibility on
So, overall I think this can be done outside the kernel but recognise
that it may not be suitable in all cases. If you feel it must be done
inside the kernel, split out the patch that adds information on failed
page migrations as it stands apart. Put it behind its own kconfig entry
that is disabled by default -- do not tie it directly to NUMA balancing
because of the data structure changes. When enabled, it should still be
disabled by default at runtime and only activated via kernel command line
parameter so that the only people who pay the cost are those that take
deliberate action to enable it.
--
Mel Gorman
SUSE Labs
On 2019/11/1 下午5:13, Mel Gorman wrote:
[snip]
>> For example in our cases, we could have hundreds of cgroups, each contain
>> hundreds of tasks, these worker thread could live and die at any moment,
>> for gathering we need to cat the list of tasks and then go reading these proc
>> files one by one, which fall into kernel rapidly and may even need to holding
>> some locks, this introduced big latency impact, and give no accurate output
>> since some task may already died before reading it's data.
>>
>> Then before next sample window, info of tasks died during the window can't be
>> acquired anymore.
>>
>> We need kernel's help on reserving data since tool can't catch them in time
>> before they are lost, also we have to avoid rapidly proc reading, which really
>> cost a lot and further more, introduce big latency on each sample window.
>>
>
> There is somewhat of a disconnect here. You say that the information must
> be accurate and historical yet are relying on NUMA hinting faults to build
> the picture which may not be accurate at all given that faults are not
> guaranteed to happen. For short-lived tasks, it is also potentially skewed
> information if short-lived tasks dominated remote accesses for whatever
> reason even though it does not matter -- the tasks were short-lived and
> their performance is probably irrelevant. Short-lived tasks may not even
> show up if they do not run longer than sysctl_numa_balancing_scan_delay
> so the data gathered already has holes in it.
>
> While it's a bit more of a stretch, even this could still be done from
> userspace if numa_hint_fault was probed and the event handled (eBPF,
> systemtap etc) to build the picture or add a tracepoint. That would give
> a much higher degree of flexibility on what information is tracked and
> allow flexibility on
>
> So, overall I think this can be done outside the kernel but recognise
> that it may not be suitable in all cases. If you feel it must be done
> inside the kernel, split out the patch that adds information on failed
> page migrations as it stands apart. Put it behind its own kconfig entry
> that is disabled by default -- do not tie it directly to NUMA balancing
> because of the data structure changes. When enabled, it should still be
> disabled by default at runtime and only activated via kernel command line
> parameter so that the only people who pay the cost are those that take
> deliberate action to enable it.
Agree, we could have these per-task faults info there, give the possibility
to implement maybe a practical userland tool, meanwhile have these kernel
numa data disabled by default, folks who got no tool but want to do easy
monitoring can just turn on the switch :-)
Will have these in next version:
* separate patch for showing per-task faults info
* new CONFIG for numa stat (disabled by default)
* dynamical runtime switch for numa stat (disabled by default)
* doc to explain the numa stat and give hint on how to handle it
Best Regards,
Michale Wang
>
On Fri, Nov 01, 2019 at 07:52:15PM +0800, ?????? wrote:
> > a much higher degree of flexibility on what information is tracked and
> > allow flexibility on
> >
> > So, overall I think this can be done outside the kernel but recognise
> > that it may not be suitable in all cases. If you feel it must be done
> > inside the kernel, split out the patch that adds information on failed
> > page migrations as it stands apart. Put it behind its own kconfig entry
> > that is disabled by default -- do not tie it directly to NUMA balancing
> > because of the data structure changes. When enabled, it should still be
> > disabled by default at runtime and only activated via kernel command line
> > parameter so that the only people who pay the cost are those that take
> > deliberate action to enable it.
>
> Agree, we could have these per-task faults info there, give the possibility
> to implement maybe a practical userland tool,
I'd prefer not because that would still require the space in the locality
array to store the data. I'd also prefer that numa_faults_locality[]
information is not exposed unless this feature is enabled. That information
is subject to change and interpreting it requires knowledge of the
internals of automatic NUMA balancing.
There are just too many corner cases where the information is garbage.
Tasks with a memory policy would never update the counters, short-lived
tasks may not update it, interleaving will give confused information about
locality, the timing of the reads matter because it might be cleared,
the frequency at which they clear is unknown as the frequency is adaptive
-- the list goes on. I find it very very difficult to believe that a
tool based on faults_locality will be able to give anything but the
most superficial help and any sensible decision will require ftrace or
numa_maps to get real information.
> meanwhile have these kernel
> numa data disabled by default, folks who got no tool but want to do easy
> monitoring can just turn on the switch :-)
>
> Will have these in next version:
>
> * separate patch for showing per-task faults info
Please only expose the failed= (or migfailed=) in that patch. Do not
expose numa_faults_locality unless it is explicitly enabled on behalf of
a tool that claims it can sensibly interpret it.
> * new CONFIG for numa stat (disabled by default)
> * dynamical runtime switch for numa stat (disabled by default)
Dynamic runtime enabling will mean that if it's turned on, the information
will be temporarily useless until stats are accumulated. Make sure to
note that in any associated documentation stating a preference to
enabling it with a kernel parameter.
--
Mel Gorman
SUSE Labs
Hello Yun.
On Tue, Oct 29, 2019 at 03:57:20PM +0800, 王贇 <[email protected]> wrote:
> +static void update_numa_statistics(struct cfs_rq *cfs_rq)
> +{
> + int idx;
> + unsigned long remote = current->numa_faults_locality[3];
> + unsigned long local = current->numa_faults_locality[4];
> +
> + cfs_rq->nstat.jiffies++;
This statistics effectively doubles what
kernel/sched/cpuacct.c:cpuacct_charge() does (measuring per-cpu time).
Hence it seems redundant.
> +
> + if (!remote && !local)
> + return;
> +
> + idx = (NR_NL_INTERVAL - 1) * local / (remote + local);
> + cfs_rq->nstat.locality[idx]++;
IIUC, the mechanism numa_faults_locality values, this statistics only
estimates the access locality based on NUMA balancing samples, i.e.
there exists more precise source of that information.
All in all, I'd concur to Mel's suggestion of external measurement.
Michal
On 2019/11/1 下午9:35, Mel Gorman wrote:
> On Fri, Nov 01, 2019 at 07:52:15PM +0800, ?????? wrote:
>>> a much higher degree of flexibility on what information is tracked and
>>> allow flexibility on
>>>
>>> So, overall I think this can be done outside the kernel but recognise
>>> that it may not be suitable in all cases. If you feel it must be done
>>> inside the kernel, split out the patch that adds information on failed
>>> page migrations as it stands apart. Put it behind its own kconfig entry
>>> that is disabled by default -- do not tie it directly to NUMA balancing
>>> because of the data structure changes. When enabled, it should still be
>>> disabled by default at runtime and only activated via kernel command line
>>> parameter so that the only people who pay the cost are those that take
>>> deliberate action to enable it.
>>
>> Agree, we could have these per-task faults info there, give the possibility
>> to implement maybe a practical userland tool,
>
> I'd prefer not because that would still require the space in the locality
> array to store the data. I'd also prefer that numa_faults_locality[]
> information is not exposed unless this feature is enabled. That information
> is subject to change and interpreting it requires knowledge of the
> internals of automatic NUMA balancing.
>
> There are just too many corner cases where the information is garbage.
> Tasks with a memory policy would never update the counters, short-lived
> tasks may not update it, interleaving will give confused information about
> locality, the timing of the reads matter because it might be cleared,
> the frequency at which they clear is unknown as the frequency is adaptive
> -- the list goes on. I find it very very difficult to believe that a
> tool based on faults_locality will be able to give anything but the
> most superficial help and any sensible decision will require ftrace or
> numa_maps to get real information.
Yeah, there are no very good approach to estimate the numa efficiency
since there are no per-task counter to tell the local & remote accessing
on hardware level, numa balancing won't cover all the cases, otherwise
we will have overhead issue again.
Well, at least we will be able to know whether this feature itself is
working well or not by reading locality, especially for cross nodes
workloads, locality could at least tell us the numa balancing is helping,
or just wasting resources :-P
>
>> meanwhile have these kernel
>> numa data disabled by default, folks who got no tool but want to do easy
>> monitoring can just turn on the switch :-)
>>
>> Will have these in next version:
>>
>> * separate patch for showing per-task faults info
>
> Please only expose the failed= (or migfailed=) in that patch. Do not
> expose numa_faults_locality unless it is explicitly enabled on behalf of
> a tool that claims it can sensibly interpret it.
Sounds fair enough, no such tool yet.
>
>> * new CONFIG for numa stat (disabled by default)
>> * dynamical runtime switch for numa stat (disabled by default)
>
> Dynamic runtime enabling will mean that if it's turned on, the information
> will be temporarily useless until stats are accumulated. Make sure to
> note that in any associated documentation stating a preference to
> enabling it with a kernel parameter.
That's right, will highlight in the doc.
Regards,
Michael Wang
>
Hi, Michal
On 2019/11/2 上午1:39, Michal Koutný wrote:
> Hello Yun.
>
> On Tue, Oct 29, 2019 at 03:57:20PM +0800, 王贇 <[email protected]> wrote:
>> +static void update_numa_statistics(struct cfs_rq *cfs_rq)
>> +{
>> + int idx;
>> + unsigned long remote = current->numa_faults_locality[3];
>> + unsigned long local = current->numa_faults_locality[4];
>> +
>> + cfs_rq->nstat.jiffies++;
> This statistics effectively doubles what
> kernel/sched/cpuacct.c:cpuacct_charge() does (measuring per-cpu time).
> Hence it seems redundant.
Yes, while there is no guarantee the cpu cgroup always binding
with cpuacct in v1, we can't rely on that...
>
>> +
>> + if (!remote && !local)
>> + return;
>> +
>> + idx = (NR_NL_INTERVAL - 1) * local / (remote + local);
>> + cfs_rq->nstat.locality[idx]++;
> IIUC, the mechanism numa_faults_locality values, this statistics only
> estimates the access locality based on NUMA balancing samples, i.e.
> there exists more precise source of that information.>
> All in all, I'd concur to Mel's suggestion of external measurement.
Currently I can only find numa balancing who is telling the real story,
at least we know after the PF, task do access the page on that CPU,
although it can't cover all the cases, it still giving good hints :-)
It would be great if we could find more similar indicators, like the
migration failure counter Mel mentioned, which give good hints on
memory policy problems, could be used as external measurement.
Regards,
Michael Wang
>
> Michal
>