LinuxLists.cc - [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load

2020-02-14 15:29:09

Subject: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++------------
1 file changed, 75 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 27450c4ddc81..12266b248f52 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};

/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
-
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
};

-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
-{
- int cpu;
-
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
-
- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
- }
-
-}
-
struct task_numa_env {
struct task_struct *p;

@@ -1521,6 +1518,47 @@ struct task_numa_env {
int best_cpu;
};

+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid)
+{
+ int cpu;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->load += cpu_load(rq);
+ ns->util += cpu_util(cpu);
+ ns->nr_running += rq->cfs.h_nr_running;
+ ns->compute_capacity += capacity_of(cpu);
+ }
+
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
+}
+
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
@@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;

+
+ /* If dst node has spare capacity, there is no real load imbalance */
+ if (env->dst_stats.node_type == node_has_spare)
+ return false;
+
/*
* The load is corrected for the CPU capacity available on each node.
*
@@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);

/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)

env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -5446,6 +5489,11 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
return load;
}

+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
+
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
--
2.17.1

2020-02-18 12:38:22

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On 14/02/2020 16:27, Vincent Guittot wrote:

[...]

> /*
> * The load is corrected for the CPU capacity available on each node.
> *
> @@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
> dist = env.dist = node_distance(env.src_nid, env.dst_nid);
> taskweight = task_weight(p, env.src_nid, dist);
> groupweight = group_weight(p, env.src_nid, dist);
> - update_numa_stats(&env.src_stats, env.src_nid);
> + update_numa_stats(&env, &env.src_stats, env.src_nid);

This looks strange. Can you do:

-static void update_numa_stats(struct task_numa_env *env,
+static void update_numa_stats(unsigned int imbalance_pct,
struct numa_stats *ns, int nid)

- update_numa_stats(&env, &env.src_stats, env.src_nid);
+ update_numa_stats(env.imbalance_pct, &env.src_stats, env.src_nid);

[...]

> +static unsigned long cpu_runnable_load(struct rq *rq)
> +{
> + return cfs_rq_runnable_load_avg(&rq->cfs);
> +}
> +

Why not remove cpu_runnable_load() in this patch rather moving it?

kernel/sched/fair.c:5492:22: warning: ‘cpu_runnable_load’ defined but
not used [-Wunused-function]
static unsigned long cpu_runnable_load(struct rq *rq)

2020-02-18 13:52:43

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, Feb 18, 2020 at 01:37:45PM +0100, Dietmar Eggemann wrote:
> On 14/02/2020 16:27, Vincent Guittot wrote:
>
> [...]
>
> > /*
> > * The load is corrected for the CPU capacity available on each node.
> > *
> > @@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
> > dist = env.dist = node_distance(env.src_nid, env.dst_nid);
> > taskweight = task_weight(p, env.src_nid, dist);
> > groupweight = group_weight(p, env.src_nid, dist);
> > - update_numa_stats(&env.src_stats, env.src_nid);
> > + update_numa_stats(&env, &env.src_stats, env.src_nid);
>
> This looks strange. Can you do:
>
> -static void update_numa_stats(struct task_numa_env *env,
> +static void update_numa_stats(unsigned int imbalance_pct,
> struct numa_stats *ns, int nid)
>
> - update_numa_stats(&env, &env.src_stats, env.src_nid);
> + update_numa_stats(env.imbalance_pct, &env.src_stats, env.src_nid);
>

You'd also have to pass in env->p and while it could be done, I do not
think its worthwhile.

> [...]
>
> > +static unsigned long cpu_runnable_load(struct rq *rq)
> > +{
> > + return cfs_rq_runnable_load_avg(&rq->cfs);
> > +}
> > +
>
> Why not remove cpu_runnable_load() in this patch rather moving it?
>
> kernel/sched/fair.c:5492:22: warning: ???cpu_runnable_load??? defined but
> not used [-Wunused-function]
> static unsigned long cpu_runnable_load(struct rq *rq)
>

I took the liberty of addressing that when I picked up Vincent's patches
for "Reconcile NUMA balancing decisions with the load balancer v3" to fix
a build warning. I did not highlight it when I posted because it was such
a trivial change.

--
Mel Gorman
SUSE Labs

2020-02-18 14:18:57

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, 18 Feb 2020 at 14:51, Mel Gorman <[email protected]> wrote:
>
> On Tue, Feb 18, 2020 at 01:37:45PM +0100, Dietmar Eggemann wrote:
> > On 14/02/2020 16:27, Vincent Guittot wrote:
> >
> > [...]
> >
> > > /*
> > > * The load is corrected for the CPU capacity available on each node.
> > > *
> > > @@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
> > > dist = env.dist = node_distance(env.src_nid, env.dst_nid);
> > > taskweight = task_weight(p, env.src_nid, dist);
> > > groupweight = group_weight(p, env.src_nid, dist);
> > > - update_numa_stats(&env.src_stats, env.src_nid);
> > > + update_numa_stats(&env, &env.src_stats, env.src_nid);
> >
> > This looks strange. Can you do:
> >
> > -static void update_numa_stats(struct task_numa_env *env,
> > +static void update_numa_stats(unsigned int imbalance_pct,
> > struct numa_stats *ns, int nid)
> >
> > - update_numa_stats(&env, &env.src_stats, env.src_nid);
> > + update_numa_stats(env.imbalance_pct, &env.src_stats, env.src_nid);
> >
>
> You'd also have to pass in env->p and while it could be done, I do not
> think its worthwhile.

I agree

>
> > [...]
> >
> > > +static unsigned long cpu_runnable_load(struct rq *rq)
> > > +{
> > > + return cfs_rq_runnable_load_avg(&rq->cfs);
> > > +}
> > > +
> >
> > Why not remove cpu_runnable_load() in this patch rather moving it?
> >
> > kernel/sched/fair.c:5492:22: warning: ???cpu_runnable_load??? defined but
> > not used [-Wunused-function]
> > static unsigned long cpu_runnable_load(struct rq *rq)
> >
>
> I took the liberty of addressing that when I picked up Vincent's patches
> for "Reconcile NUMA balancing decisions with the load balancer v3" to fix
> a build warning. I did not highlight it when I posted because it was such
> a trivial change.

yes I have noticed that.
Thanks

>
> --
> Mel Gorman
> SUSE Labs

2020-02-18 14:43:40

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On 18/02/2020 15:17, Vincent Guittot wrote:
> On Tue, 18 Feb 2020 at 14:51, Mel Gorman <[email protected]> wrote:
>>
>> On Tue, Feb 18, 2020 at 01:37:45PM +0100, Dietmar Eggemann wrote:
>>> On 14/02/2020 16:27, Vincent Guittot wrote:

[...]

>>> -static void update_numa_stats(struct task_numa_env *env,
>>> +static void update_numa_stats(unsigned int imbalance_pct,
>>> struct numa_stats *ns, int nid)
>>>
>>> - update_numa_stats(&env, &env.src_stats, env.src_nid);
>>> + update_numa_stats(env.imbalance_pct, &env.src_stats, env.src_nid);
>>>
>>
>> You'd also have to pass in env->p and while it could be done, I do not
>> think its worthwhile.
>
> I agree

Ah, another patch in Mel's patch-set:
https://lore.kernel.org/r/[email protected]

I see.

[...]

2020-02-18 14:54:42

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On 2/14/20 3:27 PM, Vincent Guittot wrote:
> @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
> group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
> }
>
> -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
> -
> -static unsigned long cpu_runnable_load(struct rq *rq)
> -{
> - return cfs_rq_runnable_load_avg(&rq->cfs);
> -}
> +/*
> + * 'numa_type' describes the node at the moment of load balancing.
> + */
> +enum numa_type {
> + /* The node has spare capacity that can be used to run more tasks. */
> + node_has_spare = 0,
> + /*
> + * The node is fully used and the tasks don't compete for more CPU
> + * cycles. Nevertheless, some tasks might wait before running.
> + */
> + node_fully_busy,
> + /*
> + * The node is overloaded and can't provide expected CPU cycles to all
> + * tasks.
> + */
> + node_overloaded
> +};

Could we reuse group_type instead? The definitions are the same modulo
s/group/node/.

>
> /* Cached statistics for all CPUs within a node */
> struct numa_stats {
> unsigned long load;
> -
> + unsigned long util;
> /* Total compute capacity of CPUs on a node */
> unsigned long compute_capacity;
> + unsigned int nr_running;
> + unsigned int weight;
> + enum numa_type node_type;
> };
>
> -/*
> - * XXX borrowed from update_sg_lb_stats
> - */
> -static void update_numa_stats(struct numa_stats *ns, int nid)
> -{
> - int cpu;
> -
> - memset(ns, 0, sizeof(*ns));
> - for_each_cpu(cpu, cpumask_of_node(nid)) {
> - struct rq *rq = cpu_rq(cpu);
> -
> - ns->load += cpu_runnable_load(rq);
> - ns->compute_capacity += capacity_of(cpu);
> - }
> -
> -}
> -
> struct task_numa_env {
> struct task_struct *p;
>
> @@ -1521,6 +1518,47 @@ struct task_numa_env {
> int best_cpu;
> };
>
> +static unsigned long cpu_load(struct rq *rq);
> +static unsigned long cpu_util(int cpu);
> +
> +static inline enum
> +numa_type numa_classify(unsigned int imbalance_pct,
> + struct numa_stats *ns)
> +{
> + if ((ns->nr_running > ns->weight) &&
> + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
> + return node_overloaded;
> +
> + if ((ns->nr_running < ns->weight) ||
> + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
> + return node_has_spare;
> +
> + return node_fully_busy;
> +}
> +

As Mel pointed out, this is group_is_overloaded() and group_has_capacity().
@Mel, you mentioned having a common helper, do you have that laying around?
I haven't seen it in your reconciliation series.

What I'm naively thinking here is that we could have either move the whole
thing to just sg_lb_stats (AFAICT the fields of numa_stats are a subset of it),
or if we really care about the stack we could tweak the ordering to ensure
we can cast one into the other (not too enticed by that one though).

> +/*
> + * XXX borrowed from update_sg_lb_stats
> + */
> +static void update_numa_stats(struct task_numa_env *env,
> + struct numa_stats *ns, int nid)
> +{
> + int cpu;
> +
> + memset(ns, 0, sizeof(*ns));
> + for_each_cpu(cpu, cpumask_of_node(nid)) {
> + struct rq *rq = cpu_rq(cpu);
> +
> + ns->load += cpu_load(rq);
> + ns->util += cpu_util(cpu);
> + ns->nr_running += rq->cfs.h_nr_running;
> + ns->compute_capacity += capacity_of(cpu);
> + }
> +
> + ns->weight = cpumask_weight(cpumask_of_node(nid));
> +
> + ns->node_type = numa_classify(env->imbalance_pct, ns);
> +}
> +
> static void task_numa_assign(struct task_numa_env *env,
> struct task_struct *p, long imp)
> {

2020-02-18 15:33:51

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, 18 Feb 2020 at 15:54, Valentin Schneider
<[email protected]> wrote:
>
> On 2/14/20 3:27 PM, Vincent Guittot wrote:
> > @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
> > group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
> > }
> >
> > -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
> > -
> > -static unsigned long cpu_runnable_load(struct rq *rq)
> > -{
> > - return cfs_rq_runnable_load_avg(&rq->cfs);
> > -}
> > +/*
> > + * 'numa_type' describes the node at the moment of load balancing.
> > + */
> > +enum numa_type {
> > + /* The node has spare capacity that can be used to run more tasks. */
> > + node_has_spare = 0,
> > + /*
> > + * The node is fully used and the tasks don't compete for more CPU
> > + * cycles. Nevertheless, some tasks might wait before running.
> > + */
> > + node_fully_busy,
> > + /*
> > + * The node is overloaded and can't provide expected CPU cycles to all
> > + * tasks.
> > + */
> > + node_overloaded
> > +};
>
> Could we reuse group_type instead? The definitions are the same modulo
> s/group/node/.

Also, imbalance might have but misfit and asym have no meaning at NUMA level

For now I prefer to keep them separate to ease code readability. Also
more changes will come on top of this for NUMA balancing which could
also ends up with numa dedicated states

>
> >
> > /* Cached statistics for all CPUs within a node */
> > struct numa_stats {
> > unsigned long load;
> > -
> > + unsigned long util;
> > /* Total compute capacity of CPUs on a node */
> > unsigned long compute_capacity;
> > + unsigned int nr_running;
> > + unsigned int weight;
> > + enum numa_type node_type;
> > };
> >
> > -/*
> > - * XXX borrowed from update_sg_lb_stats
> > - */
> > -static void update_numa_stats(struct numa_stats *ns, int nid)
> > -{
> > - int cpu;
> > -
> > - memset(ns, 0, sizeof(*ns));
> > - for_each_cpu(cpu, cpumask_of_node(nid)) {
> > - struct rq *rq = cpu_rq(cpu);
> > -
> > - ns->load += cpu_runnable_load(rq);
> > - ns->compute_capacity += capacity_of(cpu);
> > - }
> > -
> > -}
> > -
> > struct task_numa_env {
> > struct task_struct *p;
> >
> > @@ -1521,6 +1518,47 @@ struct task_numa_env {
> > int best_cpu;
> > };
> >
> > +static unsigned long cpu_load(struct rq *rq);
> > +static unsigned long cpu_util(int cpu);
> > +
> > +static inline enum
> > +numa_type numa_classify(unsigned int imbalance_pct,
> > + struct numa_stats *ns)
> > +{
> > + if ((ns->nr_running > ns->weight) &&
> > + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
> > + return node_overloaded;
> > +
> > + if ((ns->nr_running < ns->weight) ||
> > + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
> > + return node_has_spare;
> > +
> > + return node_fully_busy;
> > +}
> > +
>
> As Mel pointed out, this is group_is_overloaded() and group_has_capacity().
> @Mel, you mentioned having a common helper, do you have that laying around?
> I haven't seen it in your reconciliation series.
>
> What I'm naively thinking here is that we could have either move the whole
> thing to just sg_lb_stats (AFAICT the fields of numa_stats are a subset of it),
> or if we really care about the stack we could tweak the ordering to ensure
> we can cast one into the other (not too enticed by that one though).
>
> > +/*
> > + * XXX borrowed from update_sg_lb_stats
> > + */
> > +static void update_numa_stats(struct task_numa_env *env,
> > + struct numa_stats *ns, int nid)
> > +{
> > + int cpu;
> > +
> > + memset(ns, 0, sizeof(*ns));
> > + for_each_cpu(cpu, cpumask_of_node(nid)) {
> > + struct rq *rq = cpu_rq(cpu);
> > +
> > + ns->load += cpu_load(rq);
> > + ns->util += cpu_util(cpu);
> > + ns->nr_running += rq->cfs.h_nr_running;
> > + ns->compute_capacity += capacity_of(cpu);
> > + }
> > +
> > + ns->weight = cpumask_weight(cpumask_of_node(nid));
> > +
> > + ns->node_type = numa_classify(env->imbalance_pct, ns);
> > +}
> > +
> > static void task_numa_assign(struct task_numa_env *env,
> > struct task_struct *p, long imp)
> > {

2020-02-18 15:39:00

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, Feb 18, 2020 at 02:54:14PM +0000, Valentin Schneider wrote:
> On 2/14/20 3:27 PM, Vincent Guittot wrote:
> > @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
> > group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
> > }
> >
> > -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
> > -
> > -static unsigned long cpu_runnable_load(struct rq *rq)
> > -{
> > - return cfs_rq_runnable_load_avg(&rq->cfs);
> > -}
> > +/*
> > + * 'numa_type' describes the node at the moment of load balancing.
> > + */
> > +enum numa_type {
> > + /* The node has spare capacity that can be used to run more tasks. */
> > + node_has_spare = 0,
> > + /*
> > + * The node is fully used and the tasks don't compete for more CPU
> > + * cycles. Nevertheless, some tasks might wait before running.
> > + */
> > + node_fully_busy,
> > + /*
> > + * The node is overloaded and can't provide expected CPU cycles to all
> > + * tasks.
> > + */
> > + node_overloaded
> > +};
>
> Could we reuse group_type instead? The definitions are the same modulo
> s/group/node/.
>

I kept the naming because there is the remote possibility that NUMA
balancing will deviate in some fashion. Right now, it's harmless.

> >
> > /* Cached statistics for all CPUs within a node */
> > struct numa_stats {
> > unsigned long load;
> > -
> > + unsigned long util;
> > /* Total compute capacity of CPUs on a node */
> > unsigned long compute_capacity;
> > + unsigned int nr_running;
> > + unsigned int weight;
> > + enum numa_type node_type;
> > };
> >
> > -/*
> > - * XXX borrowed from update_sg_lb_stats
> > - */
> > -static void update_numa_stats(struct numa_stats *ns, int nid)
> > -{
> > - int cpu;
> > -
> > - memset(ns, 0, sizeof(*ns));
> > - for_each_cpu(cpu, cpumask_of_node(nid)) {
> > - struct rq *rq = cpu_rq(cpu);
> > -
> > - ns->load += cpu_runnable_load(rq);
> > - ns->compute_capacity += capacity_of(cpu);
> > - }
> > -
> > -}
> > -
> > struct task_numa_env {
> > struct task_struct *p;
> >
> > @@ -1521,6 +1518,47 @@ struct task_numa_env {
> > int best_cpu;
> > };
> >
> > +static unsigned long cpu_load(struct rq *rq);
> > +static unsigned long cpu_util(int cpu);
> > +
> > +static inline enum
> > +numa_type numa_classify(unsigned int imbalance_pct,
> > + struct numa_stats *ns)
> > +{
> > + if ((ns->nr_running > ns->weight) &&
> > + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
> > + return node_overloaded;
> > +
> > + if ((ns->nr_running < ns->weight) ||
> > + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
> > + return node_has_spare;
> > +
> > + return node_fully_busy;
> > +}
> > +
>
> As Mel pointed out, this is group_is_overloaded() and group_has_capacity().
> @Mel, you mentioned having a common helper, do you have that laying around?
> I haven't seen it in your reconciliation series.
>

I didn't merge that part of the first version of my series. I was
waiting to see how the implementation for allowing a small degree of
imbalance looks like. If it's entirely confined in adjust_numa_balance
then I'll create the common helper at the same time. For now, I left the
possibility open that numa_classify would use something different than
group_is_overloaded or group_has_capacity even if I find that hard to
imagine at the moment.

> What I'm naively thinking here is that we could have either move the whole
> thing to just sg_lb_stats (AFAICT the fields of numa_stats are a subset of it),
> or if we really care about the stack we could tweak the ordering to ensure
> we can cast one into the other (not too enticed by that one though).
>

Yikes, no I'd rather not do that. Basically all I did before was create
a common helper like __lb_has_capacity that only took basic types as
parameters. group_has_capacity and numa_has_capacity were simple wrappers
that read the correct fields from their respective stats structures.

--
Mel Gorman
SUSE Labs

2020-02-18 16:52:09

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, 18 Feb 2020 at 16:38, Mel Gorman <[email protected]> wrote:
>
> On Tue, Feb 18, 2020 at 02:54:14PM +0000, Valentin Schneider wrote:
> > On 2/14/20 3:27 PM, Vincent Guittot wrote:
> > > @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
> > > group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
> > > }
> > >
> > > -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
> > > -
> > > -static unsigned long cpu_runnable_load(struct rq *rq)
> > > -{
> > > - return cfs_rq_runnable_load_avg(&rq->cfs);
> > > -}
> > > +/*
> > > + * 'numa_type' describes the node at the moment of load balancing.
> > > + */
> > > +enum numa_type {
> > > + /* The node has spare capacity that can be used to run more tasks. */
> > > + node_has_spare = 0,
> > > + /*
> > > + * The node is fully used and the tasks don't compete for more CPU
> > > + * cycles. Nevertheless, some tasks might wait before running.
> > > + */
> > > + node_fully_busy,
> > > + /*
> > > + * The node is overloaded and can't provide expected CPU cycles to all
> > > + * tasks.
> > > + */
> > > + node_overloaded
> > > +};
> >
> > Could we reuse group_type instead? The definitions are the same modulo
> > s/group/node/.
> >
>
> I kept the naming because there is the remote possibility that NUMA
> balancing will deviate in some fashion. Right now, it's harmless.

+1
This 1st round mainly aims to align NUMA way of working with load
balance but we can imagine using some NUMA or memory specific
information to create new state

>
> > >
> > > /* Cached statistics for all CPUs within a node */
> > > struct numa_stats {
> > > unsigned long load;
> > > -
> > > + unsigned long util;
> > > /* Total compute capacity of CPUs on a node */
> > > unsigned long compute_capacity;
> > > + unsigned int nr_running;
> > > + unsigned int weight;
> > > + enum numa_type node_type;
> > > };
> > >
> > > -/*
> > > - * XXX borrowed from update_sg_lb_stats
> > > - */
> > > -static void update_numa_stats(struct numa_stats *ns, int nid)
> > > -{
> > > - int cpu;
> > > -
> > > - memset(ns, 0, sizeof(*ns));
> > > - for_each_cpu(cpu, cpumask_of_node(nid)) {
> > > - struct rq *rq = cpu_rq(cpu);
> > > -
> > > - ns->load += cpu_runnable_load(rq);
> > > - ns->compute_capacity += capacity_of(cpu);
> > > - }
> > > -
> > > -}
> > > -
> > > struct task_numa_env {
> > > struct task_struct *p;
> > >
> > > @@ -1521,6 +1518,47 @@ struct task_numa_env {
> > > int best_cpu;
> > > };
> > >
> > > +static unsigned long cpu_load(struct rq *rq);
> > > +static unsigned long cpu_util(int cpu);
> > > +
> > > +static inline enum
> > > +numa_type numa_classify(unsigned int imbalance_pct,
> > > + struct numa_stats *ns)
> > > +{
> > > + if ((ns->nr_running > ns->weight) &&
> > > + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
> > > + return node_overloaded;
> > > +
> > > + if ((ns->nr_running < ns->weight) ||
> > > + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
> > > + return node_has_spare;
> > > +
> > > + return node_fully_busy;
> > > +}
> > > +
> >
> > As Mel pointed out, this is group_is_overloaded() and group_has_capacity().
> > @Mel, you mentioned having a common helper, do you have that laying around?
> > I haven't seen it in your reconciliation series.
> >
>
> I didn't merge that part of the first version of my series. I was
> waiting to see how the implementation for allowing a small degree of
> imbalance looks like. If it's entirely confined in adjust_numa_balance
> then I'll create the common helper at the same time. For now, I left the
> possibility open that numa_classify would use something different than
> group_is_overloaded or group_has_capacity even if I find that hard to
> imagine at the moment.
>
> > What I'm naively thinking here is that we could have either move the whole
> > thing to just sg_lb_stats (AFAICT the fields of numa_stats are a subset of it),
> > or if we really care about the stack we could tweak the ordering to ensure
> > we can cast one into the other (not too enticed by that one though).
> >
>
> Yikes, no I'd rather not do that. Basically all I did before was create
> a common helper like __lb_has_capacity that only took basic types as
> parameters. group_has_capacity and numa_has_capacity were simple wrappers
> that read the correct fields from their respective stats structures.
>
> --
> Mel Gorman
> SUSE Labs

2020-02-18 16:52:50

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On 18/02/2020 15:38, Mel Gorman wrote:
>>
>> Could we reuse group_type instead? The definitions are the same modulo
>> s/group/node/.
>>
>
> I kept the naming because there is the remote possibility that NUMA
> balancing will deviate in some fashion. Right now, it's harmless.
>

Since it's just a subset ATM I'd go for the reuse and change that later if
shown a split is required, but fair enough.

> I didn't merge that part of the first version of my series. I was
> waiting to see how the implementation for allowing a small degree of
> imbalance looks like. If it's entirely confined in adjust_numa_balance
^^^^^^^^^^^^^^^^^^^
Apologies if that's a newbie question, but I'm not familiar with that one.
Would that be added in your reconciliation series? I've only had a brief
look at it yet (it's next on the chopping block).

> then I'll create the common helper at the same time. For now, I left the
> possibility open that numa_classify would use something different than
> group_is_overloaded or group_has_capacity even if I find that hard to
> imagine at the moment.
>
>> What I'm naively thinking here is that we could have either move the whole
>> thing to just sg_lb_stats (AFAICT the fields of numa_stats are a subset of it),
>> or if we really care about the stack we could tweak the ordering to ensure
>> we can cast one into the other (not too enticed by that one though).
>>
>
> Yikes, no I'd rather not do that. Basically all I did before was create
> a common helper like __lb_has_capacity that only took basic types as
> parameters. group_has_capacity and numa_has_capacity were simple wrappers
> that read the correct fields from their respective stats structures.
>

That's more sensible indeed. It'd definitely be nice to actually reconcile
the two balancers with these common helpers, though I guess it doesn't
*have* to happen with this very patch.

2020-02-18 17:43:06

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On Tue, Feb 18, 2020 at 04:50:48PM +0000, Valentin Schneider wrote:
> On 18/02/2020 15:38, Mel Gorman wrote:
> >>
> >> Could we reuse group_type instead? The definitions are the same modulo
> >> s/group/node/.
> >>
> >
> > I kept the naming because there is the remote possibility that NUMA
> > balancing will deviate in some fashion. Right now, it's harmless.
> >
>
> Since it's just a subset ATM I'd go for the reuse and change that later if
> shown a split is required, but fair enough.
>

I would feel that I was churning code for the sake of it.

> > I didn't merge that part of the first version of my series. I was
> > waiting to see how the implementation for allowing a small degree of
> > imbalance looks like. If it's entirely confined in adjust_numa_balance
> ^^^^^^^^^^^^^^^^^^^
> Apologies if that's a newbie question, but I'm not familiar with that one.
> Would that be added in your reconciliation series? I've only had a brief
> look at it yet (it's next on the chopping block).
>

I should have wrote adjust_numa_imbalance but yes, it's part of the
reconciled series so that NUMA balancing and the load balancer use the
same helper.

> > Yikes, no I'd rather not do that. Basically all I did before was create
> > a common helper like __lb_has_capacity that only took basic types as
> > parameters. group_has_capacity and numa_has_capacity were simple wrappers
> > that read the correct fields from their respective stats structures.
> >
>
> That's more sensible indeed. It'd definitely be nice to actually reconcile
> the two balancers with these common helpers, though I guess it doesn't
> *have* to happen with this very patch.

Yep, it can happen at a later time.

As it stands, I think the reconciled series stands on its own even
though there are further improvements that could be built on top.

--
Mel Gorman
SUSE Labs

2020-02-18 17:55:05

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH v2 2/5] sched/numa: Replace runnable_load_avg by load_avg

On 18/02/2020 17:41, Mel Gorman wrote:
>>> I didn't merge that part of the first version of my series. I was
>>> waiting to see how the implementation for allowing a small degree of
>>> imbalance looks like. If it's entirely confined in adjust_numa_balance
>> ^^^^^^^^^^^^^^^^^^^
>> Apologies if that's a newbie question, but I'm not familiar with that one.
>> Would that be added in your reconciliation series? I've only had a brief
>> look at it yet (it's next on the chopping block).
>>
>
> I should have wrote adjust_numa_imbalance but yes, it's part of the
> reconciled series so that NUMA balancing and the load balancer use the
> same helper.
>

Okay, thanks, then I guess I'll forget about any reconciliation for now
and whinge about it when I'll get to your series ;)