2020-02-21 13:28:55

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH v4 2/5] sched/numa: Replace runnable_load_avg by load_avg

Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: Vincent Guittot <[email protected]>
Reviewed-by: "Dietmar Eggemann <[email protected]>"
---
kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++--------------
1 file changed, 70 insertions(+), 32 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 27450c4ddc81..637f4eb47889 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};

/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
-
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
};

-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
-{
- int cpu;
-
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
-
- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
- }
-
-}
-
struct task_numa_env {
struct task_struct *p;

@@ -1521,6 +1518,47 @@ struct task_numa_env {
int best_cpu;
};

+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid)
+{
+ int cpu;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->load += cpu_load(rq);
+ ns->util += cpu_util(cpu);
+ ns->nr_running += rq->cfs.h_nr_running;
+ ns->compute_capacity += capacity_of(cpu);
+ }
+
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
+}
+
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
@@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;

+
+ /* If dst node has spare capacity, there is no real load imbalance */
+ if (env->dst_stats.node_type == node_has_spare)
+ return false;
+
/*
* The load is corrected for the CPU capacity available on each node.
*
@@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);

/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)

env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -3685,11 +3728,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.runnable_load_avg;
-}
-
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;
--
2.17.1


2020-02-23 06:38:06

by Parth Shah

[permalink] [raw]
Subject: Re: [PATCH v4 2/5] sched/numa: Replace runnable_load_avg by load_avg



On 2/21/20 6:57 PM, Vincent Guittot wrote:
> Similarly to what has been done for the normal load balancer, we can
> replace runnable_load_avg by load_avg in numa load balancing and track the
> other statistics like the utilization and the number of running tasks to
> get to better view of the current state of a node.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> Reviewed-by: "Dietmar Eggemann <[email protected]>"
> ---
> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++--------------
> 1 file changed, 70 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 27450c4ddc81..637f4eb47889 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
> group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
> }
>
> -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
> -
> -static unsigned long cpu_runnable_load(struct rq *rq)
> -{
> - return cfs_rq_runnable_load_avg(&rq->cfs);
> -}
> +/*
> + * 'numa_type' describes the node at the moment of load balancing.
> + */
> +enum numa_type {
> + /* The node has spare capacity that can be used to run more tasks. */
> + node_has_spare = 0,
> + /*
> + * The node is fully used and the tasks don't compete for more CPU
> + * cycles. Nevertheless, some tasks might wait before running.
> + */
> + node_fully_busy,
> + /*
> + * The node is overloaded and can't provide expected CPU cycles to all
> + * tasks.
> + */
> + node_overloaded
> +};
>
> /* Cached statistics for all CPUs within a node */
> struct numa_stats {
> unsigned long load;
> -
> + unsigned long util;
> /* Total compute capacity of CPUs on a node */
> unsigned long compute_capacity;
> + unsigned int nr_running;
> + unsigned int weight;
> + enum numa_type node_type;
> };
>
> -/*
> - * XXX borrowed from update_sg_lb_stats
> - */
> -static void update_numa_stats(struct numa_stats *ns, int nid)
> -{
> - int cpu;
> -
> - memset(ns, 0, sizeof(*ns));
> - for_each_cpu(cpu, cpumask_of_node(nid)) {
> - struct rq *rq = cpu_rq(cpu);
> -
> - ns->load += cpu_runnable_load(rq);
> - ns->compute_capacity += capacity_of(cpu);
> - }
> -
> -}
> -
> struct task_numa_env {
> struct task_struct *p;
>
> @@ -1521,6 +1518,47 @@ struct task_numa_env {
> int best_cpu;
> };
>
> +static unsigned long cpu_load(struct rq *rq);
> +static unsigned long cpu_util(int cpu);
> +
> +static inline enum
> +numa_type numa_classify(unsigned int imbalance_pct,
> + struct numa_stats *ns)
> +{
> + if ((ns->nr_running > ns->weight) &&
> + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
> + return node_overloaded;
> +
> + if ((ns->nr_running < ns->weight) ||
> + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
> + return node_has_spare;
> +
> + return node_fully_busy;
> +}


I was pondering upon the possible cases of returning node_fully_busy here.

It will return fully busy only when scaled util is exactly equal to
capacity && ns->nr_running == ns->weight. From reading the patch-set, I
failed to figure out the implications of it. Ideally, the tasks should
neither be pulled to or pulled from this node. Is this what its use for?

If yes, then should we return false when checking for load_too_imbalanced
and found that env->dst_stats.node_type == node_fully_busy ?

[...]
> @@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
> long orig_src_load, orig_dst_load;
> long src_capacity, dst_capacity;
>
> +
> + /* If dst node has spare capacity, there is no real load imbalance */
> + if (env->dst_stats.node_type == node_has_spare)
> + return false;
[...]

- Parth