The load balancer tries to find, through find_busiest_group and
find_busiest_queue, a busy CPU with tasks that are running on
the wrong NUMA node.
However, the load balancer only moves runnable-but-not-running
tasks in most situations. This fails horribly when the current
task on a CPU is on the wrong NUMA node, but the other task(s)
on the run queue are placed correctly.
In that situation, what started out as one misplaced tasks
quickly turns into two misplaced tasks.
Try to avoid that by factoring out the placement of the current
task, in order to find groups and runqueues with misplaced tasks
that are not currently running.
Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++++++++++-----
1 file changed, 38 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7696cbad82e0..265109566dc6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -810,6 +810,8 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+void fbq_classify_current(int cpu, struct rq *rq, int *numa, int *remote);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -6290,6 +6292,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
+ {
+ int numa, remote;
+ /*
+ * The current task cannot be moved. Pretend it is
+ * running on the right NUMA node, without counting
+ * it twice.
+ */
+ fbq_classify_current(i, rq, &numa, &remote);
+ sgs->nr_numa_running += numa;
+ sgs->nr_preferred_running += remote;
+ }
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
@@ -6368,11 +6381,31 @@ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
return all;
}
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+/*
+ * Is the current task running on the desired NUMA node?
+ * Must be called with the rcu_read_lock held.
+ */
+void fbq_classify_current(int cpu, struct rq *rq, int *numa, int *remote)
{
- if (rq->nr_running > rq->nr_numa_running)
+ struct task_struct *curr = rq->curr;
+ int curr_node = cpu_to_node(cpu);
+
+ *numa = curr->numa_preferred_nid != -1;
+ *remote = *numa && curr->numa_preferred_nid != curr_node;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq, int cpu)
+{
+ int numa, remote;
+ /*
+ * The current task cannot be moved by the load balancer.
+ * Pretend it is running on the right NUMA node, but be
+ * careful not to count it twice.
+ */
+ fbq_classify_current(cpu, rq, &numa, &remote);
+ if (rq->nr_running > rq->nr_numa_running + numa)
return regular;
- if (rq->nr_running > rq->nr_preferred_running)
+ if (rq->nr_running > rq->nr_preferred_running + remote)
return remote;
return all;
}
@@ -6382,7 +6415,7 @@ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
return all;
}
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+static inline enum fbq_type fbq_classify_rq(struct rq *rq, int cpu)
{
return regular;
}
@@ -6773,7 +6806,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
enum fbq_type rt;
rq = cpu_rq(i);
- rt = fbq_classify_rq(rq);
+ rt = fbq_classify_rq(rq, i);
/*
* We classify groups/runqueues into three groups:
On Tue, 2015-05-19 at 10:55 -0400, Rik van Riel wrote:
> The load balancer tries to find, through find_busiest_group and
> find_busiest_queue, a busy CPU with tasks that are running on
> the wrong NUMA node.
>
> However, the load balancer only moves runnable-but-not-running
> tasks in most situations. This fails horribly when the current
> task on a CPU is on the wrong NUMA node, but the other task(s)
> on the run queue are placed correctly.
>
> In that situation, what started out as one misplaced tasks
> quickly turns into two misplaced tasks.
>
> Try to avoid that by factoring out the placement of the current
> task, in order to find groups and runqueues with misplaced tasks
> that are not currently running.
>
> Signed-off-by: Rik van Riel <[email protected]>
This seem to give small improvement for our eCommerce web workload,
average sever response time went down from ~1.4 to ~1.34. I can run the
workload for longer time to get better numbers.
Thanks,
Artem.