2006-02-04 00:38:17

by John Hawkes

[permalink] [raw]
Subject: [PATCH] load_balance: "busiest CPU" -> "busier CPUs"

The load_balance algorithm determines a single "busiest CPU" and seeks
to pull tasks from it. If no migrateable tasks are found, then
load_balance simply gives up. This algorithm is vulnerable to
workloads where the busiest CPU is filled with non-migratable tasks,
which is commonly the case when a job manager carves up the CPUs into
cpusets and runs different multitasking applications in each cpuset.
In these circumstances, an all-pinned "busiest CPU" will effectively
disable load_balance balancing.

This patch changes the load_balance search into a "busier CPUs"
algorithm. The find_busiest_group() and find_busiest_queue() are
iteratively called with a constraining cpumask; if the busiest CPU
yields no migrateable task, then that CPU is dropped from the
constraining cpumask and the search repeats until no busier group or
CPU exists.

This constraining cpumask is passed by-reference for performance
reasons, and no cpus_add() is used because that operation has a
higher overhead than using the more simple cpu_isset().

This patch is simple and effective. A possible enhancement is to
cap the number of iterations, in order to limit exposure to a
denial-of-service attack where a user floods numerous CPUs with
thousands of pinned tasks, thereby slowing down every load_balance
search. (However, the current "busiest CPU" algorithm is also
vulnerable to a user who pins many thousands of tasks to a single CPU
and slows down load_balance, as well as effectively disabling systemwide
load_balance.)

Signed-off-by: John Hawkes <[email protected]>

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c 2006-01-30 13:57:32.000000000 -0800
+++ linux/kernel/sched.c 2006-02-01 11:17:01.000000000 -0800
@@ -243,6 +243,8 @@ struct runqueue {
int active_balance;
int push_cpu;

+ int cpuid; /* of this runqueue */
+
task_t *migration_thread;
struct list_head migration_queue;
#endif
@@ -425,7 +427,7 @@ static int show_schedstat(struct seq_fil
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
sd->lb_cnt[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
@@ -433,7 +435,8 @@ static int show_schedstat(struct seq_fil
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
+ sd->lb_nobusyg[itype],
+ sd->lb_retries[itype]);
}
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2025,7 +2028,8 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+ cpumask_t *consider_cpus)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2051,6 +2055,9 @@ find_busiest_group(struct sched_domain *
avg_load = 0;

for_each_cpu_mask(i, group->cpumask) {
+ if (!cpu_isset(i, *consider_cpus))
+ continue;
+
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;

@@ -2164,13 +2171,16 @@ out_balanced:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static runqueue_t *find_busiest_queue(struct sched_group *group,
- enum idle_type idle)
+ enum idle_type idle, cpumask_t *consider_cpus)
{
unsigned long load, max_load = 0;
runqueue_t *busiest = NULL;
int i;

for_each_cpu_mask(i, group->cpumask) {
+ if (!cpu_isset(i, *consider_cpus))
+ continue;
+
load = __source_load(i, 0, idle);

if (load > max_load) {
@@ -2203,19 +2213,22 @@ static int load_balance(int this_cpu, ru
int nr_moved, all_pinned = 0;
int active_balance = 0;
int sd_idle = 0;
+ cpumask_t consider_cpus = CPU_MASK_ALL;

if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[idle]);

- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+try_again:
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ &consider_cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}

- busiest = find_busiest_queue(group, idle);
+ busiest = find_busiest_queue(group, idle, &consider_cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2239,8 +2252,14 @@ static int load_balance(int this_cpu, ru
double_rq_unlock(this_rq, busiest);

/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned))
- goto out_balanced;
+ if (unlikely(all_pinned)) {
+ cpu_clear(busiest->cpuid, consider_cpus);
+ if (!cpus_empty(consider_cpus)) {
+ schedstat_inc(sd, lb_retries[idle]);
+ goto try_again;
+ } else
+ goto out_balanced;
+ }
}

if (!nr_moved) {
@@ -2327,18 +2346,21 @@ static int load_balance_newidle(int this
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ cpumask_t consider_cpus = CPU_MASK_ALL;

if (sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+try_again:
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ &sd_idle, &consider_cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}

- busiest = find_busiest_queue(group, NEWLY_IDLE);
+ busiest = find_busiest_queue(group, NEWLY_IDLE, &consider_cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
@@ -2355,6 +2377,14 @@ static int load_balance_newidle(int this
nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
+
+ if (!nr_moved) {
+ cpu_clear(busiest->cpuid, consider_cpus);
+ if (!cpus_empty(consider_cpus)) {
+ schedstat_inc(sd, lb_retries[NEWLY_IDLE]);
+ goto try_again;
+ }
+ }
}

if (!nr_moved) {
@@ -6121,6 +6151,7 @@ void __init sched_init(void)
rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
+ rq->cpuid = i;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h 2006-01-30 13:57:30.000000000 -0800
+++ linux/include/linux/sched.h 2006-01-30 16:54:05.000000000 -0800
@@ -607,6 +607,7 @@ struct sched_domain {
unsigned long lb_hot_gained[MAX_IDLE_TYPES];
unsigned long lb_nobusyg[MAX_IDLE_TYPES];
unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+ unsigned long lb_retries[MAX_IDLE_TYPES];

/* Active load balancing */
unsigned long alb_cnt;


2006-02-04 02:37:57

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH] load_balance: "busiest CPU" -> "busier CPUs"

On Fri, Feb 03, 2006 at 04:38:07PM -0800, [email protected] wrote:
> In these circumstances, an all-pinned "busiest CPU" will effectively
> disable load_balance balancing.

Solving all the load balancing issues that occur under all-pinned case is
tricy... For example, even with this patch, at a particular sched domain,
load balance might still be disabled for the cpus which belong to the same
sched group as the all-pinned "busiest CPU"

> @@ -243,6 +243,8 @@ struct runqueue {
> int active_balance;
> int push_cpu;
>
> + int cpuid; /* of this runqueue */
> +
> task_t *migration_thread;
> struct list_head migration_queue;

A simple change to find_busiest_queue() can avoid that addition to the
runqueue struct.

thanks,
suresh

2006-02-07 01:27:40

by John Hawkes

[permalink] [raw]
Subject: Re: [PATCH] load_balance: "busiest CPU" -> "busier CPUs"

Second try, incorporating Suresh Siddha's suggestion to avoid adding
a field to the runqueue struct.

The load_balance algorithm determines a single "busiest CPU" and seeks
to pull tasks from it. If no migrateable tasks are found, then
load_balance simply gives up. This algorithm is vulnerable to
workloads where the busiest CPU is filled with non-migratable tasks,
which is commonly the case when a job manager carves up the CPUs into
cpusets and runs different multitasking applications in each cpuset.
In these circumstances, an all-pinned "busiest CPU" will effectively
disable load_balance balancing.

This patch changes the load_balance search into a "busier CPUs"
algorithm. The find_busiest_group() and find_busiest_queue() are
iteratively called with a constraining cpumask; if the busiest CPU
yields no migrateable task, then that CPU is dropped from the
constraining cpumask and the search repeats until no busier group or
CPU exists.

This constraining cpumask is passed by-reference for performance
reasons, and no cpus_add() is used because that operation has a
higher overhead than using the more simple cpu_isset().

This patch is simple and effective. A possible enhancement is to
cap the number of iterations, in order to limit exposure to a
denial-of-service attack where a user floods numerous CPUs with
thousands of pinned tasks, thereby slowing down every load_balance
search. (However, the current "busiest CPU" algorithm is also
vulnerable to a user who pins many thousands of tasks to a single CPU
and slows down load_balance, as well as effectively disabling systemwide
load_balance.)

Signed-off-by: John Hawkes <[email protected]>

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c 2006-02-06 17:19:15.000000000 -0800
+++ linux/kernel/sched.c 2006-02-06 17:21:09.000000000 -0800
@@ -425,7 +425,7 @@ static int show_schedstat(struct seq_fil
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
sd->lb_cnt[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
@@ -433,7 +433,8 @@ static int show_schedstat(struct seq_fil
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
+ sd->lb_nobusyg[itype],
+ sd->lb_retries[itype]);
}
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2025,7 +2026,8 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+ cpumask_t *consider_cpus)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2051,6 +2053,9 @@ find_busiest_group(struct sched_domain *
avg_load = 0;

for_each_cpu_mask(i, group->cpumask) {
+ if (!cpu_isset(i, *consider_cpus))
+ continue;
+
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;

@@ -2163,19 +2168,22 @@ out_balanced:
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static runqueue_t *find_busiest_queue(struct sched_group *group,
- enum idle_type idle)
+static int find_busiest_queue(struct sched_group *group, enum idle_type idle,
+ cpumask_t *consider_cpus)
{
unsigned long load, max_load = 0;
- runqueue_t *busiest = NULL;
+ int busiest = -1;
int i;

for_each_cpu_mask(i, group->cpumask) {
+ if (!cpu_isset(i, *consider_cpus))
+ continue;
+
load = __source_load(i, 0, idle);

if (load > max_load) {
max_load = load;
- busiest = cpu_rq(i);
+ busiest = i;
}
}

@@ -2198,49 +2206,60 @@ static int load_balance(int this_cpu, ru
struct sched_domain *sd, enum idle_type idle)
{
struct sched_group *group;
- runqueue_t *busiest;
+ runqueue_t *busiest_rq;
+ int busiest_cpu;
unsigned long imbalance;
int nr_moved, all_pinned = 0;
int active_balance = 0;
int sd_idle = 0;
+ cpumask_t consider_cpus = CPU_MASK_ALL;

if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[idle]);

- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+try_again:
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ &consider_cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}

- busiest = find_busiest_queue(group, idle);
- if (!busiest) {
+ busiest_cpu = find_busiest_queue(group, idle, &consider_cpus);
+ if (busiest_cpu < 0) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
- }
+ } else
+ busiest_rq = cpu_rq(busiest_cpu);

- BUG_ON(busiest == this_rq);
+ BUG_ON(busiest_rq == this_rq);

schedstat_add(sd, lb_imbalance[idle], imbalance);

nr_moved = 0;
- if (busiest->nr_running > 1) {
+ if (busiest_rq->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
- * an imbalance but busiest->nr_running <= 1, the group is
+ * an imbalance but busiest_rq->nr_running <= 1, the group is
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- double_rq_lock(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ double_rq_lock(this_rq, busiest_rq);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest_rq,
imbalance, sd, idle, &all_pinned);
- double_rq_unlock(this_rq, busiest);
+ double_rq_unlock(this_rq, busiest_rq);

/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned))
- goto out_balanced;
+ if (unlikely(all_pinned)) {
+ cpu_clear(busiest_cpu, consider_cpus);
+ if (!cpus_empty(consider_cpus)) {
+ schedstat_inc(sd, lb_retries[idle]);
+ goto try_again;
+ } else
+ goto out_balanced;
+ }
}

if (!nr_moved) {
@@ -2249,25 +2268,25 @@ static int load_balance(int this_cpu, ru

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

- spin_lock(&busiest->lock);
+ spin_lock(&busiest_rq->lock);

/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
- if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
- spin_unlock(&busiest->lock);
+ if (!cpu_isset(this_cpu, busiest_rq->curr->cpus_allowed)) {
+ spin_unlock(&busiest_rq->lock);
all_pinned = 1;
goto out_one_pinned;
}

- if (!busiest->active_balance) {
- busiest->active_balance = 1;
- busiest->push_cpu = this_cpu;
+ if (!busiest_rq->active_balance) {
+ busiest_rq->active_balance = 1;
+ busiest_rq->push_cpu = this_cpu;
active_balance = 1;
}
- spin_unlock(&busiest->lock);
+ spin_unlock(&busiest_rq->lock);
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ wake_up_process(busiest_rq->migration_thread);

/*
* We've kicked active balancing, reset the failure
@@ -2323,38 +2342,51 @@ static int load_balance_newidle(int this
struct sched_domain *sd)
{
struct sched_group *group;
- runqueue_t *busiest = NULL;
+ runqueue_t *busiest_rq = NULL;
+ int busiest_cpu;
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ cpumask_t consider_cpus = CPU_MASK_ALL;

if (sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+try_again:
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ &sd_idle, &consider_cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}

- busiest = find_busiest_queue(group, NEWLY_IDLE);
- if (!busiest) {
+ busiest_cpu = find_busiest_queue(group, NEWLY_IDLE, &consider_cpus);
+ if (busiest_cpu < 0) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
- }
+ } else
+ busiest_rq = cpu_rq(busiest_cpu);

- BUG_ON(busiest == this_rq);
+ BUG_ON(busiest_rq == this_rq);

schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);

nr_moved = 0;
- if (busiest->nr_running > 1) {
+ if (busiest_rq->nr_running > 1) {
/* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ double_lock_balance(this_rq, busiest_rq);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest_rq,
imbalance, sd, NEWLY_IDLE, NULL);
- spin_unlock(&busiest->lock);
+ spin_unlock(&busiest_rq->lock);
+
+ if (!nr_moved) {
+ cpu_clear(busiest_cpu, consider_cpus);
+ if (!cpus_empty(consider_cpus)) {
+ schedstat_inc(sd, lb_retries[NEWLY_IDLE]);
+ goto try_again;
+ }
+ }
}

if (!nr_moved) {
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h 2006-02-06 17:19:14.000000000 -0800
+++ linux/include/linux/sched.h 2006-02-06 17:21:09.000000000 -0800
@@ -607,6 +607,7 @@ struct sched_domain {
unsigned long lb_hot_gained[MAX_IDLE_TYPES];
unsigned long lb_nobusyg[MAX_IDLE_TYPES];
unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+ unsigned long lb_retries[MAX_IDLE_TYPES];

/* Active load balancing */
unsigned long alb_cnt;

2006-02-15 07:12:16

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] load_balance: "busiest CPU" -> "busier CPUs"

[email protected] wrote:
> Second try, incorporating Suresh Siddha's suggestion to avoid adding
> a field to the runqueue struct.
>

Sorry for not getting back to you earlier.

Looks like it should work, although bviously exclusive cpusets are
still the preferred way to do exclusive partitioning. Not sure what
the others think about adding complexity and overhead for a
relatively uncommon case, I'm not a huge fan.

I have a suggestion for an improvement - cycle through all CPUs
in the busiest group before moving on to find_busiest_group again?

Do you really need the schedstat field? You'll have to bump the
schedstat version.

The idea I had when thinking about how to handle this, is to have
something like your `consider_cpus` field in the sched domain
itself (which can help you not do too much work at one time, but
will have other complexities like when to reset the mask).

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com