b0defa7ae03ec changed the load balancing logic to ignore env.max_loop if
all tasks examined to that point were pinned. The goal of the patch was
to make it more likely to be able to detach a task buried in a long list
of pinned tasks. However, this has the unfortunate side effect of
creating an O(n) iteration in detach_tasks(), as we now must fully
iterate every task on a cpu if all or most are pinned. Since this load
balance code is done with rq lock held, and often in softirq context, it
is very easy to trigger hard lockups. We observed such hard lockups with
a user who affined O(10k) threads to a single cpu.
My initial suggestion to Vincent was to revert the original patch and
instead bump sched_nr_migrate. However, he pointed out that we do still
want to have a limit on the number of tasks we actually detach (which is
an expensive operation), but still allow a deeper search.
As a result of the above, this patch now separates the number of tasks
we migrate from the number of tasks we can search. Now, the search limit
can be raised while keeping the nr_migrate fixed.
Signed-off-by: Josh Don <[email protected]>
---
kernel/sched/core.c | 3 ++-
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 19 ++++++++++++++-----
kernel/sched/sched.h | 1 +
4 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..820ffa9bbcfe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,10 +144,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
#endif /* CONFIG_SCHED_DEBUG */
/*
- * Number of tasks to iterate in a single balance run.
+ * Max number of tasks we can move/iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+const_debug unsigned int sysctl_sched_migrate_search_depth = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 168eecc209b4..d1701c98f996 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -356,6 +356,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ debugfs_create_u32("migrate_search_depth", 0644, debugfs_sched, &sysctl_sched_migrate_search_depth);
mutex_lock(&sched_domains_mutex);
update_sched_domain_debugfs();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34fe6e9490c2..b4b26f39dd45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8764,6 +8764,9 @@ struct lb_env {
unsigned int loop_break;
unsigned int loop_max;
+ unsigned int num_detached;
+ unsigned int detach_max;
+
enum fbq_type fbq_type;
enum migration_type migration_type;
struct list_head tasks;
@@ -9044,11 +9047,11 @@ static int detach_tasks(struct lb_env *env)
env->loop++;
/*
- * We've more or less seen every task there is, call it quits
- * unless we haven't found any movable task yet.
+ * Call it quits if we're going to exceed our search limit;
+ * we can't search indefinitely even if we've not found a
+ * migratable task yet.
*/
- if (env->loop > env->loop_max &&
- !(env->flags & LBF_ALL_PINNED))
+ if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
@@ -9116,6 +9119,9 @@ static int detach_tasks(struct lb_env *env)
list_add(&p->se.group_node, &env->tasks);
detached++;
+ env->num_detached++;
+ if (env->num_detached >= env->detach_max)
+ break;
#ifdef CONFIG_PREEMPTION
/*
@@ -11287,6 +11293,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
+ env.num_detached = 0;
+ env.detach_max = sysctl_sched_nr_migrate;
+
ld_moved = 0;
/* Clear this flag as soon as we find a pullable task */
env.flags |= LBF_ALL_PINNED;
@@ -11297,7 +11306,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+ env.loop_max = min(sysctl_sched_migrate_search_depth, busiest->nr_running);
more_balance:
rq_lock_irqsave(busiest, &rf);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..1597175d5f0b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2539,6 +2539,7 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern const_debug unsigned int sysctl_sched_migrate_search_depth;
extern unsigned int sysctl_sched_base_slice;
--
2.45.0.rc1.225.g2a3ae87e7f-goog