2021-05-12 09:48:58

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH] sched: Initialize the idle task with preemption disabled

As pointed out by commit

de9b8f5dcbd9 ("sched: Fix crash trying to dequeue/enqueue the idle thread")

init_idle() can and will be invoked more than once on the same idle
task. At boot time, it is invoked for the boot CPU thread by
sched_init(). Then smp_init() creates the threads for all the secondary
CPUs and invokes init_idle() on them.

As the hotplug machinery brings the secondaries to life, it will issue
calls to idle_thread_get(), which itself invokes init_idle() yet again.
In this case it's invoked twice more per secondary: at _cpu_up(), and at
bringup_cpu().

Given smp_init() already initializes the idle tasks for all *possible*
CPUs, no further initialization should be required. Now, removing
init_idle() from idle_thread_get() exposes some interesting expectations
with regards to the idle task's preempt_count: the secondary startup always
issues a preempt_disable(), requiring some reset of the preempt count to 0
between hot-unplug and hotplug, which is currently served by
idle_thread_get() -> idle_init().

Given the idle task is supposed to have preemption disabled once and never
see it re-enabled, it seems that what we actually want is to initialize its
preempt_count to PREEMPT_DISABLED and leave it there. Do that, and remove
init_idle() from idle_thread_get().

Secondary startups were patched via coccinelle:

@begone@
@@

-preempt_disable();
...
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);

Signed-off-by: Valentin Schneider <[email protected]>
---
arch/alpha/kernel/smp.c | 1 -
arch/arc/kernel/smp.c | 1 -
arch/arm/kernel/smp.c | 1 -
arch/arm64/include/asm/preempt.h | 2 +-
arch/arm64/kernel/smp.c | 1 -
arch/csky/kernel/smp.c | 1 -
arch/ia64/kernel/smpboot.c | 1 -
arch/mips/kernel/smp.c | 1 -
arch/openrisc/kernel/smp.c | 2 --
arch/parisc/kernel/smp.c | 1 -
arch/powerpc/kernel/smp.c | 1 -
arch/riscv/kernel/smpboot.c | 1 -
arch/s390/include/asm/preempt.h | 4 ++--
arch/s390/kernel/smp.c | 1 -
arch/sh/kernel/smp.c | 2 --
arch/sparc/kernel/smp_32.c | 1 -
arch/sparc/kernel/smp_64.c | 3 ---
arch/x86/include/asm/preempt.h | 2 +-
arch/x86/kernel/smpboot.c | 1 -
arch/xtensa/kernel/smp.c | 1 -
include/asm-generic/preempt.h | 2 +-
init/main.c | 6 +-----
kernel/fork.c | 2 +-
kernel/sched/core.c | 2 +-
kernel/smpboot.c | 1 -
25 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index f4dd9f3f3001..4b2575f936d4 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -166,7 +166,6 @@ smp_callin(void)
DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
cpuid, current, current->active_mm));

- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 52906d314537..db0e104d6835 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -189,7 +189,6 @@ void start_kernel_secondary(void)
pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu);

local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 74679240a9d8..c7bb168b0d97 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -432,7 +432,6 @@ asmlinkage void secondary_start_kernel(void)
#endif
pr_debug("CPU%u: Booted secondary processor\n", cpu);

- preempt_disable();
trace_hardirqs_off();

/*
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 80e946b2abee..e83f0982b99c 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc)
} while (0)

#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 357590beaabb..48fd89256739 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -223,7 +223,6 @@ asmlinkage notrace void secondary_start_kernel(void)
init_gic_priority_masking();

rcu_cpu_starting(cpu);
- preempt_disable();
trace_hardirqs_off();

/*
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 0f9f5eef9338..e2993539af8e 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -281,7 +281,6 @@ void csky_start_secondary(void)
pr_info("CPU%u Online: %s...\n", cpu, __func__);

local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 49b488580939..d10f780c13b9 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -441,7 +441,6 @@ start_secondary (void *unused)
#endif
efi_map_pal_code();
cpu_init();
- preempt_disable();
smp_callin();

cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index ef86fbad8546..d542fb7af3ba 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -348,7 +348,6 @@ asmlinkage void start_secondary(void)
*/

calibrate_delay();
- preempt_disable();
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;

diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 48e1092a64de..415e209732a3 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -145,8 +145,6 @@ asmlinkage __init void secondary_start_kernel(void)
set_cpu_online(cpu, true);

local_irq_enable();
-
- preempt_disable();
/*
* OK, it's off to the idle thread for us
*/
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 10227f667c8a..1405b603b91b 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -302,7 +302,6 @@ void __init smp_callin(unsigned long pdce_proc)
#endif

smp_cpu_init(slave_id);
- preempt_disable();

flush_cache_all_local(); /* start with known state */
flush_tlb_all_local(NULL);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5a4d59a1070d..055ca3816eb7 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1505,7 +1505,6 @@ void start_secondary(void *unused)
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
rcu_cpu_starting(cpu);
- preempt_disable();
cpu_callin_map[cpu] = 1;

if (smp_ops->setup_cpu)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 5e276c25646f..1941a6ce86a1 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -176,7 +176,6 @@ asmlinkage __visible void smp_callin(void)
* Disable preemption before enabling interrupts, so we don't try to
* schedule a CPU that hasn't actually started yet.
*/
- preempt_disable();
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index b49e0492842c..23ff51be7e29 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -32,7 +32,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
@@ -91,7 +91,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 58c8afa3da65..d60c7374d807 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -877,7 +877,6 @@ static void smp_init_secondary(void)
restore_access_regs(S390_lowcore.access_regs_save_area);
cpu_init();
rcu_cpu_starting(cpu);
- preempt_disable();
init_cpu_timer();
vtime_init();
vdso_getcpu_init();
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 372acdc9033e..65924d9ec245 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -186,8 +186,6 @@ asmlinkage void start_secondary(void)

per_cpu_trap_init();

- preempt_disable();
-
notify_cpu_starting(cpu);

local_irq_enable();
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 50c127ab46d5..22b148e5a5f8 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -348,7 +348,6 @@ static void sparc_start_secondary(void *arg)
*/
arch_cpu_pre_starting(arg);

- preempt_disable();
cpu = smp_processor_id();

notify_cpu_starting(cpu);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e38d8bf454e8..ae5faa1d989d 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -138,9 +138,6 @@ void smp_callin(void)

set_cpu_online(cpuid, true);

- /* idle thread is expected to have preempt disabled */
- preempt_disable();
-
local_irq_enable();

cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index f8cb8af4de5c..fe5efbcba824 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 16703c35a944..29713d0cf155 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -236,7 +236,6 @@ static void notrace start_secondary(void *unused)
cpu_init();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();

enable_start_cpu0 = 0;
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index cd85a7a2722b..1254da07ead1 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -145,7 +145,6 @@ void secondary_start_kernel(void)
cpumask_set_cpu(cpu, mm_cpumask(mm));
enter_lazy_tlb(mm, current);

- preempt_disable();
trace_hardirqs_off();

calibrate_delay();
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d683f5e6d791..b4d43a4af5f7 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc)
} while (0)

#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)

static __always_inline void set_preempt_need_resched(void)
diff --git a/init/main.c b/init/main.c
index 53b278845b88..d8580323110e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -918,11 +918,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
- /*
- * Disable preemption - early bootup scheduling is extremely
- * fragile until we cpu_idle() for the first time.
- */
- preempt_disable();
+
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index a1a763019bfb..9de7bc40be1d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,7 +2406,7 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}

-struct task_struct *fork_idle(int cpu)
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a0668acd876..43b903ae823b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7433,7 +7433,7 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f25208e8df83..e4163042c4d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu)

if (!tsk)
return ERR_PTR(-ENOMEM);
- init_idle(tsk, cpu);
return tsk;
}

--
2.25.1


Subject: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

The following commit has been merged into the sched/core branch of tip:

Commit-ID: f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674
Gitweb: https://git.kernel.org/tip/f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674
Author: Valentin Schneider <[email protected]>
AuthorDate: Wed, 12 May 2021 10:46:36 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 12 May 2021 13:01:45 +02:00

sched/core: Initialize the idle task with preemption disabled

As pointed out by commit

de9b8f5dcbd9 ("sched: Fix crash trying to dequeue/enqueue the idle thread")

init_idle() can and will be invoked more than once on the same idle
task. At boot time, it is invoked for the boot CPU thread by
sched_init(). Then smp_init() creates the threads for all the secondary
CPUs and invokes init_idle() on them.

As the hotplug machinery brings the secondaries to life, it will issue
calls to idle_thread_get(), which itself invokes init_idle() yet again.
In this case it's invoked twice more per secondary: at _cpu_up(), and at
bringup_cpu().

Given smp_init() already initializes the idle tasks for all *possible*
CPUs, no further initialization should be required. Now, removing
init_idle() from idle_thread_get() exposes some interesting expectations
with regards to the idle task's preempt_count: the secondary startup always
issues a preempt_disable(), requiring some reset of the preempt count to 0
between hot-unplug and hotplug, which is currently served by
idle_thread_get() -> idle_init().

Given the idle task is supposed to have preemption disabled once and never
see it re-enabled, it seems that what we actually want is to initialize its
preempt_count to PREEMPT_DISABLED and leave it there. Do that, and remove
init_idle() from idle_thread_get().

Secondary startups were patched via coccinelle:

@begone@
@@

-preempt_disable();
...
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);

Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/alpha/kernel/smp.c | 1 -
arch/arc/kernel/smp.c | 1 -
arch/arm/kernel/smp.c | 1 -
arch/arm64/include/asm/preempt.h | 2 +-
arch/arm64/kernel/smp.c | 1 -
arch/csky/kernel/smp.c | 1 -
arch/ia64/kernel/smpboot.c | 1 -
arch/mips/kernel/smp.c | 1 -
arch/openrisc/kernel/smp.c | 2 --
arch/parisc/kernel/smp.c | 1 -
arch/powerpc/kernel/smp.c | 1 -
arch/riscv/kernel/smpboot.c | 1 -
arch/s390/include/asm/preempt.h | 4 ++--
arch/s390/kernel/smp.c | 1 -
arch/sh/kernel/smp.c | 2 --
arch/sparc/kernel/smp_32.c | 1 -
arch/sparc/kernel/smp_64.c | 3 ---
arch/x86/include/asm/preempt.h | 2 +-
arch/x86/kernel/smpboot.c | 1 -
arch/xtensa/kernel/smp.c | 1 -
include/asm-generic/preempt.h | 2 +-
init/main.c | 6 +-----
kernel/fork.c | 2 +-
kernel/sched/core.c | 2 +-
kernel/smpboot.c | 1 -
25 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index f4dd9f3..4b2575f 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -166,7 +166,6 @@ smp_callin(void)
DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
cpuid, current, current->active_mm));

- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 52906d3..db0e104 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -189,7 +189,6 @@ void start_kernel_secondary(void)
pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu);

local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 7467924..c7bb168 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -432,7 +432,6 @@ asmlinkage void secondary_start_kernel(void)
#endif
pr_debug("CPU%u: Booted secondary processor\n", cpu);

- preempt_disable();
trace_hardirqs_off();

/*
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 80e946b..e83f098 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc)
} while (0)

#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041..6671000 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -224,7 +224,6 @@ asmlinkage notrace void secondary_start_kernel(void)
init_gic_priority_masking();

rcu_cpu_starting(cpu);
- preempt_disable();
trace_hardirqs_off();

/*
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 0f9f5ee..e299353 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -281,7 +281,6 @@ void csky_start_secondary(void)
pr_info("CPU%u Online: %s...\n", cpu, __func__);

local_irq_enable();
- preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 49b4885..d10f780 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -441,7 +441,6 @@ start_secondary (void *unused)
#endif
efi_map_pal_code();
cpu_init();
- preempt_disable();
smp_callin();

cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index ef86fba..d542fb7 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -348,7 +348,6 @@ asmlinkage void start_secondary(void)
*/

calibrate_delay();
- preempt_disable();
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;

diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 48e1092..415e209 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -145,8 +145,6 @@ asmlinkage __init void secondary_start_kernel(void)
set_cpu_online(cpu, true);

local_irq_enable();
-
- preempt_disable();
/*
* OK, it's off to the idle thread for us
*/
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 10227f6..1405b60 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -302,7 +302,6 @@ void __init smp_callin(unsigned long pdce_proc)
#endif

smp_cpu_init(slave_id);
- preempt_disable();

flush_cache_all_local(); /* start with known state */
flush_tlb_all_local(NULL);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e05c78..6c6e4d9 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1547,7 +1547,6 @@ void start_secondary(void *unused)
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
rcu_cpu_starting(cpu);
- preempt_disable();
cpu_callin_map[cpu] = 1;

if (smp_ops->setup_cpu)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 9a408e2..bd82375 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -180,7 +180,6 @@ asmlinkage __visible void smp_callin(void)
* Disable preemption before enabling interrupts, so we don't try to
* schedule a CPU that hasn't actually started yet.
*/
- preempt_disable();
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index b49e049..23ff51b 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -32,7 +32,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
@@ -91,7 +91,7 @@ static inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+ S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2fec2b8..111909a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -878,7 +878,6 @@ static void smp_init_secondary(void)
restore_access_regs(S390_lowcore.access_regs_save_area);
cpu_init();
rcu_cpu_starting(cpu);
- preempt_disable();
init_cpu_timer();
vtime_init();
vdso_getcpu_init();
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 372acdc..65924d9 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -186,8 +186,6 @@ asmlinkage void start_secondary(void)

per_cpu_trap_init();

- preempt_disable();
-
notify_cpu_starting(cpu);

local_irq_enable();
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 50c127a..22b148e 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -348,7 +348,6 @@ static void sparc_start_secondary(void *arg)
*/
arch_cpu_pre_starting(arg);

- preempt_disable();
cpu = smp_processor_id();

notify_cpu_starting(cpu);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e38d8bf..ae5faa1 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -138,9 +138,6 @@ void smp_callin(void)

set_cpu_online(cpuid, true);

- /* idle thread is expected to have preempt disabled */
- preempt_disable();
-
local_irq_enable();

cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index f8cb8af..fe5efbc 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0ad5214..0936f5b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -236,7 +236,6 @@ static void notrace start_secondary(void *unused)
cpu_init();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();

enable_start_cpu0 = 0;
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index cd85a7a..1254da0 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -145,7 +145,6 @@ void secondary_start_kernel(void)
cpumask_set_cpu(cpu, mm_cpumask(mm));
enter_lazy_tlb(mm, current);

- preempt_disable();
trace_hardirqs_off();

calibrate_delay();
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d683f5e..b4d43a4 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc)
} while (0)

#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)

static __always_inline void set_preempt_need_resched(void)
diff --git a/init/main.c b/init/main.c
index eb01e12..7b027d9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -941,11 +941,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
- /*
- * Disable preemption - early bootup scheduling is extremely
- * fragile until we cpu_idle() for the first time.
- */
- preempt_disable();
+
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index e7fd928..ace4631 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2412,7 +2412,7 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}

-struct task_struct *fork_idle(int cpu)
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55b2d93..9d00f49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8227,7 +8227,7 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f25208e..e416304 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu)

if (!tsk)
return ERR_PTR(-ENOMEM);
- init_idle(tsk, cpu);
return tsk;
}

2021-07-06 19:47:05

by Guenter Roeck

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

Hi,

On Wed, May 12, 2021 at 11:15:54AM -0000, tip-bot2 for Valentin Schneider wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674
> Gitweb: https://git.kernel.org/tip/f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674
> Author: Valentin Schneider <[email protected]>
> AuthorDate: Wed, 12 May 2021 10:46:36 +01:00
> Committer: Ingo Molnar <[email protected]>
> CommitterDate: Wed, 12 May 2021 13:01:45 +02:00
>
> sched/core: Initialize the idle task with preemption disabled
>
> As pointed out by commit
>
> de9b8f5dcbd9 ("sched: Fix crash trying to dequeue/enqueue the idle thread")
>
> init_idle() can and will be invoked more than once on the same idle
> task. At boot time, it is invoked for the boot CPU thread by
> sched_init(). Then smp_init() creates the threads for all the secondary
> CPUs and invokes init_idle() on them.
>
> As the hotplug machinery brings the secondaries to life, it will issue
> calls to idle_thread_get(), which itself invokes init_idle() yet again.
> In this case it's invoked twice more per secondary: at _cpu_up(), and at
> bringup_cpu().
>
> Given smp_init() already initializes the idle tasks for all *possible*
> CPUs, no further initialization should be required. Now, removing
> init_idle() from idle_thread_get() exposes some interesting expectations
> with regards to the idle task's preempt_count: the secondary startup always
> issues a preempt_disable(), requiring some reset of the preempt count to 0
> between hot-unplug and hotplug, which is currently served by
> idle_thread_get() -> idle_init().
>
> Given the idle task is supposed to have preemption disabled once and never
> see it re-enabled, it seems that what we actually want is to initialize its
> preempt_count to PREEMPT_DISABLED and leave it there. Do that, and remove
> init_idle() from idle_thread_get().
>
> Secondary startups were patched via coccinelle:
>
> @begone@
> @@
>
> -preempt_disable();
> ...
> cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
>
> Signed-off-by: Valentin Schneider <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> Acked-by: Peter Zijlstra <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]

This patch results in several messages similar to the following
when booting s390 images in qemu.

[ 1.690807] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
[ 1.690925] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
[ 1.691053] no locks held by swapper/0/1.
[ 1.691310] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-11788-g79160a603bdb #1
[ 1.691469] Hardware name: QEMU 2964 QEMU (KVM/Linux)
[ 1.691612] Call Trace:
[ 1.691718] [<0000000000d98bb0>] show_stack+0x90/0xf8
[ 1.692040] [<0000000000da894c>] dump_stack_lvl+0x74/0xa8
[ 1.692134] [<0000000000187e52>] ___might_sleep+0x15a/0x170
[ 1.692228] [<000000000014f588>] cpus_read_lock+0x38/0xc0
[ 1.692320] [<0000000000182e8a>] smpboot_register_percpu_thread+0x2a/0x160
[ 1.692412] [<00000000014814b8>] cpuhp_threads_init+0x28/0x60
[ 1.692505] [<0000000001487a30>] smp_init+0x28/0x90
[ 1.692597] [<00000000014779a6>] kernel_init_freeable+0x1f6/0x270
[ 1.692689] [<0000000000db7466>] kernel_init+0x2e/0x160
[ 1.692779] [<0000000000103618>] __ret_from_fork+0x40/0x58
[ 1.692870] [<0000000000dc6e12>] ret_from_fork+0xa/0x30

Reverting this patch fixes the problem.
Bisect log is attached.

Guenter

---
# bad: [007b350a58754a93ca9fe50c498cc27780171153] Merge tag 'dlm-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm
# good: [62fb9874f5da54fdb243003b386128037319b219] Linux 5.13
git bisect start '007b350a5875' '62fb9874f5da'
# bad: [36824f198c621cebeb22966b5e244378fa341295] Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
git bisect bad 36824f198c621cebeb22966b5e244378fa341295
# bad: [9269d27e519ae9a89be8d288f59d1ec573b0c686] Merge tag 'timers-nohz-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect bad 9269d27e519ae9a89be8d288f59d1ec573b0c686
# good: [69609a91ac1d82f9c958a762614edfe0ac8498e3] Merge tag 'spi-v5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi
git bisect good 69609a91ac1d82f9c958a762614edfe0ac8498e3
# good: [a15286c63d113d4296c58867994cd266a28f5d6d] Merge tag 'locking-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good a15286c63d113d4296c58867994cd266a28f5d6d
# bad: [0159bb020ca9a43b17aa9149f1199643c1d49426] Documentation: Add usecases, design and interface for core scheduling
git bisect bad 0159bb020ca9a43b17aa9149f1199643c1d49426
# good: [97886d9dcd86820bdbc1fa73b455982809cbc8c2] sched: Migration changes for core scheduling
git bisect good 97886d9dcd86820bdbc1fa73b455982809cbc8c2
# bad: [fcb501704554eebfd27e3220b0540997fd2b24a8] delayacct: Document task_delayacct sysctl
git bisect bad fcb501704554eebfd27e3220b0540997fd2b24a8
# bad: [cc00c1988801dc71f63bb7bad019e85046865095] sched: Fix leftover comment typos
git bisect bad cc00c1988801dc71f63bb7bad019e85046865095
# good: [7ac592aa35a684ff1858fb9ec282886b9e3575ac] sched: prctl() core-scheduling interface
git bisect good 7ac592aa35a684ff1858fb9ec282886b9e3575ac
# bad: [f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674] sched/core: Initialize the idle task with preemption disabled
git bisect bad f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674
# good: [9f26990074931bbf797373e53104216059b300b1] kselftest: Add test for core sched prctl interface
git bisect good 9f26990074931bbf797373e53104216059b300b1
# first bad commit: [f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674] sched/core: Initialize the idle task with preemption disabled

2021-07-06 23:56:42

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled


Hi Guenter,

On 06/07/21 12:44, Guenter Roeck wrote:
> This patch results in several messages similar to the following
> when booting s390 images in qemu.
>
> [ 1.690807] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> [ 1.690925] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
> [ 1.691053] no locks held by swapper/0/1.
> [ 1.691310] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-11788-g79160a603bdb #1
> [ 1.691469] Hardware name: QEMU 2964 QEMU (KVM/Linux)
> [ 1.691612] Call Trace:
> [ 1.691718] [<0000000000d98bb0>] show_stack+0x90/0xf8
> [ 1.692040] [<0000000000da894c>] dump_stack_lvl+0x74/0xa8
> [ 1.692134] [<0000000000187e52>] ___might_sleep+0x15a/0x170
> [ 1.692228] [<000000000014f588>] cpus_read_lock+0x38/0xc0
> [ 1.692320] [<0000000000182e8a>] smpboot_register_percpu_thread+0x2a/0x160
> [ 1.692412] [<00000000014814b8>] cpuhp_threads_init+0x28/0x60
> [ 1.692505] [<0000000001487a30>] smp_init+0x28/0x90
> [ 1.692597] [<00000000014779a6>] kernel_init_freeable+0x1f6/0x270
> [ 1.692689] [<0000000000db7466>] kernel_init+0x2e/0x160
> [ 1.692779] [<0000000000103618>] __ret_from_fork+0x40/0x58
> [ 1.692870] [<0000000000dc6e12>] ret_from_fork+0xa/0x30
>
> Reverting this patch fixes the problem.
> Bisect log is attached.
>
> Guenter
>

Thanks for the report.

So somehow the init task ends up with a non-zero preempt_count()? Per
FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero count, are
you hitting the WARN_ONCE() in finish_task_switch()?

Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?

I can't make sense of this right now, but it's a bit late :) I'll grab some
toolchain+qemu tomorrow and go poke at it (and while at it I need to do the
same with powerpc).

2021-07-07 04:08:26

by Guenter Roeck

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 7/6/21 4:55 PM, Valentin Schneider wrote:
>
> Hi Guenter,
>
> On 06/07/21 12:44, Guenter Roeck wrote:
>> This patch results in several messages similar to the following
>> when booting s390 images in qemu.
>>
>> [ 1.690807] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
>> [ 1.690925] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
>> [ 1.691053] no locks held by swapper/0/1.
>> [ 1.691310] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-11788-g79160a603bdb #1
>> [ 1.691469] Hardware name: QEMU 2964 QEMU (KVM/Linux)
>> [ 1.691612] Call Trace:
>> [ 1.691718] [<0000000000d98bb0>] show_stack+0x90/0xf8
>> [ 1.692040] [<0000000000da894c>] dump_stack_lvl+0x74/0xa8
>> [ 1.692134] [<0000000000187e52>] ___might_sleep+0x15a/0x170
>> [ 1.692228] [<000000000014f588>] cpus_read_lock+0x38/0xc0
>> [ 1.692320] [<0000000000182e8a>] smpboot_register_percpu_thread+0x2a/0x160
>> [ 1.692412] [<00000000014814b8>] cpuhp_threads_init+0x28/0x60
>> [ 1.692505] [<0000000001487a30>] smp_init+0x28/0x90
>> [ 1.692597] [<00000000014779a6>] kernel_init_freeable+0x1f6/0x270
>> [ 1.692689] [<0000000000db7466>] kernel_init+0x2e/0x160
>> [ 1.692779] [<0000000000103618>] __ret_from_fork+0x40/0x58
>> [ 1.692870] [<0000000000dc6e12>] ret_from_fork+0xa/0x30
>>
>> Reverting this patch fixes the problem.
>> Bisect log is attached.
>>
>> Guenter
>>
>
> Thanks for the report.
>
> So somehow the init task ends up with a non-zero preempt_count()? Per
> FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero count, are
> you hitting the WARN_ONCE() in finish_task_switch()?
>
> Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?
>

My configuration doesn't have CONFIG_PREEMPT enabled.

Guenter

2021-07-07 12:04:35

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On Wed, Jul 07, 2021 at 12:55:20AM +0100, Valentin Schneider wrote:
>
> Hi Guenter,
>
> On 06/07/21 12:44, Guenter Roeck wrote:
> > This patch results in several messages similar to the following
> > when booting s390 images in qemu.
> >
> > [ 1.690807] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> > [ 1.690925] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
> > [ 1.691053] no locks held by swapper/0/1.
> > [ 1.691310] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-11788-g79160a603bdb #1
> > [ 1.691469] Hardware name: QEMU 2964 QEMU (KVM/Linux)
> > [ 1.691612] Call Trace:
> > [ 1.691718] [<0000000000d98bb0>] show_stack+0x90/0xf8
> > [ 1.692040] [<0000000000da894c>] dump_stack_lvl+0x74/0xa8
> > [ 1.692134] [<0000000000187e52>] ___might_sleep+0x15a/0x170
> > [ 1.692228] [<000000000014f588>] cpus_read_lock+0x38/0xc0
> > [ 1.692320] [<0000000000182e8a>] smpboot_register_percpu_thread+0x2a/0x160
> > [ 1.692412] [<00000000014814b8>] cpuhp_threads_init+0x28/0x60
> > [ 1.692505] [<0000000001487a30>] smp_init+0x28/0x90
> > [ 1.692597] [<00000000014779a6>] kernel_init_freeable+0x1f6/0x270
> > [ 1.692689] [<0000000000db7466>] kernel_init+0x2e/0x160
> > [ 1.692779] [<0000000000103618>] __ret_from_fork+0x40/0x58
> > [ 1.692870] [<0000000000dc6e12>] ret_from_fork+0xa/0x30
> >
> > Reverting this patch fixes the problem.
> > Bisect log is attached.
> >
> > Guenter
> >
>
> Thanks for the report.
>
> So somehow the init task ends up with a non-zero preempt_count()? Per
> FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero count, are
> you hitting the WARN_ONCE() in finish_task_switch()?
>
> Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?
>
> I can't make sense of this right now, but it's a bit late :) I'll grab some
> toolchain+qemu tomorrow and go poke at it (and while at it I need to do the
> same with powerpc).

One possible issue is that s390's init_idle_preempt_count() doesn't apply on the
target idle task but on the _current_ CPU. And since smp_init() ->
idle_threads_init() is actually called remotely, we are overwriting the current
CPU preempt_count() instead of the target one.

2021-07-07 12:14:37

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 07/07/21 14:03, Frederic Weisbecker wrote:
> On Wed, Jul 07, 2021 at 12:55:20AM +0100, Valentin Schneider wrote:
>> Thanks for the report.
>>
>> So somehow the init task ends up with a non-zero preempt_count()? Per
>> FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero count, are
>> you hitting the WARN_ONCE() in finish_task_switch()?
>>
>> Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?
>>
>> I can't make sense of this right now, but it's a bit late :) I'll grab some
>> toolchain+qemu tomorrow and go poke at it (and while at it I need to do the
>> same with powerpc).
>
> One possible issue is that s390's init_idle_preempt_count() doesn't apply on the
> target idle task but on the _current_ CPU. And since smp_init() ->
> idle_threads_init() is actually called remotely, we are overwriting the current
> CPU preempt_count() instead of the target one.

Indeed, this becomes quite obvious when tracing the preemption count
changes. This also means that s390 relied on the idle_thread_get() from the
hotplug machinery to properly setup the preempt count, rather than
init_idle_preempt_count() - which is quite yuck.

I'll write a patch for that and likely one for powerpc.

2021-07-07 15:02:46

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 08/07/21 00:35, Alexey Kardashevskiy wrote:
> On 08/07/2021 00:14, Guenter Roeck wrote:
>>
>> Can you reproduce the problem with a powerpc qemu emulation ?
>> If so, how do you reproduce it there ? Reason for asking is that I don't
>> see
>> the problem with any of my powerpc emulations, and I would like to add test
>> case(s) if possible.
>
> I can reproduce the problem on powerpc easily - qemu with "-smp 2" does it.
>

So on powerpc I'm chasing a slightly different problem, reported at
[1]. I couldn't get it to trigger on qemu for powerpc64, and I'm still
struggling with powerpc. Could you please share you qemu invocation &
kernel .config? Thanks.

[1]: https://lore.kernel.org/linux-next/[email protected]/

>
> --
> Alexey

2021-07-07 15:36:19

by Guenter Roeck

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 7/7/21 5:11 AM, Valentin Schneider wrote:
> On 07/07/21 14:03, Frederic Weisbecker wrote:
>> On Wed, Jul 07, 2021 at 12:55:20AM +0100, Valentin Schneider wrote:
>>> Thanks for the report.
>>>
>>> So somehow the init task ends up with a non-zero preempt_count()? Per
>>> FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero count, are
>>> you hitting the WARN_ONCE() in finish_task_switch()?
>>>
>>> Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?
>>>
>>> I can't make sense of this right now, but it's a bit late :) I'll grab some
>>> toolchain+qemu tomorrow and go poke at it (and while at it I need to do the
>>> same with powerpc).
>>
>> One possible issue is that s390's init_idle_preempt_count() doesn't apply on the
>> target idle task but on the _current_ CPU. And since smp_init() ->
>> idle_threads_init() is actually called remotely, we are overwriting the current
>> CPU preempt_count() instead of the target one.
>
> Indeed, this becomes quite obvious when tracing the preemption count
> changes. This also means that s390 relied on the idle_thread_get() from the
> hotplug machinery to properly setup the preempt count, rather than
> init_idle_preempt_count() - which is quite yuck.
>
> I'll write a patch for that and likely one for powerpc.
>

Can you reproduce the problem with a powerpc qemu emulation ?
If so, how do you reproduce it there ? Reason for asking is that I don't see
the problem with any of my powerpc emulations, and I would like to add test
case(s) if possible.

Thanks,
Guenter

2021-07-07 15:36:34

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled



On 08/07/2021 00:14, Guenter Roeck wrote:
> On 7/7/21 5:11 AM, Valentin Schneider wrote:
>> On 07/07/21 14:03, Frederic Weisbecker wrote:
>>> On Wed, Jul 07, 2021 at 12:55:20AM +0100, Valentin Schneider wrote:
>>>> Thanks for the report.
>>>>
>>>> So somehow the init task ends up with a non-zero preempt_count()? Per
>>>> FORK_PREEMPT_COUNT we should exit __ret_from_fork() with a zero
>>>> count, are
>>>> you hitting the WARN_ONCE() in finish_task_switch()?
>>>>
>>>> Does CONFIG_DEBUG_PREEMPT=y yield anything interesting?
>>>>
>>>> I can't make sense of this right now, but it's a bit late :) I'll
>>>> grab some
>>>> toolchain+qemu tomorrow and go poke at it (and while at it I need to
>>>> do the
>>>> same with powerpc).
>>>
>>> One possible issue is that s390's init_idle_preempt_count() doesn't
>>> apply on the
>>> target idle task but on the _current_ CPU. And since smp_init() ->
>>> idle_threads_init() is actually called remotely, we are overwriting
>>> the current
>>> CPU preempt_count() instead of the target one.
>>
>> Indeed, this becomes quite obvious when tracing the preemption count
>> changes. This also means that s390 relied on the idle_thread_get()
>> from the
>> hotplug machinery to properly setup the preempt count, rather than
>> init_idle_preempt_count() - which is quite yuck.
>>
>> I'll write a patch for that and likely one for powerpc.
>>
>
> Can you reproduce the problem with a powerpc qemu emulation ?
> If so, how do you reproduce it there ? Reason for asking is that I don't
> see
> the problem with any of my powerpc emulations, and I would like to add test
> case(s) if possible.

I can reproduce the problem on powerpc easily - qemu with "-smp 2" does it.


--
Alexey

2021-07-07 15:38:33

by Guenter Roeck

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On Wed, Jul 07, 2021 at 03:57:17PM +0100, Valentin Schneider wrote:
> On 08/07/21 00:35, Alexey Kardashevskiy wrote:
> > On 08/07/2021 00:14, Guenter Roeck wrote:
> >>
> >> Can you reproduce the problem with a powerpc qemu emulation ?
> >> If so, how do you reproduce it there ? Reason for asking is that I don't
> >> see
> >> the problem with any of my powerpc emulations, and I would like to add test
> >> case(s) if possible.
> >
> > I can reproduce the problem on powerpc easily - qemu with "-smp 2" does it.
> >
>
> So on powerpc I'm chasing a slightly different problem, reported at
> [1]. I couldn't get it to trigger on qemu for powerpc64, and I'm still
> struggling with powerpc. Could you please share you qemu invocation &
> kernel .config? Thanks.
>
Same here. Actually, worse: All 32-bit ppc emulations I tried to
run with more than 1 CPU crash when bringing up the 2nd CPU,
and that even happens with 5.13.

So, yes, please share your qemu command line and the kernel
configuration.

Thanks,
Guenter

2021-07-07 16:39:03

by Guenter Roeck

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 7/7/21 7:57 AM, Valentin Schneider wrote:
> On 08/07/21 00:35, Alexey Kardashevskiy wrote:
>> On 08/07/2021 00:14, Guenter Roeck wrote:
>>>
>>> Can you reproduce the problem with a powerpc qemu emulation ?
>>> If so, how do you reproduce it there ? Reason for asking is that I don't
>>> see
>>> the problem with any of my powerpc emulations, and I would like to add test
>>> case(s) if possible.
>>
>> I can reproduce the problem on powerpc easily - qemu with "-smp 2" does it.
>>
>
> So on powerpc I'm chasing a slightly different problem, reported at
> [1]. I couldn't get it to trigger on qemu for powerpc64, and I'm still
> struggling with powerpc. Could you please share you qemu invocation &
> kernel .config? Thanks.
>

I think I have it. pseries_defconfig, and pseries emulation,
started with "-smp 2" and qemu-system-ppc64:

[ 0.731644][ T1] smp: Bringing up secondary CPUs ...^M
[ 0.750546][ T0] BUG: scheduling while atomic: swapper/1/0/0x00000000^M
[ 0.752119][ T0] no locks held by swapper/1/0.^M
[ 0.752309][ T0] Modules linked in:^M
[ 0.752684][ T0] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.13.0-11855-g77d34a4683b0 #1^M
[ 0.753197][ T0] Call Trace:^M
[ 0.753334][ T0] [c000000008737b20] [c0000000009f9b18] .dump_stack_lvl+0xa4/0x100 (unreliable)^M
[ 0.754224][ T0] [c000000008737bb0] [c000000000190ed0] .__schedule_bug+0xa0/0xe0^M
[ 0.754459][ T0] [c000000008737c30] [c000000001182518] .__schedule+0xc08/0xd90^M
[ 0.754738][ T0] [c000000008737d20] [c000000001182b8c] .schedule_idle+0x2c/0x60^M
[ 0.754945][ T0] [c000000008737d90] [c0000000001a48ec] .do_idle+0x29c/0x3c0^M
[ 0.755145][ T0] [c000000008737e60] [c0000000001a4df0] .cpu_startup_entry+0x30/0x40^M
[ 0.755403][ T0] [c000000008737ee0] [c00000000005ef10] .start_secondary+0x2c0/0x300^M
[ 0.755621][ T0] [c000000008737f90] [c00000000000d254] start_secondary_prolog+0x10/0x14^M
[ 0.764164][ T1] smp: Brought up 1 node, 2 CPUs^M

Guenter

2021-07-07 18:51:26

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled

On 07/07/21 09:35, Guenter Roeck wrote:
> I think I have it. pseries_defconfig, and pseries emulation,
> started with "-smp 2" and qemu-system-ppc64:
>
> [ 0.731644][ T1] smp: Bringing up secondary CPUs ...^M
> [ 0.750546][ T0] BUG: scheduling while atomic: swapper/1/0/0x00000000^M
> [ 0.752119][ T0] no locks held by swapper/1/0.^M
> [ 0.752309][ T0] Modules linked in:^M
> [ 0.752684][ T0] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.13.0-11855-g77d34a4683b0 #1^M
> [ 0.753197][ T0] Call Trace:^M
> [ 0.753334][ T0] [c000000008737b20] [c0000000009f9b18] .dump_stack_lvl+0xa4/0x100 (unreliable)^M
> [ 0.754224][ T0] [c000000008737bb0] [c000000000190ed0] .__schedule_bug+0xa0/0xe0^M
> [ 0.754459][ T0] [c000000008737c30] [c000000001182518] .__schedule+0xc08/0xd90^M
> [ 0.754738][ T0] [c000000008737d20] [c000000001182b8c] .schedule_idle+0x2c/0x60^M
> [ 0.754945][ T0] [c000000008737d90] [c0000000001a48ec] .do_idle+0x29c/0x3c0^M
> [ 0.755145][ T0] [c000000008737e60] [c0000000001a4df0] .cpu_startup_entry+0x30/0x40^M
> [ 0.755403][ T0] [c000000008737ee0] [c00000000005ef10] .start_secondary+0x2c0/0x300^M
> [ 0.755621][ T0] [c000000008737f90] [c00000000000d254] start_secondary_prolog+0x10/0x14^M
> [ 0.764164][ T1] smp: Brought up 1 node, 2 CPUs^M
>
> Guenter

Hmph, I was about to say I couldn't get that, but after cycling between
different PREEMPT options I finally triggered it, so thanks for that!

Same sha1 as yours, invocation is:

qemu-system-ppc64 vmlinux -smp 2 -nographic -m 1024 -machine pseries,usb=off

with pseries_defconfig + CONFIG_DEBUG_ATOMIC_SLEEP + CONFIG_PREEMPT_VOLUNTARY

Now to dig!

2021-07-08 01:54:51

by Alexey Kardashevskiy

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/core: Initialize the idle task with preemption disabled



On 08/07/2021 03:31, Valentin Schneider wrote:
> On 07/07/21 09:35, Guenter Roeck wrote:
>> I think I have it. pseries_defconfig, and pseries emulation,
>> started with "-smp 2" and qemu-system-ppc64:
>>
>> [ 0.731644][ T1] smp: Bringing up secondary CPUs ...^M
>> [ 0.750546][ T0] BUG: scheduling while atomic: swapper/1/0/0x00000000^M
>> [ 0.752119][ T0] no locks held by swapper/1/0.^M
>> [ 0.752309][ T0] Modules linked in:^M
>> [ 0.752684][ T0] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.13.0-11855-g77d34a4683b0 #1^M
>> [ 0.753197][ T0] Call Trace:^M
>> [ 0.753334][ T0] [c000000008737b20] [c0000000009f9b18] .dump_stack_lvl+0xa4/0x100 (unreliable)^M
>> [ 0.754224][ T0] [c000000008737bb0] [c000000000190ed0] .__schedule_bug+0xa0/0xe0^M
>> [ 0.754459][ T0] [c000000008737c30] [c000000001182518] .__schedule+0xc08/0xd90^M
>> [ 0.754738][ T0] [c000000008737d20] [c000000001182b8c] .schedule_idle+0x2c/0x60^M
>> [ 0.754945][ T0] [c000000008737d90] [c0000000001a48ec] .do_idle+0x29c/0x3c0^M
>> [ 0.755145][ T0] [c000000008737e60] [c0000000001a4df0] .cpu_startup_entry+0x30/0x40^M
>> [ 0.755403][ T0] [c000000008737ee0] [c00000000005ef10] .start_secondary+0x2c0/0x300^M
>> [ 0.755621][ T0] [c000000008737f90] [c00000000000d254] start_secondary_prolog+0x10/0x14^M
>> [ 0.764164][ T1] smp: Brought up 1 node, 2 CPUs^M
>>
>> Guenter
>
> Hmph, I was about to say I couldn't get that, but after cycling between
> different PREEMPT options I finally triggered it, so thanks for that!
>
> Same sha1 as yours, invocation is:
>
> qemu-system-ppc64 vmlinux -smp 2 -nographic -m 1024 -machine pseries,usb=off
>
> with pseries_defconfig + CONFIG_DEBUG_ATOMIC_SLEEP + CONFIG_PREEMPT_VOLUNTARY
>
> Now to dig!

CONFIG_PREEMPT_NOTIFIERS=y
CONFIG_PREEMPT_NONE=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_COUNT=y

CONFIG_DEBUG_ATOMIC_SLEEP=y

is what I have, and qemu cmdline is

qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline \
-nographic \
-vga none \
-enable-kvm \
-m 512M \
-smp 2 \
-kernel ./vmldbg \
-machine pseries


(unrelated) I wonder how/why PREEMPT_NOTIFIERS work when PREEMPT_NONE=y
:-/ I have a crash in a KVM preempt notifier with such config.



--
Alexey