LinuxLists.cc - [RFC PATCH 03/13] sched: Core-wide rq->lock

2020-03-04 17:01:04

Subject: [RFC PATCH 03/13] sched: Core-wide rq->lock

From: Peter Zijlstra <[email protected]>

Introduce the basic infrastructure to have a core wide rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
---
kernel/Kconfig.preempt | 6 +++
kernel/sched/core.c | 113 ++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 31 +++++++++++
3 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..577c288e81e5 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -80,3 +80,9 @@ config PREEMPT_COUNT
config PREEMPTION
bool
select PREEMPT_COUNT
+
+config SCHED_CORE
+ bool
+ default y
+ depends on SCHED_SMT
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28ba9b56dd8a..ba17ff8a8663 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,70 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;

+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/*
+ * The static-key + stop-machine variable are needed such that:
+ *
+ * spin_lock(rq_lockp(rq));
+ * ...
+ * spin_unlock(rq_lockp(rq));
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+static int __sched_core_stopper(void *data)
+{
+ bool enabled = !!(unsigned long)data;
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ return 0;
+}
+
+static DEFINE_MUTEX(sched_core_mutex);
+static int sched_core_count;
+
+static void __sched_core_enable(void)
+{
+ // XXX verify there are no cookie tasks (yet)
+
+ static_branch_enable(&__sched_core_enabled);
+ stop_machine(__sched_core_stopper, (void *)true, NULL);
+}
+
+static void __sched_core_disable(void)
+{
+ // XXX verify there are no cookie tasks (left)
+
+ stop_machine(__sched_core_stopper, (void *)false, NULL);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!sched_core_count++)
+ __sched_core_enable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+void sched_core_put(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!--sched_core_count)
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -6400,8 +6464,15 @@ int sched_cpu_activate(unsigned int cpu)
/*
* When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
static_branch_inc_cpuslocked(&sched_smt_present);
+#ifdef CONFIG_SCHED_CORE
+ if (static_branch_unlikely(&__sched_core_enabled)) {
+ rq->core_enabled = true;
+ }
+#endif
+ }
+
#endif
set_cpu_active(cpu, true);

@@ -6447,8 +6518,16 @@ int sched_cpu_deactivate(unsigned int cpu)
/*
* When going down, decrement the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
+#ifdef CONFIG_SCHED_CORE
+ struct rq *rq = cpu_rq(cpu);
+ if (static_branch_unlikely(&__sched_core_enabled)) {
+ rq->core_enabled = false;
+ }
+#endif
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ }
#endif

if (!sched_smp_initialized)
@@ -6473,6 +6552,28 @@ static void sched_rq_cpu_starting(unsigned int cpu)

int sched_cpu_starting(unsigned int cpu)
{
+#ifdef CONFIG_SCHED_CORE
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq, *core_rq = NULL;
+ int i;
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+ if (rq->core && rq->core == rq)
+ core_rq = rq;
+ }
+
+ if (!core_rq)
+ core_rq = cpu_rq(cpu);
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+
+ WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ rq->core = core_rq;
+ }
+#endif /* CONFIG_SCHED_CORE */
+
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -6501,6 +6602,9 @@ int sched_cpu_dying(unsigned int cpu)
update_max_interval();
nohz_balance_exit_idle(rq);
hrtick_clear(rq);
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+#endif
return 0;
}
#endif
@@ -6695,6 +6799,11 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+ rq->core_enabled = 0;
+#endif
}

set_load_weight(&init_task, false);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a8335e3078ab..a3941b2ee29e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -999,6 +999,12 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ unsigned int core_enabled;
+#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1026,11 +1032,36 @@ static inline int cpu_of(struct rq *rq)
#endif
}

+#ifdef CONFIG_SCHED_CORE
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
static inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
return &rq->__lock;
}

+#endif /* CONFIG_SCHED_CORE */
+
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);

--
2.17.1

2020-04-01 11:29:53

by Cheng Jian

[permalink] [raw]

Subject: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
the thread_sibling is ready for use.

notify_cpu_starting()
-> sched_cpu_starting() # use thread_sibling

store_cpu_topology(cpu)
-> update_siblings_masks # set thread_sibling

Fix this by doing notify_cpu_starting later, just like x86 do.

Signed-off-by: Cheng Jian <[email protected]>
---
arch/arm64/kernel/smp.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 5407bf5d98ac..a427c14e82af 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -236,13 +236,18 @@ asmlinkage notrace void secondary_start_kernel(void)
cpuinfo_store_cpu();

/*
- * Enable GIC and timers.
+ * Store cpu topology before notify_cpu_starting,
+ * CPUHP_AP_SCHED_STARTING requires SMT topology
+ * been initialized for SCHED_CORE.
*/
- notify_cpu_starting(cpu);
-
store_cpu_topology(cpu);
numa_add_cpu(cpu);

+ /*
+ * Enable GIC and timers.
+ */
+ notify_cpu_starting(cpu);
+
/*
* OK, now it's safe to let the boot CPU continue. Wait for
* the CPU migration code to notice that the CPU is online
--
2.17.1

2020-04-01 13:24:33

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

(+LAKML, +Sudeep)

On Wed, Apr 01 2020, Cheng Jian wrote:
> when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
> SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
> the thread_sibling is ready for use.
>
> notify_cpu_starting()
> -> sched_cpu_starting() # use thread_sibling
>
> store_cpu_topology(cpu)
> -> update_siblings_masks # set thread_sibling
>
> Fix this by doing notify_cpu_starting later, just like x86 do.
>

I haven't been following the sched core stuff closely; can't this
rq->core assignment be done in sched_cpu_activate() instead? We already
look at the cpu_smt_mask() in there, and it is valid (we go through the
entirety of secondary_start_kernel() before getting anywhere near
CPUHP_AP_ACTIVE).

I don't think this breaks anything, but without this dependency in
sched_cpu_starting() then there isn't really a reason for this move.

> Signed-off-by: Cheng Jian <[email protected]>
> ---
> arch/arm64/kernel/smp.c | 11 ++++++++---
> 1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 5407bf5d98ac..a427c14e82af 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -236,13 +236,18 @@ asmlinkage notrace void secondary_start_kernel(void)
> cpuinfo_store_cpu();
>
> /*
> - * Enable GIC and timers.
> + * Store cpu topology before notify_cpu_starting,
> + * CPUHP_AP_SCHED_STARTING requires SMT topology
> + * been initialized for SCHED_CORE.
> */
> - notify_cpu_starting(cpu);
> -
> store_cpu_topology(cpu);
> numa_add_cpu(cpu);
>
> + /*
> + * Enable GIC and timers.
> + */
> + notify_cpu_starting(cpu);
> +
> /*
> * OK, now it's safe to let the boot CPU continue. Wait for
> * the CPU migration code to notice that the CPU is online

2020-04-06 08:01:52

by Cheng Jian

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On 2020/4/1 21:23, Valentin Schneider wrote:
> (+LAKML, +Sudeep)
>
> On Wed, Apr 01 2020, Cheng Jian wrote:
>> when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
>> SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
>> the thread_sibling is ready for use.
>>
>> notify_cpu_starting()
>> -> sched_cpu_starting() # use thread_sibling
>>
>> store_cpu_topology(cpu)
>> -> update_siblings_masks # set thread_sibling
>>
>> Fix this by doing notify_cpu_starting later, just like x86 do.
>>
> I haven't been following the sched core stuff closely; can't this
> rq->core assignment be done in sched_cpu_activate() instead? We already
> look at the cpu_smt_mask() in there, and it is valid (we go through the
> entirety of secondary_start_kernel() before getting anywhere near
> CPUHP_AP_ACTIVE).
>
> I don't think this breaks anything, but without this dependency in
> sched_cpu_starting() then there isn't really a reason for this move.

Yes, it is correct to put the rq-> core assignment in sched_cpu_active().

The cpu_smt_mask is already valid here.

I have made such an attempt on my own branch and passed the test.

Thank you.

-- Cheng Jian

2020-04-09 10:00:58

by Sudeep Holla

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On Wed, Apr 01, 2020 at 02:23:33PM +0100, Valentin Schneider wrote:
>
> (+LAKML, +Sudeep)
>

Thanks Valentin.

> On Wed, Apr 01 2020, Cheng Jian wrote:
> > when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
> > SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
> > the thread_sibling is ready for use.
> >
> > notify_cpu_starting()
> > -> sched_cpu_starting() # use thread_sibling
> >
> > store_cpu_topology(cpu)
> > -> update_siblings_masks # set thread_sibling
> >
> > Fix this by doing notify_cpu_starting later, just like x86 do.
> >
>
> I haven't been following the sched core stuff closely; can't this
> rq->core assignment be done in sched_cpu_activate() instead? We already
> look at the cpu_smt_mask() in there, and it is valid (we go through the
> entirety of secondary_start_kernel() before getting anywhere near
> CPUHP_AP_ACTIVE).
>

I too came to same conclusion. Did you see any issues ? Or is it
just code inspection in parity with x86 ?

> I don't think this breaks anything, but without this dependency in
> sched_cpu_starting() then there isn't really a reason for this move.
>

Based on the commit message, had a quick look at x86 code and I agree
this shouldn't break anything. However the commit message does make
complete sense to me, especially reference to sched_cpu_starting
while smt_masks are accessed in sched_cpu_activate. Or am I missing
to understand something here ?

--
Regards,
Sudeep

2020-04-09 10:34:40

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On 09/04/20 10:59, Sudeep Holla wrote:
> On Wed, Apr 01, 2020 at 02:23:33PM +0100, Valentin Schneider wrote:
>>
>> (+LAKML, +Sudeep)
>>
>
> Thanks Valentin.
>
>> On Wed, Apr 01 2020, Cheng Jian wrote:
>> > when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
>> > SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
>> > the thread_sibling is ready for use.
>> >
>> > notify_cpu_starting()
>> > -> sched_cpu_starting() # use thread_sibling
>> >
>> > store_cpu_topology(cpu)
>> > -> update_siblings_masks # set thread_sibling
>> >
>> > Fix this by doing notify_cpu_starting later, just like x86 do.
>> >
>>
>> I haven't been following the sched core stuff closely; can't this
>> rq->core assignment be done in sched_cpu_activate() instead? We already
>> look at the cpu_smt_mask() in there, and it is valid (we go through the
>> entirety of secondary_start_kernel() before getting anywhere near
>> CPUHP_AP_ACTIVE).
>>
>
> I too came to same conclusion. Did you see any issues ? Or is it
> just code inspection in parity with x86 ?
>

With mainline this isn't a problem; with the core scheduling stuff there is
an expectation that we can use the SMT masks in sched_cpu_starting().

>> I don't think this breaks anything, but without this dependency in
>> sched_cpu_starting() then there isn't really a reason for this move.
>>
>
> Based on the commit message, had a quick look at x86 code and I agree
> this shouldn't break anything. However the commit message does make
> complete sense to me, especially reference to sched_cpu_starting
> while smt_masks are accessed in sched_cpu_activate. Or am I missing
> to understand something here ?

As stated above, it's not a problem for mainline, and AIUI we can change
the core scheduling bits to only use the SMT mask in sched_cpu_activate()
instead, therefore not requiring any change in the arch code.

I'm not aware of any written rule that the topology masks should be usable
from a given hotplug state upwards, only that right now we need them in
sched_cpu_(de)activate() for SMT scheduling - and that is already working
fine.

So really this should be considering as a simple neutral cleanup; I don't
really have any opinion on picking it up or not.

2020-04-09 11:10:24

by Sudeep Holla

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On Thu, Apr 09, 2020 at 11:32:12AM +0100, Valentin Schneider wrote:
>
> On 09/04/20 10:59, Sudeep Holla wrote:
> > On Wed, Apr 01, 2020 at 02:23:33PM +0100, Valentin Schneider wrote:
> >>
> >> (+LAKML, +Sudeep)
> >>
> >
> > Thanks Valentin.
> >
> >> On Wed, Apr 01 2020, Cheng Jian wrote:
> >> > when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
> >> > SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
> >> > the thread_sibling is ready for use.
> >> >
> >> > notify_cpu_starting()
> >> > -> sched_cpu_starting() # use thread_sibling
> >> >
> >> > store_cpu_topology(cpu)
> >> > -> update_siblings_masks # set thread_sibling
> >> >
> >> > Fix this by doing notify_cpu_starting later, just like x86 do.
> >> >
> >>
> >> I haven't been following the sched core stuff closely; can't this
> >> rq->core assignment be done in sched_cpu_activate() instead? We already
> >> look at the cpu_smt_mask() in there, and it is valid (we go through the
> >> entirety of secondary_start_kernel() before getting anywhere near
> >> CPUHP_AP_ACTIVE).
> >>
> >
> > I too came to same conclusion. Did you see any issues ? Or is it
> > just code inspection in parity with x86 ?
> >
>
> With mainline this isn't a problem; with the core scheduling stuff there is
> an expectation that we can use the SMT masks in sched_cpu_starting().
>

Ah, OK. I prefer this to be specified in the commit message as it is not
obvious.

> >> I don't think this breaks anything, but without this dependency in
> >> sched_cpu_starting() then there isn't really a reason for this move.
> >>
> >
> > Based on the commit message, had a quick look at x86 code and I agree
> > this shouldn't break anything. However the commit message does make
> > complete sense to me, especially reference to sched_cpu_starting
> > while smt_masks are accessed in sched_cpu_activate. Or am I missing
> > to understand something here ?
>
> As stated above, it's not a problem for mainline, and AIUI we can change
> the core scheduling bits to only use the SMT mask in sched_cpu_activate()
> instead, therefore not requiring any change in the arch code.
>

Either way is fine. If it is already set expectation that SMT masks needs
to be set before sched_cpu_starting, then let us just stick with that.

> I'm not aware of any written rule that the topology masks should be usable
> from a given hotplug state upwards, only that right now we need them in
> sched_cpu_(de)activate() for SMT scheduling - and that is already working
> fine.
>

Sure, we can at-least document as part of this change even if it is just
in ARM64 so that someone need not wonder the same in future.

> So really this should be considering as a simple neutral cleanup; I don't
> really have any opinion on picking it up or not.

I am fine with the change too, just need some tweaking in the commit
message.

--
Regards,
Sudeep

2020-04-09 17:55:18

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On Wed, Apr 1, 2020 at 7:27 AM Cheng Jian <[email protected]> wrote:
>
> when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
> SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
> the thread_sibling is ready for use.
>
> notify_cpu_starting()
> -> sched_cpu_starting() # use thread_sibling
>
> store_cpu_topology(cpu)
> -> update_siblings_masks # set thread_sibling
>
> Fix this by doing notify_cpu_starting later, just like x86 do.
>
> Signed-off-by: Cheng Jian <[email protected]>

Just a high-level question, why does core-scheduling matter on ARM64?
Is it for HPC workloads?

Thanks,

- Joel

> ---
> arch/arm64/kernel/smp.c | 11 ++++++++---
> 1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 5407bf5d98ac..a427c14e82af 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -236,13 +236,18 @@ asmlinkage notrace void secondary_start_kernel(void)
> cpuinfo_store_cpu();
>
> /*
> - * Enable GIC and timers.
> + * Store cpu topology before notify_cpu_starting,
> + * CPUHP_AP_SCHED_STARTING requires SMT topology
> + * been initialized for SCHED_CORE.
> */
> - notify_cpu_starting(cpu);
> -
> store_cpu_topology(cpu);
> numa_add_cpu(cpu);
>
> + /*
> + * Enable GIC and timers.
> + */
> + notify_cpu_starting(cpu);
> +
> /*
> * OK, now it's safe to let the boot CPU continue. Wait for
> * the CPU migration code to notice that the CPU is online
> --
> 2.17.1
>

2020-04-10 13:51:03

by Cheng Jian

[permalink] [raw]

Subject: Re: [PATCH] sched/arm64: store cpu topology before notify_cpu_starting

On 2020/4/10 1:54, Joel Fernandes wrote:
> On Wed, Apr 1, 2020 at 7:27 AM Cheng Jian <[email protected]> wrote:
>> when SCHED_CORE enabled, sched_cpu_starting() uses thread_sibling as
>> SMT_MASK to initialize rq->core, but only after store_cpu_topology(),
>> the thread_sibling is ready for use.
>>
>> notify_cpu_starting()
>> -> sched_cpu_starting() # use thread_sibling
>>
>> store_cpu_topology(cpu)
>> -> update_siblings_masks # set thread_sibling
>>
>> Fix this by doing notify_cpu_starting later, just like x86 do.
>>
>> Signed-off-by: Cheng Jian <[email protected]>
> Just a high-level question, why does core-scheduling matter on ARM64?
> Is it for HPC workloads?
>
> Thanks,
>
> - Joel

Hi, Joel

I am analyzing the mainline scheduling patches and find this problem.

ARM has some platforms that support SMT, and provides some emulate

can be used.

Thanks.

--Cheng Jian

2020-04-15 21:26:38

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 03/13] sched: Core-wide rq->lock

On Wed, Mar 04, 2020 at 04:59:53PM +0000, vpillai wrote:
> @@ -6400,8 +6464,15 @@ int sched_cpu_activate(unsigned int cpu)
> /*
> * When going up, increment the number of cores with SMT present.
> */
> - if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
> + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
> static_branch_inc_cpuslocked(&sched_smt_present);
> +#ifdef CONFIG_SCHED_CORE
> + if (static_branch_unlikely(&__sched_core_enabled)) {
> + rq->core_enabled = true;
> + }
> +#endif
> + }
> +
> #endif
> set_cpu_active(cpu, true);
>
> @@ -6447,8 +6518,16 @@ int sched_cpu_deactivate(unsigned int cpu)
> /*
> * When going down, decrement the number of cores with SMT present.
> */
> - if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
> + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
> +#ifdef CONFIG_SCHED_CORE
> + struct rq *rq = cpu_rq(cpu);
> + if (static_branch_unlikely(&__sched_core_enabled)) {
> + rq->core_enabled = false;
> + }
> +#endif
> static_branch_dec_cpuslocked(&sched_smt_present);
> +
> + }
> #endif
>
> if (!sched_smp_initialized)

Aside from the fact that it's probably much saner to write this as:

rq->core_enabled = static_key_enabled(&__sched_core_enabled);

I'm fairly sure I didn't write this part. And while I do somewhat see
the point of disabling core scheduling for a core that has only a single
thread on, I wonder why we care.

The thing is, this directly leads to the utter horror-show that is patch
6.

It should be perfectly possible to core schedule a core with only a
single thread on. It might be a tad silly to do, but it beats the heck
out of the trainwreck created here.

So how did this happen?

2020-04-15 21:39:30

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 03/13] sched: Core-wide rq->lock

On Wed, Mar 04, 2020 at 04:59:53PM +0000, vpillai wrote:
> +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
> +
> +/*
> + * The static-key + stop-machine variable are needed such that:
> + *
> + * spin_lock(rq_lockp(rq));
> + * ...
> + * spin_unlock(rq_lockp(rq));
> + *
> + * ends up locking and unlocking the _same_ lock, and all CPUs
> + * always agree on what rq has what lock.
> + *
> + * XXX entirely possible to selectively enable cores, don't bother for now.
> + */
> +static int __sched_core_stopper(void *data)
> +{
> + bool enabled = !!(unsigned long)data;
> + int cpu;
> +
> + for_each_online_cpu(cpu)
> + cpu_rq(cpu)->core_enabled = enabled;
> +
> + return 0;
> +}
> +
> +static DEFINE_MUTEX(sched_core_mutex);
> +static int sched_core_count;
> +
> +static void __sched_core_enable(void)
> +{
> + // XXX verify there are no cookie tasks (yet)
> +
> + static_branch_enable(&__sched_core_enabled);
> + stop_machine(__sched_core_stopper, (void *)true, NULL);
> +}
> +
> +static void __sched_core_disable(void)
> +{
> + // XXX verify there are no cookie tasks (left)
> +
> + stop_machine(__sched_core_stopper, (void *)false, NULL);
> + static_branch_disable(&__sched_core_enabled);
> +}

> +static inline raw_spinlock_t *rq_lockp(struct rq *rq)
> +{
> + if (sched_core_enabled(rq))
> + return &rq->core->__lock;
> +
> + return &rq->__lock;
> +}

While reading all this again, I realized it's not too hard to get rid of
stop-machine here.

void __raw_rq_lock(struct rq *rq)
{
raw_spinlock_t *lock;

for (;;) {
lock = rq_lockp(rq);

raw_spin_lock(lock);
if (lock == rq_lock(rq))
return;
raw_spin_unlock(lock);
}
}

void __sched_core_enable(int core, bool enable)
{
const cpumask *smt_mask;
int cpu, i;

smt_mask = cpu_smt_mask(core);

for_each_cpu(cpu, smt_mask)
raw_spin_lock_nested(&cpu_rq(cpu)->__lock, i++);

for_each_cpu(cpu, smt_mask)
cpu_rq(cpu)->core_enabled = enable;

for_each_cpu(cpu, smt_mask)
raw_spin_unlock(&cpu_rq(cpu)->__lock);
}

2020-04-15 22:58:13

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 03/13] sched: Core-wide rq->lock

On Tue, Apr 14, 2020 at 05:35:07PM -0400, Vineeth Remanan Pillai wrote:
> > Aside from the fact that it's probably much saner to write this as:
> >
> > rq->core_enabled = static_key_enabled(&__sched_core_enabled);
> >
> > I'm fairly sure I didn't write this part. And while I do somewhat see
> > the point of disabling core scheduling for a core that has only a single
> > thread on, I wonder why we care.
> >
> I think this change was to fix some crashes which happened due to
> uninitialized rq->core if a sibling was offline during boot and is
> onlined after coresched was enabled.
>
> https://lwn.net/ml/linux-kernel/[email protected]/
>
> I tried to fix it by initializing coresched members during a cpu online
> and tearing it down on a cpu offline. This was back in v3 and do not
> remember the exact details. I shall revisit this and see if there is a
> better way to fix the race condition above.

Argh, that problem again. So AFAIK booting with maxcpus= is broken in a
whole number of 'interesting' ways. I'm not sure what to do about that,
perhaps we should add a config around that option and make it depend on
CONFIG_BROKEN.

That said; I'm thinking it shouldn't be too hard to fix up the core
state before we add the CPU to the masks, but it will be arch specific.
See speculative_store_bypass_ht_init() for inspiration, but you'll need
to be even earlier, before set_cpu_sibling_map() in smp_callin() on x86
(no clue about other archs).

Even without maxcpus= this can happen when you do physical hotplug and
add a part (or replace one where the new part has more cores than the
old).

The moment core-scheduling is enabled and you're adding unknown
topology, we need to set up state before we publish the mask,... or I
suppose endlessly do: 'smt_mask & active_mask' all over the place :/ In
which case you can indeed do it purely in sched/core.

Hurmph...

2020-04-15 23:35:06

by Vineeth Remanan Pillai

[permalink] [raw]

Subject: Re: [RFC PATCH 03/13] sched: Core-wide rq->lock

> Aside from the fact that it's probably much saner to write this as:
>
> rq->core_enabled = static_key_enabled(&__sched_core_enabled);
>
> I'm fairly sure I didn't write this part. And while I do somewhat see
> the point of disabling core scheduling for a core that has only a single
> thread on, I wonder why we care.
>
I think this change was to fix some crashes which happened due to
uninitialized rq->core if a sibling was offline during boot and is
onlined after coresched was enabled.

https://lwn.net/ml/linux-kernel/[email protected]/

I tried to fix it by initializing coresched members during a cpu online
and tearing it down on a cpu offline. This was back in v3 and do not
remember the exact details. I shall revisit this and see if there is a
better way to fix the race condition above.

Thanks,
Vineeth