2022-02-14 09:41:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [GIT PULL] sched/urgent for 5.17-rc4

On Mon, Feb 14, 2022 at 09:45:22AM +0100, Peter Zijlstra wrote:
> On Sun, Feb 13, 2022 at 10:02:22AM -0800, Linus Torvalds wrote:
> > On Sun, Feb 13, 2022 at 4:37 AM Borislav Petkov <[email protected]> wrote:
> > >
> > > Tadeusz Struk (1):
> > > sched/fair: Fix fault in reweight_entity
> >
> > I've pulled this, but this really smells bad to me.
> >
> > If set_load_weight() can see a process that hasn't even had the
> > runqueue pointer set yet, then what keeps *others* from the same
> > thing?
>
> Urgh, I think you're right, the moment we enter the pidhash and become
> visible we should be complete. That means the previous commit
> (4ef0c5c6b5ba) is buggered... Let me try and make sense of all that
> cgroup stuff again :-(

Zhang, Tadeusz, TJ, how does this look?

---
include/linux/sched/task.h | 4 ++--
kernel/fork.c | 9 ++++++++-
kernel/sched/core.c | 34 +++++++++++++++++++++-------------
3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index b9198a1b3a84..e84e54d1b490 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..05faebafe2b5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2266,6 +2266,13 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_put_pidfd;

+ /*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ */
+ sched_cgroup_fork(p, args);
+
/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
@@ -2376,7 +2383,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);

proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fcf0c180617c..dd97a42b1eee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif

-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
- bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;

@@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = p->static_prio;
- set_load_weight(p);
+ set_load_weight(p, false);

/*
* We don't need the reset flag anymore after the fork. It has
@@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)

init_entity_runnable_average(&p->se);

+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}

-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif

+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = autogroup_task_group(p, tg);
+ }
#endif
rseq_migrate(p);
/*
@@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}

+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}

@@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);

p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);

@@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}

/*
@@ -9446,7 +9454,7 @@ void __init sched_init(void)
#endif
}

- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);

/*
* The boot idle thread does lazy MMU switching as well:


2022-02-14 21:25:02

by Tejun Heo

[permalink] [raw]
Subject: Re: [GIT PULL] sched/urgent for 5.17-rc4

Hello, Peter.

On Mon, Feb 14, 2022 at 10:16:57AM +0100, Peter Zijlstra wrote:
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d75a528f7b21..05faebafe2b5 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2266,6 +2266,13 @@ static __latent_entropy struct task_struct *copy_process(
> if (retval)
> goto bad_fork_put_pidfd;
>
> + /*
> + * Now that the cgroups are pinned, re-clone the parent cgroup and put
> + * the new task on the correct runqueue. All this *before* the task
> + * becomes visible.
> + */
> + sched_cgroup_fork(p, args);

Would it be less confusing to comment that this isn't ->can_fork() because
scheduler task_group needs to be initialized for autogroup even when cgroup
is disabled and maybe name it sched_cgroup_can_fork() even if it always
succeeds?

> +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
> {
> unsigned long flags;
> -#ifdef CONFIG_CGROUP_SCHED
> - struct task_group *tg;
> -#endif
>
> + /*
> + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
> + * required yet, but lockdep gets upset if rules are violated.
> + */
> raw_spin_lock_irqsave(&p->pi_lock, flags);
> #ifdef CONFIG_CGROUP_SCHED
> - tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> - struct task_group, css);
> - p->sched_task_group = autogroup_task_group(p, tg);
> + if (1) {
> + struct task_group *tg;
> + tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> + struct task_group, css);
> + tg = autogroup_task_group(p, tg);
> + p->sched_task_group = autogroup_task_group(p, tg);
> + }

I suppose the double autogroup_task_group() call is unintentional?

Otherwise, looks good to me. The only requirement from cgroup side is that
the membership should be initialized between ->can_fork() and ->fork()
inclusively, and sans autogroup this would have been done as a part of
->can_fork() so the proposed change makes sense to me.

Thanks.

--
tejun

2022-02-14 21:35:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [GIT PULL] sched/urgent for 5.17-rc4

On Mon, Feb 14, 2022 at 09:17:12AM -1000, Tejun Heo wrote:
> Hello, Peter.
>
> On Mon, Feb 14, 2022 at 10:16:57AM +0100, Peter Zijlstra wrote:
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index d75a528f7b21..05faebafe2b5 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -2266,6 +2266,13 @@ static __latent_entropy struct task_struct *copy_process(
> > if (retval)
> > goto bad_fork_put_pidfd;
> >
> > + /*
> > + * Now that the cgroups are pinned, re-clone the parent cgroup and put
> > + * the new task on the correct runqueue. All this *before* the task
> > + * becomes visible.
> > + */
> > + sched_cgroup_fork(p, args);
>
> Would it be less confusing to comment that this isn't ->can_fork() because
> scheduler task_group needs to be initialized for autogroup even when cgroup
> is disabled and maybe name it sched_cgroup_can_fork() even if it always
> succeeds?

So there's two things that need doing; the re-cloning of the task_group
thing, but also calling of __set_task_cpu() which sets up the proper
runqueue links.

The first is CGroup only, and *could* in theory be done in ->can_fork(),
but the second needs to be done unconditionally, and it doesn't make
much sense to split this up.

I actually tried, but it made the patch bigger/uglier -- but maybe I
didn't try hard enough.

> > +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
> > {
> > unsigned long flags;
> > -#ifdef CONFIG_CGROUP_SCHED
> > - struct task_group *tg;
> > -#endif
> >
> > + /*
> > + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
> > + * required yet, but lockdep gets upset if rules are violated.
> > + */
> > raw_spin_lock_irqsave(&p->pi_lock, flags);
> > #ifdef CONFIG_CGROUP_SCHED
> > - tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> > - struct task_group, css);
> > - p->sched_task_group = autogroup_task_group(p, tg);
> > + if (1) {
> > + struct task_group *tg;
> > + tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> > + struct task_group, css);
> > + tg = autogroup_task_group(p, tg);
> > + p->sched_task_group = autogroup_task_group(p, tg);
> > + }
>
> I suppose the double autogroup_task_group() call is unintentional?

Yeah, that's a silly fail. Will ammend.

> Otherwise, looks good to me. The only requirement from cgroup side is that
> the membership should be initialized between ->can_fork() and ->fork()
> inclusively, and sans autogroup this would have been done as a part of
> ->can_fork() so the proposed change makes sense to me.

Thanks! I suppose I should go write me a Changelog then... assuming it
actually works :-)

2022-02-17 21:32:41

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH] sched: Fix yet more sched_fork() races

On Mon, Feb 14, 2022 at 10:16:57AM +0100, Peter Zijlstra wrote:
> Zhang, Tadeusz, TJ, how does this look?

*sigh* I was hoping for some Tested-by, since I've no idea how to
operate this cgroup stuff properly.

Anyway, full patch below. I'll go stick it in sched/urgent.

---
Subject: sched: Fix yet more sched_fork() races
From: Peter Zijlstra <[email protected]>
Date: Mon, 14 Feb 2022 10:16:57 +0100

Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
race vs syscalls by not placing the task on the runqueue before it
gets exposed through the pidhash.

Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
trying to fix a single instance of this, instead fix the whole class
of issues, effectively reverting this commit.

Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
Reported-by: Linus Torvalds <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched/task.h | 4 ++--
kernel/fork.c | 13 ++++++++++++-
kernel/sched/core.c | 34 +++++++++++++++++++++-------------
3 files changed, 35 insertions(+), 16 deletions(-)

--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2266,6 +2266,17 @@ static __latent_entropy struct task_stru
goto bad_fork_put_pidfd;

/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2375,7 +2386,7 @@ static __latent_entropy struct task_stru
write_unlock_irq(&tasklist_lock);

proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1215,9 +1215,8 @@ int tg_nop(struct task_group *tg, void *
}
#endif

-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
- bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;

@@ -4408,7 +4407,7 @@ int sched_fork(unsigned long clone_flags
p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = p->static_prio;
- set_load_weight(p);
+ set_load_weight(p, false);

/*
* We don't need the reset flag anymore after the fork. It has
@@ -4426,6 +4425,7 @@ int sched_fork(unsigned long clone_flags

init_entity_runnable_average(&p->se);

+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4441,18 +4441,23 @@ int sched_fork(unsigned long clone_flags
return 0;
}

-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif

+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4463,7 +4468,10 @@ void sched_post_fork(struct task_struct
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}

+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}

@@ -6923,7 +6931,7 @@ void set_user_nice(struct task_struct *p
put_prev_task(rq, p);

p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);

@@ -7214,7 +7222,7 @@ static void __setscheduler_params(struct
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}

/*
@@ -9447,7 +9455,7 @@ void __init sched_init(void)
#endif
}

- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);

/*
* The boot idle thread does lazy MMU switching as well:

2022-02-17 23:20:30

by Tadeusz Struk

[permalink] [raw]
Subject: Re: [PATCH] sched: Fix yet more sched_fork() races

On 2/17/22 00:51, Peter Zijlstra wrote:
> On Mon, Feb 14, 2022 at 10:16:57AM +0100, Peter Zijlstra wrote:
>> Zhang, Tadeusz, TJ, how does this look?
>
> *sigh* I was hoping for some Tested-by, since I've no idea how to
> operate this cgroup stuff properly.
>
> Anyway, full patch below. I'll go stick it in sched/urgent.

Just tested it on 5.17.0-rc4 and it looks ok.

Tested-by: Tadeusz Struk <[email protected]>

--
Thanks,
Tadeusz

2022-02-18 06:29:21

by Zhang Qiao

[permalink] [raw]
Subject: Re: [PATCH] sched: Fix yet more sched_fork() races



在 2022/2/17 16:51, Peter Zijlstra 写道:
> On Mon, Feb 14, 2022 at 10:16:57AM +0100, Peter Zijlstra wrote:
>> Zhang, Tadeusz, TJ, how does this look?
>
> *sigh* I was hoping for some Tested-by, since I've no idea how to
> operate this cgroup stuff properly.

tested this patch successfully.

Tested-by: Zhang Qiao <[email protected]>


>
> Anyway, full patch below. I'll go stick it in sched/urgent.
>
> ---
> Subject: sched: Fix yet more sched_fork() races
> From: Peter Zijlstra <[email protected]>
> Date: Mon, 14 Feb 2022 10:16:57 +0100
>
> Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
> invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
> race vs syscalls by not placing the task on the runqueue before it
> gets exposed through the pidhash.
>
> Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
> trying to fix a single instance of this, instead fix the whole class
> of issues, effectively reverting this commit.
>
> Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
> Reported-by: Linus Torvalds <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched/task.h | 4 ++--
> kernel/fork.c | 13 ++++++++++++-
> kernel/sched/core.c | 34 +++++++++++++++++++++-------------
> 3 files changed, 35 insertions(+), 16 deletions(-)
>
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str
> extern void init_idle(struct task_struct *idle, int cpu);
>
> extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
> -extern void sched_post_fork(struct task_struct *p,
> - struct kernel_clone_args *kargs);
> +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
> +extern void sched_post_fork(struct task_struct *p);
> extern void sched_dead(struct task_struct *p);
>
> void __noreturn do_task_dead(void);
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2266,6 +2266,17 @@ static __latent_entropy struct task_stru
> goto bad_fork_put_pidfd;
>
> /*
> + * Now that the cgroups are pinned, re-clone the parent cgroup and put
> + * the new task on the correct runqueue. All this *before* the task
> + * becomes visible.
> + *
> + * This isn't part of ->can_fork() because while the re-cloning is
> + * cgroup specific, it unconditionally needs to place the task on a
> + * runqueue.
> + */
> + sched_cgroup_fork(p, args);
> +
> + /*
> * From this point on we must avoid any synchronous user-space
> * communication until we take the tasklist-lock. In particular, we do
> * not want user-space to be able to predict the process start-time by
> @@ -2375,7 +2386,7 @@ static __latent_entropy struct task_stru
> write_unlock_irq(&tasklist_lock);
>
> proc_fork_connector(p);
> - sched_post_fork(p, args);
> + sched_post_fork(p);
> cgroup_post_fork(p, args);
> perf_event_fork(p);
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1215,9 +1215,8 @@ int tg_nop(struct task_group *tg, void *
> }
> #endif
>
> -static void set_load_weight(struct task_struct *p)
> +static void set_load_weight(struct task_struct *p, bool update_load)
> {
> - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
> int prio = p->static_prio - MAX_RT_PRIO;
> struct load_weight *load = &p->se.load;
>
> @@ -4408,7 +4407,7 @@ int sched_fork(unsigned long clone_flags
> p->static_prio = NICE_TO_PRIO(0);
>
> p->prio = p->normal_prio = p->static_prio;
> - set_load_weight(p);
> + set_load_weight(p, false);
>
> /*
> * We don't need the reset flag anymore after the fork. It has
> @@ -4426,6 +4425,7 @@ int sched_fork(unsigned long clone_flags
>
> init_entity_runnable_average(&p->se);
>
> +
> #ifdef CONFIG_SCHED_INFO
> if (likely(sched_info_on()))
> memset(&p->sched_info, 0, sizeof(p->sched_info));
> @@ -4441,18 +4441,23 @@ int sched_fork(unsigned long clone_flags
> return 0;
> }
>
> -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
> +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
> {
> unsigned long flags;
> -#ifdef CONFIG_CGROUP_SCHED
> - struct task_group *tg;
> -#endif
>
> + /*
> + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
> + * required yet, but lockdep gets upset if rules are violated.
> + */
> raw_spin_lock_irqsave(&p->pi_lock, flags);
> #ifdef CONFIG_CGROUP_SCHED
> - tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> - struct task_group, css);
> - p->sched_task_group = autogroup_task_group(p, tg);
> + if (1) {
> + struct task_group *tg;
> + tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
> + struct task_group, css);
> + tg = autogroup_task_group(p, tg);
> + p->sched_task_group = tg;
> + }
> #endif
> rseq_migrate(p);
> /*
> @@ -4463,7 +4468,10 @@ void sched_post_fork(struct task_struct
> if (p->sched_class->task_fork)
> p->sched_class->task_fork(p);
> raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +}
>
> +void sched_post_fork(struct task_struct *p)
> +{
> uclamp_post_fork(p);
> }
>
> @@ -6923,7 +6931,7 @@ void set_user_nice(struct task_struct *p
> put_prev_task(rq, p);
>
> p->static_prio = NICE_TO_PRIO(nice);
> - set_load_weight(p);
> + set_load_weight(p, true);
> old_prio = p->prio;
> p->prio = effective_prio(p);
>
> @@ -7214,7 +7222,7 @@ static void __setscheduler_params(struct
> */
> p->rt_priority = attr->sched_priority;
> p->normal_prio = normal_prio(p);
> - set_load_weight(p);
> + set_load_weight(p, true);
> }
>
> /*
> @@ -9447,7 +9455,7 @@ void __init sched_init(void)
> #endif
> }
>
> - set_load_weight(&init_task);
> + set_load_weight(&init_task, false);
>
> /*
> * The boot idle thread does lazy MMU switching as well:
>
> .
>

Subject: [tip: sched/urgent] sched: Fix yet more sched_fork() races

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: b1e8206582f9d680cff7d04828708c8b6ab32957
Gitweb: https://git.kernel.org/tip/b1e8206582f9d680cff7d04828708c8b6ab32957
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 14 Feb 2022 10:16:57 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Sat, 19 Feb 2022 11:11:05 +01:00

sched: Fix yet more sched_fork() races

Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
race vs syscalls by not placing the task on the runqueue before it
gets exposed through the pidhash.

Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
trying to fix a single instance of this, instead fix the whole class
of issues, effectively reverting this commit.

Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
Reported-by: Linus Torvalds <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Tadeusz Struk <[email protected]>
Tested-by: Zhang Qiao <[email protected]>
Tested-by: Dietmar Eggemann <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/sched/task.h | 4 ++--
kernel/fork.c | 13 ++++++++++++-
kernel/sched/core.c | 34 +++++++++++++++++++++-------------
3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index b9198a1..e84e54d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528..c607d23 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2267,6 +2267,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;

/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);

proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fcf0c18..9745613 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif

-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
- bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;

@@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = p->static_prio;
- set_load_weight(p);
+ set_load_weight(p, false);

/*
* We don't need the reset flag anymore after the fork. It has
@@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)

init_entity_runnable_average(&p->se);

+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}

-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif

+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}

+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}

@@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);

p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);

@@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}

/*
@@ -9446,7 +9454,7 @@ void __init sched_init(void)
#endif
}

- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);

/*
* The boot idle thread does lazy MMU switching as well: