With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
non-preemptible context look untidy; after the main oops, the kernel prints
a "sleeping function called from invalid context" report because
exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
fixup.
It looks like the same thing applies to profile_task_exit() and
kcov_task_exit().
Fix it by moving the preemption fixup up and the calls to
profile_task_exit() and kcov_task_exit() down.
Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks")
Signed-off-by: Jann Horn <[email protected]>
---
As so often, I have no idea which tree this should go through. tip? mm?
v2: now without adding redundant whitespace...
kernel/exit.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..eb42d49fd99d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -713,8 +713,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -732,6 +736,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -749,13 +763,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
base-commit: 9f65ed5fe41ce08ed1cb1f6a950f9ec694c142ad
--
2.25.0.265.gbab2e86ba0-goog
On Thu, 5 Mar 2020 23:06:57 +0100
Jann Horn <[email protected]> wrote:
> With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
> non-preemptible context look untidy; after the main oops, the kernel prints
> a "sleeping function called from invalid context" report because
> exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
> can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
> fixup.
>
> It looks like the same thing applies to profile_task_exit() and
> kcov_task_exit().
>
> Fix it by moving the preemption fixup up and the calls to
> profile_task_exit() and kcov_task_exit() down.
>
> Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks")
> Signed-off-by: Jann Horn <[email protected]>
> ---
> @@ -732,6 +736,16 @@ void __noreturn do_exit(long code)
> */
> set_fs(USER_DS);
>
> + if (unlikely(in_atomic())) {
> + pr_info("note: %s[%d] exited with preempt_count %d\n",
> + current->comm, task_pid_nr(current),
> + preempt_count());
This should be more than a pr_info. It should also probably state the
"Dazed and confused, best to reboot" message.
Because if something crashed in a non preempt section, it may likely be
holding a lock that it will never release, causing a soon to be deadlock!
-- Steve
> + preempt_count_set(PREEMPT_ENABLED);
> + }
> +
> + profile_task_exit(tsk);
> + kcov_task_exit(tsk);
> +
> ptrace_event(PTRACE_EVENT_EXIT, code);
>
> validate_creds_for_do_exit(tsk);
> @@ -749,13 +763,6 @@ void __noreturn do_exit(long code)
>
> exit_signals(tsk); /* sets PF_EXITING */
>
> - if (unlikely(in_atomic())) {
> - pr_info("note: %s[%d] exited with preempt_count %d\n",
> - current->comm, task_pid_nr(current),
> - preempt_count());
> - preempt_count_set(PREEMPT_ENABLED);
> - }
> -
> /* sync mm's RSS info before statistics gathering */
> if (tsk->mm)
> sync_mm_rss(tsk->mm);
>
> base-commit: 9f65ed5fe41ce08ed1cb1f6a950f9ec694c142ad
On Thu, Mar 5, 2020 at 11:13 PM Steven Rostedt <[email protected]> wrote:
> On Thu, 5 Mar 2020 23:06:57 +0100
> Jann Horn <[email protected]> wrote:
>
> > With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
> > non-preemptible context look untidy; after the main oops, the kernel prints
> > a "sleeping function called from invalid context" report because
> > exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
> > can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
> > fixup.
> >
> > It looks like the same thing applies to profile_task_exit() and
> > kcov_task_exit().
> >
> > Fix it by moving the preemption fixup up and the calls to
> > profile_task_exit() and kcov_task_exit() down.
[...]
> > + if (unlikely(in_atomic())) {
> > + pr_info("note: %s[%d] exited with preempt_count %d\n",
> > + current->comm, task_pid_nr(current),
> > + preempt_count());
>
> This should be more than a pr_info. It should also probably state the
> "Dazed and confused, best to reboot" message.
>
> Because if something crashed in a non preempt section, it may likely be
> holding a lock that it will never release, causing a soon to be deadlock!
I didn't write that code, I'm just moving it around. :P But I guess if
you want, I can change it in the same patch... something like this on
top? Does that look reasonable?
if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
+ pr_emerg("note: %s[%d] exited with preempt_count %d,
system might deadlock, please reboot\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
On Thu, 5 Mar 2020 23:30:13 +0100
Jann Horn <[email protected]> wrote:
> On Thu, Mar 5, 2020 at 11:13 PM Steven Rostedt <[email protected]> wrote:
> > On Thu, 5 Mar 2020 23:06:57 +0100
> > Jann Horn <[email protected]> wrote:
> >
> > > With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
> > > non-preemptible context look untidy; after the main oops, the kernel prints
> > > a "sleeping function called from invalid context" report because
> > > exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
> > > can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
> > > fixup.
> > >
> > > It looks like the same thing applies to profile_task_exit() and
> > > kcov_task_exit().
> > >
> > > Fix it by moving the preemption fixup up and the calls to
> > > profile_task_exit() and kcov_task_exit() down.
> [...]
> > > + if (unlikely(in_atomic())) {
> > > + pr_info("note: %s[%d] exited with preempt_count %d\n",
> > > + current->comm, task_pid_nr(current),
> > > + preempt_count());
> >
> > This should be more than a pr_info. It should also probably state the
> > "Dazed and confused, best to reboot" message.
> >
> > Because if something crashed in a non preempt section, it may likely be
> > holding a lock that it will never release, causing a soon to be deadlock!
>
> I didn't write that code, I'm just moving it around. :P But I guess if
Ah, I didn't scroll down enough to see it was just moved.
> you want, I can change it in the same patch... something like this on
> top? Does that look reasonable?
No, an update to the text should be done as a separate patch, as it is a
different type of change.
Thanks,
-- Steve
>
> if (unlikely(in_atomic())) {
> - pr_info("note: %s[%d] exited with preempt_count %d\n",
> + pr_emerg("note: %s[%d] exited with preempt_count %d,
> system might deadlock, please reboot\n",
> current->comm, task_pid_nr(current),
> preempt_count());
> preempt_count_set(PREEMPT_ENABLED);
On Thu, Mar 5, 2020 at 11:07 PM Jann Horn <[email protected]> wrote:
> With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
> non-preemptible context look untidy; after the main oops, the kernel prints
> a "sleeping function called from invalid context" report because
> exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
> can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
> fixup.
>
> It looks like the same thing applies to profile_task_exit() and
> kcov_task_exit().
>
> Fix it by moving the preemption fixup up and the calls to
> profile_task_exit() and kcov_task_exit() down.
>
> Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks")
> Signed-off-by: Jann Horn <[email protected]>
> ---
> As so often, I have no idea which tree this should go through. tip? mm?
Do the tip folks want to take this, since it's vaguely locking-related
and the fixed commit also came that way? Or should it go through
akpm's tree?
On Tue, Mar 24, 2020 at 10:30:02AM +0100, Jann Horn wrote:
> On Thu, Mar 5, 2020 at 11:07 PM Jann Horn <[email protected]> wrote:
> > With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
> > non-preemptible context look untidy; after the main oops, the kernel prints
> > a "sleeping function called from invalid context" report because
> > exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
> > can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
> > fixup.
> >
> > It looks like the same thing applies to profile_task_exit() and
> > kcov_task_exit().
> >
> > Fix it by moving the preemption fixup up and the calls to
> > profile_task_exit() and kcov_task_exit() down.
> >
> > Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks")
> > Signed-off-by: Jann Horn <[email protected]>
> > ---
> > As so often, I have no idea which tree this should go through. tip? mm?
>
> Do the tip folks want to take this, since it's vaguely locking-related
> and the fixed commit also came that way? Or should it go through
> akpm's tree?
I've picked it up, as it seems to be languishing. Thanks!
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 586b58cac8b4683eb58a1446fbc399de18974e40
Gitweb: https://git.kernel.org/tip/586b58cac8b4683eb58a1446fbc399de18974e40
Author: Jann Horn <[email protected]>
AuthorDate: Thu, 05 Mar 2020 23:06:57 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 30 Apr 2020 20:14:38 +02:00
exit: Move preemption fixup up, move blocking operations down
With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in
non-preemptible context look untidy; after the main oops, the kernel prints
a "sleeping function called from invalid context" report because
exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read()
can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED)
fixup.
It looks like the same thing applies to profile_task_exit() and
kcov_task_exit().
Fix it by moving the preemption fixup up and the calls to
profile_task_exit() and kcov_task_exit() down.
Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks")
Signed-off-by: Jann Horn <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/exit.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index ce2a75b..d56fe51 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);