2020-03-03 11:39:41

by Sergey Senozhatsky

[permalink] [raw]
Subject: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

printk_deferred(), similarly to printk_safe/printk_nmi,
does not immediately attempt to print a new message on
the consoles, avoiding calls into non-reentrant kernel
paths, e.g. scheduler or timekeeping, which potentially
can deadlock the system. Those printk() flavors, instead,
rely on per-CPU flush irq_work to print messages from
safer contexts. For same reasons (recursive scheduler or
timekeeping calls) printk() uses per-CPU irq_work in
order to wake up user space syslog/kmsg readers.

However, only printk_safe/printk_nmi do make sure that
per-CPU areas have been initialised and that it's safe
to modify per-CPU irq_work. This means that, for instance,
should printk_deferred() be invoked "too early", that
is before per-CPU areas are initialised, printk_deferred()
will perform illegal per-CPU access.

Lech Perczak [0] reports that after commit 1b710b1b10ef
("char/random: silence a lockdep splat with printk()")
user-space syslog/kmsg readers are not able to read new
kernel messages. The reason is printk_deferred() being
called too early (as was pointed out by Petr and John).

Fix printk_deferred() and do not queue per-CPU irq_work
before per-CPU areas are initialized.

[0] https://lore.kernel.org/lkml/[email protected]/

Signed-off-by: Sergey Senozhatsky <[email protected]>
Reported-by: Lech Perczak <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Cc: John Ogness <[email protected]>
---
include/linux/printk.h | 5 -----
init/main.c | 1 -
kernel/printk/internal.h | 5 +++++
kernel/printk/printk.c | 34 ++++++++++++++++++++++++++++++++++
kernel/printk/printk_safe.c | 11 +----------
5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 1e6108b8d15f..e061635e0409 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -202,7 +202,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
-extern void printk_safe_init(void);
extern void printk_safe_flush(void);
extern void printk_safe_flush_on_panic(void);
#else
@@ -269,10 +268,6 @@ static inline void dump_stack(void)
{
}

-static inline void printk_safe_init(void)
-{
-}
-
static inline void printk_safe_flush(void)
{
}
diff --git a/init/main.c b/init/main.c
index e004c340680b..5a3894df3471 100644
--- a/init/main.c
+++ b/init/main.c
@@ -909,7 +909,6 @@ asmlinkage __visible void __init start_kernel(void)
boot_init_stack_canary();

time_init();
- printk_safe_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index c8e6ab689d42..b2b0f526f249 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);

+void printk_safe_init(void);
+bool printk_percpu_data_ready(void);
+
#define printk_safe_enter_irqsave(flags) \
do { \
local_irq_save(flags); \
@@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
#define printk_safe_enter_irq() local_irq_disable()
#define printk_safe_exit_irq() local_irq_enable()

+static inline void printk_safe_init(void) { }
+static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ad4606234545..11c646e88cd4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -461,6 +461,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;

+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __read_mostly;
+
+bool printk_percpu_data_ready(void)
+{
+ return __printk_percpu_data_ready;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -1147,12 +1159,28 @@ static void __init log_buf_add_cpu(void)
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */

+static void __init set_percpu_data_ready(void)
+{
+ printk_safe_init();
+ /* Make sure we set this flag only after printk_safe() init is done */
+ barrier();
+ __printk_percpu_data_ready = true;
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
unsigned int free;

+ /*
+ * Some archs call setup_log_buf() multiple times - first is very
+ * early, e.g. from setup_arch(), and second - when percpu_areas
+ * are initialised.
+ */
+ if (!early)
+ set_percpu_data_ready();
+
if (log_buf != __log_buf)
return;

@@ -3009,6 +3037,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {

void wake_up_klogd(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
@@ -3019,6 +3050,9 @@ void wake_up_klogd(void)

void defer_console_output(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index b4045e782743..d9a659a686f3 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -27,7 +27,6 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready __read_mostly;

#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready)
+ if (printk_percpu_data_ready())
irq_work_queue(&s->work);
}

@@ -402,14 +401,6 @@ void __init printk_safe_init(void)
#endif
}

- /*
- * In the highly unlikely event that a NMI were to trigger at
- * this moment. Make sure IRQ work is set up before this
- * variable is set.
- */
- barrier();
- printk_safe_irq_ready = 1;
-
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
--
2.25.1


2020-03-04 15:22:37

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On Tue 2020-03-03 20:30:02, Sergey Senozhatsky wrote:
> However, only printk_safe/printk_nmi do make sure that
> per-CPU areas have been initialised and that it's safe
> to modify per-CPU irq_work. This means that, for instance,
> should printk_deferred() be invoked "too early", that
> is before per-CPU areas are initialised, printk_deferred()
> will perform illegal per-CPU access.
>
> Lech Perczak [0] reports that after commit 1b710b1b10ef
> ("char/random: silence a lockdep splat with printk()")
> user-space syslog/kmsg readers are not able to read new
> kernel messages. The reason is printk_deferred() being
> called too early (as was pointed out by Petr and John).
>
> Fix printk_deferred() and do not queue per-CPU irq_work
> before per-CPU areas are initialized.
>
> [0] https://lore.kernel.org/lkml/[email protected]/
>
> Signed-off-by: Sergey Senozhatsky <[email protected]>
> Reported-by: Lech Perczak <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Theodore Ts'o <[email protected]>
> Cc: John Ogness <[email protected]>

Reviewed-by: Petr Mladek <[email protected]>

Now, the question is whether to hurry this fix into 5.6 or if
it could wait for 5.7.

I think that it could wait because 5.6 is not affected by
the particular printk_deferred(). This patch fixes a long-term
generic problem. But I am open for other opinions.

Best Regards,
Petr

2020-03-05 01:31:07

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On (20/03/04 16:21), Petr Mladek wrote:
[..]
> > Fix printk_deferred() and do not queue per-CPU irq_work
> > before per-CPU areas are initialized.
> >
> > [0] https://lore.kernel.org/lkml/[email protected]/
> >
> > Signed-off-by: Sergey Senozhatsky <[email protected]>
> > Reported-by: Lech Perczak <[email protected]>
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: Theodore Ts'o <[email protected]>
> > Cc: John Ogness <[email protected]>
>
> Reviewed-by: Petr Mladek <[email protected]>

Thanks!

> Now, the question is whether to hurry this fix into 5.6 or if
> it could wait for 5.7.
>
> I think that it could wait because 5.6 is not affected by
> the particular printk_deferred(). This patch fixes a long-term
> generic problem. But I am open for other opinions.

Good question. My 5 cents, I would _probably_ push it now. Not
because it fixes any known issues on 5.6, but because we have
a number of LTS kernel (4.19, 4.14, 4.9, 4.4, 3.16) that can be
affected should 1b710b1b10eff9d4 be backported to those kernels.
Which is quite likely, I suspect. The sooner we fix printk_deferred(),
the sooner -stable/LTS picks up the fix, so that we don't have same
regression reports in the future. The regression in question is
pretty hard to track down, git-bisect, perhaps, is the only reasonably
fast way.

-ss

2020-03-05 18:55:37

by Greg KH

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On Thu, Mar 05, 2020 at 10:30:14AM +0900, Sergey Senozhatsky wrote:
> On (20/03/04 16:21), Petr Mladek wrote:
> [..]
> > > Fix printk_deferred() and do not queue per-CPU irq_work
> > > before per-CPU areas are initialized.
> > >
> > > [0] https://lore.kernel.org/lkml/[email protected]/
> > >
> > > Signed-off-by: Sergey Senozhatsky <[email protected]>
> > > Reported-by: Lech Perczak <[email protected]>
> > > Cc: Greg Kroah-Hartman <[email protected]>
> > > Cc: Theodore Ts'o <[email protected]>
> > > Cc: John Ogness <[email protected]>
> >
> > Reviewed-by: Petr Mladek <[email protected]>
>
> Thanks!
>
> > Now, the question is whether to hurry this fix into 5.6 or if
> > it could wait for 5.7.
> >
> > I think that it could wait because 5.6 is not affected by
> > the particular printk_deferred(). This patch fixes a long-term
> > generic problem. But I am open for other opinions.
>
> Good question. My 5 cents, I would _probably_ push it now. Not
> because it fixes any known issues on 5.6, but because we have
> a number of LTS kernel (4.19, 4.14, 4.9, 4.4, 3.16) that can be
> affected should 1b710b1b10eff9d4 be backported to those kernels.

It was backported there, and now has been reverted. So no big rush from
what I can tell.

thanks,

greg k-h

2020-04-01 19:37:01

by Jann Horn

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On Tue, Mar 3, 2020 at 12:30 PM Sergey Senozhatsky
<[email protected]> wrote:
> printk_deferred(), similarly to printk_safe/printk_nmi,
> does not immediately attempt to print a new message on
> the consoles, avoiding calls into non-reentrant kernel
> paths, e.g. scheduler or timekeeping, which potentially
> can deadlock the system. Those printk() flavors, instead,
> rely on per-CPU flush irq_work to print messages from
> safer contexts. For same reasons (recursive scheduler or
> timekeeping calls) printk() uses per-CPU irq_work in
> order to wake up user space syslog/kmsg readers.
>
> However, only printk_safe/printk_nmi do make sure that
> per-CPU areas have been initialised and that it's safe
> to modify per-CPU irq_work. This means that, for instance,
> should printk_deferred() be invoked "too early", that
> is before per-CPU areas are initialised, printk_deferred()
> will perform illegal per-CPU access.
>
> Lech Perczak [0] reports that after commit 1b710b1b10ef
> ("char/random: silence a lockdep splat with printk()")
> user-space syslog/kmsg readers are not able to read new
> kernel messages. The reason is printk_deferred() being
> called too early (as was pointed out by Petr and John).
>
> Fix printk_deferred() and do not queue per-CPU irq_work
> before per-CPU areas are initialized.

I ran into the same issue during some development work, and Sergey
directed me to this patch. It fixes the problem for me. Thanks!

Tested-by: Jann Horn <[email protected]>

2020-04-09 19:45:52

by Simon Kirby

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On Thu, Mar 05, 2020 at 07:53:48PM +0100, Greg Kroah-Hartman wrote:

> On Thu, Mar 05, 2020 at 10:30:14AM +0900, Sergey Senozhatsky wrote:
> > On (20/03/04 16:21), Petr Mladek wrote:
> > [..]
> > > > Fix printk_deferred() and do not queue per-CPU irq_work
> > > > before per-CPU areas are initialized.
> > > >
> > > > [0] https://lore.kernel.org/lkml/[email protected]/
> > > >
> > > > Signed-off-by: Sergey Senozhatsky <[email protected]>
> > > > Reported-by: Lech Perczak <[email protected]>
> > > > Cc: Greg Kroah-Hartman <[email protected]>
> > > > Cc: Theodore Ts'o <[email protected]>
> > > > Cc: John Ogness <[email protected]>
> > >
> > > Reviewed-by: Petr Mladek <[email protected]>
> >
> > Thanks!
> >
> > > Now, the question is whether to hurry this fix into 5.6 or if
> > > it could wait for 5.7.
> > >
> > > I think that it could wait because 5.6 is not affected by
> > > the particular printk_deferred(). This patch fixes a long-term
> > > generic problem. But I am open for other opinions.
> >
> > Good question. My 5 cents, I would _probably_ push it now. Not
> > because it fixes any known issues on 5.6, but because we have
> > a number of LTS kernel (4.19, 4.14, 4.9, 4.4, 3.16) that can be
> > affected should 1b710b1b10eff9d4 be backported to those kernels.
>
> It was backported there, and now has been reverted. So no big rush from
> what I can tell.

This causes "dmesg -w" or "cat /dev/kmsg" to not print new messages after
dumping the current ring. I hit this on v5.5.9, v5.5.15, v5.6.3, and
Linus HEAD. This prints no "hi":

(sleep 1;echo hi > /dev/kmsg)& dmesg -w

...curiously, "strace dmesg -w" shows the data received once ^C is hit.

Jann pointed me to this patch. Applying it or reverting 1b710b1b10eff9d4
does fix it for me. However, Linus HEAD is still broken and, AFAICS,
remains unreverted and unfixed in stable/linux-5.4.y through linux-5.6.y.
It was introduced in 5.6 but backported to those, not just the LTS above.

Simon-

2020-04-10 03:08:13

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On (20/04/09 12:25), Simon Kirby wrote:
> This causes "dmesg -w" or "cat /dev/kmsg" to not print new messages after
> dumping the current ring. I hit this on v5.5.9, v5.5.15, v5.6.3, and
> Linus HEAD. This prints no "hi":
>
> (sleep 1;echo hi > /dev/kmsg)& dmesg -w
>
> ...curiously, "strace dmesg -w" shows the data received once ^C is hit.
>
> Jann pointed me to this patch. Applying it or reverting 1b710b1b10eff9d4
> does fix it for me. However, Linus HEAD is still broken and, AFAICS,
> remains unreverted and unfixed in stable/linux-5.4.y through linux-5.6.y.
> It was introduced in 5.6 but backported to those, not just the LTS above.

I'm trying to land this patch. Give me a moment, I'll come back to
you shortly.

-ss

2020-04-10 23:23:33

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCHv2] printk: queue wake_up_klogd irq_work only if per-CPU areas are ready

On (20/04/10 12:07), Sergey Senozhatsky wrote:
> On (20/04/09 12:25), Simon Kirby wrote:
> > This causes "dmesg -w" or "cat /dev/kmsg" to not print new messages after
> > dumping the current ring. I hit this on v5.5.9, v5.5.15, v5.6.3, and
> > Linus HEAD. This prints no "hi":
>

[..]

> I'm trying to land this patch. Give me a moment, I'll come back to
> you shortly.

Applied. Commit ab6f762f0f53162d Linus' HEAD.

-ss