First, printk is NMI context safe now since the safe printk has been
implemented. The safe printk already has an irqwork to make NMI context
safe.
Second, the NMI irqwork actually does not work if a NMI handler causes
panic by watchdog timeout. This NMI irqwork have no chance to run in such
case, while the safe printk will flush its per-cpu buffer before panic.
Signed-off-by: Changbin Du <[email protected]>
---
arch/x86/include/asm/nmi.h | 1 -
arch/x86/kernel/nmi.c | 20 +++++++++-----------
2 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 75ded1d13d98..9d5d949e662e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -41,7 +41,6 @@ struct nmiaction {
struct list_head list;
nmi_handler_t handler;
u64 max_duration;
- struct irq_work irq_work;
unsigned long flags;
const char *name;
};
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e676a9916c49..aa15d4f2340f 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -104,18 +104,22 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);
-static void nmi_max_handler(struct irq_work *w)
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
{
- struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
int remainder_ns, decimal_msecs;
- u64 whole_msecs = READ_ONCE(a->max_duration);
+ u64 whole_msecs = READ_ONCE(action->max_duration);
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
remainder_ns = do_div(whole_msecs, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
printk_ratelimited(KERN_INFO
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
- a->handler, whole_msecs, decimal_msecs);
+ action->handler, whole_msecs, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)
@@ -142,11 +146,7 @@ static int nmi_handle(unsigned int type, struct pt_regs *regs)
delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);
- if (delta < nmi_longest_ns || delta < a->max_duration)
- continue;
-
- a->max_duration = delta;
- irq_work_queue(&a->irq_work);
+ nmi_check_duration(a, delta);
}
rcu_read_unlock();
@@ -164,8 +164,6 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;
- init_irq_work(&action->irq_work, nmi_max_handler);
-
raw_spin_lock_irqsave(&desc->lock, flags);
/*
--
2.24.0
Hi, Thomas,
Have you checked this one? I think this even can consider as a fix.
On Wed, Jan 01, 2020 at 03:20:17PM +0800, Changbin Du wrote:
> First, printk is NMI context safe now since the safe printk has been
> implemented. The safe printk already has an irqwork to make NMI context
> safe.
>
> Second, the NMI irqwork actually does not work if a NMI handler causes
> panic by watchdog timeout. This NMI irqwork have no chance to run in such
> case, while the safe printk will flush its per-cpu buffer before panic.
>
> Signed-off-by: Changbin Du <[email protected]>
> ---
> arch/x86/include/asm/nmi.h | 1 -
> arch/x86/kernel/nmi.c | 20 +++++++++-----------
> 2 files changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
> index 75ded1d13d98..9d5d949e662e 100644
> --- a/arch/x86/include/asm/nmi.h
> +++ b/arch/x86/include/asm/nmi.h
> @@ -41,7 +41,6 @@ struct nmiaction {
> struct list_head list;
> nmi_handler_t handler;
> u64 max_duration;
> - struct irq_work irq_work;
> unsigned long flags;
> const char *name;
> };
> diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
> index e676a9916c49..aa15d4f2340f 100644
> --- a/arch/x86/kernel/nmi.c
> +++ b/arch/x86/kernel/nmi.c
> @@ -104,18 +104,22 @@ static int __init nmi_warning_debugfs(void)
> }
> fs_initcall(nmi_warning_debugfs);
>
> -static void nmi_max_handler(struct irq_work *w)
> +static void nmi_check_duration(struct nmiaction *action, u64 duration)
> {
> - struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
> int remainder_ns, decimal_msecs;
> - u64 whole_msecs = READ_ONCE(a->max_duration);
> + u64 whole_msecs = READ_ONCE(action->max_duration);
> +
> + if (duration < nmi_longest_ns || duration < action->max_duration)
> + return;
> +
> + action->max_duration = duration;
>
> remainder_ns = do_div(whole_msecs, (1000 * 1000));
> decimal_msecs = remainder_ns / 1000;
>
> printk_ratelimited(KERN_INFO
> "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
> - a->handler, whole_msecs, decimal_msecs);
> + action->handler, whole_msecs, decimal_msecs);
> }
>
> static int nmi_handle(unsigned int type, struct pt_regs *regs)
> @@ -142,11 +146,7 @@ static int nmi_handle(unsigned int type, struct pt_regs *regs)
> delta = sched_clock() - delta;
> trace_nmi_handler(a->handler, (int)delta, thishandled);
>
> - if (delta < nmi_longest_ns || delta < a->max_duration)
> - continue;
> -
> - a->max_duration = delta;
> - irq_work_queue(&a->irq_work);
> + nmi_check_duration(a, delta);
> }
>
> rcu_read_unlock();
> @@ -164,8 +164,6 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
> if (!action->handler)
> return -EINVAL;
>
> - init_irq_work(&action->irq_work, nmi_max_handler);
> -
> raw_spin_lock_irqsave(&desc->lock, flags);
>
> /*
> --
> 2.24.0
>
--
Cheers,
Changbin Du
Changbin Du <[email protected]> writes:
> First, printk is NMI context safe now since the safe printk has been
> implemented. The safe printk already has an irqwork to make NMI context
> safe.
>
> Second, the NMI irqwork actually does not work if a NMI handler causes
> panic by watchdog timeout. This NMI irqwork have no chance to run in such
> case, while the safe printk will flush its per-cpu buffer before panic.
>
> Signed-off-by: Changbin Du <[email protected]>
Looks about right.
Acked-by: Thomas Gleixner <[email protected]>
On Thu, Jan 09, 2020 at 09:55:51PM +0100, Thomas Gleixner wrote:
> Changbin Du <[email protected]> writes:
>
> > First, printk is NMI context safe now since the safe printk has been
> > implemented. The safe printk already has an irqwork to make NMI context
> > safe.
> >
> > Second, the NMI irqwork actually does not work if a NMI handler causes
> > panic by watchdog timeout. This NMI irqwork have no chance to run in such
> > case, while the safe printk will flush its per-cpu buffer before panic.
> >
> > Signed-off-by: Changbin Du <[email protected]>
>
> Looks about right.
>
> Acked-by: Thomas Gleixner <[email protected]>
I'm wondering why is this thing being moved:
- if (delta < nmi_longest_ns || delta < a->max_duration)
- continue;
into nmi_check_duration() and not remaining where it is?
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Jan 10, 2020 at 04:13:29PM +0100, Borislav Petkov wrote:
> On Fri, Jan 10, 2020 at 10:05:49PM +0800, Changbin Du wrote:
> > I added a new function nmi_check_duration(), so shoudn't this check be
> > done in that function?
>
> Why should it be done in that function? Your patch is removing irq_work
> - why is it doing additional changes?
>
Just to move all the check code together and be a standalone function.
yes, this somewhat does code refining after the irqwork is removed but
I think it is normal.
> > Don't worry about performance, this function will be inlined by
> > compiler.
>
> I'm not worried about that at all.
>
> Btw, why are you sending private mail and not keeping the discussion on
> the mailing list?
>
oops, typed wrong key. Just added back.
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
--
Cheers,
Changbin Du
On Fri, Jan 10, 2020 at 05:34:50PM +0000, Changbin Du wrote:
> Just to move all the check code together and be a standalone function.
> yes, this somewhat does code refining after the irqwork is removed but
> I think it is normal.
But it makes review harder because your patch is removing irq_work,
*nothing* in the commit message is talking about *why* you're doing
that additional change. I'd imagine at the end of the commit message
something like:
"While at it, repurpose the IRQ work callback into a function which
concentrates the NMI duration checking."
This lets a reader know know why that additional change is done instead
of going back'n'forth and having to ask you why you're doing this.
Ok?
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Jan 10, 2020 at 08:58:37PM +0100, Borislav Petkov wrote:
> On Fri, Jan 10, 2020 at 05:34:50PM +0000, Changbin Du wrote:
> > Just to move all the check code together and be a standalone function.
> > yes, this somewhat does code refining after the irqwork is removed but
> > I think it is normal.
>
> But it makes review harder because your patch is removing irq_work,
> *nothing* in the commit message is talking about *why* you're doing
> that additional change. I'd imagine at the end of the commit message
> something like:
>
> "While at it, repurpose the IRQ work callback into a function which
> concentrates the NMI duration checking."
>
> This lets a reader know know why that additional change is done instead
> of going back'n'forth and having to ask you why you're doing this.
>
> Ok?
>
sure, and thanks for your suggestion. I will send v2 later.
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
--
Cheers,
Changbin Du