2010-06-12 09:28:38

by Huang, Ying

[permalink] [raw]
Subject: [RFC 1/3] Unified NMI delayed call mechanism

NMI can be triggered even when IRQ is masked. So it is not safe for
NMI handler to call some functions. One solution is to delay the call
via self interrupt, so that the delayed call can be done once the
interrupt is enabled again. This has been implemented in MCE and perf
event. This patch provides a unified version and make it easier for
other NMI semantic handler to take use of the delayed call.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 1
arch/x86/include/asm/hw_irq.h | 1
arch/x86/include/asm/irq_vectors.h | 5 +
arch/x86/include/asm/nmi.h | 7 ++
arch/x86/kernel/entry_64.S | 3 +
arch/x86/kernel/irqinit.c | 3 +
arch/x86/kernel/traps.c | 104 +++++++++++++++++++++++++++++++++++++
7 files changed, 124 insertions(+)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -65,4 +65,5 @@ BUILD_INTERRUPT(threshold_interrupt,THRE
BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
#endif

+BUILD_INTERRUPT(nmi_delayed_call_interrupt,NMI_DELAYED_CALL_VECTOR)
#endif
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -35,6 +35,7 @@ extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
extern void mce_self_interrupt(void);
+extern void nmi_delayed_call_interrupt(void);

extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,11 @@
*/
#define MCE_SELF_VECTOR 0xeb

+/*
+ * Self IPI vector for NMI delayed call
+ */
+#define NMI_DELAYED_CALL_VECTOR 0xe9
+
#define NR_VECTORS 256

#define FPU_IRQ 13
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -75,4 +75,11 @@ void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
void restart_nmi(void);

+#define NMI_DELAYED_CALL_ID_INVALID -1
+
+typedef void (*nmi_delayed_call_func_t)(void);
+int nmi_delayed_call_register(nmi_delayed_call_func_t func);
+void nmi_delayed_call_unregister(int id);
+void nmi_delayed_call_schedule(int id);
+
#endif /* _ASM_X86_NMI_H */
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1009,6 +1009,9 @@ apicinterrupt MCE_SELF_VECTOR \
mce_self_interrupt smp_mce_self_interrupt
#endif

+apicinterrupt NMI_DELAYED_CALL_VECTOR \
+ nmi_delayed_call_interrupt smp_nmi_delayed_call_interrupt
+
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -212,6 +212,9 @@ static void __init apic_intr_init(void)
#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
#endif
+#if defined(CONFIG_X86_LOCAL_APIC)
+ alloc_intr_gate(NMI_DELAYED_CALL_VECTOR, nmi_delayed_call_interrupt);
+#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -888,3 +888,107 @@ void __init trap_init(void)

x86_init.irqs.trap_init();
}
+
+#define NMI_DELAYED_CALL_ID_MAX 32
+#define NMI_DELAYED_CALL_RESTART_MAX 5
+
+static nmi_delayed_call_func_t nmi_delayed_call_funcs[NMI_DELAYED_CALL_ID_MAX];
+static DEFINE_SPINLOCK(nmi_delayed_call_lock);
+
+static DEFINE_PER_CPU(unsigned long, nmi_delayed_call_pending);
+
+static void nmi_delayed_call_run(void)
+{
+ int cpu, restart = NMI_DELAYED_CALL_RESTART_MAX;
+ unsigned long pending, *ppending;
+ nmi_delayed_call_func_t *pfunc, func;
+
+ cpu = smp_processor_id();
+ ppending = per_cpu_ptr(&nmi_delayed_call_pending, cpu);
+ while (*ppending && restart--) {
+ pending = xchg(ppending, 0);
+ pfunc = nmi_delayed_call_funcs;
+ do {
+ if (pending & 1) {
+ func = *pfunc;
+ if (func)
+ func();
+ }
+ pfunc++;
+ pending >>= 1;
+ } while (pending);
+ }
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+asmlinkage void smp_nmi_delayed_call_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ irq_enter();
+ nmi_delayed_call_run();
+ irq_exit();
+}
+#endif
+
+int nmi_delayed_call_register(nmi_delayed_call_func_t func)
+{
+ unsigned long flags;
+ int i, id = NMI_DELAYED_CALL_ID_INVALID;
+
+ spin_lock_irqsave(&nmi_delayed_call_lock, flags);
+ for (i = 0; i < NMI_DELAYED_CALL_ID_MAX; i++) {
+ if (!nmi_delayed_call_funcs[i]) {
+ nmi_delayed_call_funcs[i] = func;
+ id = i;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&nmi_delayed_call_lock, flags);
+ return id;
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_register);
+
+/* Corresponding NMI handler should complete before invoking this
+ * function */
+void nmi_delayed_call_unregister(int id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&nmi_delayed_call_lock, flags);
+ nmi_delayed_call_funcs[id] = NULL;
+ spin_unlock_irqrestore(&nmi_delayed_call_lock, flags);
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_unregister);
+
+void nmi_delayed_call_schedule(int id)
+{
+ int cpu;
+
+ if (id == NMI_DELAYED_CALL_ID_INVALID)
+ return;
+ BUG_ON(id < 0 || id >= NMI_DELAYED_CALL_ID_MAX);
+
+ cpu = smp_processor_id();
+ set_bit(id, per_cpu_ptr(&nmi_delayed_call_pending, cpu));
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Without APIC do not schedule */
+ if (!cpu_has_apic)
+ return;
+
+ /*
+ * In nmi we cannot use kernel services safely. Trigger an
+ * self interrupt through the APIC to instead do the
+ * notification after interrupts are reenabled again.
+ */
+ apic->send_IPI_self(NMI_DELAYED_CALL_VECTOR);
+
+ /*
+ * Wait for idle afterwards again so that we don't leave the
+ * APIC in a non idle state because the normal APIC writes
+ * cannot exclude us.
+ */
+ apic_wait_icr_idle();
+#endif
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_schedule);


2010-06-12 09:28:50

by Huang, Ying

[permalink] [raw]
Subject: [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler

The original self interrupt mechanism in MCE handler is replaced by
the unified delayed call mechanism.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 4 --
arch/x86/include/asm/irq_vectors.h | 5 ---
arch/x86/kernel/cpu/mcheck/mce.c | 53 ++++++-------------------------------
arch/x86/kernel/entry_64.S | 5 ---
arch/x86/kernel/irqinit.c | 3 --
5 files changed, 10 insertions(+), 60 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -61,9 +61,5 @@ BUILD_INTERRUPT(thermal_interrupt,THERMA
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif

-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
BUILD_INTERRUPT(nmi_delayed_call_interrupt,NMI_DELAYED_CALL_VECTOR)
#endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -121,11 +121,6 @@
#define UV_BAU_MESSAGE 0xea

/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xeb
-
-/*
* Self IPI vector for NMI delayed call
*/
#define NMI_DELAYED_CALL_VECTOR 0xe9
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -480,60 +480,22 @@ static inline void mce_get_rip(struct mc
m->ip = mce_rdmsrl(rip_msr);
}

-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static int mce_delayed_call_id = NMI_DELAYED_CALL_ID_INVALID;
+
+static void __mce_report_event(void)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif

static void mce_report_event(struct pt_regs *regs)
{
if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
- mce_notify_irq();
- /*
- * Triggering the work queue here is just an insurance
- * policy in case the syscall exit notify handler
- * doesn't run soon enough or ends up running on the
- * wrong CPU (can happen when audit sleeps)
- */
- mce_schedule_work();
+ __mce_report_event();
return;
}

-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ nmi_delayed_call_schedule(mce_delayed_call_id);
}

DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -1731,6 +1693,11 @@ __setup("mce", mcheck_enable);

int __init mcheck_init(void)
{
+ mce_delayed_call_id = nmi_delayed_call_register(__mce_report_event);
+ if (mce_delayed_call_id == NMI_DELAYED_CALL_ID_INVALID)
+ pr_err(
+ "Failed to register mce delayed event reporting function!\n");
+
atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);

mcheck_intel_therm_init();
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1004,11 +1004,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt

-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
apicinterrupt NMI_DELAYED_CALL_VECTOR \
nmi_delayed_call_interrupt smp_nmi_delayed_call_interrupt

--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -209,9 +209,6 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
#if defined(CONFIG_X86_LOCAL_APIC)
alloc_intr_gate(NMI_DELAYED_CALL_VECTOR, nmi_delayed_call_interrupt);
#endif

2010-06-12 09:28:53

by Huang, Ying

[permalink] [raw]
Subject: [RFC 3/3] Use unified NMI delayed call mechanism in perf event NMI handler

The original self interrupt mechanism in perf event NMI handler is
replaced by the unified delayed call mechanism.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 4 ----
arch/x86/include/asm/irq_vectors.h | 5 -----
arch/x86/kernel/cpu/perf_event.c | 13 ++++++++-----
arch/x86/kernel/entry_64.S | 5 -----
arch/x86/kernel/irqinit.c | 6 ------
5 files changed, 8 insertions(+), 25 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,10 +49,6 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)

-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
-#endif
-
#ifdef CONFIG_X86_THERMAL_VECTOR
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -113,11 +113,6 @@
*/
#define X86_PLATFORM_IPI_VECTOR 0xed

-/*
- * Performance monitoring pending work vector:
- */
-#define LOCAL_PENDING_VECTOR 0xec
-
#define UV_BAU_MESSAGE 0xea

/*
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1160,13 +1160,12 @@ static int x86_pmu_handle_irq(struct pt_
return handled;
}

-void smp_perf_pending_interrupt(struct pt_regs *regs)
+static int perf_delayed_call_id = NMI_DELAYED_CALL_ID_INVALID;
+
+void perf_do_pending(void)
{
- irq_enter();
- ack_APIC_irq();
inc_irq_stat(apic_pending_irqs);
perf_event_do_pending();
- irq_exit();
}

void set_perf_event_pending(void)
@@ -1175,7 +1174,7 @@ void set_perf_event_pending(void)
if (!x86_pmu.apic || !x86_pmu_initialized())
return;

- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+ nmi_delayed_call_schedule(perf_delayed_call_id);
#endif
}

@@ -1363,6 +1362,10 @@ void __init init_hw_perf_events(void)
}
}

+ perf_delayed_call_id = nmi_delayed_call_register(perf_do_pending);
+ if (perf_delayed_call_id == NMI_DELAYED_CALL_ID_INVALID)
+ pr_err("Failed to register NMI delayed call for perf.\n");
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1021,11 +1021,6 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt

-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
/*
* Exception entry points.
*/
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -223,12 +223,6 @@ static void __init apic_intr_init(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
#endif
}

2010-06-12 10:26:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* Huang Ying <[email protected]> wrote:

> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> handler to call some functions. One solution is to delay the call via self
> interrupt, so that the delayed call can be done once the interrupt is
> enabled again. This has been implemented in MCE and perf event. This patch
> provides a unified version and make it easier for other NMI semantic handler
> to take use of the delayed call.
>
> Signed-off-by: Huang Ying <[email protected]>
> ---
> arch/x86/include/asm/entry_arch.h | 1
> arch/x86/include/asm/hw_irq.h | 1
> arch/x86/include/asm/irq_vectors.h | 5 +
> arch/x86/include/asm/nmi.h | 7 ++
> arch/x86/kernel/entry_64.S | 3 +
> arch/x86/kernel/irqinit.c | 3 +
> arch/x86/kernel/traps.c | 104 +++++++++++++++++++++++++++++++++++++
> 7 files changed, 124 insertions(+)

Instead of introducing this extra intermediate facility please use the same
approach the unified NMI watchdog is using (see latest -tip): a perf event
callback gives all the extra functionality needed.

The MCE code needs to be updated to use that - and then it will be integrated
into the events framework.

Thanks,

Ingo

2010-06-13 01:54:12

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Sat, 2010-06-12 at 18:25 +0800, Ingo Molnar wrote:
> * Huang Ying <[email protected]> wrote:
>
> > NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> > handler to call some functions. One solution is to delay the call via self
> > interrupt, so that the delayed call can be done once the interrupt is
> > enabled again. This has been implemented in MCE and perf event. This patch
> > provides a unified version and make it easier for other NMI semantic handler
> > to take use of the delayed call.
> >
> > Signed-off-by: Huang Ying <[email protected]>
> > ---
> > arch/x86/include/asm/entry_arch.h | 1
> > arch/x86/include/asm/hw_irq.h | 1
> > arch/x86/include/asm/irq_vectors.h | 5 +
> > arch/x86/include/asm/nmi.h | 7 ++
> > arch/x86/kernel/entry_64.S | 3 +
> > arch/x86/kernel/irqinit.c | 3 +
> > arch/x86/kernel/traps.c | 104 +++++++++++++++++++++++++++++++++++++
> > 7 files changed, 124 insertions(+)
>
> Instead of introducing this extra intermediate facility please use the same
> approach the unified NMI watchdog is using (see latest -tip): a perf event
> callback gives all the extra functionality needed.

Sorry, if my understanding is correct, the perf event overflow callback
should be run in NMI context instead of a delayed context (such as IRQ,
soft_irq, process context). That is, the backtrace of
watchdog_overflow_callback should be something as follow:

x86_pmu_handle_irq
perf_event_overflow
__perf_event_overflow
watchdog_overflow_callback

I do not find the delayed mechanism here.

> The MCE code needs to be updated to use that - and then it will be integrated
> into the events framework.

MCE is NMI-like, and there are other NMI users too. I think some of them
will need some kind of delayed call mechanism. In fact, perf itself uses
self-made NMI delayed call mechanism too, I just want to generalize it
for other users too.

Best Regards,
Huang Ying

2010-06-14 03:47:07

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

(2010/06/12 19:25), Ingo Molnar wrote:
>
> * Huang Ying <[email protected]> wrote:
>
>> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
>> handler to call some functions. One solution is to delay the call via self
>> interrupt, so that the delayed call can be done once the interrupt is
>> enabled again. This has been implemented in MCE and perf event. This patch
>> provides a unified version and make it easier for other NMI semantic handler
>> to take use of the delayed call.
>
> Instead of introducing this extra intermediate facility please use the same
> approach the unified NMI watchdog is using (see latest -tip): a perf event
> callback gives all the extra functionality needed.
>
> The MCE code needs to be updated to use that - and then it will be integrated
> into the events framework.

Hi Ingo,

I think this "NMI delayed call mechanism" could be a part of "the events
framework" that we are planning to get in kernel soon. At least APEI will
use NMI to report some hardware events (likely error) to kernel. So I
suppose we will go to have a delayed call as an event handler for APEI.

Generally speaking "event" can occur independently of the situation.
NMI can tell us some of external events, expecting urgent reaction for
the event, but we cannot do everything in NMI context. Or we might have
a sudden urge to generate an internal event while interrupts are disabled.

I agree that generating a self interrupt is reasonable solution.
Note that it could be said that both of "MCE handled (=event log should
be delivered to userland asap)" and "perf events pending (=pending events
should be handled asap)" are kind of internal event that requires urgent
handling in non-NMI kernel context. One question here is why we should
have different vectors for these events that uses same mechanism.

How about calling the vector LOCAL_EVENT_VECTOR or so?
I guess there should be better name if it is possible to inject an event
to other cpus via IPI with this vector...


Thanks,
H.Seto

2010-06-14 13:54:38

by Don Zickus

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Mon, Jun 14, 2010 at 12:45:21PM +0900, Hidetoshi Seto wrote:
> (2010/06/12 19:25), Ingo Molnar wrote:
> >
> > * Huang Ying <[email protected]> wrote:
> >
> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> >> handler to call some functions. One solution is to delay the call via self
> >> interrupt, so that the delayed call can be done once the interrupt is
> >> enabled again. This has been implemented in MCE and perf event. This patch
> >> provides a unified version and make it easier for other NMI semantic handler
> >> to take use of the delayed call.
> >
> > Instead of introducing this extra intermediate facility please use the same
> > approach the unified NMI watchdog is using (see latest -tip): a perf event
> > callback gives all the extra functionality needed.
> >
> > The MCE code needs to be updated to use that - and then it will be integrated
> > into the events framework.
>
> Hi Ingo,
>
> I think this "NMI delayed call mechanism" could be a part of "the events
> framework" that we are planning to get in kernel soon. At least APEI will
> use NMI to report some hardware events (likely error) to kernel. So I
> suppose we will go to have a delayed call as an event handler for APEI.
>
> Generally speaking "event" can occur independently of the situation.
> NMI can tell us some of external events, expecting urgent reaction for
> the event, but we cannot do everything in NMI context. Or we might have
> a sudden urge to generate an internal event while interrupts are disabled.
>
> I agree that generating a self interrupt is reasonable solution.
> Note that it could be said that both of "MCE handled (=event log should
> be delivered to userland asap)" and "perf events pending (=pending events
> should be handled asap)" are kind of internal event that requires urgent
> handling in non-NMI kernel context. One question here is why we should
> have different vectors for these events that uses same mechanism.

I think the perf event subsytem can log events in NMI context already and
deliver them to userspace when the NMI is done. This is why I think Ingo
wants MCE to be updated to sit on top of the perf event subsytem to avoid
re-invent everything again.

Then again I do not know enough about the MCE stuff to understand what you
mean when an event comes in but you can't handle it in an NMI-safe
context. An example would be helpful.

Cheers,
Don

2010-06-14 14:44:10

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

> I think the perf event subsytem can log events in NMI context already and
> deliver them to userspace when the NMI is done. This is why I think Ingo
> wants MCE to be updated to sit on top of the perf event subsytem to avoid
> re-invent everything again.

perf is not solving the problem this is trying to solve.

> Then again I do not know enough about the MCE stuff to understand what you
> mean when an event comes in but you can't handle it in an NMI-safe
> context. An example would be helpful.

At least for MCE hwpoison recovery needs to sleep and you obviously cannot sleep in
NMI like context. The way it's done is to first do a self interrupt, then do a work queue
wakeup and finally the sleeping operations.

perf does not fit into this because it has no way to process such an event
inside the kernel.

Anyways this just cleans up the existing mechanism to share some code.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-14 15:12:57

by Don Zickus

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Mon, Jun 14, 2010 at 04:44:03PM +0200, Andi Kleen wrote:
> > I think the perf event subsytem can log events in NMI context already and
> > deliver them to userspace when the NMI is done. This is why I think Ingo
> > wants MCE to be updated to sit on top of the perf event subsytem to avoid
> > re-invent everything again.
>
> perf is not solving the problem this is trying to solve.
>
> > Then again I do not know enough about the MCE stuff to understand what you
> > mean when an event comes in but you can't handle it in an NMI-safe
> > context. An example would be helpful.
>
> At least for MCE hwpoison recovery needs to sleep and you obviously cannot sleep in
> NMI like context. The way it's done is to first do a self interrupt, then do a work queue
> wakeup and finally the sleeping operations.
>
> perf does not fit into this because it has no way to process such an event
> inside the kernel.

Ah, makes sense. Thanks for the example.

Cheers,
Don

2010-06-18 09:48:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* Hidetoshi Seto <[email protected]> wrote:

> (2010/06/12 19:25), Ingo Molnar wrote:
> >
> > * Huang Ying <[email protected]> wrote:
> >
> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> >> handler to call some functions. One solution is to delay the call via self
> >> interrupt, so that the delayed call can be done once the interrupt is
> >> enabled again. This has been implemented in MCE and perf event. This patch
> >> provides a unified version and make it easier for other NMI semantic handler
> >> to take use of the delayed call.
> >
> > Instead of introducing this extra intermediate facility please use the same
> > approach the unified NMI watchdog is using (see latest -tip): a perf event
> > callback gives all the extra functionality needed.
> >
> > The MCE code needs to be updated to use that - and then it will be integrated
> > into the events framework.
>
> Hi Ingo,
>
> I think this "NMI delayed call mechanism" could be a part of "the events
> framework" that we are planning to get in kernel soon. [...]

My request was to make it part of perf events - which is a generic event
logging framework. We dont really need/want a second 'events framework'
as we have one already ;-)

> [...] At least APEI will use NMI to report some hardware events (likely
> error) to kernel. So I suppose we will go to have a delayed call as an
> event handler for APEI.

Yep, that makes sense. I wasnt arguing against the functionality itself, i was
arguing against the illogical layering that limits its utility. By making it
part of perf events it becomes a generic part of that framework and can be
used by anything that deals with events and uses that framework.

Thanks,

Ingo

2010-06-18 10:31:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* Andi Kleen <[email protected]> wrote:

> > I think the perf event subsytem can log events in NMI context already and
> > deliver them to userspace when the NMI is done. This is why I think Ingo
> > wants MCE to be updated to sit on top of the perf event subsytem to avoid
> > re-invent everything again.
>
> perf is not solving the problem this is trying to solve.

That is why i requested to extend the events backend. That will unify more of
the code than the first few steps achieved by these three patches - and offers
the functionality to all code that uses the events framework.

> [...]
>
> perf does not fit into this because it has no way to process such an event
> inside the kernel.

It 'does not fit' into the events backend only if you pretend that it is
impossible or undesirable to have a delayed, in-context callback mechanism
implemented there.

If you look at it more closely you'll notice that in reality it's not only
possible but that it is also a pretty natural fit.

Thanks,

Ingo

2010-06-18 11:34:57

by huang ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

Hi, Ingo,

On Fri, Jun 18, 2010 at 5:48 PM, Ingo Molnar <[email protected]> wrote:
>
> * Hidetoshi Seto <[email protected]> wrote:
>
>> (2010/06/12 19:25), Ingo Molnar wrote:
>> >
>> > * Huang Ying <[email protected]> wrote:
>> >
>> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
>> >> handler to call some functions. One solution is to delay the call via self
>> >> interrupt, so that the delayed call can be done once the interrupt is
>> >> enabled again. This has been implemented in MCE and perf event. This patch
>> >> provides a unified version and make it easier for other NMI semantic handler
>> >> to take use of the delayed call.
>> >
>> > Instead of introducing this extra intermediate facility please use the same
>> > approach the unified NMI watchdog is using (see latest -tip): a perf event
>> > callback gives all the extra functionality needed.
>> >
>> > The MCE code needs to be updated to use that - and then it will be integrated
>> > into the events framework.
>>
>> Hi Ingo,
>>
>> I think this "NMI delayed call mechanism" could be a part of "the events
>> framework" that we are planning to get in kernel soon. [...]
>
> My request was to make it part of perf events - which is a generic event
> logging framework. We dont really need/want a second 'events framework'
> as we have one already ;-)

This patchset is simple and straightforward, it is just a delayed
execution mechanism, not another 'events framework'. There are several
other NMI users other than perf, should we integrate all NMI users
into perf framework?

>> [...]  At least APEI will use NMI to report some hardware events (likely
>> error) to kernel.  So I suppose we will go to have a delayed call as an
>> event handler for APEI.
>
> Yep, that makes sense. I wasnt arguing against the functionality itself, i was
> arguing against the illogical layering that limits its utility. By making it
> part of perf events it becomes a generic part of that framework and can be
> used by anything that deals with events and uses that framework.

I think the the 'layering' in the patchset helps instead of 'limits'
its utility. It is designed to be as general as possible, so that it
can be used by both perf and other NMI users. Do you think so?

Best Regards,
Huang Ying

2010-06-18 11:56:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Sat, 2010-06-12 at 17:28 +0800, Huang Ying wrote:
> +#define NMI_DELAYED_CALL_ID_MAX 32
> +#define NMI_DELAYED_CALL_RESTART_MAX 5
> +
> +static nmi_delayed_call_func_t nmi_delayed_call_funcs[NMI_DELAYED_CALL_ID_MAX];
> +static DEFINE_SPINLOCK(nmi_delayed_call_lock);
> +
> +static DEFINE_PER_CPU(unsigned long, nmi_delayed_call_pending);
> +
> +static void nmi_delayed_call_run(void)
> +{
> + int cpu, restart = NMI_DELAYED_CALL_RESTART_MAX;
> + unsigned long pending, *ppending;
> + nmi_delayed_call_func_t *pfunc, func;
> +
> + cpu = smp_processor_id();
> + ppending = per_cpu_ptr(&nmi_delayed_call_pending, cpu);
> + while (*ppending && restart--) {
> + pending = xchg(ppending, 0);
> + pfunc = nmi_delayed_call_funcs;
> + do {
> + if (pending & 1) {
> + func = *pfunc;
> + if (func)
> + func();
> + }
> + pfunc++;
> + pending >>= 1;
> + } while (pending);
> + }
> +}

So aside from the should this be perf or not, the above is utter
gibberish. Whoever came up with this nonsense?

Why not make a work_struct like thing and enqueue it using cmpxchg on a
percpu list, then have the interrupt process them. Read
perf_pending_queue() and __perf_pending_run().

That way you don't need this whole register/id/limit crap.

> +#ifdef CONFIG_X86_LOCAL_APIC

What's the point of the rest of this code if we don't have a lapic?

> +asmlinkage void smp_nmi_delayed_call_interrupt(struct pt_regs *regs)
> +{
> + ack_APIC_irq();
> + irq_enter();

You're missing inc_irq_stat() there.

> + nmi_delayed_call_run();
> + irq_exit();
> +}
> +#endif

2010-06-18 12:25:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

> So aside from the should this be perf or not, the above is utter
> gibberish. Whoever came up with this nonsense?

This is pretty much how softirqs (and before them bottom halves) work.
I believe Linus invented that scheme originally back in the early
days of Linux.

It's actually quite simple and works well

-Andi

2010-06-18 12:46:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* huang ying <[email protected]> wrote:

> Hi, Ingo,
>
> On Fri, Jun 18, 2010 at 5:48 PM, Ingo Molnar <[email protected]> wrote:
> >
> > * Hidetoshi Seto <[email protected]> wrote:
> >
> >> (2010/06/12 19:25), Ingo Molnar wrote:
> >> >
> >> > * Huang Ying <[email protected]> wrote:
> >> >
> >> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> >> >> handler to call some functions. One solution is to delay the call via self
> >> >> interrupt, so that the delayed call can be done once the interrupt is
> >> >> enabled again. This has been implemented in MCE and perf event. This patch
> >> >> provides a unified version and make it easier for other NMI semantic handler
> >> >> to take use of the delayed call.
> >> >
> >> > Instead of introducing this extra intermediate facility please use the same
> >> > approach the unified NMI watchdog is using (see latest -tip): a perf event
> >> > callback gives all the extra functionality needed.
> >> >
> >> > The MCE code needs to be updated to use that - and then it will be integrated
> >> > into the events framework.
> >>
> >> Hi Ingo,
> >>
> >> I think this "NMI delayed call mechanism" could be a part of "the events
> >> framework" that we are planning to get in kernel soon. [...]
> >
> > My request was to make it part of perf events - which is a generic event
> > logging framework. We dont really need/want a second 'events framework' as
> > we have one already ;-)
>
> This patchset is simple and straightforward, [...]

We wouldnt want to add another workqueue or memory allocation mechanism
either, even if it was 'simple and straightforward'. We try to make things
more generally useful.

> [...] it is just a delayed execution mechanism, not another 'events
> framework'. There are several other NMI users other than perf, should we
> integrate all NMI users into perf framework?

We already did so with the NMI watchdog. What other significant NMI event
users do you have in mind?

> >> [...] ??At least APEI will use NMI to report some hardware events (likely
> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
> >> event handler for APEI.
> >
> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
> > was arguing against the illogical layering that limits its utility. By
> > making it part of perf events it becomes a generic part of that framework
> > and can be used by anything that deals with events and uses that
> > framework.
>
> I think the the 'layering' in the patchset helps instead of 'limits' its
> utility. It is designed to be as general as possible, so that it can be used
> by both perf and other NMI users. Do you think so?

What other NMI users do you mean? EDAC/MCE is going to go utilize events as
well (away from the horrible /dev/mcelog interface), the NMI watchdog already
did it and the perf tool obviously does as well. There's a few leftovers like
kcrash which isnt really event centric and i dont think it needs to be
converted.

Thanks,

Ingo

2010-06-18 12:48:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, 2010-06-18 at 14:25 +0200, Andi Kleen wrote:
> > So aside from the should this be perf or not, the above is utter
> > gibberish. Whoever came up with this nonsense?
>
> This is pretty much how softirqs (and before them bottom halves) work.
> I believe Linus invented that scheme originally back in the early
> days of Linux.

Doesn't mean its the right abstraction for this.

> It's actually quite simple and works well

And adds more code than it removes whilst providing a very limited
service.

You generally want to pass more information along anyway, now your
callback function needs to go look for it. Much better to pass a
work_struct like thing around that is contained in the state it needs.

2010-06-18 13:09:29

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

> You generally want to pass more information along anyway, now your
> callback function needs to go look for it. Much better to pass a
> work_struct like thing around that is contained in the state it needs.

But how would you allocate the work queue in an NMI?

If it's only a single instance (like this bit) it can be always put
into a per cpu variable.

-Andi

2010-06-18 13:13:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > You generally want to pass more information along anyway, now your
> > callback function needs to go look for it. Much better to pass a
> > work_struct like thing around that is contained in the state it needs.
>
> But how would you allocate the work queue in an NMI?
>
> If it's only a single instance (like this bit) it can be always put
> into a per cpu variable.

Pre-allocate. For the perf-event stuff we use the perf_event allocated
at creation time. But yeah, per-cpu storage also works.

2010-06-18 13:23:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, Jun 18, 2010 at 03:12:49PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > > You generally want to pass more information along anyway, now your
> > > callback function needs to go look for it. Much better to pass a
> > > work_struct like thing around that is contained in the state it needs.
> >
> > But how would you allocate the work queue in an NMI?
> >
> > If it's only a single instance (like this bit) it can be always put
> > into a per cpu variable.
>
> Pre-allocate. For the perf-event stuff we use the perf_event allocated
> at creation time. But yeah, per-cpu storage also works.

So you could just preallocate the bits instead ?

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-18 13:25:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, 2010-06-18 at 15:23 +0200, Andi Kleen wrote:
> On Fri, Jun 18, 2010 at 03:12:49PM +0200, Peter Zijlstra wrote:
> > On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > > > You generally want to pass more information along anyway, now your
> > > > callback function needs to go look for it. Much better to pass a
> > > > work_struct like thing around that is contained in the state it needs.
> > >
> > > But how would you allocate the work queue in an NMI?
> > >
> > > If it's only a single instance (like this bit) it can be always put
> > > into a per cpu variable.
> >
> > Pre-allocate. For the perf-event stuff we use the perf_event allocated
> > at creation time. But yeah, per-cpu storage also works.
>
> So you could just preallocate the bits instead ?

You mean the bits in your function array? Those are limited to 32 and
you'd need a secondary lookup to match them to your data object, not
very useful.

2010-06-18 13:40:45

by huang ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <[email protected]> wrote:
>> >> [...] ??At least APEI will use NMI to report some hardware events (likely
>> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
>> >> event handler for APEI.
>> >
>> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
>> > was arguing against the illogical layering that limits its utility. By
>> > making it part of perf events it becomes a generic part of that framework
>> > and can be used by anything that deals with events and uses that
>> > framework.
>>
>> I think the the 'layering' in the patchset helps instead of 'limits' its
>> utility. It is designed to be as general as possible, so that it can be used
>> by both perf and other NMI users. Do you think so?
>
> What other NMI users do you mean? EDAC/MCE is going to go utilize events as
> well (away from the horrible /dev/mcelog interface), the NMI watchdog already
> did it and the perf tool obviously does as well. There's a few leftovers like
> kcrash which isnt really event centric and i dont think it needs to be
> converted.

But why not just make it more general? It does not hurt anyone including perf.

Best Regards,
Huang Ying

2010-06-18 14:35:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* huang ying <[email protected]> wrote:

> On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <[email protected]> wrote:
> >> >> [...] ??At least APEI will use NMI to report some hardware events (likely
> >> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
> >> >> event handler for APEI.
> >> >
> >> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
> >> > was arguing against the illogical layering that limits its utility. By
> >> > making it part of perf events it becomes a generic part of that framework
> >> > and can be used by anything that deals with events and uses that
> >> > framework.
> >>
> >> I think the the 'layering' in the patchset helps instead of 'limits' its
> >> utility. It is designed to be as general as possible, so that it can be used
> >> by both perf and other NMI users. Do you think so?
> >
> > What other NMI users do you mean? EDAC/MCE is going to go utilize events
> > as well (away from the horrible /dev/mcelog interface), the NMI watchdog
> > already did it and the perf tool obviously does as well. There's a few
> > leftovers like kcrash which isnt really event centric and i dont think it
> > needs to be converted.
>
> But why not just make it more general? It does not hurt anyone including
> perf.

Because it's not actually more generic that way - just look at the code. It's
x86 specific, plus it ties it to NMI delivery while the concept of delayed
execution has nothing to do with NMIs.

Ingo

2010-06-18 14:38:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


* Andi Kleen <[email protected]> wrote:

> > So aside from the should this be perf or not, the above is utter
> > gibberish. Whoever came up with this nonsense?
>
> This is pretty much how softirqs (and before them bottom halves) work.
> I believe Linus invented that scheme originally back in the early
> days of Linux.

Nope - the softirq code was written by Alexey Kuznetsov and David S. Miller.
But it's irrelevant, because:

> It's actually quite simple and works well

... as Peter said the last thing we want is yet another softirq vector.

Thanks,

Ingo

2010-06-18 15:16:26

by huang ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, Jun 18, 2010 at 10:35 PM, Ingo Molnar <[email protected]> wrote:
>
> * huang ying <[email protected]> wrote:
>
>> On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <[email protected]> wrote:
>> >> >> [...] ??At least APEI will use NMI to report some hardware events (likely
>> >> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
>> >> >> event handler for APEI.
>> >> >
>> >> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
>> >> > was arguing against the illogical layering that limits its utility. By
>> >> > making it part of perf events it becomes a generic part of that framework
>> >> > and can be used by anything that deals with events and uses that
>> >> > framework.
>> >>
>> >> I think the the 'layering' in the patchset helps instead of 'limits' its
>> >> utility. It is designed to be as general as possible, so that it can be used
>> >> by both perf and other NMI users. Do you think so?
>> >
>> > What other NMI users do you mean? EDAC/MCE is going to go utilize events
>> > as well (away from the horrible /dev/mcelog interface), the NMI watchdog
>> > already did it and the perf tool obviously does as well. There's a few
>> > leftovers like kcrash which isnt really event centric and i dont think it
>> > needs to be converted.
>>
>> But why not just make it more general? It does not hurt anyone including
>> perf.
>
> Because it's not actually more generic that way - just look at the code. It's
> x86 specific, plus it ties it to NMI delivery while the concept of delayed
> execution has nothing to do with NMIs.

soft_irq is a delayed mechanism for IRQ, a self interrupt can be a
delayed mechanism for NMI. If we can make soft_irq NMI-safe, we can
use soft_irq as a backup of self interrupt (for systems without APIC
and maybe for other architectures).

Best Regards,
Huang Ying

2010-06-18 15:31:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, 2010-06-18 at 23:16 +0800, huang ying wrote:
>
> soft_irq is a delayed mechanism for IRQ,

No its not.

> a self interrupt can be a
> delayed mechanism for NMI. If we can make soft_irq NMI-safe,

No you can't.

> we can
> use soft_irq as a backup of self interrupt (for systems without APIC
> and maybe for other architectures).

Whatever would you want to do that for.

2010-06-19 01:51:05

by huang ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Fri, Jun 18, 2010 at 11:31 PM, Peter Zijlstra <[email protected]> wrote:
> On Fri, 2010-06-18 at 23:16 +0800, huang ying wrote:
>>
>> soft_irq is a delayed mechanism for IRQ,
>
> No its not.

Why? What do you think soft_irq is for?

>>  a self interrupt can be a
>> delayed mechanism for NMI. If we can make soft_irq NMI-safe,
>
> No you can't.
>
>> we can
>> use soft_irq as a backup of self interrupt (for systems without APIC
>> and maybe for other architectures).
>
> Whatever would you want to do that for.

Best Regards,
Huang Ying

2010-06-19 08:02:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

> > we can
> > use soft_irq as a backup of self interrupt (for systems without APIC
> > and maybe for other architectures).
>
> Whatever would you want to do that for.

The idea is that the work would be done latest on the next
timer interrupt as a fallback if APIC is not available.

That's what mce does already and it's probably approbiate
for most other users too.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-19 10:54:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism


( Ugh: what's this new fad of you not quoting the name of the person who
wrote a mail? It makes multi-level quotes utterly unreadable as it's not
clear who wrote what. You should also respect others by quoting their names. )

* Andi Kleen <[email protected]> wrote:

> > > we can use soft_irq as a backup of self interrupt (for systems without
> > > APIC and maybe for other architectures).
> >
> > Whatever would you want to do that for.
>
> The idea is that the work would be done latest on the next timer interrupt
> as a fallback if APIC is not available.

Abusing the timer irq for that is an exceedingly ugly and unacceptable design,
as machine check events have nothing to do with timers. (That approach is also
buggy because it inserts an arbitrary delay - which could be rather long on
nohz.)

This kind of messy, ad-hoc piggybacking only creates unmaintainable code in
the long run.

> That's what mce does already and it's probably approbiate for most other
> users too.

Hell no, the unfortunate and unclean practices of the MCE code must not be
propagated elsewhere.

The proper, generic approach would be to enable softirq notifications (on x86)
from NMI contexts as well (it's actually possible without overhead), and to
extend user return notifiers with the logical next step: nmi return notifiers.
If presented in such a form then those could use softirqs for atomic callbacks
and per cpu kthreads for sleepable callbacks, etc.

Ingo

2010-06-19 14:07:44

by huang ying

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Sat, Jun 19, 2010 at 6:53 PM, Ingo Molnar <[email protected]> wrote:
> The proper, generic approach would be to enable softirq notifications (on x86)
> from NMI contexts as well (it's actually possible without overhead),

Yes. I will do that. And I think self interrupt can be used as the
short-cut for soft_irq if available. The next soft_irq may be too late
if there is too few interrupts.

> and to
> extend user return notifiers with the logical next step: nmi return notifiers.
> If presented in such a form then those could use softirqs for atomic callbacks
> and per cpu kthreads for sleepable callbacks, etc.

NMI return notifiers fired in soft_irq?

Best Regards,
Huang Ying

2010-06-19 14:17:27

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

> Nope - the softirq code was written by Alexey Kuznetsov and David S. Miller.

It was based on the design of the previous bottom halves in case you
missed it. I believe BHs were there from nearly the beginning.

(softirqs were basically just per cpu bottom halves, otherwise
being very similar)

-Andi

2010-06-19 14:24:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 1/3] Unified NMI delayed call mechanism

On Sat, Jun 19, 2010 at 12:53:52PM +0200, Ingo Molnar wrote:
>
> ( Ugh: what's this new fad of you not quoting the name of the person who
> wrote a mail? It makes multi-level quotes utterly unreadable as it's not
> clear who wrote what. You should also respect others by quoting their names. )

To give you an excuse to flame of course. I know that's
your favourite pastime and I'm always happy to make you happy.

>
> Abusing the timer irq for that is an exceedingly ugly and unacceptable design,
> as machine check events have nothing to do with timers. (That approach is also
The rationale is that this nearly never happens in practice anyways
(the operations that trigger it do near always only happen with APICs)
AFAIK that's true for all the proposed users.

For very rare and obscure cases like this in my experience
the simplest possible code is the best,
that makes it most likely it actually works when it's needed.
I do not know of any simpler way to implement this.

Of course if there's evidence it's actually common that would
need to be revisited.

-Andi
--
[email protected] -- Speaking for myself only.