2002-10-23 20:08:28

by Corey Minyard

[permalink] [raw]
Subject: [PATCH] NMI request/release, version 4

diff -ur linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/i386_ksyms.c Tue Oct 22 12:19:59 2002
@@ -90,6 +90,9 @@
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);

+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
#endif
diff -ur linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c
--- linux.orig/arch/i386/kernel/irq.c Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/irq.c Tue Oct 22 12:08:20 2002
@@ -131,6 +131,8 @@
* Generic, controller-independent functions:
*/

+extern void nmi_append_user_names(struct seq_file *p);
+
int show_interrupts(struct seq_file *p, void *v)
{
int i, j;
@@ -166,6 +168,8 @@
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
p += seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " ");
+ nmi_append_user_names(p);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
diff -ur linux.orig/arch/i386/kernel/nmi.c linux/arch/i386/kernel/nmi.c
--- linux.orig/arch/i386/kernel/nmi.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi.c Wed Oct 23 13:57:55 2002
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>

#include <asm/smp.h>
#include <asm/mtrr.h>
@@ -102,6 +103,17 @@
return 0;
}

+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+ .dev_name = "nmi_watchdog",
+ .dev_id = NULL,
+ .handler = nmi_watchdog_tick,
+ .priority = 255, /* We want to be relatively high priority. */
+};
+
static int __init setup_nmi_watchdog(char *str)
{
int nmi;
@@ -110,6 +122,12 @@

if (nmi >= NMI_INVALID)
return 0;
+
+ if (request_nmi(&nmi_watchdog_handler) != 0) {
+ /* Couldn't add a watchdog handler, give up. */
+ return 0;
+ }
+
if (nmi == NMI_NONE)
nmi_watchdog = nmi;
/*
@@ -350,7 +368,7 @@
alert_counter[i] = 0;
}

-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs)
{

/*
@@ -401,4 +419,6 @@
}
wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
}
+
+ return NOTIFY_OK;
}
diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/traps.c Wed Oct 23 14:58:09 2002
@@ -23,6 +23,10 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>

#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -485,21 +489,139 @@
printk("Do you have a strange power saving mode enabled?\n");
}

+/*
+ * A list of handlers for NMIs. This list will be called in order
+ * when an NMI from an otherwise unidentifiable source somes in. If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE. NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner. A
+ * spinlock protects the list from multiple writers. When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu. When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu. The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+ struct nmi_handler *handler = arg;
+
+ INIT_LIST_HEAD(&(handler->link));
+ wmb();
+ wake_up(&(handler->wait));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h = NULL;
+
+ if (!list_empty(&(handler->link)))
+ return EBUSY;
+
+ spin_lock(&nmi_handler_lock);
+
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->priority <= handler->priority)
+ break;
+ }
+
+ if (curr_h)
+ /* list_add_rcu takes care of memory barrier */
+ list_add_rcu(&(handler->link), curr_h->link.prev);
+ else
+ list_add_rcu(&(handler->link), &nmi_handler_list);
+
+ spin_unlock(&nmi_handler_lock);
+ return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+ wait_queue_t q_ent;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nmi_handler_lock, flags);
+ list_del_rcu(&(handler->link));
+
+ /* Wait for handler to finish being freed. This can't be
+ interrupted, we must wait until it finished. */
+ init_waitqueue_head(&(handler->wait));
+ init_waitqueue_entry(&q_ent, current);
+ add_wait_queue(&(handler->wait), &q_ent);
+ call_rcu(&(handler->rcu), free_nmi_handler, handler);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (list_empty(&(handler->link)))
+ break;
+ spin_unlock_irqrestore(&nmi_handler_lock, flags);
+ schedule();
+ spin_lock_irqsave(&nmi_handler_lock, flags);
+ }
+ remove_wait_queue(&(handler->wait), &q_ent);
+ spin_unlock_irqrestore(&nmi_handler_lock, flags);
+}
+
+static int call_nmi_handlers(struct pt_regs * regs)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+ int handled = 0;
+ int val;
+
+ __list_for_each_rcu(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ val = curr_h->handler(curr_h->dev_id, regs);
+ switch (val & ~NOTIFY_STOP_MASK) {
+ case NOTIFY_OK:
+ handled = 1;
+ break;
+
+ case NOTIFY_DONE:
+ default:
+ }
+ if (val & NOTIFY_STOP_MASK)
+ break;
+ }
+ return handled;
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+
+ spin_lock(&nmi_handler_lock);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->dev_name)
+ p += seq_printf(p, " %s", curr_h->dev_name);
+ }
+ spin_unlock(&nmi_handler_lock);
+}
+
static void default_do_nmi(struct pt_regs * regs)
{
unsigned char reason = inb(0x61);

if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
/*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
+ * Check the handler list to see if anyone can handle this
+ * nmi.
*/
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
+ if (call_nmi_handlers(regs))
return;
- }
-#endif
+
unknown_nmi_error(reason, regs);
return;
}
diff -ur linux.orig/include/asm-i386/apic.h linux/include/asm-i386/apic.h
--- linux.orig/include/asm-i386/apic.h Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
extern void setup_boot_APIC_clock (void);
extern void setup_secondary_APIC_clock (void);
extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
diff -ur linux.orig/include/asm-i386/irq.h linux/include/asm-i386/irq.h
--- linux.orig/include/asm-i386/irq.h Mon Oct 21 13:24:41 2002
+++ linux/include/asm-i386/irq.h Wed Oct 23 14:15:59 2002
@@ -12,6 +12,7 @@

#include <linux/config.h>
#include <linux/sched.h>
+#include <linux/rcupdate.h>
/* include comes from machine specific directory */
#include "irq_vectors.h"

@@ -27,5 +28,36 @@
#ifdef CONFIG_X86_LOCAL_APIC
#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
#endif
+
+
+/**
+ * Register a handler to get called when an NMI occurs. If the
+ * handler actually handles the NMI, it should return NOTIFY_OK. If
+ * it did not handle the NMI, it should return NOTIFY_DONE. It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified. Note that this is a slow-path
+ * handler, if you have a fast-path handler there's another tie-in
+ * for you. See the oprofile code.
+ */
+#define HAVE_NMI_HANDLER 1
+struct nmi_handler
+{
+ struct list_head link; /* You must init this before use. */
+
+ char *dev_name;
+ void *dev_id;
+ int (*handler)(void *dev_id, struct pt_regs *regs);
+ int priority; /* Handlers called in priority order. */
+
+ /* Don't mess with anything below here. */
+
+ struct rcu_head rcu;
+ wait_queue_head_t wait;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);

#endif /* _ASM_IRQ_H */


Attachments:
linux-nmi-v4.diff (8.89 kB)

2002-10-23 20:49:30

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

Well, I haven't looked at the whole patch yet, but some quick
responses -

On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
> I have noticed that the rcu callback can be delayed a long time,
> sometimes up to 3 seconds. That seems odd. I have verified that the
> delay happens there.

That kind of latency is really bad. Could you please check the latency
with this change in kernel/rcupdate.c -

void rcu_check_callbacks(int cpu, int user)
{
if (user ||
- (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
+ (idle_cpu(cpu) && !in_softirq() &&
+ hardirq_count() <= (1 << HARDIRQ_SHIFT)))
RCU_qsctr(cpu)++;
tasklet_schedule(&RCU_tasklet(cpu));

After local_irq_count() went away, the idle CPU check was broken
and that meant that if you had an idle CPU, it could hold up RCU
grace period completion.


> +void release_nmi(struct nmi_handler *handler)
> +{
> + wait_queue_t q_ent;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&nmi_handler_lock, flags);
> + list_del_rcu(&(handler->link));
> +
> + /* Wait for handler to finish being freed. This can't be
> + interrupted, we must wait until it finished. */
> + init_waitqueue_head(&(handler->wait));
> + init_waitqueue_entry(&q_ent, current);
> + add_wait_queue(&(handler->wait), &q_ent);
> + call_rcu(&(handler->rcu), free_nmi_handler, handler);
> + for (;;) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (list_empty(&(handler->link)))
> + break;
> + spin_unlock_irqrestore(&nmi_handler_lock, flags);
> + schedule();
> + spin_lock_irqsave(&nmi_handler_lock, flags);
> + }
> + remove_wait_queue(&(handler->wait), &q_ent);
> + spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +}

It might just be simpler to use completions instead -

call_rcu(&(handler->rcu), free_nmi_handler, handler);
init_completion(&handler->completion);
spin_unlock_irqrestore(&nmi_handler_lock, flags);
wait_for_completion(&handler->completion);

and do

complete(&handler->completion);

in the the RCU callback.

Also, now I think your original idea of "Don't do this!" :) was right.
Waiting until an nmi handler is seen unlinked could make a task
block for a long time if another task re-installs it. You should
probably just fail installation of a busy handler (checked under
lock).


Thanks
Dipankar

2002-10-23 21:47:04

by Corey Minyard

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

Dipankar Sarma wrote:

>Well, I haven't looked at the whole patch yet, but some quick
>responses -
>
>On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
>
>
>>I have noticed that the rcu callback can be delayed a long time,
>>sometimes up to 3 seconds. That seems odd. I have verified that the
>>delay happens there.
>>
>>
>
>That kind of latency is really bad. Could you please check the latency
>with this change in kernel/rcupdate.c -
>
> void rcu_check_callbacks(int cpu, int user)
> {
> if (user ||
>- (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
>+ (idle_cpu(cpu) && !in_softirq() &&
>+ hardirq_count() <= (1 << HARDIRQ_SHIFT)))
> RCU_qsctr(cpu)++;
> tasklet_schedule(&RCU_tasklet(cpu));
>
>After local_irq_count() went away, the idle CPU check was broken
>and that meant that if you had an idle CPU, it could hold up RCU
>grace period completion.
>
Ah, much better. That seems to fix it.

>It might just be simpler to use completions instead -
>
> call_rcu(&(handler->rcu), free_nmi_handler, handler);
> init_completion(&handler->completion);
> spin_unlock_irqrestore(&nmi_handler_lock, flags);
> wait_for_completion(&handler->completion);
>
>and do
>
> complete(&handler->completion);
>
>in the the RCU callback.
>
I was working under the assumption that the spinlocks were needed. But
now I see that there are spinlocks in wait_for_completion. You did get
init_completion() and call_rcu() backwards, they would need to be the
opposite order, I think.

>Also, now I think your original idea of "Don't do this!" :) was right.
>Waiting until an nmi handler is seen unlinked could make a task
>block for a long time if another task re-installs it. You should
>probably just fail installation of a busy handler (checked under
>lock).
>
Since just about all of these will be in modules at unload time, I'm
thinking that the way it is now is better. Otherwise, someone will mess
it up. IMHO, it's much more likely that someone doesn't handle the
callback correctly than someone reused the value before the call that
releases it returns. I'd prefer to leave it the way it is now.

-Corey

2002-10-24 07:40:03

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

On Wed, Oct 23, 2002 at 04:53:34PM -0500, Corey Minyard wrote:
> >After local_irq_count() went away, the idle CPU check was broken
> >and that meant that if you had an idle CPU, it could hold up RCU
> >grace period completion.
> >
> Ah, much better. That seems to fix it.

Great! Do you have any latency numbers ? Just curious.

>
> >It might just be simpler to use completions instead -
> >
> > call_rcu(&(handler->rcu), free_nmi_handler, handler);
> > init_completion(&handler->completion);
> > spin_unlock_irqrestore(&nmi_handler_lock, flags);
> > wait_for_completion(&handler->completion);
> >
> >and do
> >
> > complete(&handler->completion);
> >
> >in the the RCU callback.
> >
> I was working under the assumption that the spinlocks were needed. But
> now I see that there are spinlocks in wait_for_completion. You did get
> init_completion() and call_rcu() backwards, they would need to be the
> opposite order, I think.

AFAICS, the ordering of call_rcu() and init_completion should not matter
because the CPU that is executing them would not have gone
through a quiescent state and thus the RCU callback cannot happen.
Only after a context swtich in wait_for_completion(), the callback
is possible.


>
> >Also, now I think your original idea of "Don't do this!" :) was right.
> >Waiting until an nmi handler is seen unlinked could make a task
> >block for a long time if another task re-installs it. You should
> >probably just fail installation of a busy handler (checked under
> >lock).
> >
> Since just about all of these will be in modules at unload time, I'm
> thinking that the way it is now is better. Otherwise, someone will mess
> it up. IMHO, it's much more likely that someone doesn't handle the
> callback correctly than someone reused the value before the call that
> releases it returns. I'd prefer to leave it the way it is now.

Oh, I didn't mean the part about waiting in release_nmi() until
the release is complete (RCU callback done). That is absolutely
necessary. I was talking about looping until the handler is
not busy any more. I think it is safe to just do a wait_for_completion()
and return in release_nmi().

Thanks
Dipankar

2002-10-24 07:49:01

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

Ok, some more comments -

On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
> +void release_nmi(struct nmi_handler *handler)
> +{
> + wait_queue_t q_ent;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&nmi_handler_lock, flags);
> + list_del_rcu(&(handler->link));
> +
> + /* Wait for handler to finish being freed. This can't be
> + interrupted, we must wait until it finished. */
> + init_waitqueue_head(&(handler->wait));
> + init_waitqueue_entry(&q_ent, current);
> + add_wait_queue(&(handler->wait), &q_ent);
> + call_rcu(&(handler->rcu), free_nmi_handler, handler);
> + for (;;) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (list_empty(&(handler->link)))
> + break;
> + spin_unlock_irqrestore(&nmi_handler_lock, flags);
> + schedule();
> + spin_lock_irqsave(&nmi_handler_lock, flags);
> + }
> + remove_wait_queue(&(handler->wait), &q_ent);
> + spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +}

Can release_nmi() be done from irq context ? If not, I don't see
why spin_lock_irqsave() is required here. If it can be called
from irq context, then I can't see how you can schedule()
(or wait_for_completion() for that matter :)).

Thanks
Dipankar

2002-10-24 12:59:00

by Corey Minyard

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

Dipankar Sarma wrote:

>Ok, some more comments -
>
>On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
>
>
>Can release_nmi() be done from irq context ? If not, I don't see
>why spin_lock_irqsave() is required here. If it can be called
>from irq context, then I can't see how you can schedule()
>(or wait_for_completion() for that matter :)).
>
I originally was using nmi_handler_lock in the rcu routine (which runs
at interrupt level). You have to turn off interrupts if someone else
can claim the lock at interrupt level. However, I"m not any more, so it
can go away.

-Corey

2002-10-24 13:01:50

by Corey Minyard

[permalink] [raw]
Subject: Re: [PATCH] NMI request/release, version 4

Dipankar Sarma wrote:

>On Wed, Oct 23, 2002 at 04:53:34PM -0500, Corey Minyard wrote:
>
>
>>>After local_irq_count() went away, the idle CPU check was broken
>>>and that meant that if you had an idle CPU, it could hold up RCU
>>>grace period completion.
>>>
>>Ah, much better. That seems to fix it.
>>
>>
>Great! Do you have any latency numbers ? Just curious.
>
Unfortunately not. 3 seconds is well within the realm of human
observation with printk.

>>>It might just be simpler to use completions instead -
>>>
>>> call_rcu(&(handler->rcu), free_nmi_handler, handler);
>>> init_completion(&handler->completion);
>>> spin_unlock_irqrestore(&nmi_handler_lock, flags);
>>> wait_for_completion(&handler->completion);
>>>
>>>and do
>>>
>>> complete(&handler->completion);
>>>
>>>in the the RCU callback.
>>>
>>>
>>>
>>I was working under the assumption that the spinlocks were needed. But
>>now I see that there are spinlocks in wait_for_completion. You did get
>>init_completion() and call_rcu() backwards, they would need to be the
>>opposite order, I think.
>>
>>
>
>AFAICS, the ordering of call_rcu() and init_completion should not matter
>because the CPU that is executing them would not have gone
>through a quiescent state and thus the RCU callback cannot happen.
>Only after a context swtich in wait_for_completion(), the callback
>is possible.
>
Yes, I think you are right. I'm still not used to the RCUs :-).

-Corey