2010-06-24 03:05:18

by Huang, Ying

[permalink] [raw]
Subject: [RFC 1/5] Make soft_irq NMI safe

NMI can be triggered even when IRQ is masked, so many kernel services
can not be used in NMI handler. It is necessary for NMI handler to
trigger some operations in other contexts such as IRQ and process. To
do this, this patch makes soft_irq mechanism usable for NMI handler.

Signed-off-by: Huang Ying <[email protected]>
---
arch/ia64/include/asm/hardirq.h | 2 -
arch/powerpc/include/asm/hardirq.h | 2 -
arch/powerpc/kernel/irq.c | 2 -
arch/s390/include/asm/hardirq.h | 2 -
arch/s390/kernel/irq.c | 2 -
arch/sh/kernel/irq.c | 2 -
arch/sparc/include/asm/hardirq_64.h | 2 -
arch/sparc/kernel/irq_64.c | 2 -
arch/x86/include/asm/hardirq.h | 7 ----
arch/x86/kernel/irq_32.c | 2 -
arch/x86/kernel/irq_64.c | 2 -
block/blk-iopoll.c | 6 +--
block/blk-softirq.c | 6 +--
include/linux/interrupt.h | 10 +----
include/linux/irq_cpustat.h | 2 -
kernel/hrtimer.c | 4 +-
kernel/softirq.c | 61 +++++++++++++++++-------------------
kernel/time/tick-sched.c | 6 +--
net/core/dev.c | 14 ++++----
19 files changed, 62 insertions(+), 74 deletions(-)

--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -18,7 +18,7 @@

#define __ARCH_IRQ_STAT 1

-#define local_softirq_pending() (local_cpu_data->softirq_pending)
+#define local_softirq_pending() (&local_cpu_data->softirq_pending)

extern void __iomem *ipi_base_addr;

--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -16,7 +16,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust

#define __ARCH_IRQ_STAT

-#define local_softirq_pending() __get_cpu_var(irq_stat).__softirq_pending
+#define local_softirq_pending() (&__get_cpu_var(irq_stat).__softirq_pending)

static inline void ack_bad_irq(unsigned int irq)
{
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -505,7 +505,7 @@ void do_softirq(void)

local_irq_save(flags);

- if (local_softirq_pending())
+ if (*local_softirq_pending())
do_softirq_onstack();

local_irq_restore(flags);
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,7 @@
#include <linux/interrupt.h>
#include <asm/lowcore.h>

-#define local_softirq_pending() (S390_lowcore.softirq_pending)
+#define local_softirq_pending() (&S390_lowcore.softirq_pending)

#define __ARCH_IRQ_STAT
#define __ARCH_HAS_DO_SOFTIRQ
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -70,7 +70,7 @@ asmlinkage void do_softirq(void)

local_irq_save(flags);

- if (local_softirq_pending()) {
+ if (*local_softirq_pending()) {
/* Get current stack pointer. */
asm volatile("la %0,0(15)" : "=a" (old));
/* Check against async. stack address range. */
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -212,7 +212,7 @@ asmlinkage void do_softirq(void)

local_irq_save(flags);

- if (local_softirq_pending()) {
+ if (*local_softirq_pending()) {
curctx = current_thread_info();
irqctx = softirq_ctx[smp_processor_id()];
irqctx->tinfo.task = curctx->task;
--- a/arch/sparc/include/asm/hardirq_64.h
+++ b/arch/sparc/include/asm/hardirq_64.h
@@ -10,7 +10,7 @@

#define __ARCH_IRQ_STAT
#define local_softirq_pending() \
- (local_cpu_data().__softirq_pending)
+ (&local_cpu_data().__softirq_pending)

void ack_bad_irq(unsigned int irq);

--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -770,7 +770,7 @@ void do_softirq(void)

local_irq_save(flags);

- if (local_softirq_pending()) {
+ if (*local_softirq_pending()) {
void *orig_sp, *sp = softirq_stack[smp_processor_id()];

sp += THREAD_SIZE - 192 - STACK_BIAS;
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -37,12 +37,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust

#define inc_irq_stat(member) percpu_inc(irq_stat.member)

-#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
+#define local_softirq_pending() (&__get_cpu_var(irq_stat.__softirq_pending))

extern void ack_bad_irq(unsigned int irq);

--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -168,7 +168,7 @@ asmlinkage void do_softirq(void)

local_irq_save(flags);

- if (local_softirq_pending()) {
+ if (*local_softirq_pending()) {
curctx = current_thread_info();
irqctx = __get_cpu_var(softirq_ctx);
irqctx->tinfo.task = curctx->task;
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -74,7 +74,7 @@ asmlinkage void do_softirq(void)
return;

local_irq_save(flags);
- pending = local_softirq_pending();
+ pending = *local_softirq_pending();
/* Switch to interrupt stack */
if (pending) {
call_softirq();
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -36,7 +36,7 @@ void blk_iopoll_sched(struct blk_iopoll

local_irq_save(flags);
list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(blk_iopoll_sched);
@@ -132,7 +132,7 @@ static void blk_iopoll_softirq(struct so
}

if (rearm)
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);

local_irq_enable();
}
@@ -202,7 +202,7 @@ static int __cpuinit blk_iopoll_cpu_noti
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
&__get_cpu_var(blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);
local_irq_enable();
}

--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -47,7 +47,7 @@ static void trigger_softirq(void *data)
list_add_tail(&rq->csd.list, list);

if (list->next == &rq->csd.list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ raise_softirq_preempt_off(BLOCK_SOFTIRQ);

local_irq_restore(flags);
}
@@ -90,7 +90,7 @@ static int __cpuinit blk_cpu_notify(stru
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_done, cpu),
&__get_cpu_var(blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ raise_softirq_preempt_off(BLOCK_SOFTIRQ);
local_irq_enable();
}

@@ -134,7 +134,7 @@ do_local:
* hasn't run yet.
*/
if (list->next == &req->csd.list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ raise_softirq_preempt_off(BLOCK_SOFTIRQ);
} else if (raise_blk_irq(ccpu, req))
goto do_local;

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -347,11 +347,6 @@ static inline int disable_irq_wake(unsig
}
#endif /* CONFIG_GENERIC_HARDIRQS */

-#ifndef __ARCH_SET_SOFTIRQ_PENDING
-#define set_softirq_pending(x) (local_softirq_pending() = (x))
-#define or_softirq_pending(x) (local_softirq_pending() |= (x))
-#endif
-
/* Some architectures might implement lazy enabling/disabling of
* interrupts. In some cases, such as stop_machine, we might want
* to ensure that after a local_irq_disable(), interrupts have
@@ -402,8 +397,9 @@ asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void);
extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
-extern void raise_softirq_irqoff(unsigned int nr);
+#define __raise_softirq_preempt_off(nr) \
+ do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0)
+extern void raise_softirq_preempt_off(unsigned int nr);
extern void raise_softirq(unsigned int nr);
extern void wakeup_softirqd(void);

--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -23,7 +23,7 @@ extern irq_cpustat_t irq_stat[]; /* def

/* arch independent irq_stat fields */
#define local_softirq_pending() \
- __IRQ_STAT(smp_processor_id(), __softirq_pending)
+ (&__IRQ_STAT(smp_processor_id(), __softirq_pending))

/* arch dependent irq_stat fields */
#define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) /* i386 */
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -695,10 +695,10 @@ static inline int hrtimer_enqueue_reprog
if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
if (wakeup) {
raw_spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_softirq_preempt_off(HRTIMER_SOFTIRQ);
raw_spin_lock(&base->cpu_base->lock);
} else
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ __raise_softirq_preempt_off(HRTIMER_SOFTIRQ);

return 1;
}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -155,7 +155,7 @@ static inline void _local_bh_enable_ip(u
*/
sub_preempt_count(SOFTIRQ_OFFSET - 1);

- if (unlikely(!in_interrupt() && local_softirq_pending()))
+ if (unlikely(!in_interrupt() && *local_softirq_pending()))
do_softirq();

dec_preempt_count();
@@ -191,22 +191,22 @@ EXPORT_SYMBOL(local_bh_enable_ip);
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
- __u32 pending;
+ __u32 pending, *ppending;
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;

- pending = local_softirq_pending();
+ ppending = local_softirq_pending();
account_system_vtime(current);

__local_bh_disable((unsigned long)__builtin_return_address(0));
lockdep_softirq_enter();

cpu = smp_processor_id();
-restart:
- /* Reset the pending bitmask before enabling irqs */
- set_softirq_pending(0);

local_irq_enable();
+restart:
+ /* Reset the pending bitmask before enabling irqs */
+ pending = xchg(ppending, 0);

h = softirq_vec;

@@ -233,13 +233,12 @@ restart:
pending >>= 1;
} while (pending);

- local_irq_disable();
-
- pending = local_softirq_pending();
- if (pending && --max_restart)
+ if (*ppending && --max_restart)
goto restart;

- if (pending)
+ local_irq_disable();
+
+ if (*ppending)
wakeup_softirqd();

lockdep_softirq_exit();
@@ -260,7 +259,7 @@ asmlinkage void do_softirq(void)

local_irq_save(flags);

- pending = local_softirq_pending();
+ pending = *local_softirq_pending();

if (pending)
__do_softirq();
@@ -299,7 +298,7 @@ void irq_exit(void)
account_system_vtime(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt() && *local_softirq_pending())
invoke_softirq();

rcu_irq_exit();
@@ -312,11 +311,11 @@ void irq_exit(void)
}

/*
- * This function must run with irqs disabled!
+ * This function must run with preempt disabled!
*/
-inline void raise_softirq_irqoff(unsigned int nr)
+inline void raise_softirq_preempt_off(unsigned int nr)
{
- __raise_softirq_irqoff(nr);
+ __raise_softirq_preempt_off(nr);

/*
* If we're in an interrupt or softirq, we're done
@@ -333,11 +332,9 @@ inline void raise_softirq_irqoff(unsigne

void raise_softirq(unsigned int nr)
{
- unsigned long flags;
-
- local_irq_save(flags);
- raise_softirq_irqoff(nr);
- local_irq_restore(flags);
+ preempt_disable();
+ raise_softirq_preempt_off(nr);
+ preempt_enable();
}

void open_softirq(int nr, void (*action)(struct softirq_action *))
@@ -365,7 +362,7 @@ void __tasklet_schedule(struct tasklet_s
t->next = NULL;
*__get_cpu_var(tasklet_vec).tail = t;
__get_cpu_var(tasklet_vec).tail = &(t->next);
- raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ raise_softirq_preempt_off(TASKLET_SOFTIRQ);
local_irq_restore(flags);
}

@@ -379,7 +376,7 @@ void __tasklet_hi_schedule(struct taskle
t->next = NULL;
*__get_cpu_var(tasklet_hi_vec).tail = t;
__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
- raise_softirq_irqoff(HI_SOFTIRQ);
+ raise_softirq_preempt_off(HI_SOFTIRQ);
local_irq_restore(flags);
}

@@ -391,7 +388,7 @@ void __tasklet_hi_schedule_first(struct

t->next = __get_cpu_var(tasklet_hi_vec).head;
__get_cpu_var(tasklet_hi_vec).head = t;
- __raise_softirq_irqoff(HI_SOFTIRQ);
+ __raise_softirq_preempt_off(HI_SOFTIRQ);
}

EXPORT_SYMBOL(__tasklet_hi_schedule_first);
@@ -426,7 +423,7 @@ static void tasklet_action(struct softir
t->next = NULL;
*__get_cpu_var(tasklet_vec).tail = t;
__get_cpu_var(tasklet_vec).tail = &(t->next);
- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ __raise_softirq_preempt_off(TASKLET_SOFTIRQ);
local_irq_enable();
}
}
@@ -461,7 +458,7 @@ static void tasklet_hi_action(struct sof
t->next = NULL;
*__get_cpu_var(tasklet_hi_vec).tail = t;
__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
- __raise_softirq_irqoff(HI_SOFTIRQ);
+ __raise_softirq_preempt_off(HI_SOFTIRQ);
local_irq_enable();
}
}
@@ -561,7 +558,7 @@ static void __local_trigger(struct call_

/* Trigger the softirq only if the list was previously empty. */
if (head->next == &cp->list)
- raise_softirq_irqoff(softirq);
+ raise_softirq_preempt_off(softirq);
}

#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
@@ -659,7 +656,7 @@ static int __cpuinit remote_softirq_cpu_

local_head = &__get_cpu_var(softirq_work_list[i]);
list_splice_init(head, local_head);
- raise_softirq_irqoff(i);
+ raise_softirq_preempt_off(i);
}
local_irq_enable();
}
@@ -698,7 +695,7 @@ static int run_ksoftirqd(void * __bind_c

while (!kthread_should_stop()) {
preempt_disable();
- if (!local_softirq_pending()) {
+ if (!*local_softirq_pending()) {
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -706,7 +703,7 @@ static int run_ksoftirqd(void * __bind_c

__set_current_state(TASK_RUNNING);

- while (local_softirq_pending()) {
+ while (*local_softirq_pending()) {
/* Preempt disable stops cpu going offline.
If already offline, we'll be on wrong CPU:
don't process */
@@ -781,7 +778,7 @@ static void takeover_tasklets(unsigned i
per_cpu(tasklet_vec, cpu).head = NULL;
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
}
- raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ raise_softirq_preempt_off(TASKLET_SOFTIRQ);

if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
@@ -789,7 +786,7 @@ static void takeover_tasklets(unsigned i
per_cpu(tasklet_hi_vec, cpu).head = NULL;
per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
}
- raise_softirq_irqoff(HI_SOFTIRQ);
+ raise_softirq_preempt_off(HI_SOFTIRQ);

local_irq_enable();
}
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -304,12 +304,12 @@ void tick_nohz_stop_sched_tick(int inidl
if (need_resched())
goto end;

- if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ if (unlikely(*local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;

if (ratelimit < 10) {
printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
+ (unsigned int) *local_softirq_pending());
ratelimit++;
}
goto end;
@@ -453,7 +453,7 @@ void tick_nohz_stop_sched_tick(int inidl
tick_do_update_jiffies64(ktime_get());
cpumask_clear_cpu(cpu, nohz_cpu_mask);
}
- raise_softirq_irqoff(TIMER_SOFTIRQ);
+ raise_softirq_preempt_off(TIMER_SOFTIRQ);
out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1564,7 +1564,7 @@ static inline void __netif_reschedule(st
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ raise_softirq_preempt_off(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}

@@ -1585,7 +1585,7 @@ void dev_kfree_skb_irq(struct sk_buff *s
sd = &__get_cpu_var(softnet_data);
skb->next = sd->completion_queue;
sd->completion_queue = skb;
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ raise_softirq_preempt_off(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
}
@@ -2218,7 +2218,7 @@ static inline void ____napi_schedule(str
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ __raise_softirq_preempt_off(NET_RX_SOFTIRQ);
}

#ifdef CONFIG_RPS
@@ -2397,7 +2397,7 @@ static int rps_ipi_queued(struct softnet
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;

- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ __raise_softirq_preempt_off(NET_RX_SOFTIRQ);
return 1;
}
#endif /* CONFIG_RPS */
@@ -2506,7 +2506,7 @@ int netif_rx_ni(struct sk_buff *skb)

preempt_disable();
err = netif_rx(skb);
- if (local_softirq_pending())
+ if (*local_softirq_pending())
do_softirq();
preempt_enable();

@@ -3532,7 +3532,7 @@ out:

softnet_break:
sd->time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ __raise_softirq_preempt_off(NET_RX_SOFTIRQ);
goto out;
}

@@ -5670,7 +5670,7 @@ static int dev_cpu_callback(struct notif
oldsd->output_queue_tailp = &oldsd->output_queue;
}

- raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ raise_softirq_preempt_off(NET_TX_SOFTIRQ);
local_irq_enable();

/* Process offline CPU's input_pkt_queue */


2010-06-24 03:05:22

by Huang, Ying

[permalink] [raw]
Subject: [RFC 4/5] x86, Use NMI return notifier in MCE

Use general NMI return notifier mechanism to replace the self
interrupt used in MCE handler.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/include/asm/entry_arch.h | 4 --
arch/x86/include/asm/irq_vectors.h | 5 ---
arch/x86/kernel/cpu/mcheck/mce.c | 50 +++++--------------------------------
arch/x86/kernel/entry_64.S | 5 ---
arch/x86/kernel/irqinit.c | 3 --
5 files changed, 7 insertions(+), 60 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -61,8 +61,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMA
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif

-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
#endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -121,11 +121,6 @@
#define UV_BAU_MESSAGE 0xea

/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xeb
-
-/*
* Self IPI vector for NMI return notifier
*/
#define NMI_RETURN_NOTIFIER_VECTOR 0xe9
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -480,60 +480,24 @@ static inline void mce_get_rip(struct mc
m->ip = mce_rdmsrl(rip_msr);
}

-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void __mce_report_event(struct nmi_return_notifier *nrn)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif
+
+static DEFINE_PER_CPU(struct nmi_return_notifier, mce_nrn) = {
+ .on_nmi_return = __mce_report_event,
+};

static void mce_report_event(struct pt_regs *regs)
{
if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
- mce_notify_irq();
- /*
- * Triggering the work queue here is just an insurance
- * policy in case the syscall exit notify handler
- * doesn't run soon enough or ends up running on the
- * wrong CPU (can happen when audit sleeps)
- */
- mce_schedule_work();
+ __mce_report_event(NULL);
return;
}

-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ nmi_return_notifier_schedule(&__get_cpu_var(mce_nrn));
}

DEFINE_PER_CPU(unsigned, mce_poll_count);
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1004,11 +1004,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt

-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
#ifdef CONFIG_X86_LOCAL_APIC
apicinterrupt NMI_RETURN_NOTIFIER_VECTOR \
nmi_return_notifier_interrupt smp_nmi_return_notifier_interrupt
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -209,9 +209,6 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
#ifdef CONFIG_X86_LOCAL_APIC
alloc_intr_gate(NMI_RETURN_NOTIFIER_VECTOR, nmi_return_notifier_interrupt);
#endif

2010-06-24 03:05:20

by Huang, Ying

[permalink] [raw]
Subject: [RFC 2/5] NMI return notifier

Many kernel services can not be used in NMI handler. So NMI handler
needs a mechanism to do these operations in other contexts such as IRQ
and process.

This patch implements such a mechanism based on soft_irq in a similar
way as user return notifier. A new soft_irq named
NMI_RETURN_NOTIFIER_SOFTIRQ is defined and a lock-less single link
list is used to hold functions which will be called in the soft_irq
after NMI handler returned.

Signed-off-by: Huang Ying <[email protected]>
---
include/linux/interrupt.h | 1
include/linux/nmi.h | 11 +++++
kernel/Makefile | 2 -
kernel/nmi.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/softirq.c | 2 +
5 files changed, 101 insertions(+), 1 deletion(-)
create mode 100644 kernel/nmi.c

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -374,6 +374,7 @@ enum
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
+ NMI_RETURN_NOTIFIER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */

NR_SOFTIRQS
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,15 @@ static inline bool trigger_all_cpu_backt
}
#endif

+struct nmi_return_notifier {
+ void (*on_nmi_return)(struct nmi_return_notifier *nrn);
+ void *data;
+ struct nmi_return_notifier *next;
+};
+
+void nmi_return_notifier_schedule(struct nmi_return_notifier *nrn);
+void fire_nmi_return_notifiers(void);
+struct softirq_action;
+void nmi_return_notifier_action(struct softirq_action *a);
+
#endif
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
- async.o range.o
+ async.o range.o nmi.o
obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
obj-y += groups.o

--- /dev/null
+++ b/kernel/nmi.c
@@ -0,0 +1,86 @@
+/*
+ * nmi.c
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+#include <linux/percpu.h>
+
+#define NMI_RETURN_NOTIFIER_TAIL ((struct nmi_return_notifier *)-1UL)
+
+static DEFINE_PER_CPU(struct nmi_return_notifier *, nmi_return_notifier_head) =
+ NMI_RETURN_NOTIFIER_TAIL;
+
+/*
+ * Some architectures can used this function to trigger soft_irq, for
+ * example via self interrupt.
+ */
+void __weak arch_nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+}
+
+/*
+ * Schedule a notification after current CPU returns from NMI handler.
+ * Must be called in atomic context. The notifier will be called in
+ * IRQ or soft_irq context.
+ *
+ * This function is based on perf_pending_queue().
+ */
+void nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+ struct nmi_return_notifier **head;
+
+ if (cmpxchg(&nrn->next, NULL, NMI_RETURN_NOTIFIER_TAIL) != NULL)
+ return;
+
+ head = &get_cpu_var(nmi_return_notifier_head);
+
+ do {
+ nrn->next = *head;
+ } while (cmpxchg(head, nrn->next, nrn) != nrn->next);
+
+ raise_softirq_preempt_off(NMI_RETURN_NOTIFIER_SOFTIRQ);
+
+ arch_nmi_return_notifier_schedule(nrn);
+
+ put_cpu_var(nmi_return_notifier_head);
+}
+EXPORT_SYMBOL_GPL(nmi_return_notifier_schedule);
+
+void fire_nmi_return_notifiers(void)
+{
+ struct nmi_return_notifier *nrn, *list;
+
+ list = xchg(&__get_cpu_var(nmi_return_notifier_head),
+ NMI_RETURN_NOTIFIER_TAIL);
+ while (list != NMI_RETURN_NOTIFIER_TAIL) {
+ nrn = list;
+ list = list->next;
+ nrn->next = NULL;
+ nrn->on_nmi_return(nrn);
+ }
+}
+EXPORT_SYMBOL_GPL(fire_nmi_return_notifiers);
+
+void nmi_return_notifier_action(struct softirq_action *a)
+{
+ fire_nmi_return_notifiers();
+}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,7 @@
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/tick.h>
+#include <linux/nmi.h>

#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -687,6 +688,7 @@ void __init softirq_init(void)

open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
+ open_softirq(NMI_RETURN_NOTIFIER_SOFTIRQ, nmi_return_notifier_action);
}

static int run_ksoftirqd(void * __bind_cpu)

2010-06-24 03:05:32

by Huang, Ying

[permalink] [raw]
Subject: [RFC 5/5] Use NMI return notifier in perf pending

Use general NMI return notifier mechanism to replace the self
interrupt used in perf pending.

Known issue: don't know how to deal with SPARC architecture.

Signed-off-by: Huang Ying <[email protected]>
---
arch/alpha/include/asm/perf_event.h | 1
arch/arm/include/asm/perf_event.h | 12 -----
arch/parisc/include/asm/perf_event.h | 1
arch/powerpc/kernel/time.c | 5 --
arch/s390/include/asm/perf_event.h | 3 -
arch/sh/include/asm/perf_event.h | 5 --
arch/sparc/include/asm/perf_event.h | 2
arch/x86/include/asm/entry_arch.h | 4 -
arch/x86/include/asm/hardirq.h | 1
arch/x86/include/asm/irq_vectors.h | 5 --
arch/x86/kernel/cpu/perf_event.c | 19 --------
arch/x86/kernel/entry_64.S | 5 --
arch/x86/kernel/irq.c | 5 --
arch/x86/kernel/irqinit.c | 6 --
include/linux/perf_event.h | 11 ----
kernel/perf_event.c | 81 +++++------------------------------
kernel/timer.c | 2
17 files changed, 15 insertions(+), 153 deletions(-)

--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -2,7 +2,6 @@
#define __ASM_ALPHA_PERF_EVENT_H

/* Alpha only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }

#define PERF_EVENT_INDEX_OFFSET 0

--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
#ifndef __ARM_PERF_EVENT_H__
#define __ARM_PERF_EVENT_H__

-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts enabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
/* ARM performance counters start from 1 (in the cp15 accesses) so use the
* same indexes here for consistency. */
#define PERF_EVENT_INDEX_OFFSET 1
--- a/arch/parisc/include/asm/perf_event.h
+++ b/arch/parisc/include/asm/perf_event.h
@@ -2,6 +2,5 @@
#define __ASM_PARISC_PERF_EVENT_H

/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }

#endif /* __ASM_PARISC_PERF_EVENT_H */
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -635,11 +635,6 @@ void timer_interrupt(struct pt_regs * re

calculate_steal_time();

- if (test_perf_event_pending()) {
- clear_perf_event_pending();
- perf_event_do_pending();
- }
-
#ifdef CONFIG_PPC_ISERIES
if (firmware_has_feature(FW_FEATURE_ISERIES))
get_lppaca()->int_dword.fields.decr_int = 0;
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,4 @@
* Copyright 2009 Martin Schwidefsky, IBM Corporation.
*/

-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
-
#define PERF_EVENT_INDEX_OFFSET 0
--- a/arch/sh/include/asm/perf_event.h
+++ b/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,6 @@ extern int register_sh_pmu(struct sh_pmu
extern int reserve_pmc_hardware(void);
extern void release_pmc_hardware(void);

-static inline void set_perf_event_pending(void)
-{
- /* Nothing to see here, move along. */
-}
-
#define PERF_EVENT_INDEX_OFFSET 0

#endif /* __ASM_SH_PERF_EVENT_H */
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -1,8 +1,6 @@
#ifndef __ASM_SPARC_PERF_EVENT_H
#define __ASM_SPARC_PERF_EVENT_H

-extern void set_perf_event_pending(void);
-
#define PERF_EVENT_INDEX_OFFSET 0

#ifdef CONFIG_PERF_EVENTS
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,10 +49,6 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)

-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
-#endif
-
#ifdef CONFIG_X86_THERMAL_VECTOR
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,7 +15,6 @@ typedef struct {
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
- unsigned int apic_pending_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -113,11 +113,6 @@
*/
#define X86_PLATFORM_IPI_VECTOR 0xed

-/*
- * Performance monitoring pending work vector:
- */
-#define LOCAL_PENDING_VECTOR 0xec
-
#define UV_BAU_MESSAGE 0xea

/*
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1160,25 +1160,6 @@ static int x86_pmu_handle_irq(struct pt_
return handled;
}

-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1023,11 +1023,6 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt

-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
/*
* Exception entry points.
*/
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -73,10 +73,6 @@ static int show_other_interrupts(struct
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -192,7 +188,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_nmi_return_notifier_irqs;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -223,12 +223,6 @@ static void __init apic_intr_init(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
#endif
}

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -484,6 +484,7 @@ struct perf_guest_info_callbacks {
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
+#include <linux/nmi.h>
#include <asm/atomic.h>
#include <asm/local.h>

@@ -608,11 +609,6 @@ struct perf_mmap_data {
void *data_pages[0];
};

-struct perf_pending_entry {
- struct perf_pending_entry *next;
- void (*func)(struct perf_pending_entry *);
-};
-
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -719,7 +715,7 @@ struct perf_event {
int pending_wakeup;
int pending_kill;
int pending_disable;
- struct perf_pending_entry pending;
+ struct nmi_return_notifier pending;

atomic_t event_limit;

@@ -831,8 +827,6 @@ extern void perf_event_task_tick(struct
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
extern void perf_event_print_debug(void);
extern void __perf_disable(void);
extern bool __perf_enable(void);
@@ -1031,7 +1025,6 @@ perf_event_task_tick(struct task_struct
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
-static inline void perf_event_do_pending(void) { }
static inline void perf_event_print_debug(void) { }
static inline void perf_disable(void) { }
static inline void perf_enable(void) { }
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2829,15 +2829,17 @@ void perf_event_wakeup(struct perf_event
*
* Handle the case where we need to wakeup up from NMI (or rq->lock) context.
*
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
+ * The NMI bit means we cannot possibly take locks. Therefore, use
+ * nmi_return_notifier.
*/

-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct nmi_return_notifier *nrn)
{
- struct perf_event *event = container_of(entry,
+ struct perf_event *event = container_of(nrn,
struct perf_event, pending);

+ nrn->data = NULL;
+
if (event->pending_disable) {
event->pending_disable = 0;
__perf_event_disable(event);
@@ -2849,59 +2851,12 @@ static void perf_pending_event(struct pe
}
}

-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
- PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
- void (*func)(struct perf_pending_entry *))
+static void perf_pending_queue(struct nmi_return_notifier *nrn,
+ void (*func)(struct nmi_return_notifier *))
{
- struct perf_pending_entry **head;
-
- if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
- return;
-
- entry->func = func;
-
- head = &get_cpu_var(perf_pending_head);
-
- do {
- entry->next = *head;
- } while (cmpxchg(head, entry->next, entry) != entry->next);
-
- set_perf_event_pending();
-
- put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
- struct perf_pending_entry *list;
- int nr = 0;
-
- list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
- while (list != PENDING_TAIL) {
- void (*func)(struct perf_pending_entry *);
- struct perf_pending_entry *entry = list;
-
- list = list->next;
-
- func = entry->func;
- entry->next = NULL;
- /*
- * Ensure we observe the unqueue before we issue the wakeup,
- * so that we won't be waiting forever.
- * -- see perf_not_pending().
- */
- smp_wmb();
-
- func(entry);
- nr++;
- }
-
- return nr;
+ nrn->on_nmi_return = func;
+ nrn->data = nrn;
+ nmi_return_notifier_schedule(nrn);
}

static inline int perf_not_pending(struct perf_event *event)
@@ -2911,15 +2866,10 @@ static inline int perf_not_pending(struc
* need to wait.
*/
get_cpu();
- __perf_pending_run();
+ fire_nmi_return_notifiers();
put_cpu();

- /*
- * Ensure we see the proper queue state before going to sleep
- * so that we do not miss the wakeup. -- see perf_pending_handle()
- */
- smp_rmb();
- return event->pending.next == NULL;
+ return event->pending.data == NULL;
}

static void perf_pending_sync(struct perf_event *event)
@@ -2927,11 +2877,6 @@ static void perf_pending_sync(struct per
wait_event(event->waitq, perf_not_pending(event));
}

-void perf_event_do_pending(void)
-{
- __perf_pending_run();
-}
-
/*
* Callchain support -- arch specific
*/
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,6 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
-#include <linux/perf_event.h>
#include <linux/sched.h>
#include <linux/slab.h>

@@ -1264,7 +1263,6 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
printk_tick();
- perf_event_do_pending();
scheduler_tick();
run_posix_cpu_timers(p);
}

2010-06-24 03:05:53

by Huang, Ying

[permalink] [raw]
Subject: [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier

soft_irq is used to run NMI return notifiers. But it may be quite long
from soft_irq is raised in NMI handler to soft_irq is run actually. To
solve the issue, a self interrupt IPI is used to trigger the soft_irq
earlier.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/include/asm/hardirq.h | 1 +
arch/x86/include/asm/hw_irq.h | 1 +
arch/x86/include/asm/irq_vectors.h | 5 +++++
arch/x86/kernel/entry_64.S | 5 +++++
arch/x86/kernel/irq.c | 7 +++++++
arch/x86/kernel/irqinit.c | 3 +++
arch/x86/kernel/traps.c | 29 +++++++++++++++++++++++++++++
7 files changed, 51 insertions(+)

--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,7 @@ typedef struct {
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
+ unsigned int apic_nmi_return_notifier_irqs;
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -35,6 +35,7 @@ extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
extern void mce_self_interrupt(void);
+extern void nmi_return_notifier_interrupt(void);

extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,11 @@
*/
#define MCE_SELF_VECTOR 0xeb

+/*
+ * Self IPI vector for NMI return notifier
+ */
+#define NMI_RETURN_NOTIFIER_VECTOR 0xe9
+
#define NR_VECTORS 256

#define FPU_IRQ 13
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1009,6 +1009,11 @@ apicinterrupt MCE_SELF_VECTOR \
mce_self_interrupt smp_mce_self_interrupt
#endif

+#ifdef CONFIG_X86_LOCAL_APIC
+apicinterrupt NMI_RETURN_NOTIFIER_VECTOR \
+ nmi_return_notifier_interrupt smp_nmi_return_notifier_interrupt
+#endif
+
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,12 @@ static int show_other_interrupts(struct
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "NRN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_nmi_return_notifier_irqs);
+ seq_printf(p, " NMI return notifier interrupts\n");
+
seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
@@ -184,6 +190,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_nmi_return_notifier_irqs;
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_pending_irqs;
#endif
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -212,6 +212,9 @@ static void __init apic_intr_init(void)
#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ alloc_intr_gate(NMI_RETURN_NOTIFIER_VECTOR, nmi_return_notifier_interrupt);
+#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -888,3 +888,32 @@ void __init trap_init(void)

x86_init.irqs.trap_init();
}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+asmlinkage void smp_nmi_return_notifier_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ irq_enter();
+ inc_irq_stat(apic_nmi_return_notifier_irqs);
+ irq_exit();
+}
+
+void arch_nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+ /* Without APIC, hope next soft_irq not too late*/
+ if (!cpu_has_apic)
+ return;
+
+ /*
+ * Use a self interrupt to trigger the soft_irq for NMI return
+ * notifiers
+ */
+ apic->send_IPI_self(NMI_RETURN_NOTIFIER_VECTOR);
+
+ /* Wait for idle afterward so that we don't leave the APIC in
+ * a non idle state because the normal APIC writes cannot
+ * exclude us.
+ */
+ apic_wait_icr_idle();
+}
+#endif

2010-06-24 06:00:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 5/5] Use NMI return notifier in perf pending

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> Use general NMI return notifier mechanism to replace the self
> interrupt used in perf pending.
>
> Known issue: don't know how to deal with SPARC architecture.

NAK I want and need hardirq context for the callback

2010-06-24 06:03:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> soft_irq is used to run NMI return notifiers. But it may be quite long
> from soft_irq is raised in NMI handler to soft_irq is run actually. To
> solve the issue, a self interrupt IPI is used to trigger the soft_irq
> earlier.

This all is just gross hackery,..

wth is wrong with doing what I proposed earlier and use a work_struct
like callback mechanism?

2010-06-24 06:09:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/5] Make soft_irq NMI safe

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> +#define __raise_softirq_preempt_off(nr) \
> + do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0)

So that is the reason for that insane local_softirq_pending()
definition?

Quite revolting.

2010-06-24 06:36:06

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] irq_work


Something like this, but filled out with some arch code that does the
self-ipi and calls irq_work_run() should do.

No need to molest the softirq code, no need for limited vectors of any
kind.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/irq_callback.h | 13 ++++++++
kernel/irq_callback.c | 66 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 79 insertions(+)

Index: linux-2.6/include/linux/irq_callback.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/irq_callback.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_IRQ_CALLBACK_H
+#define _LINUX_IRQ_CALLBACK_H
+
+struct irq_work {
+ struct irq_work *next;
+ void (*func)(struct irq_work *);
+};
+
+int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_CALLBACK_H */
Index: linux-2.6/kernel/irq_callback.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/irq_callback.c
@@ -0,0 +1,66 @@
+
+#include <linux/irq_callback.h>
+
+#define CALLBACK_TAIL ((struct irq_work *)-1UL)
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
+ CALLBACK_TAIL,
+};
+
+int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+ struct irq_work **head;
+
+ if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
+ return 0;
+
+ entry->func = func;
+
+ head = &get_cpu_var(irq_work_list);
+
+ do {
+ entry->next = *head;
+ } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+ if (entry->next == CALLBACK_TAIL)
+ arch_self_ipi();
+
+ put_cpu_var(irq_work_list);
+ return 1;
+}
+
+void irq_work_run(void)
+{
+ struct irq_work *list;
+
+ list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
+ while (list != CALLBACK_TAIL) {
+ struct irq_work *entry = list;
+
+ list = list->next;
+ entry->func(entry);
+
+ entry->next = NULL;
+ /*
+ * matches the mb in cmpxchg() in irq_work_queue()
+ */
+ smp_wmb();
+ }
+}
+
+static int irq_work_pending(struct irq_work *entry)
+{
+ /*
+ * matches the wmb in irq_work_run
+ */
+ smp_rmb();
+ return entry->next != NULL;
+}
+
+void irq_work_sync(struct irq_work *entry)
+{
+ WARN_ON_ONCE(irqs_disabled());
+
+ while (irq_work_pending(entry))
+ cpu_relax();
+}

2010-06-24 06:43:14

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

Hi, Peter,

On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> Something like this, but filled out with some arch code that does the
> self-ipi and calls irq_work_run() should do.
>
> No need to molest the softirq code, no need for limited vectors of any
> kind.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> include/linux/irq_callback.h | 13 ++++++++
> kernel/irq_callback.c | 66 +++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 79 insertions(+)
>
> Index: linux-2.6/include/linux/irq_callback.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6/include/linux/irq_callback.h
> @@ -0,0 +1,13 @@
> +#ifndef _LINUX_IRQ_CALLBACK_H
> +#define _LINUX_IRQ_CALLBACK_H
> +
> +struct irq_work {
> + struct irq_work *next;
> + void (*func)(struct irq_work *);
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
> +void irq_work_run(void);
> +void irq_work_sync(struct irq_work *entry);
> +
> +#endif /* _LINUX_IRQ_CALLBACK_H */
> Index: linux-2.6/kernel/irq_callback.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/kernel/irq_callback.c
> @@ -0,0 +1,66 @@
> +
> +#include <linux/irq_callback.h>
> +
> +#define CALLBACK_TAIL ((struct irq_work *)-1UL)
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
> + CALLBACK_TAIL,
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
> +{
> + struct irq_work **head;
> +
> + if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
> + return 0;
> +
> + entry->func = func;
> +
> + head = &get_cpu_var(irq_work_list);
> +
> + do {
> + entry->next = *head;
> + } while (cmpxchg(head, entry->next, entry) != entry->next);
> +
> + if (entry->next == CALLBACK_TAIL)
> + arch_self_ipi();
> +
> + put_cpu_var(irq_work_list);
> + return 1;
> +}
> +
> +void irq_work_run(void)
> +{
> + struct irq_work *list;
> +
> + list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> + while (list != CALLBACK_TAIL) {
> + struct irq_work *entry = list;
> +
> + list = list->next;
> + entry->func(entry);
> +
> + entry->next = NULL;
> + /*
> + * matches the mb in cmpxchg() in irq_work_queue()
> + */
> + smp_wmb();
> + }
> +}
> +
> +static int irq_work_pending(struct irq_work *entry)
> +{
> + /*
> + * matches the wmb in irq_work_run
> + */
> + smp_rmb();
> + return entry->next != NULL;
> +}
> +
> +void irq_work_sync(struct irq_work *entry)
> +{
> + WARN_ON_ONCE(irqs_disabled());
> +
> + while (irq_work_pending(entry))
> + cpu_relax();
> +}

I fact I uses exactly the similar method in my patches, just trigger it
with soft_irq instead of IRQ. Please take a look at
nmi_return_notifier_schedule in

[RFC 2/5] NMI return notifier

Best Regards,
Huang Ying

2010-06-24 06:46:04

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC 1/5] Make soft_irq NMI safe

On Thu, 2010-06-24 at 14:09 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> > +#define __raise_softirq_preempt_off(nr) \
> > + do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0)
>
> So that is the reason for that insane local_softirq_pending()
> definition?

I need to change local_softirq_pending() for this and the xchg() in
__do_softirq().

Best Regards,
Huang Ying

2010-06-24 06:48:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> Hi, Peter,

> I fact I uses exactly the similar method in my patches, just trigger it
> with soft_irq instead of IRQ. Please take a look at
> nmi_return_notifier_schedule in

But then why still use softirq? Once you have this its completely
useless.

2010-06-24 06:50:46

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > Hi, Peter,
>
> > I fact I uses exactly the similar method in my patches, just trigger it
> > with soft_irq instead of IRQ. Please take a look at
> > nmi_return_notifier_schedule in
>
> But then why still use softirq? Once you have this its completely
> useless.

Some systems have no self interrupt, for example the system without
APIC. We need to provide a fallback for them. soft_irq can help here.

Best Regards,
Huang Ying

2010-06-24 06:58:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:50 +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > > Hi, Peter,
> >
> > > I fact I uses exactly the similar method in my patches, just trigger it
> > > with soft_irq instead of IRQ. Please take a look at
> > > nmi_return_notifier_schedule in
> >
> > But then why still use softirq? Once you have this its completely
> > useless.
>
> Some systems have no self interrupt, for example the system without
> APIC. We need to provide a fallback for them. soft_irq can help here.

So there's systems that don't have self-ipi but do have NMI context?

Can't we run the callbacks from the tick or something for such legacy
muck? I really don't like the whole softirq mess.

2010-06-24 07:04:36

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:58 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:50 +0800, Huang Ying wrote:
> > On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> > > On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > > > Hi, Peter,
> > >
> > > > I fact I uses exactly the similar method in my patches, just trigger it
> > > > with soft_irq instead of IRQ. Please take a look at
> > > > nmi_return_notifier_schedule in
> > >
> > > But then why still use softirq? Once you have this its completely
> > > useless.
> >
> > Some systems have no self interrupt, for example the system without
> > APIC. We need to provide a fallback for them. soft_irq can help here.
>
> So there's systems that don't have self-ipi but do have NMI context?

Yes. NMI is there from 8259 age.

> Can't we run the callbacks from the tick or something for such legacy
> muck? I really don't like the whole softirq mess.

That is possible. But in NO_HZ system, we have no tick to rely on.
soft_irq is better here, because it will be triggered for any interrupt.

Best Regards,
Huang Ying

2010-06-24 07:20:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:

> Yes. NMI is there from 8259 age.

But do we really care about such systems?

> That is possible. But in NO_HZ system, we have no tick to rely on.

Of course you have, you can delay the NO_HZ state when there's pending
callbacks, that's all of 1 line.

> soft_irq is better here, because it will be triggered for any interrupt.

Well, you can do the callbacks from irq_exit() as well, that's no
problem.

2010-06-24 07:27:36

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 15:19 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:
>
> > Yes. NMI is there from 8259 age.
>
> But do we really care about such systems?
>
> > That is possible. But in NO_HZ system, we have no tick to rely on.
>
> Of course you have, you can delay the NO_HZ state when there's pending
> callbacks, that's all of 1 line.
>
> > soft_irq is better here, because it will be triggered for any interrupt.
>
> Well, you can do the callbacks from irq_exit() as well, that's no
> problem.

I think it is not a good idea to add overhead in such a hot path if the
overhead can be avoided.

Best Regards,
Huang Ying

2010-06-24 07:32:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 15:27 +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 15:19 +0800, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:
> >
> > > Yes. NMI is there from 8259 age.
> >
> > But do we really care about such systems?
> >
> > > That is possible. But in NO_HZ system, we have no tick to rely on.
> >
> > Of course you have, you can delay the NO_HZ state when there's pending
> > callbacks, that's all of 1 line.
> >
> > > soft_irq is better here, because it will be triggered for any interrupt.
> >
> > Well, you can do the callbacks from irq_exit() as well, that's no
> > problem.
>
> I think it is not a good idea to add overhead in such a hot path if the
> overhead can be avoided.

True, but I really don't like the softirq thing, and I really don't care
about !APIC machines, I probably couldn't buy one if I wanted to and its
not like we have good MCE support for them now, so who cares.

2010-06-24 10:00:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 4/5] x86, Use NMI return notifier in MCE

Huang Ying <[email protected]> writes:

Hi Ying,

> {
> if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
> - mce_notify_irq();
> - /*
> - * Triggering the work queue here is just an insurance
> - * policy in case the syscall exit notify handler
> - * doesn't run soon enough or ends up running on the
> - * wrong CPU (can happen when audit sleeps)
> - */
> - mce_schedule_work();
> + __mce_report_event(NULL);

Do we still handle the CPU switch case correctly?

The backend handler needs to run on the same CPU to process the per
CPU mce pfns.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 10:27:28

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

> True, but I really don't like the softirq thing, and I really don't care
> about !APIC machines, I probably couldn't buy one if I wanted to and its
> not like we have good MCE support for them now, so who cares.

In theory you can run a machine with good MCE support in non APIC single
CPU mode. It wouldn't make much sense, but you could do it.

Anyways, I don't think we need a lot of effort to handle this case,
but it would be better to not explicitely break it either.

That's why the timer fallback in the original code was fine, this
basically never happens and even if there is a 5s delay from tickless
that's fine.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-24 10:30:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 12:27 +0200, Andi Kleen wrote:
> > True, but I really don't like the softirq thing, and I really don't care
> > about !APIC machines, I probably couldn't buy one if I wanted to and its
> > not like we have good MCE support for them now, so who cares.
>
> In theory you can run a machine with good MCE support in non APIC single
> CPU mode. It wouldn't make much sense, but you could do it.
>
> Anyways, I don't think we need a lot of effort to handle this case,
> but it would be better to not explicitely break it either.
>
> That's why the timer fallback in the original code was fine, this
> basically never happens and even if there is a 5s delay from tickless
> that's fine.

Right, in that case I would very much prefer the simpler thing I
proposed over all this softirq stuff, we can have the tick process the
callbacks for really broken hardware (perf_events doesn't care since
without a lapic there's no pmi anyway).

2010-06-24 10:52:58

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

> Right, in that case I would very much prefer the simpler thing I
> proposed over all this softirq stuff, we can have the tick process the
> callbacks for really broken hardware (perf_events doesn't care since
> without a lapic there's no pmi anyway).

Ying's approach will work I think.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 10:58:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 12:52 +0200, Andi Kleen wrote:
> > Right, in that case I would very much prefer the simpler thing I
> > proposed over all this softirq stuff, we can have the tick process the
> > callbacks for really broken hardware (perf_events doesn't care since
> > without a lapic there's no pmi anyway).
>
> Ying's approach will work I think.

Right, except that I really dislike it, it touches far too much code for
no particular reason.

And I really want hardirq context for perf callbacks, some code actually
relies on it (I used to have the fallback in the timer softirq and that
broke thing at some point).

So I'm really opposed to all the softirq molestation as I see no reason
to do that at all.

2010-06-24 11:08:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

> And I really want hardirq context for perf callbacks, some code actually
> relies on it (I used to have the fallback in the timer softirq and that

Surely that could be fixed? *requiring* hard irq context sounds weird.

> broke thing at some point).

I have one case that needs to sleep (but only when interrupting user code)
They key thing in it really is to switch stacks back to process.

-andi
--
[email protected] -- Speaking for myself only.

2010-06-24 11:11:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > And I really want hardirq context for perf callbacks, some code actually
> > relies on it (I used to have the fallback in the timer softirq and that
>
> Surely that could be fixed? *requiring* hard irq context sounds weird.

possibly, but there is no reason what so ever to use softirq here.

> > broke thing at some point).
>
> I have one case that needs to sleep (but only when interrupting user code)
> They key thing in it really is to switch stacks back to process.

softirq can't sleep either, you need a trampoline anyway.

2010-06-24 11:20:44

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 01:10:52PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > > And I really want hardirq context for perf callbacks, some code actually
> > > relies on it (I used to have the fallback in the timer softirq and that
> >
> > Surely that could be fixed? *requiring* hard irq context sounds weird.
>
> possibly, but there is no reason what so ever to use softirq here.

Ok so going back to the original self-irq patchkit. Unfortunately the other
reviewer hated that. How to get out of that deadlock?


> > > broke thing at some point).
> >
> > I have one case that needs to sleep (but only when interrupting user code)
> > They key thing in it really is to switch stacks back to process.
>
> softirq can't sleep either, you need a trampoline anywa

Not true, when you interrupt ring 3 it can sleep. You just need to make
sure to run on the right stack and fix up any irq counters.

Anyways this can be solved in a different way too, it would just fit
in there too.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 11:23:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work


* Peter Zijlstra <[email protected]> wrote:

> On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > > And I really want hardirq context for perf callbacks, some code actually
> > > relies on it (I used to have the fallback in the timer softirq and that
> >
> > Surely that could be fixed? *requiring* hard irq context sounds weird.
>
> possibly, but there is no reason what so ever to use softirq here.
>
> > > broke thing at some point).
> >
> > I have one case that needs to sleep (but only when interrupting user code)
> > They key thing in it really is to switch stacks back to process.
>
> softirq can't sleep either, you need a trampoline anyway.

What might make sense is to offer two types of callbacks: one that is
immediate whenever an event triggers - and another that is sleepable and is
executed from process context.

Having an intermediate softirq level might be over-design indeed.

Thanks,

Ingo

2010-06-24 11:33:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > softirq can't sleep either, you need a trampoline anywa
>
> Not true, when you interrupt ring 3 it can sleep. You just need to make
> sure to run on the right stack and fix up any irq counters.

But that is not softirq. That would be something like what faults do,
but we don't have anything else that does that.

2010-06-24 11:35:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:23 +0200, Ingo Molnar wrote:

> What might make sense is to offer two types of callbacks: one that is
> immediate whenever an event triggers - and another that is sleepable and is
> executed from process context.

Trouble is waking that thread, you cannot wake tasks from NMI context,
so whatever you do, you'll end up with a trampoline.

You could of course offer that trampoline nicely packaged, but I'm not
sure that's really worth the effort.

2010-06-24 11:42:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> Ok so going back to the original self-irq patchkit. Unfortunately the other
> reviewer hated that. How to get out of that deadlock?

Well, I didn't like your original patch either.

What's wrong with going with the patch I posted today? (aside from me
getting the barriers slightly wrong and not doing the arch
implementation).

2010-06-24 11:55:28

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 01:33:24PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > > softirq can't sleep either, you need a trampoline anywa
> >
> > Not true, when you interrupt ring 3 it can sleep. You just need to make
> > sure to run on the right stack and fix up any irq counters.
>
> But that is not softirq. That would be something like what faults do,

Yes not a classical one, but can use the same infrastructure.

> but we don't have anything else that does that.

Actually we do, audit in syscalls and scheduling in interrupts and signals
all work this way. Probably more at some point adding more code to this
path was very popular.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 11:57:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > but we don't have anything else that does that.
>
> Actually we do, audit in syscalls and scheduling in interrupts and signals
> all work this way. Probably more at some point adding more code to this
> path was very popular.

That's the return to user path, nothing to do with softirqs. Add a TIF
flag and call your function there.

2010-06-24 11:58:41

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 01:42:11PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > Ok so going back to the original self-irq patchkit. Unfortunately the other
> > reviewer hated that. How to get out of that deadlock?
>
> Well, I didn't like your original patch either.
>
> What's wrong with going with the patch I posted today? (aside from me
> getting the barriers slightly wrong and not doing the arch
> implementation).

Well it would need to work.

Also I personally didn't see the point of the irq items list because
there's no good way to dynamically allocate it in a NMI, so the window
would be always "fixed size" anyways and you could as well just use
per cpu data.

That's why for simple self irq I preferred Ying's original patch.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-24 12:02:30

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > but we don't have anything else that does that.
> >
> > Actually we do, audit in syscalls and scheduling in interrupts and signals
> > all work this way. Probably more at some point adding more code to this
> > path was very popular.
>
> That's the return to user path, nothing to do with softirqs. Add a TIF
> flag and call your function there.

It does that, but there are some cases where it's not enough.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-24 12:02:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 13:58 +0200, Andi Kleen wrote:
> On Thu, Jun 24, 2010 at 01:42:11PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > > Ok so going back to the original self-irq patchkit. Unfortunately the other
> > > reviewer hated that. How to get out of that deadlock?
> >
> > Well, I didn't like your original patch either.
> >
> > What's wrong with going with the patch I posted today? (aside from me
> > getting the barriers slightly wrong and not doing the arch
> > implementation).
>
> Well it would need to work.

Look at kernel/perf_event.c:perf_pending_queue()/__perf_pending_run()

> Also I personally didn't see the point of the irq items list because
> there's no good way to dynamically allocate it in a NMI, so the window
> would be always "fixed size" anyways and you could as well just use
> per cpu data.
>
> That's why for simple self irq I preferred Ying's original patch.

I already told you that I have an irq_work in every perf_event structure
(its called perf_pending_entry), I cannot register an id for each
perf_event because:
1) there's potentially more than 32 of them
2) I'd need an id->perf_event map which is a waste of time



2010-06-24 12:18:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:02 +0200, Andi Kleen wrote:
> On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > > but we don't have anything else that does that.
> > >
> > > Actually we do, audit in syscalls and scheduling in interrupts and signals
> > > all work this way. Probably more at some point adding more code to this
> > > path was very popular.
> >
> > That's the return to user path, nothing to do with softirqs. Add a TIF
> > flag and call your function there.
>
> It does that, but there are some cases where it's not enough.

care to expand on that?

2010-06-24 12:35:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work


* Peter Zijlstra <[email protected]> wrote:

> On Thu, 2010-06-24 at 13:23 +0200, Ingo Molnar wrote:
>
> > What might make sense is to offer two types of callbacks: one that is
> > immediate whenever an event triggers - and another that is sleepable and is
> > executed from process context.
>
> Trouble is waking that thread, you cannot wake tasks from NMI context,
> so whatever you do, you'll end up with a trampoline.
>
> You could of course offer that trampoline nicely packaged, but I'm not
> sure that's really worth the effort.

Right, so there's basically three clean solutions to the 'sleepable callback'
problem, in order of amount of state that needs to be passed to it:

- State-less (or idempotent) events/callbacks: use a hardirq callback to wake
up a well-known process context.

- If we want the task that generates an event to execute a sleeping callback:
use a TIF flag and state in the task itself to pass along the info.

- In the most generic case, if there's arbitrary target task and arbitrary
state that needs to be queued, then to achieve sleepable callbacks the
following solution can be used: the task allocates a perf ring-buffer and
uses a TIF flag to trigger consumption of it.

All memory allocation, wakeup, etc. is handled already by the regular perf
events and ring-buffer codepaths.

No special, open-coded trampolining needed - the ring-buffer is the trampoline
and the ring-buffer consumer can key off the events it receives. (and there
can be multiple consumers of the same event source so we can have in-task
kernel based action combined with a user-space daemon that get an event stream
as well.)

All of these solutions use the fact that perf events are a generic event
framework. If there's any missing details somewhere then fixes/enhancements
can be added - right now our in-kernel event consumers are simple. But the
design is sound.

And none of these solutions involves the incestous low level raping of
softirqs.

Thanks,

Ingo

2010-06-24 12:38:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 02:18:07PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:02 +0200, Andi Kleen wrote:
> > On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > > > but we don't have anything else that does that.
> > > >
> > > > Actually we do, audit in syscalls and scheduling in interrupts and signals
> > > > all work this way. Probably more at some point adding more code to this
> > > > path was very popular.
> > >
> > > That's the return to user path, nothing to do with softirqs. Add a TIF
> > > flag and call your function there.
> >
> > It does that, but there are some cases where it's not enough.
>
> care to expand on that?

This is for execution context error recovery.

TIF works for user space, but it's a bit ugly because it requires adding
more data to the task_struct because CPUs can change. The sleepable
soft irq would have avoided that (that's not a show stopper)

The other case was to recover from a *_user() error in the kernel.
I originally had some fancy code for preemptive kernels that
exploited that you could sleep here

(it doesn't work for non preemptive unfortunately because we can't
know if locks are hold and some *_user are expected to
never sleep)

But there were still ugly special cases for switching stacks
and the sleepable softirqs could have avoided that.

Anyways the later is not fatal either, but it would have been
nice to solve that one.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 13:02:37

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 02:35:37PM +0200, Ingo Molnar wrote:
> All of these solutions use the fact that perf events are a generic event
> framework. If there's any missing details somewhere then fixes/enhancements
> can be added - right now our in-kernel event consumers are simple. But the
> design is sound.

One immediate problem that comes to mind with the proposal
is that if the event is of a type that cannot be dropped (e.g. an error
that needs to be handled) then a shared ring buffer cannot guarantee that.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 13:20:23

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

From: Andi Kleen <[email protected]>
Date: Thu, Jun 24, 2010 at 03:02:34PM +0200

> On Thu, Jun 24, 2010 at 02:35:37PM +0200, Ingo Molnar wrote:
> > All of these solutions use the fact that perf events are a generic event
> > framework. If there's any missing details somewhere then fixes/enhancements
> > can be added - right now our in-kernel event consumers are simple. But the
> > design is sound.
>
> One immediate problem that comes to mind with the proposal
> is that if the event is of a type that cannot be dropped (e.g. an error
> that needs to be handled) then a shared ring buffer cannot guarantee that.

If its a critical error you do all the handling in the kernel and you
don't need task context at all, no? Can you give an example of such an
error?

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

2010-06-24 13:33:26

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

> If its a critical error you do all the handling in the kernel and you

I assume you mean in MCE. And the answer is no.

MCE generally can only panic or log, everything else
needs other contexts.

> don't need task context at all, no?

Process context is needed for various recovery schemes, all
that need to sleep for example.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-24 13:43:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work


Andi,

* Andi Kleen <[email protected]> wrote:

> > If its a critical error you do all the handling in the kernel and you

Sidenote: could you please stop doing this special new email style of cutting
out the portion from your emails which shows whom you replied to?

Doing that:

- Shows disrespect towards the person you are replying to. If you feel the
content worth quoting then you should pay respect to the person having
written that email as well, and quote his name.

- Makes the thread flow much harder to follow to me as i dont see it from the
mail text whom you replied to.

Thanks,

Ingo

2010-06-24 13:46:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work


* Andi Kleen <[email protected]> wrote:

> > If its a critical error you do all the handling in the kernel and you
>
> I assume you mean in MCE. And the answer is no.
>
> MCE generally can only panic or log, everything else
> needs other contexts.
>
> > don't need task context at all, no?
>
> Process context is needed for various recovery schemes, all
> that need to sleep for example.

Please, as Peter and Boris asked you already, quote a concrete, specific
example:

'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
via the suggested method due to Z.'

Your generic arguments look wrong (to the extent they are specified) and it
makes it much easier and faster to address your points if you dont blur them
by vagaries.

Thanks,

Ingo

2010-06-24 14:01:46

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

> Please, as Peter and Boris asked you already, quote a concrete, specific
> example:

It was already in my answer to Peter.

>
> 'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
> via the suggested method due to Z.'
>
> Your generic arguments look wrong (to the extent they are specified) and it
> makes it much easier and faster to address your points if you dont blur them
> by vagaries.

It's one of the fundamental properties of recoverable errors.

Error happens.
Machine check or NMI or other exception happens.
That exception runs on the exception stack
The error is not fatal, but recoverable.
For example you want to kill a process or call hwpoison or do some other
recovery action. These generally have to sleep to do anything
interesting.
You cannot do the sleeping on the exception stack, so you push it to
another context.

Now just because an error is recoverable doesn't mean it's not critical
(I think that was the mistake Boris made). If you don't do something
(like killing or recovery) you could end up in a loop or consume
corrupted data or something else bad.

So the error has to have a fail safe path from detection to handling.

That's quite different from logging or performance counting etc.
where dropping events on overload is normal and expected.

Normally it can be only done by using dedicated resources.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-24 15:39:56

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

From: Andi Kleen <[email protected]>
Date: Thu, Jun 24, 2010 at 10:01:43AM -0400

> > Please, as Peter and Boris asked you already, quote a concrete, specific
> > example:
>
> It was already in my answer to Peter.
>
> >
> > 'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
> > via the suggested method due to Z.'
> >
> > Your generic arguments look wrong (to the extent they are specified) and it
> > makes it much easier and faster to address your points if you dont blur them
> > by vagaries.
>
> It's one of the fundamental properties of recoverable errors.
>
> Error happens.
> Machine check or NMI or other exception happens.
> That exception runs on the exception stack
> The error is not fatal, but recoverable.
> For example you want to kill a process or call hwpoison or do some other
> recovery action. These generally have to sleep to do anything
> interesting.
> You cannot do the sleeping on the exception stack, so you push it to
> another context.
>
> Now just because an error is recoverable doesn't mean it's not critical
> (I think that was the mistake Boris made).

It wasn't a mistake - I was simply trying to lure you into giving a more
concrete example so that we all land on the same page and we know what
the heck you/we/all are talking about.

> If you don't do something
> (like killing or recovery) you could end up in a loop or consume
> corrupted data or something else bad.
>
> So the error has to have a fail safe path from detection to handling.

So we are talking about a more involved and "could-sleep" error
recovery.

> That's quite different from logging or performance counting etc.
> where dropping events on overload is normal and expected.

So I went back and reread the whole thread, and correct me if I'm
wrong but the whole run softirq after NMI has one use case for now -
"could-sleep" error handling for MCEs _only_ on x86. So you're changing
a bunch of generic and x86 kernel code just for error handling. Hmm,
that's a kinda big hammer in my book.

A slimmer solution is a much better way to go, IMHO. I think Peter said
something about irq_exit(), which should be just fine.

But AFAICT an arch-specific solution would be even better, e.g.
if you call into your deferred work helper from paranoid_exit in
<arch/x86/kernel/entry_64.S>. I.e, something like

#ifdef CONFIG_X86_MCE
testl $_TIF_NEED_POST_NMI,%ebx
jnz do_post_nmi_work
#endif

Or even slimmer, rewrite the paranoidzeroentry to a MCE-specific variant
which does the added functionality. But that wouldn't be extensible if
other entities want post-NMI work later.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

2010-06-24 16:09:44

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, Jun 24, 2010 at 05:41:24PM +0200, Borislav Petkov wrote:
> > If you don't do something
> > (like killing or recovery) you could end up in a loop or consume
> > corrupted data or something else bad.
> >
> > So the error has to have a fail safe path from detection to handling.
>
> So we are talking about a more involved and "could-sleep" error
> recovery.

That's one case, there are other too.

>
> > That's quite different from logging or performance counting etc.
> > where dropping events on overload is normal and expected.
>
> So I went back and reread the whole thread, and correct me if I'm
> wrong but the whole run softirq after NMI has one use case for now -
> "could-sleep" error handling for MCEs _only_ on x86. So you're changing

Nope, there are multiple use cases. Today it's background MCE
and possibly perf if it ever decides to share code
with the rest of the kernel instead of wanting to be Bork of Linux.
Future ones would be more MCE errors and also non MCE errors like NMIs.

> a bunch of generic and x86 kernel code just for error handling. Hmm,
> that's a kinda big hammer in my book.

Actually no, it would just make the current code slightly cleaner
and somewhat more general. But for most cases it works without it.
>
> A slimmer solution is a much better way to go, IMHO. I think Peter said
> something about irq_exit(), which should be just fine.

The "slimmer solution" is there, but it has some limitations.
I merely said that softirqs would be useful for solving these limitations
(but are not strictly needed)

Anyways slimmer solution was even originally proposed,
just some of the earlier review proposed softirqs instead.
So Ying posts softirqs and then he gets now flamed for posting
softirqs. Overall there wasn't much consistency in the suggestions,
three different reviewers suggested three incompatible approaches.

Anyways if there are no softirqs that's fine too, the error
handler can probably live with not having that.

> But AFAICT an arch-specific solution would be even better, e.g.
> if you call into your deferred work helper from paranoid_exit in
> <arch/x86/kernel/entry_64.S>. I.e, something like

Yes that helps for part of the error handling (in fact this
has been implemented), but that does not solve the self interrupt
problem which requires delaying until next cli.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-25 02:12:47

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> Something like this, but filled out with some arch code that does the
> self-ipi and calls irq_work_run() should do.
>
> No need to molest the softirq code, no need for limited vectors of any
> kind.

Now, as far as my understanding goes, hard IRQ based solution is
acceptable for everyone.

Ingo and Andi,

Do you agree?

> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> include/linux/irq_callback.h | 13 ++++++++
> kernel/irq_callback.c | 66 +++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 79 insertions(+)
>
> Index: linux-2.6/include/linux/irq_callback.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6/include/linux/irq_callback.h
> @@ -0,0 +1,13 @@
> +#ifndef _LINUX_IRQ_CALLBACK_H
> +#define _LINUX_IRQ_CALLBACK_H
> +
> +struct irq_work {
> + struct irq_work *next;
> + void (*func)(struct irq_work *);
> +};

It is better to add "void *data" field in this struct to allow same
function can be used for multiple struct irq_work.

And I think IRQ is the implementation detail here, so irq_work is
probably not a good name. nmi_return_notifier or nmi_callback is better?

> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
> +void irq_work_run(void);
> +void irq_work_sync(struct irq_work *entry);
> +
> +#endif /* _LINUX_IRQ_CALLBACK_H */
> Index: linux-2.6/kernel/irq_callback.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/kernel/irq_callback.c
> @@ -0,0 +1,66 @@
> +
> +#include <linux/irq_callback.h>
> +
> +#define CALLBACK_TAIL ((struct irq_work *)-1UL)
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
> + CALLBACK_TAIL,
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
> +{
> + struct irq_work **head;
> +
> + if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
> + return 0;
> +
> + entry->func = func;
> +
> + head = &get_cpu_var(irq_work_list);
> +
> + do {
> + entry->next = *head;
> + } while (cmpxchg(head, entry->next, entry) != entry->next);
> +
> + if (entry->next == CALLBACK_TAIL)
> + arch_self_ipi();
> +
> + put_cpu_var(irq_work_list);
> + return 1;
> +}
> +
> +void irq_work_run(void)
> +{
> + struct irq_work *list;
> +
> + list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> + while (list != CALLBACK_TAIL) {
> + struct irq_work *entry = list;
> +
> + list = list->next;
> + entry->func(entry);
> +
> + entry->next = NULL;

entry->next = NULL should be put before entry->func(entry), so that we
will not lose a notification from NMI. And maybe check irq_work_list for
several times to make sure nothing is lost.

> + /*
> + * matches the mb in cmpxchg() in irq_work_queue()
> + */
> + smp_wmb();
> + }
> +}

I don't know why we need smp_wmb() here and smp_rmb() in
irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
__wait_event() already.

> +static int irq_work_pending(struct irq_work *entry)
> +{
> + /*
> + * matches the wmb in irq_work_run
> + */
> + smp_rmb();
> + return entry->next != NULL;
> +}
> +
> +void irq_work_sync(struct irq_work *entry)
> +{
> + WARN_ON_ONCE(irqs_disabled());
> +
> + while (irq_work_pending(entry))
> + cpu_relax();
> +}

If we move entry->next = NULL earlier in irq_work_run(), we need another
flag to signify the entry->func is running here.

Best Regards,
Huang Ying

2010-06-25 08:17:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
>
> It is better to add "void *data" field in this struct to allow same
> function can be used for multiple struct irq_work.

No, simply do:

struct my_foo {
struct irq_work work;
/* my extra data */
}

void my_func(struct irq_work *work)
{
struct my_foo *foo = container_of(work, struct my_foo, work);

/* tada! */
}


> And I think IRQ is the implementation detail here, so irq_work is
> probably not a good name. nmi_return_notifier or nmi_callback is better?

Well, its ran in hard-irq context, so its an irq work. There's nothing
that says it can only be used from NMI context.

> > +void irq_work_run(void)
> > +{
> > + struct irq_work *list;
> > +
> > + list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> > + while (list != CALLBACK_TAIL) {
> > + struct irq_work *entry = list;
> > +
> > + list = list->next;
> > + entry->func(entry);
> > +
> > + entry->next = NULL;
>
> entry->next = NULL should be put before entry->func(entry), so that we
> will not lose a notification from NMI. And maybe check irq_work_list for
> several times to make sure nothing is lost.

But then _sync() will return before its done executing.

I think clearing after the function is done executing is the only sane
semantics (and yes, I should fix the current perf code).

You can always miss an NMI since it can always happen before the
callback gets done, and allowing another enqueue before the callback is
complete is asking for trouble.

> > + /*
> > + * matches the mb in cmpxchg() in irq_work_queue()
> > + */
> > + smp_wmb();
> > + }
> > +}
>
> I don't know why we need smp_wmb() here and smp_rmb() in
> irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
> not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
> __wait_event() already.

The smp_wmb() wants to be before ->next = NULL; so that all writes are
completed before we release the entry. To that same effect _sync() and
_queue need the (r)mb.

2010-06-25 09:08:09

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, Jun 25, 2010 at 10:12:43AM +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> > Something like this, but filled out with some arch code that does the
> > self-ipi and calls irq_work_run() should do.
> >
> > No need to molest the softirq code, no need for limited vectors of any
> > kind.
>
> Now, as far as my understanding goes, hard IRQ based solution is
> acceptable for everyone.
>
> Ingo and Andi,
>
> Do you agree?

Yes that's fine for me. The error handling issues can be solved in other
ways too.

-Andi

2010-06-25 09:17:23

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> >
> > It is better to add "void *data" field in this struct to allow same
> > function can be used for multiple struct irq_work.
>
> No, simply do:
>
> struct my_foo {
> struct irq_work work;
> /* my extra data */
> }
>
> void my_func(struct irq_work *work)
> {
> struct my_foo *foo = container_of(work, struct my_foo, work);
>
> /* tada! */
> }

Yes. This works too. But Adding "void *data" field is helpful if you do
not embed struct irq_work into another struct.

> > And I think IRQ is the implementation detail here, so irq_work is
> > probably not a good name. nmi_return_notifier or nmi_callback is better?
>
> Well, its ran in hard-irq context, so its an irq work. There's nothing
> that says it can only be used from NMI context.

It may be run in other contexts on some system (without APIC). And I
don't think it is useful to others except NMI handler. I think this is a
choice between naming after implementation and purpose.

> > > +void irq_work_run(void)
> > > +{
> > > + struct irq_work *list;
> > > +
> > > + list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> > > + while (list != CALLBACK_TAIL) {
> > > + struct irq_work *entry = list;
> > > +
> > > + list = list->next;
> > > + entry->func(entry);
> > > +
> > > + entry->next = NULL;
> >
> > entry->next = NULL should be put before entry->func(entry), so that we
> > will not lose a notification from NMI. And maybe check irq_work_list for
> > several times to make sure nothing is lost.
>
> But then _sync() will return before its done executing.

We can use another flag to signify whether it is executing. For example
the bit 0 of entry->next.

> I think clearing after the function is done executing is the only sane
> semantics (and yes, I should fix the current perf code).
>
> You can always miss an NMI since it can always happen before the
> callback gets done, and allowing another enqueue before the callback is
> complete is asking for trouble.

If we move entry->next = NULL before entry->func(entry), we will not
miss the NMI. Can you show how to miss it in this way?

> > > + /*
> > > + * matches the mb in cmpxchg() in irq_work_queue()
> > > + */
> > > + smp_wmb();
> > > + }
> > > +}
> >
> > I don't know why we need smp_wmb() here and smp_rmb() in
> > irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
> > not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
> > __wait_event() already.
>
> The smp_wmb() wants to be before ->next = NULL; so that all writes are
> completed before we release the entry. To that same effect _sync() and
> _queue need the (r)mb.

It is reasonable in this way.

Best Regards,
Huang Ying

2010-06-25 09:23:36

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

2010/6/25 Huang Ying <[email protected]>:
> On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
>> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
>> >
>> > It is better to add "void *data" field in this struct to allow same
>> > function can be used for multiple struct irq_work.
>>
>> No, simply do:
>>
>> struct my_foo {
>> ? struct irq_work work;
>> ? /* my extra data */
>> }
>>
>> void my_func(struct irq_work *work)
>> {
>> ? struct my_foo *foo = container_of(work, struct my_foo, work);
>>
>> ? /* tada! */
>> }
>
> Yes. This works too. But Adding "void *data" field is helpful if you do
> not embed struct irq_work into another struct.



That's what makes most sense. If you use work->data to put foo, then
you can also do the opposite. Now the best is to pick the choice that
gives you a real type and a typechecking, and not an error-prone and
obfuscated void *

This is the way things are made in the kernel. struct work_struct, struct list,
struct rcu_head, etc... are all embedded into a container, so that we can
use container_of.

2010-06-25 09:30:36

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, 2010-06-25 at 17:23 +0800, Frederic Weisbecker wrote:
> 2010/6/25 Huang Ying <[email protected]>:
> > On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> >> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> >> >
> >> > It is better to add "void *data" field in this struct to allow same
> >> > function can be used for multiple struct irq_work.
> >>
> >> No, simply do:
> >>
> >> struct my_foo {
> >> struct irq_work work;
> >> /* my extra data */
> >> }
> >>
> >> void my_func(struct irq_work *work)
> >> {
> >> struct my_foo *foo = container_of(work, struct my_foo, work);
> >>
> >> /* tada! */
> >> }
> >
> > Yes. This works too. But Adding "void *data" field is helpful if you do
> > not embed struct irq_work into another struct.
>
>
> That's what makes most sense. If you use work->data to put foo, then
> you can also do the opposite. Now the best is to pick the choice that
> gives you a real type and a typechecking, and not an error-prone and
> obfuscated void *
>
> This is the way things are made in the kernel. struct work_struct, struct list,
> struct rcu_head, etc... are all embedded into a container, so that we can
> use container_of.

container_of has no full type checking too.

Best Regards,
Huang Ying

2010-06-25 09:31:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, 2010-06-25 at 17:17 +0800, Huang Ying wrote:
> On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> > On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> > >
> > > It is better to add "void *data" field in this struct to allow same
> > > function can be used for multiple struct irq_work.
> >
> > No, simply do:
> >
> > struct my_foo {
> > struct irq_work work;
> > /* my extra data */
> > }
> >
> > void my_func(struct irq_work *work)
> > {
> > struct my_foo *foo = container_of(work, struct my_foo, work);
> >
> > /* tada! */
> > }
>
> Yes. This works too. But Adding "void *data" field is helpful if you do
> not embed struct irq_work into another struct.

No, embedding is the normal way we do this in the kernel.

> > > And I think IRQ is the implementation detail here, so irq_work is
> > > probably not a good name. nmi_return_notifier or nmi_callback is better?
> >
> > Well, its ran in hard-irq context, so its an irq work. There's nothing
> > that says it can only be used from NMI context.
>
> It may be run in other contexts on some system (without APIC).

I would consider that a BUG. Use a random IRQ source to process the
callbacks on these broken platforms.

> And I
> don't think it is useful to others except NMI handler. I think this is a
> choice between naming after implementation and purpose.

There is, although I'm sure people will yell at me for even proposing
this. You can raise the IPI from an IRQ disabled section to get
something done right after it.

> We can use another flag to signify whether it is executing. For example
> the bit 0 of entry->next.

There's no point.

> > I think clearing after the function is done executing is the only sane
> > semantics (and yes, I should fix the current perf code).
> >
> > You can always miss an NMI since it can always happen before the
> > callback gets done, and allowing another enqueue before the callback is
> > complete is asking for trouble.
>
> If we move entry->next = NULL before entry->func(entry), we will not
> miss the NMI. Can you show how to miss it in this way?

<NMI>
...
irq_work_queue(&my_work, func);
...
<EOI>
<IPI>
irq_work_run()

<NMI>
irq_work_queue(&my_work, func); <FAIL>
<EOI>

my_func.next = NULL;
<EOI>

Really not that hard. Now imagine wrapping irq_work in some state and
you reusing the state while the function is still running..

2010-06-25 09:44:49

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

2010/6/25 Huang Ying <[email protected]>:
> On Fri, 2010-06-25 at 17:23 +0800, Frederic Weisbecker wrote:
>> 2010/6/25 Huang Ying <[email protected]>:
>> > On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
>> >> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
>> >> >
>> >> > It is better to add "void *data" field in this struct to allow same
>> >> > function can be used for multiple struct irq_work.
>> >>
>> >> No, simply do:
>> >>
>> >> struct my_foo {
>> >> ? struct irq_work work;
>> >> ? /* my extra data */
>> >> }
>> >>
>> >> void my_func(struct irq_work *work)
>> >> {
>> >> ? struct my_foo *foo = container_of(work, struct my_foo, work);
>> >>
>> >> ? /* tada! */
>> >> }
>> >
>> > Yes. This works too. But Adding "void *data" field is helpful if you do
>> > not embed struct irq_work into another struct.
>>
>>
>> That's what makes most sense. If you use work->data to put foo, then
>> you can also do the opposite. Now the best is to pick the choice that
>> gives you a real type and a typechecking, and not an error-prone and
>> obfuscated void *
>>
>> This is the way things are made in the kernel. struct work_struct, struct list,
>> struct rcu_head, etc... are all embedded into a container, so that we can
>> use container_of.
>
> container_of has no full type checking too.


You're right. There is nothing that guarantees B is contained into A,
I mean the code is supposed
to provide this guarantee, but not the type.

That said it's much proper than playing with a void *data, beside the
fact kernel developers
will quickly understand what you do if you play with such scheme as
they are used to it.

2010-06-25 10:39:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Thu, 2010-06-24 at 14:38 +0200, Andi Kleen wrote:
> The sleepable
> soft irq would have avoided that (that's not a show stopper)

I'm still not convinced sleepable softirq is a workable thing.

Softirqs:
A) are non-preemptible
B) are per-cpu because of A
C) can be ran from ksoftirqd context
D) generic kernel infrastructure with identical semantics on all archs

If you were to make something like a sleepable softirq, you'd loose A
(per definition), B (sleepable implies migratable to cpus_allowed) and
possibly D (unless you want to touch all architectures).

Now from your 'requirements':

> I have one case that needs to sleep (but only when interrupting user code)

> TIF works for user space, but it's a bit ugly because it requires adding
> more data to the task_struct because CPUs can change.

Which I read as:

1) needs to run in the task context of the task that got 'interrupted'
2) needs to stay on the cpu it got interrupted on.

So C is out of the window too, at which point there's nothing resembling
softirqs left.

To boot, x86_64 runs softirqs from the hardirq stack:

/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
push %rbp
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
incl PER_CPU_VAR(irq_count)
cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
END(call_softirq)

Also, -rt has something that could be considered sleepable softirqs,
although we call them preemptible softirqs. It runs all softirqs from
cpu bound kthreads, which again doesn't match your requirements.

So no, I don't think your idea of sleepable softirqs is sound.

2010-06-25 11:58:53

by huang ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work

On Fri, Jun 25, 2010 at 5:30 PM, Peter Zijlstra <[email protected]> wrote:
>> > You can always miss an NMI since it can always happen before the
>> > callback gets done, and allowing another enqueue before the callback is
>> > complete is asking for trouble.
>>
>> If we move entry->next = NULL before entry->func(entry), we will not
>> miss the NMI. Can you show how to miss it in this way?
>
> <NMI>
>  ...
>  irq_work_queue(&my_work, func);
>  ...
> <EOI>
> <IPI>
>  irq_work_run()
>
>  <NMI>
>    irq_work_queue(&my_work, func); <FAIL>
>  <EOI>
>
>   my_func.next = NULL;

entry->func() should follows here. You can collect all information
(maybe some data in a ring buffer) from NMI handler in entry->func().
But if you place entry->NULL after entry->func(), you will really lose
a NMI notification and the information from NMI handler.

> <EOI>

> Really not that hard. Now imagine wrapping irq_work in some state and
> you reusing the state while the function is still running..

So I suggest to use another flag to signify the function is running to
distinguish.

Best Regards,
Huang Ying

2010-06-25 18:30:47

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] irq_work -v2

Utterly untested and you really need visit all the touched architectures
if you want to make it useful.

---
Subject: irq_work: generic hard-irq context callbacks

In order for other NMI context users that want to run things from
hard-IRQ context, extract the perf_event callback mechanism.

Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/alpha/include/asm/perf_event.h | 9 -
arch/frv/lib/perf_event.c | 19 ---
arch/parisc/include/asm/perf_event.h | 7 -
arch/s390/include/asm/perf_event.h | 10 -
linux-2.6/arch/alpha/Kconfig | 1
linux-2.6/arch/arm/Kconfig | 1
linux-2.6/arch/arm/include/asm/perf_event.h | 12 -
linux-2.6/arch/arm/kernel/perf_event.c | 4
linux-2.6/arch/frv/Kconfig | 1
linux-2.6/arch/parisc/Kconfig | 1
linux-2.6/arch/powerpc/Kconfig | 1
linux-2.6/arch/powerpc/kernel/time.c | 42 +++---
linux-2.6/arch/s390/Kconfig | 1
linux-2.6/arch/sh/Kconfig | 1
linux-2.6/arch/sh/include/asm/perf_event.h | 7 -
linux-2.6/arch/sparc/Kconfig | 2
linux-2.6/arch/sparc/include/asm/perf_event.h | 4
linux-2.6/arch/sparc/kernel/pcr.c | 8 -
linux-2.6/arch/x86/Kconfig | 1
linux-2.6/arch/x86/include/asm/entry_arch.h | 4
linux-2.6/arch/x86/include/asm/hw_irq.h | 2
linux-2.6/arch/x86/kernel/Makefile | 1
linux-2.6/arch/x86/kernel/cpu/perf_event.c | 19 ---
linux-2.6/arch/x86/kernel/entry_64.S | 4
linux-2.6/arch/x86/kernel/irq_work.c | 26 ++++
linux-2.6/arch/x86/kernel/irqinit.c | 4
linux-2.6/include/linux/irq_work.h | 20 +++
linux-2.6/include/linux/perf_event.h | 11 -
linux-2.6/init/Kconfig | 8 +
linux-2.6/kernel/Makefile | 2
linux-2.6/kernel/irq_work.c | 157 ++++++++++++++++++++++++++
linux-2.6/kernel/perf_event.c | 102 ----------------
linux-2.6/kernel/timer.c | 4
33 files changed, 266 insertions(+), 230 deletions(-)

Index: linux-2.6/include/linux/irq_work.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/irq_work.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_IRQ_WORK_H
+#define _LINUX_IRQ_WORK_H
+
+struct irq_work {
+ struct irq_work *next;
+ void (*func)(struct irq_work *);
+};
+
+static inline
+void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+ entry->next = NULL;
+ entry->func = func;
+}
+
+int irq_work_queue(struct irq_work *entry);
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_WORK_H */
Index: linux-2.6/kernel/irq_work.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/irq_work.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free NULL, 0 -> {claimed} : free to be used
+ * claimed NULL, 3 -> {pending} : claimed to be enqueued
+ * pending next, 3 -> {busy} : queued, pending callback
+ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING 1UL
+#define IRQ_WORK_BUSY 2UL
+#define IRQ_WORK_FLAGS 3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+ return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+ unsigned long next = (unsigned long)entry->next;
+ next &= ~IRQ_WORK_FLAGS;
+ return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+ unsigned long next = (unsigned long)entry;
+ next |= flags;
+ return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+ unsigned long flags;
+
+ do {
+ flags = (unsigned long)entry->next;
+ if (flags & IRQ_WORK_PENDING)
+ return false;
+ } while (cmpxchg(&entry->next, flags, flags | IRQ_WORK_FLAGS) != flags);
+
+ return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+ /*
+ * Lame architectures will get the timer tick callback
+ */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+ struct irq_work **head;
+
+ head = &get_cpu_var(irq_work_list);
+
+ do {
+ /*
+ * Can assign non-atomic because we keep the flags set.
+ */
+ entry->next = next_flags(*head, IRQ_WORK_FLAGS);
+ } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+ /*
+ * The list was empty, raise self-interrupt to start processing.
+ */
+ if (!irq_work_next(entry))
+ arch_irq_work_raise();
+
+ put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+ if (!irq_work_claim(entry)) {
+ /*
+ * Already enqueued, can't do!
+ */
+ return false;
+ }
+
+ __irq_work_queue(entry);
+ return true;
+}
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+ struct irq_work *list;
+
+ BUG_ON(!in_irq());
+ BUG_ON(!irqs_disabled());
+
+ list = xchg(&__get_cpu_var(irq_work_list), NULL);
+ while (list != NULL) {
+ struct irq_work *entry = list;
+
+ list = irq_work_next(list);
+
+ /*
+ * Clear the PENDING bit, after this point the @entry
+ * can be re-used.
+ */
+ entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+ entry->func(entry);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+ }
+}
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+ WARN_ON_ONCE(irqs_disabled());
+
+ while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+ cpu_relax();
+}
Index: linux-2.6/arch/alpha/Kconfig
===================================================================
--- linux-2.6.orig/arch/alpha/Kconfig
+++ linux-2.6/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_SYSCALL_WRAPPERS
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_DMA_ATTRS
help
Index: linux-2.6/arch/alpha/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/alpha/include/asm/perf_event.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_ALPHA_PERF_EVENT_H
-#define __ASM_ALPHA_PERF_EVENT_H
-
-/* Alpha only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
-#endif /* __ASM_ALPHA_PERF_EVENT_H */
Index: linux-2.6/arch/arm/Kconfig
===================================================================
--- linux-2.6.orig/arch/arm/Kconfig
+++ linux-2.6/arch/arm/Kconfig
@@ -22,6 +22,7 @@ config ARM
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_KERNEL_LZMA
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
help
Index: linux-2.6/arch/arm/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/arm/include/asm/perf_event.h
+++ linux-2.6/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
#ifndef __ARM_PERF_EVENT_H__
#define __ARM_PERF_EVENT_H__

-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts enabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
/* ARM performance counters start from 1 (in the cp15 accesses) so use the
* same indexes here for consistency. */
#define PERF_EVENT_INDEX_OFFSET 1
Index: linux-2.6/arch/frv/Kconfig
===================================================================
--- linux-2.6.orig/arch/frv/Kconfig
+++ linux-2.6/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
default y
select HAVE_IDE
select HAVE_ARCH_TRACEHOOK
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS

config ZONE_DMA
Index: linux-2.6/arch/frv/lib/perf_event.c
===================================================================
--- linux-2.6.orig/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells ([email protected])
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_event.h>
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
Index: linux-2.6/arch/parisc/Kconfig
===================================================================
--- linux-2.6.orig/arch/parisc/Kconfig
+++ linux-2.6/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
select RTC_DRV_GENERIC
select INIT_ALL_POSSIBLE
select BUG
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select GENERIC_ATOMIC64 if !64BIT
help
Index: linux-2.6/arch/parisc/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/parisc/include/asm/perf_event.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARISC_PERF_EVENT_H
-#define __ASM_PARISC_PERF_EVENT_H
-
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
-
-#endif /* __ASM_PARISC_PERF_EVENT_H */
Index: linux-2.6/arch/powerpc/Kconfig
===================================================================
--- linux-2.6.orig/arch/powerpc/Kconfig
+++ linux-2.6/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
select HAVE_OPROFILE
select HAVE_SYSCALL_WRAPPERS if PPC64
select GENERIC_ATOMIC64 if PPC32
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_REGS_AND_STACK_ACCESS_API

Index: linux-2.6/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/time.c
+++ linux-2.6/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
#include <linux/posix-timers.h>
#include <linux/irq.h>
#include <linux/delay.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
#include <asm/trace.h>

#include <asm/io.h>
@@ -532,60 +532,60 @@ void __init iSeries_time_init_early(void
}
#endif /* CONFIG_PPC_ISERIES */

-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK

/*
* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
*/
#ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;

asm volatile("lbz %0,%1(13)"
: "=r" (x)
- : "i" (offsetof(struct paca_struct, perf_event_pending)));
+ : "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
}

-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
{
asm volatile("stb %0,%1(13)" : :
"r" (1),
- "i" (offsetof(struct paca_struct, perf_event_pending)));
+ "i" (offsetof(struct paca_struct, irq_work_pending)));
}

-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
{
asm volatile("stb %0,%1(13)" : :
"r" (0),
- "i" (offsetof(struct paca_struct, perf_event_pending)));
+ "i" (offsetof(struct paca_struct, irq_work_pending)));
}

#else /* 32-bit */

-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);

-#define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending() __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending() __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0

#endif /* 32 vs 64 bit */

-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
{
preempt_disable();
- set_perf_event_pending_flag();
+ set_irq_work_pending_flag();
set_dec(1);
preempt_enable();
}

-#else /* CONFIG_PERF_EVENTS */
+#else /* CONFIG_IRQ_WORK */

-#define test_perf_event_pending() 0
-#define clear_perf_event_pending()
+#define test_irq_work_pending() 0
+#define clear_irq_work_pending()

-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */

/*
* For iSeries shared processors, we have to let the hypervisor
@@ -635,9 +635,9 @@ void timer_interrupt(struct pt_regs * re

calculate_steal_time();

- if (test_perf_event_pending()) {
- clear_perf_event_pending();
- perf_event_do_pending();
+ if (test_irq_work_pending()) {
+ clear_irq_work_pending();
+ irq_work_run();
}

#ifdef CONFIG_PPC_ISERIES
Index: linux-2.6/arch/s390/Kconfig
===================================================================
--- linux-2.6.orig/arch/s390/Kconfig
+++ linux-2.6/arch/s390/Kconfig
@@ -98,6 +98,7 @@ config S390
select HAVE_KVM if 64BIT
select HAVE_ARCH_TRACEHOOK
select INIT_ALL_POSSIBLE
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
Index: linux-2.6/arch/s390/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/s390/include/asm/perf_event.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Performance event support - s390 specific definitions.
- *
- * Copyright 2009 Martin Schwidefsky, IBM Corporation.
- */
-
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
-
-#define PERF_EVENT_INDEX_OFFSET 0
Index: linux-2.6/arch/sh/Kconfig
===================================================================
--- linux-2.6.orig/arch/sh/Kconfig
+++ linux-2.6/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
select HAVE_ARCH_TRACEHOOK
select HAVE_DMA_API_DEBUG
select HAVE_DMA_ATTRS
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_KERNEL_GZIP
Index: linux-2.6/arch/sh/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/sh/include/asm/perf_event.h
+++ linux-2.6/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu
extern int reserve_pmc_hardware(void);
extern void release_pmc_hardware(void);

-static inline void set_perf_event_pending(void)
-{
- /* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
#endif /* __ASM_SH_PERF_EVENT_H */
Index: linux-2.6/arch/sparc/Kconfig
===================================================================
--- linux-2.6.orig/arch/sparc/Kconfig
+++ linux-2.6/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
select ARCH_WANT_OPTIONAL_GPIOLIB
select RTC_CLASS
select RTC_DRV_M48T59
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_DMA_ATTRS
@@ -52,6 +53,7 @@ config SPARC64
select RTC_DRV_BQ4802
select RTC_DRV_SUN4V
select RTC_DRV_STARFIRE
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC

Index: linux-2.6/arch/sparc/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/sparc/include/asm/perf_event.h
+++ linux-2.6/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
#ifndef __ASM_SPARC_PERF_EVENT_H
#define __ASM_SPARC_PERF_EVENT_H

-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
#ifdef CONFIG_PERF_EVENTS
#include <asm/ptrace.h>

Index: linux-2.6/arch/sparc/kernel/pcr.c
===================================================================
--- linux-2.6.orig/arch/sparc/kernel/pcr.c
+++ linux-2.6/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
#include <linux/init.h>
#include <linux/irq.h>

-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
#include <linux/ftrace.h>

#include <asm/pil.h>
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(i

old_regs = set_irq_regs(regs);
irq_enter();
-#ifdef CONFIG_PERF_EVENTS
- perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+ irq_work_run();
#endif
irq_exit();
set_irq_regs(old_regs);
}

-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
{
set_softint(1 << PIL_DEFERRED_PCR_WORK);
}
Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS_NMI
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
Index: linux-2.6/arch/x86/include/asm/entry_arch.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/entry_arch.h
+++ linux-2.6/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)

-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt,LOCAL_PENDING_VECTOR)
#endif

#ifdef CONFIG_X86_THERMAL_VECTOR
Index: linux-2.6/arch/x86/include/asm/hw_irq.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hw_irq.h
+++ linux-2.6/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
extern void apic_timer_interrupt(void);
extern void x86_platform_ipi(void);
extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);

extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
Index: linux-2.6/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/kernel/Makefile
+++ linux-2.6/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ obj-y := process_$(BITS).o signal.o en
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_event.c
@@ -1170,25 +1170,6 @@ static int x86_pmu_handle_irq(struct pt_
return handled;
}

-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
Index: linux-2.6/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/entry_64.S
+++ linux-2.6/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt

-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
+ irq_work_interrupt smp_irq_work_interrupt
#endif

/*
Index: linux-2.6/arch/x86/kernel/irq_work.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/kernel/irq_work.c
@@ -0,0 +1,26 @@
+
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_pending_irqs);
+ irq_work_run();
+ irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (!cpu_as_apic)
+ return;
+
+ apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+ apic_wait_icr_idle();
+#endif
+}
+
+
Index: linux-2.6/arch/x86/kernel/irqinit.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit.c
+++ linux-2.6/arch/x86/kernel/irqinit.c
@@ -225,8 +225,8 @@ static void __init apic_intr_init(void)
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);

/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# ifdef CONFIG_IRQ_WORK
+ alloc_intr_gate(LOCAL_PENDING_VECTOR, irq_work_interrupt);
# endif

#endif
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -486,6 +486,7 @@ struct perf_guest_info_callbacks {
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
+#include <linux/irq_work.h>
#include <asm/atomic.h>
#include <asm/local.h>

@@ -629,11 +630,6 @@ struct perf_buffer {
void *data_pages[0];
};

-struct perf_pending_entry {
- struct perf_pending_entry *next;
- void (*func)(struct perf_pending_entry *);
-};
-
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -741,7 +737,7 @@ struct perf_event {
int pending_wakeup;
int pending_kill;
int pending_disable;
- struct perf_pending_entry pending;
+ struct irq_work pending;

atomic_t event_limit;

@@ -853,8 +849,6 @@ extern void perf_event_task_tick(struct
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
extern void perf_event_print_debug(void);
extern void __perf_disable(void);
extern bool __perf_enable(void);
@@ -1028,7 +1022,6 @@ perf_event_task_tick(struct task_struct
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
-static inline void perf_event_do_pending(void) { }
static inline void perf_event_print_debug(void) { }
static inline void perf_disable(void) { }
static inline void perf_enable(void) { }
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -21,6 +21,13 @@ config CONSTRUCTORS
depends on !UML
default y

+config HAVE_IRQ_WORK
+ bool
+
+config IRQ_WORK
+ bool
+ depends on HAVE_IRQ_WORK
+
menu "General setup"

config EXPERIMENTAL
@@ -983,6 +990,7 @@ config PERF_EVENTS
default y if (PROFILING || PERF_COUNTERS)
depends on HAVE_PERF_EVENTS
select ANON_INODES
+ select IRQ_WORK
help
Enable kernel support for various performance events provided
by software and hardware.
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile
+++ linux-2.6/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
endif

obj-$(CONFIG_FREEZER) += freezer.o
@@ -101,6 +102,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -1880,12 +1880,11 @@ static void free_event_rcu(struct rcu_he
kfree(event);
}

-static void perf_pending_sync(struct perf_event *event);
static void perf_buffer_put(struct perf_buffer *buffer);

static void free_event(struct perf_event *event)
{
- perf_pending_sync(event);
+ irq_work_sync(&event->pending);

if (!event->parent) {
atomic_dec(&nr_events);
@@ -2829,15 +2828,6 @@ void perf_event_wakeup(struct perf_event
}
}

-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
static void perf_pending_event(struct perf_pending_entry *entry)
{
struct perf_event *event = container_of(entry,
@@ -2854,89 +2844,6 @@ static void perf_pending_event(struct pe
}
}

-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
- PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
- void (*func)(struct perf_pending_entry *))
-{
- struct perf_pending_entry **head;
-
- if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
- return;
-
- entry->func = func;
-
- head = &get_cpu_var(perf_pending_head);
-
- do {
- entry->next = *head;
- } while (cmpxchg(head, entry->next, entry) != entry->next);
-
- set_perf_event_pending();
-
- put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
- struct perf_pending_entry *list;
- int nr = 0;
-
- list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
- while (list != PENDING_TAIL) {
- void (*func)(struct perf_pending_entry *);
- struct perf_pending_entry *entry = list;
-
- list = list->next;
-
- func = entry->func;
- entry->next = NULL;
- /*
- * Ensure we observe the unqueue before we issue the wakeup,
- * so that we won't be waiting forever.
- * -- see perf_not_pending().
- */
- smp_wmb();
-
- func(entry);
- nr++;
- }
-
- return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
- /*
- * If we flush on whatever cpu we run, there is a chance we don't
- * need to wait.
- */
- get_cpu();
- __perf_pending_run();
- put_cpu();
-
- /*
- * Ensure we see the proper queue state before going to sleep
- * so that we do not miss the wakeup. -- see perf_pending_handle()
- */
- smp_rmb();
- return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
- wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
- __perf_pending_run();
-}
-
/*
* Callchain support -- arch specific
*/
@@ -2996,8 +2903,7 @@ static void perf_output_wakeup(struct pe

if (handle->nmi) {
handle->event->pending_wakeup = 1;
- perf_pending_queue(&handle->event->pending,
- perf_pending_event);
+ irq_work_queue(&handle->event->pending);
} else
perf_event_wakeup(handle->event);
}
@@ -3988,8 +3894,7 @@ static int __perf_event_overflow(struct
event->pending_kill = POLL_HUP;
if (nmi) {
event->pending_disable = 1;
- perf_pending_queue(&event->pending,
- perf_pending_event);
+ irq_work_queue(&event->pending);
} else
perf_event_disable(event);
}
@@ -4841,6 +4746,7 @@ perf_event_alloc(struct perf_event_attr
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
init_waitqueue_head(&event->waitq);
+ irq_work_init(&event->pending, perf_pending_event);

mutex_init(&event->mmap_mutex);

Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -37,7 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
#include <linux/sched.h>
#include <linux/slab.h>

@@ -1260,7 +1260,7 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
printk_tick();
- perf_event_do_pending();
+ irq_work_run();
scheduler_tick();
run_posix_cpu_timers(p);
}
Index: linux-2.6/arch/arm/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/arch/arm/kernel/perf_event.c
+++ linux-2.6/arch/arm/kernel/perf_event.c
@@ -1045,7 +1045,7 @@ armv6pmu_handle_irq(int irq_num,
* platforms that can have the PMU interrupts raised as a PMI, this
* will not work.
*/
- perf_event_do_pending();
+ irq_work_run();

return IRQ_HANDLED;
}
@@ -2021,7 +2021,7 @@ static irqreturn_t armv7pmu_handle_irq(i
* platforms that can have the PMU interrupts raised as a PMI, this
* will not work.
*/
- perf_event_do_pending();
+ irq_work_run();

return IRQ_HANDLED;
}

2010-06-25 19:30:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Fri, Jun 25, 2010 at 08:30:25PM +0200, Peter Zijlstra wrote:

I'm not sure what all the logic for entry enqueued by someone
else is good for? Is that for the case you don't have enough
entries preallocated and you share them with someone else?

Normally if the sharing is per cpu that would be difficult
to recover from because if it's due to a nest situation (for example)
you would deadlock.

For me it would seem simpler to simply not share.

> + struct irq_work *list;
> +
> + BUG_ON(!in_irq());
> + BUG_ON(!irqs_disabled());
> +
> + list = xchg(&__get_cpu_var(irq_work_list), NULL);
> + while (list != NULL) {
> + struct irq_work *entry = list;
> +
> + list = irq_work_next(list);
> +
> + /*
> + * Clear the PENDING bit, after this point the @entry
> + * can be re-used.
> + */
> + entry->next = next_flags(NULL, IRQ_WORK_BUSY);
> + entry->func(entry);

Needs compiler memory barrier here I think.

> + /*
> + * Clear the BUSY bit and return to the free state if
> + * no-one else claimed it meanwhile.
> + */
> + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
> + }
> +}

-Andi

2010-06-25 19:40:05

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
>
> I'm not sure what all the logic for entry enqueued by someone
> else is good for? Is that for the case you don't have enough
> entries preallocated and you share them with someone else?
>
> Normally if the sharing is per cpu that would be difficult
> to recover from because if it's due to a nest situation (for example)
> you would deadlock.
>
> For me it would seem simpler to simply not share.

perf has two different reasons to for the callback, what I do is set the
state and enqueue, if its already enqueued the pending callback will
handle both.

Its cheaper than having two callback structures per event.

We can expose the claim/enqueue thing separately so that users can
choose.

2010-06-25 19:47:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
> > + entry->next = next_flags(NULL, IRQ_WORK_BUSY);
> > + entry->func(entry);
>
> Needs compiler memory barrier here I think.
>
> > + /*
> > + * Clear the BUSY bit and return to the free state if
> > + * no-one else claimed it meanwhile.
> > + */
> > + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
> > + }

Both the (indirect) function call and the cmpxchg imply a compiler
barrier.

2010-06-25 19:49:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Fri, 2010-06-25 at 21:39 +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
> >
> > I'm not sure what all the logic for entry enqueued by someone
> > else is good for? Is that for the case you don't have enough
> > entries preallocated and you share them with someone else?
> >
> > Normally if the sharing is per cpu that would be difficult
> > to recover from because if it's due to a nest situation (for example)
> > you would deadlock.
> >
> > For me it would seem simpler to simply not share.
>
> perf has two different reasons to for the callback, what I do is set the
> state and enqueue, if its already enqueued the pending callback will
> handle both.
>
> Its cheaper than having two callback structures per event.
>
> We can expose the claim/enqueue thing separately so that users can
> choose.

Also, its possible the PMI hits again before the IRQ callback has a
chance to happen.

2010-06-25 22:29:59

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

> perf has two different reasons to for the callback, what I do is set the
> state and enqueue, if its already enqueued the pending callback will
> handle both.
>
> Its cheaper than having two callback structures per event.

Again it sounds like you just need a bit...
>
> We can expose the claim/enqueue thing separately so that users can
> choose.

Yes it would be good to separate that, because I doubt other users
will require similar hacks.

-Andi

--
[email protected] -- Speaking for myself only.

2010-06-26 01:26:09

by huang ying

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Sat, Jun 26, 2010 at 2:30 AM, Peter Zijlstra <[email protected]> wrote:
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
> +
> +/*
> + * Claim the entry so that no one else will poke at it.
> + */
> +static bool irq_work_claim(struct irq_work *entry)
> +{
> +       unsigned long flags;
> +
> +       do {
> +               flags = (unsigned long)entry->next;
> +               if (flags & IRQ_WORK_PENDING)
> +                       return false;
> +       } while (cmpxchg(&entry->next, flags, flags | IRQ_WORK_FLAGS) != flags);
> +
> +       return true;
> +}
> +
> +
> +void __weak arch_irq_work_raise(void)
> +{
> +       /*
> +        * Lame architectures will get the timer tick callback
> +        */
> +}
> +
> +/*
> + * Queue the entry and raise the IPI if needed.
> + */
> +static void __irq_work_queue(struct irq_work *entry)
> +{
> +       struct irq_work **head;
> +
> +       head = &get_cpu_var(irq_work_list);
> +
> +       do {
> +               /*
> +                * Can assign non-atomic because we keep the flags set.
> +                */
> +               entry->next = next_flags(*head, IRQ_WORK_FLAGS);
> +       } while (cmpxchg(head, entry->next, entry) != entry->next);

*head & IRQ_WORK_FLAGS == 0, but entry->next & IRQ_WORK_FLAGS ==
IRQ_WORK_FLAGS. So the cmpxchg will never succeed.

> +
> +       /*
> +        * The list was empty, raise self-interrupt to start processing.
> +        */
> +       if (!irq_work_next(entry))
> +               arch_irq_work_raise();
> +
> +       put_cpu_var(irq_work_list);
> +}

Best Regards,
Huang Ying

2010-06-26 08:37:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Sat, 2010-06-26 at 00:29 +0200, Andi Kleen wrote:

> Yes it would be good to separate that, because I doubt other users
> will require similar hacks.

You're such a constructive critic..

I would think every NMI user would need them since NMI can interrupt at
any time, and if you have a limited number of irq_work structs (like 1
per cpu) you'll end up with wanting to enqueue an already enqueued one
at some point.

2010-06-26 10:08:52

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Sat, Jun 26, 2010 at 10:36:45AM +0200, Peter Zijlstra wrote:
> On Sat, 2010-06-26 at 00:29 +0200, Andi Kleen wrote:
>
> > Yes it would be good to separate that, because I doubt other users
> > will require similar hacks.
>
> You're such a constructive critic..

Well I'm only adapting to your tone (FWIW I thought your original
description of Ying's patches was bordering to unfair, not quoting
the words back to you). I find it also always interesting when
people who always dish out with full hands are quite sensitive themselves...

But yes we can agree to not use such tone, if that's a mutual agreement.

> I would think every NMI user would need them since NMI can interrupt at
> any time, and if you have a limited number of irq_work structs (like 1
> per cpu) you'll end up with wanting to enqueue an already enqueued one
> at some point.

You could as well drop the excessive event. In fact it surprises me that you
don't simply do that in perf. The state should be in the PMU registers
anyways, so you'll pick it up from there (and if you get NMIs as quickly that
you cannot process them you have to eventually throttle by dropping anyways)

With the reuse methology you end up with the same problem anyways, is
just shifts it slightly.

For fatal NMIs it's more like: if the error is fatal then the NMI handler
will stop and if it's non fatal it can be dropped on overload.
For overload situations there needs to be a dropping mechanism, spinning
is not ok because you don't know if the current owner isn't on your
own CPU.

Some of the other errors cannot drop, but these need other mechanisms
anyways.

-Andi
--
[email protected] -- Speaking for myself only.

2010-06-26 10:32:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] irq_work -v2

On Sat, 2010-06-26 at 12:08 +0200, Andi Kleen wrote:
> You could as well drop the excessive event. In fact it surprises me that you
> don't simply do that in perf. The state should be in the PMU registers
> anyways, so you'll pick it up from there (and if you get NMIs as quickly that
> you cannot process them you have to eventually throttle by dropping anyways)

I'm not quite seeing what you mean, the PMU state is reset on PMI, it
doesn't know about previous overflows, and it most certainly doesn't
know if for the previous event we had to wake the buffer consumers.

For non-uniform events (like basically everything but
cycles/instructions) you can get high bursts of PMIs, only when the rate
stays too high will we eventually throttle, but we can (and should) deal
with short periods of high rate PMIs.

The PMU is stopped during the PMI, we write out data to the buffer and
possible queue a callback to wake the consumers and or de-schedule the
event.

By setting the the pending op state and enqueueing the callback we
ensure the pending op isn't lost/ignored. If there was already one in
flight it will pick up our pending state, if not we just queued a new
one.

In fact Huang argued for something very similar, so I assumed he
actually needed this too.