2010-01-18 13:44:43

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

It only disable preemption in perf_swevent_get_recursion_context()
it can't avoid race of hard-irq and NMI

In this patch, we use atomic operation to avoid it and reduce
cpu_ctx->recursion size, it also make this patch no need diable
preemption

Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/perf_event.h | 2 +-
kernel/perf_event.c | 14 ++++----------
2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c6f812e..8474ab0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,7 +730,7 @@ struct perf_cpu_context {
*
* task, softirq, irq, nmi context
*/
- int recursion[4];
+ unsigned long recursion;
};

struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eae6ff6..77ef16e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3921,7 +3921,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,

int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;

if (in_nmi())
@@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;

- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (test_and_set_bit(rctx, &cpuctx->recursion))
return -1;
- }
-
- cpuctx->recursion[rctx]++;
- barrier();

return rctx;
}
@@ -3948,9 +3943,8 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- barrier();
- cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
+
+ clear_bit(rctx, &cpuctx->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);

--
1.6.1.2


2010-01-18 13:47:01

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
operate event profile buffer, clean up redundant code

Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/ftrace_event.h | 9 +++-
include/trace/ftrace.h | 48 +++-----------------
kernel/trace/trace_event_profile.c | 60 +++++++++++++++++++++++--
kernel/trace/trace_kprobe.c | 86 ++++-------------------------------
kernel/trace/trace_syscalls.c | 71 ++++-------------------------
5 files changed, 87 insertions(+), 187 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3ca9485..5ab4b81 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,9 +137,6 @@ struct ftrace_event_call {

#define FTRACE_MAX_PROFILE_SIZE 2048

-extern char *perf_trace_buf;
-extern char *perf_trace_buf_nmi;
-
#define MAX_FILTER_PRED 32
#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */

@@ -194,6 +191,12 @@ extern void ftrace_profile_disable(int event_id);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
+extern void *
+ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags);
+extern void
+ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags);
#endif

#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5b46cf9..1007302 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -755,22 +755,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
proto) \
{ \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
- extern int perf_swevent_get_recursion_context(void); \
- extern void perf_swevent_put_recursion_context(int rctx); \
- extern void perf_tp_event(int, u64, u64, void *, int); \
struct ftrace_raw_##call *entry; \
u64 __addr = 0, __count = 1; \
unsigned long irq_flags; \
- struct trace_entry *ent; \
int __entry_size; \
int __data_size; \
- char *trace_buf; \
- char *raw_data; \
- int __cpu; \
int rctx; \
- int pc; \
- \
- pc = preempt_count(); \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
@@ -780,42 +770,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \
"profile buffer not large enough")) \
return; \
- \
- local_irq_save(irq_flags); \
- \
- rctx = perf_swevent_get_recursion_context(); \
- if (rctx < 0) \
- goto end_recursion; \
- \
- __cpu = smp_processor_id(); \
- \
- if (in_nmi()) \
- trace_buf = rcu_dereference(perf_trace_buf_nmi); \
- else \
- trace_buf = rcu_dereference(perf_trace_buf); \
- \
- if (!trace_buf) \
- goto end; \
- \
- raw_data = per_cpu_ptr(trace_buf, __cpu); \
- \
- *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
- entry = (struct ftrace_raw_##call *)raw_data; \
- ent = &entry->ent; \
- tracing_generic_entry_update(ent, irq_flags, pc); \
- ent->type = event_call->id; \
- \
+ entry = (struct ftrace_raw_##call *)ftrace_profile_buf_begin( \
+ __entry_size, event_call->id, &rctx, &irq_flags); \
+ if (!entry) \
+ return; \
tstruct \
\
{ assign; } \
\
- perf_tp_event(event_call->id, __addr, __count, entry, \
- __entry_size); \
- \
-end: \
- perf_swevent_put_recursion_context(rctx); \
-end_recursion: \
- local_irq_restore(irq_flags); \
+ ftrace_profile_buf_end(entry, __entry_size, rctx, __addr, \
+ __count, irq_flags); \
}

#undef DEFINE_EVENT
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573..f0fa16b 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,11 +9,8 @@
#include "trace.h"


-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;

typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;

@@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
}
mutex_unlock(&event_mutex);
}
+
+void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags)
+{
+ struct trace_entry *entry;
+ char *trace_buf, *raw_data;
+ int pc, cpu;
+
+ pc = preempt_count();
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(*irq_flags);
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ goto err_recursion;
+
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto err;
+
+ raw_data = per_cpu_ptr(trace_buf, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ entry = (struct trace_entry *)raw_data;
+ tracing_generic_entry_update(entry, *irq_flags, pc);
+ entry->type = type;
+
+ return raw_data;
+err:
+ perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+ local_irq_restore(*irq_flags);
+ return NULL;
+}
+
+void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags)
+{
+ struct trace_entry *entry = raw_data;
+
+ perf_tp_event(entry->type, addr, count, raw_data, size);
+ perf_swevent_put_recursion_context(rctx);
+ local_irq_restore(irq_flags);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aa19b6a..6d478c1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1223,14 +1223,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;

- pc = preempt_count();
__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1238,45 +1234,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
"profile buffer not large enough"))
return 0;

- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;

- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ip, 1, entry, size);

-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);

return 0;
}
@@ -1288,14 +1255,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;

- pc = preempt_count();
__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1303,46 +1266,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return 0;

- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kretprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;

- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ret_ip, 1, entry, size);

-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);

return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f6b0712..fb40e91 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
- char *trace_buf;
- char *raw_data;
int syscall_nr;
int rctx;
int size;
- int cpu;

syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
"profile buffer not large enough"))
return;

- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ rec = (struct syscall_trace_enter *)ftrace_profile_buf_begin(size,
+ sys_data->enter_event->id, &rctx, &flags);
+ if (!rec)
+ return;

- rec = (struct syscall_trace_enter *) raw_data;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_event->id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_profile_buf_end(rec, size, rctx, 0, 1, flags);
}

int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
- char *trace_buf;
- char *raw_data;
int rctx;
int size;
- int cpu;

syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
"exit event has grown above profile buffer size"))
return;

- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
- rec = (struct syscall_trace_exit *)raw_data;
+ rec = (struct syscall_trace_exit *)ftrace_profile_buf_begin(size,
+ sys_data->exit_event->id, &rctx, &flags);
+ if (!rec)
+ return;

- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->exit_event->id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);

- perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_profile_buf_end(rec, size, rctx, 0, 1, flags);
}

int prof_sysexit_enable(struct ftrace_event_call *call)
--
1.6.1.2

2010-01-18 13:48:23

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/3] tracing/kprobe: cleanup unused return value of function

Those function's return value is meaningless

Signed-off-by: Xiao Guangrong <[email protected]>
---
kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6d478c1..5869819 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
};

/* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry *entry;
@@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;

entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)

if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}

/* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;

entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);

- return 0;
+ return ;
}

/* Event entry printers */
@@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS

/* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;

entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;

entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
@@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,

ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);

- return 0;
+ return ;
}

/* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;

entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;

entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
@@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,

ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);

- return 0;
+ return ;
}

static int probe_profile_enable(struct ftrace_event_call *call)
--
1.6.1.2

2010-01-18 13:56:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context()
> it can't avoid race of hard-irq and NMI
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need diable
> preemption

Uhm why?

This patch looks terminally broken

2010-01-18 16:17:34

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 3/3] tracing/kprobe: cleanup unused return value of function

Xiao Guangrong wrote:
> Those function's return value is meaningless

Right, now these functions are called from dispatchers,
and return values are wasted.

>
> Signed-off-by: Xiao Guangrong <[email protected]>

Acked-by: Masami Hiramatsu <[email protected]>

Thanks!

> ---
> kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
> 1 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 6d478c1..5869819 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
> };
>
> /* Kprobe handler */
> -static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> +static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> struct kprobe_trace_entry *entry;
> @@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
> irq_flags, pc);
> if (!event)
> - return 0;
> + return ;
>
> entry = ring_buffer_event_data(event);
> entry->nargs = tp->nr_args;
> @@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
>
> if (!filter_current_check_discard(buffer, call, entry, event))
> trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
> - return 0;
> + return ;
> }
>
> /* Kretprobe handler */
> -static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> +static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> @@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
> irq_flags, pc);
> if (!event)
> - return 0;
> + return ;
>
> entry = ring_buffer_event_data(event);
> entry->nargs = tp->nr_args;
> @@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> if (!filter_current_check_discard(buffer, call, entry, event))
> trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
>
> - return 0;
> + return ;
> }
>
> /* Event entry printers */
> @@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
> #ifdef CONFIG_PERF_EVENTS
>
> /* Kprobe profile handler */
> -static __kprobes int kprobe_profile_func(struct kprobe *kp,
> +static __kprobes void kprobe_profile_func(struct kprobe *kp,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> @@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
> size -= sizeof(u32);
> if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
> "profile buffer not large enough"))
> - return 0;
> + return ;
>
> entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
> if (!entry)
> - return 0;
> + return ;
>
> entry->nargs = tp->nr_args;
> entry->ip = (unsigned long)kp->addr;
> @@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
>
> ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);
>
> - return 0;
> + return ;
> }
>
> /* Kretprobe profile handler */
> -static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
> +static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> @@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
> size -= sizeof(u32);
> if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
> "profile buffer not large enough"))
> - return 0;
> + return ;
>
> entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
> if (!entry)
> - return 0;
> + return ;
>
> entry->nargs = tp->nr_args;
> entry->func = (unsigned long)tp->rp.kp.addr;
> @@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
>
> ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);
>
> - return 0;
> + return ;
> }
>
> static int probe_profile_enable(struct ftrace_event_call *call)

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2010-01-18 16:22:35

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

Xiao Guangrong wrote:
> Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> operate event profile buffer, clean up redundant code
>
> Signed-off-by: Xiao Guangrong <[email protected]>
[...]

> diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
> index 9e25573..f0fa16b 100644
> --- a/kernel/trace/trace_event_profile.c
> +++ b/kernel/trace/trace_event_profile.c
> @@ -9,11 +9,8 @@
> #include "trace.h"
>
>
> -char *perf_trace_buf;
> -EXPORT_SYMBOL_GPL(perf_trace_buf);
> -
> -char *perf_trace_buf_nmi;
> -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
> +static char *perf_trace_buf;
> +static char *perf_trace_buf_nmi;
>
> typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
>
> @@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
> }
> mutex_unlock(&event_mutex);
> }
> +
> +void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
> + unsigned long *irq_flags)
> +{
> + struct trace_entry *entry;
> + char *trace_buf, *raw_data;
> + int pc, cpu;
> +
> + pc = preempt_count();
> +
> + /* Protect the per cpu buffer, begin the rcu read side */
> + local_irq_save(*irq_flags);
> +
> + *rctxp = perf_swevent_get_recursion_context();
> + if (*rctxp < 0)
> + goto err_recursion;
> +
> + cpu = smp_processor_id();
> +
> + if (in_nmi())
> + trace_buf = rcu_dereference(perf_trace_buf_nmi);
> + else
> + trace_buf = rcu_dereference(perf_trace_buf);
> +
> + if (!trace_buf)
> + goto err;
> +
> + raw_data = per_cpu_ptr(trace_buf, cpu);
> +
> + /* zero the dead bytes from align to not leak stack to user */
> + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
> +
> + entry = (struct trace_entry *)raw_data;
> + tracing_generic_entry_update(entry, *irq_flags, pc);
> + entry->type = type;
> +
> + return raw_data;
> +err:
> + perf_swevent_put_recursion_context(*rctxp);
> +err_recursion:
> + local_irq_restore(*irq_flags);
> + return NULL;
> +}
> +
> +void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
> + u64 count, unsigned long irq_flags)
> +{
> + struct trace_entry *entry = raw_data;
> +
> + perf_tp_event(entry->type, addr, count, raw_data, size);
> + perf_swevent_put_recursion_context(rctx);
> + local_irq_restore(irq_flags);
> +}

Hmm, could you make it inline-functions or add __kprobes?
Because it is called from kprobes, we don't want to probe
the function which will be called from kprobes handlers itself.

(IMHO, from the viewpoint of performance, inline-function
could be better.)

Thank you,

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2010-01-18 16:41:35

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context()
> it can't avoid race of hard-irq and NMI
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need diable
> preemption
>
> Signed-off-by: Xiao Guangrong <[email protected]>



I don't understand what is racy in what we have currently.



> int perf_swevent_get_recursion_context(void)
> {
> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> int rctx;
>
> if (in_nmi())
> @@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
> else
> rctx = 0;
>
> - if (cpuctx->recursion[rctx]) {
> - put_cpu_var(perf_cpu_context);
> + if (test_and_set_bit(rctx, &cpuctx->recursion))
> return -1;



This looks broken. We don't call back perf_swevent_put_recursion_context
in fail case, so the bit won't ever be cleared once we recurse.

2010-01-18 17:11:52

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

On Mon, Jan 18, 2010 at 09:44:56PM +0800, Xiao Guangrong wrote:
> Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> operate event profile buffer, clean up redundant code
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---


Looks good, nice and desired cleanup, thanks!

Acked-by: Frederic Weisbecker <[email protected]>

2010-01-18 17:27:58

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
> Xiao Guangrong wrote:
> > Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> > operate event profile buffer, clean up redundant code
> >
> > Signed-off-by: Xiao Guangrong <[email protected]>
> [...]
>
> > diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
> > index 9e25573..f0fa16b 100644
> > --- a/kernel/trace/trace_event_profile.c
> > +++ b/kernel/trace/trace_event_profile.c
> > @@ -9,11 +9,8 @@
> > #include "trace.h"
> >
> >
> > -char *perf_trace_buf;
> > -EXPORT_SYMBOL_GPL(perf_trace_buf);
> > -
> > -char *perf_trace_buf_nmi;
> > -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
> > +static char *perf_trace_buf;
> > +static char *perf_trace_buf_nmi;
> >
> > typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
> >
> > @@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
> > }
> > mutex_unlock(&event_mutex);
> > }
> > +
> > +void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
> > + unsigned long *irq_flags)
> > +{
> > + struct trace_entry *entry;
> > + char *trace_buf, *raw_data;
> > + int pc, cpu;
> > +
> > + pc = preempt_count();
> > +
> > + /* Protect the per cpu buffer, begin the rcu read side */
> > + local_irq_save(*irq_flags);
> > +
> > + *rctxp = perf_swevent_get_recursion_context();
> > + if (*rctxp < 0)
> > + goto err_recursion;
> > +
> > + cpu = smp_processor_id();
> > +
> > + if (in_nmi())
> > + trace_buf = rcu_dereference(perf_trace_buf_nmi);
> > + else
> > + trace_buf = rcu_dereference(perf_trace_buf);
> > +
> > + if (!trace_buf)
> > + goto err;
> > +
> > + raw_data = per_cpu_ptr(trace_buf, cpu);
> > +
> > + /* zero the dead bytes from align to not leak stack to user */
> > + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
> > +
> > + entry = (struct trace_entry *)raw_data;
> > + tracing_generic_entry_update(entry, *irq_flags, pc);
> > + entry->type = type;
> > +
> > + return raw_data;
> > +err:
> > + perf_swevent_put_recursion_context(*rctxp);
> > +err_recursion:
> > + local_irq_restore(*irq_flags);
> > + return NULL;
> > +}
> > +
> > +void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
> > + u64 count, unsigned long irq_flags)
> > +{
> > + struct trace_entry *entry = raw_data;
> > +
> > + perf_tp_event(entry->type, addr, count, raw_data, size);
> > + perf_swevent_put_recursion_context(rctx);
> > + local_irq_restore(irq_flags);
> > +}
>
> Hmm, could you make it inline-functions or add __kprobes?
> Because it is called from kprobes, we don't want to probe
> the function which will be called from kprobes handlers itself.
>
> (IMHO, from the viewpoint of performance, inline-function
> could be better.)
>
> Thank you,



Yeah, may be inline ftrace_profile_buf_end, would be better.
But we shouldn't inline ftrace_profile_buf_begin() I guess,
considering its size.

While at it, may be let's choose more verbose names
like

ftrace_profile_buf_fill() and ftrace_profile_buf_submit().

Also, profile is a bit of a misnomer. Not a problem since
ftrace_profile_templ_##call() is already a misnomer, but
we should start a bit of a rename. Sometimes, perf only
profiles trace events as counters and sometimes it records
the raw samples too.

So, as more generic names, I would suggest:

ftrace_perf_buf_fill() and ftrace_perf_buf_submit().

At least it's unambiguous, I hope...

2010-01-18 17:49:57

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
>> Hmm, could you make it inline-functions or add __kprobes?
>> Because it is called from kprobes, we don't want to probe
>> the function which will be called from kprobes handlers itself.
>>
>> (IMHO, from the viewpoint of performance, inline-function
>> could be better.)
>>
>> Thank you,
>
>
>
> Yeah, may be inline ftrace_profile_buf_end, would be better.
> But we shouldn't inline ftrace_profile_buf_begin() I guess,
> considering its size.

Indeed, especially for events...

> While at it, may be let's choose more verbose names
> like
>
> ftrace_profile_buf_fill() and ftrace_profile_buf_submit().
>
> Also, profile is a bit of a misnomer. Not a problem since
> ftrace_profile_templ_##call() is already a misnomer, but
> we should start a bit of a rename. Sometimes, perf only
> profiles trace events as counters and sometimes it records
> the raw samples too.
>
> So, as more generic names, I would suggest:
>
> ftrace_perf_buf_fill() and ftrace_perf_buf_submit().


Actual filling buffer is done in the profile handlers,
so I think ftrace_perf_buf_prepare() may be better :-)
ftrace_perf_buf_submit is good to me:-)

Thank you,


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2010-01-18 18:02:55

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

On Mon, Jan 18, 2010 at 12:48:48PM -0500, Masami Hiramatsu wrote:
> > So, as more generic names, I would suggest:
> >
> > ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
>
>
> Actual filling buffer is done in the profile handlers,
> so I think ftrace_perf_buf_prepare() may be better :-)
> ftrace_perf_buf_submit is good to me:-)


Oh you're right :)

2010-01-19 01:21:50

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()



Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context()
>> it can't avoid race of hard-irq and NMI
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need diable
>> preemption
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>
>
>
> I don't understand what is racy in what we have currently.
>

It's because hard-irq(we can handle interruption with interruption enabled)
and NMI are nested, for example:

int perf_swevent_get_recursion_context(void)
{
......
if (cpuctx->recursion[rctx]) {
put_cpu_var(perf_cpu_context);
return -1;
}

/*
* Another interruption handler/NMI will re-enter there if it
* happed, it make the recursion value chaotic
*/
cpuctx->recursion[rctx]++;
......
}

>
>
>> int perf_swevent_get_recursion_context(void)
>> {
>> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
>> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
>> int rctx;
>>
>> if (in_nmi())
>> @@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
>> else
>> rctx = 0;
>>
>> - if (cpuctx->recursion[rctx]) {
>> - put_cpu_var(perf_cpu_context);
>> + if (test_and_set_bit(rctx, &cpuctx->recursion))
>> return -1;
>
>
>
> This looks broken. We don't call back perf_swevent_put_recursion_context
> in fail case, so the bit won't ever be cleared once we recurse.
>

Um, i think we can't clear the bit in this fail case, consider below
sequence:

path A: path B

set bit but find the bit already set
atomic set bit |
| |
V |
handle SW event |
| V
V exit and not clear the bit
atomic clear bit

After A and B, the bit is still zero

Right? :-)

Thanks,
Xiao

2010-01-19 01:28:23

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation



Masami Hiramatsu wrote:
> Frederic Weisbecker wrote:
>> On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
>>> Hmm, could you make it inline-functions or add __kprobes?
>>> Because it is called from kprobes, we don't want to probe
>>> the function which will be called from kprobes handlers itself.
>>>
>>> (IMHO, from the viewpoint of performance, inline-function
>>> could be better.)
>>>
>>> Thank you,
>>
>>
>> Yeah, may be inline ftrace_profile_buf_end, would be better.
>> But we shouldn't inline ftrace_profile_buf_begin() I guess,
>> considering its size.
>
> Indeed, especially for events...
>

Thanks Masami and Frederic, i'll fix it in the next version

>> While at it, may be let's choose more verbose names
>> like
>>
>> ftrace_profile_buf_fill() and ftrace_profile_buf_submit().
>>
>> Also, profile is a bit of a misnomer. Not a problem since
>> ftrace_profile_templ_##call() is already a misnomer, but
>> we should start a bit of a rename. Sometimes, perf only
>> profiles trace events as counters and sometimes it records
>> the raw samples too.
>>
>> So, as more generic names, I would suggest:
>>
>> ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
>
>
> Actual filling buffer is done in the profile handlers,
> so I think ftrace_perf_buf_prepare() may be better :-)
> ftrace_perf_buf_submit is good to me:-)
>

OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
i guess i can add you Acked-by :-)

Thanks,
Xiao

2010-01-19 07:39:00

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

Hi Peter,

Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context()
>> it can't avoid race of hard-irq and NMI
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need diable
>> preemption
>
> Uhm why?
>
> This patch looks terminally broken

Please see my explanation in another mail:
http://lkml.org/lkml/2010/1/18/501

Thanks,
Xiao

2010-01-19 08:40:06

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()

It only disable preemption in perf_swevent_get_recursion_context(),
it can't avoid race of hard-irq and NMI since they are nested that
will re-enter this path and make the recursion value chaotic

In this patch, we use atomic operation to avoid it and reduce
cpu_ctx->recursion size, it also make this patch no need disable
preemption

Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/perf_event.h | 2 +-
kernel/perf_event.c | 14 ++++----------
2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c6f812e..8474ab0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,7 +730,7 @@ struct perf_cpu_context {
*
* task, softirq, irq, nmi context
*/
- int recursion[4];
+ unsigned long recursion;
};

struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eae6ff6..77ef16e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3921,7 +3921,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,

int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;

if (in_nmi())
@@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;

- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (test_and_set_bit(rctx, &cpuctx->recursion))
return -1;
- }
-
- cpuctx->recursion[rctx]++;
- barrier();

return rctx;
}
@@ -3948,9 +3943,8 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- barrier();
- cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
+
+ clear_bit(rctx, &cpuctx->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);

--
1.6.1.2

2010-01-19 08:41:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

On Tue, 2010-01-19 at 15:36 +0800, Xiao Guangrong wrote:
> Hi Peter,
>
> Peter Zijlstra wrote:
> > On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
> >> It only disable preemption in perf_swevent_get_recursion_context()
> >> it can't avoid race of hard-irq and NMI
> >>
> >> In this patch, we use atomic operation to avoid it and reduce
> >> cpu_ctx->recursion size, it also make this patch no need diable
> >> preemption
> >
> > Uhm why?
> >
> > This patch looks terminally broken
>
> Please see my explanation in another mail:
> http://lkml.org/lkml/2010/1/18/501

Still its not going to happen, we need those 4 recursion contexts.
Otherwise we could not receive a software event from an IRQ while we are
processing a software event from process context, etc.

2010-01-19 08:41:45

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/3 v2] perf_event: cleanup for event profile buffer operation

Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to
operate event profile buffer, clean up redundant code

Changlog v1->v2:
- Rename function name address Masami and Frederic's suggestion
- Add __kprobes for ftrace_perf_buf_prepare() and make
ftrace_perf_buf_submit() inline address Masami's suggestion
- Export ftrace_perf_buf_prepare since module will use it

Acked-by: Frederic Weisbecker <[email protected]>
Acked-by: Masami Hiramatsu <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/ftrace_event.h | 18 ++++++-
include/trace/ftrace.h | 48 +++-----------------
kernel/trace/trace_event_profile.c | 52 +++++++++++++++++++--
kernel/trace/trace_kprobe.c | 86 ++++-------------------------------
kernel/trace/trace_syscalls.c | 71 ++++-------------------------
5 files changed, 88 insertions(+), 187 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3ca9485..6b7c444 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -5,6 +5,7 @@
#include <linux/trace_seq.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/perf_event.h>

struct trace_array;
struct tracer;
@@ -137,9 +138,6 @@ struct ftrace_event_call {

#define FTRACE_MAX_PROFILE_SIZE 2048

-extern char *perf_trace_buf;
-extern char *perf_trace_buf_nmi;
-
#define MAX_FILTER_PRED 32
#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */

@@ -194,6 +192,20 @@ extern void ftrace_profile_disable(int event_id);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
+extern void *
+ftrace_perf_buf_prepare(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags);
+
+static inline void
+ftrace_perf_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags)
+{
+ struct trace_entry *entry = raw_data;
+
+ perf_tp_event(entry->type, addr, count, raw_data, size);
+ perf_swevent_put_recursion_context(rctx);
+ local_irq_restore(irq_flags);
+}
#endif

#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5b46cf9..fb2c5bd 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -755,22 +755,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
proto) \
{ \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
- extern int perf_swevent_get_recursion_context(void); \
- extern void perf_swevent_put_recursion_context(int rctx); \
- extern void perf_tp_event(int, u64, u64, void *, int); \
struct ftrace_raw_##call *entry; \
u64 __addr = 0, __count = 1; \
unsigned long irq_flags; \
- struct trace_entry *ent; \
int __entry_size; \
int __data_size; \
- char *trace_buf; \
- char *raw_data; \
- int __cpu; \
int rctx; \
- int pc; \
- \
- pc = preempt_count(); \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
@@ -780,42 +770,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \
"profile buffer not large enough")) \
return; \
- \
- local_irq_save(irq_flags); \
- \
- rctx = perf_swevent_get_recursion_context(); \
- if (rctx < 0) \
- goto end_recursion; \
- \
- __cpu = smp_processor_id(); \
- \
- if (in_nmi()) \
- trace_buf = rcu_dereference(perf_trace_buf_nmi); \
- else \
- trace_buf = rcu_dereference(perf_trace_buf); \
- \
- if (!trace_buf) \
- goto end; \
- \
- raw_data = per_cpu_ptr(trace_buf, __cpu); \
- \
- *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
- entry = (struct ftrace_raw_##call *)raw_data; \
- ent = &entry->ent; \
- tracing_generic_entry_update(ent, irq_flags, pc); \
- ent->type = event_call->id; \
- \
+ entry = (struct ftrace_raw_##call *)ftrace_perf_buf_prepare( \
+ __entry_size, event_call->id, &rctx, &irq_flags); \
+ if (!entry) \
+ return; \
tstruct \
\
{ assign; } \
\
- perf_tp_event(event_call->id, __addr, __count, entry, \
- __entry_size); \
- \
-end: \
- perf_swevent_put_recursion_context(rctx); \
-end_recursion: \
- local_irq_restore(irq_flags); \
+ ftrace_perf_buf_submit(entry, __entry_size, rctx, __addr, \
+ __count, irq_flags); \
}

#undef DEFINE_EVENT
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573..f0d6930 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
*/

#include <linux/module.h>
+#include <linux/kprobes.h>
#include "trace.h"


-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;

typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;

@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
}
mutex_unlock(&event_mutex);
}
+
+__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+ int *rctxp, unsigned long *irq_flags)
+{
+ struct trace_entry *entry;
+ char *trace_buf, *raw_data;
+ int pc, cpu;
+
+ pc = preempt_count();
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(*irq_flags);
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ goto err_recursion;
+
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto err;
+
+ raw_data = per_cpu_ptr(trace_buf, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ entry = (struct trace_entry *)raw_data;
+ tracing_generic_entry_update(entry, *irq_flags, pc);
+ entry->type = type;
+
+ return raw_data;
+err:
+ perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+ local_irq_restore(*irq_flags);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aa19b6a..2714d23 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1223,14 +1223,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;

- pc = preempt_count();
__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1238,45 +1234,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
"profile buffer not large enough"))
return 0;

- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;

- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ip, 1, entry, size);

-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);

return 0;
}
@@ -1288,14 +1255,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;

- pc = preempt_count();
__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1303,46 +1266,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return 0;

- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kretprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;

- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ret_ip, 1, entry, size);

-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);

return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f6b0712..6cce6a8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
- char *trace_buf;
- char *raw_data;
int syscall_nr;
int rctx;
int size;
- int cpu;

syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
"profile buffer not large enough"))
return;

- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+ sys_data->enter_event->id, &rctx, &flags);
+ if (!rec)
+ return;

- rec = (struct syscall_trace_enter *) raw_data;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_event->id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
}

int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
- char *trace_buf;
- char *raw_data;
int rctx;
int size;
- int cpu;

syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
"exit event has grown above profile buffer size"))
return;

- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
- rec = (struct syscall_trace_exit *)raw_data;
+ rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+ sys_data->exit_event->id, &rctx, &flags);
+ if (!rec)
+ return;

- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->exit_event->id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);

- perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
}

int prof_sysexit_enable(struct ftrace_event_call *call)
--
1.6.1.2

2010-01-19 08:43:31

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/3 v2] tracing/kprobe: cleanup unused return value of function

Those function's return value is meaningless

Acked-by: Masami Hiramatsu <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2714d23..de2cf85 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
};

/* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry *entry;
@@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;

entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)

if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}

/* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;

entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);

- return 0;
+ return ;
}

/* Event entry printers */
@@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS

/* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;

entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;

entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
@@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,

ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);

- return 0;
+ return ;
}

/* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;

entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;

entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
@@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,

ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);

- return 0;
+ return ;
}

static int probe_profile_enable(struct ftrace_event_call *call)
--
1.6.1.2

2010-01-19 08:46:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

On Tue, 2010-01-19 at 09:19 +0800, Xiao Guangrong wrote:
>
>
> It's because hard-irq(we can handle interruption with interruption enabled)
> and NMI are nested, for example:
>
> int perf_swevent_get_recursion_context(void)
> {
> ......
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> /*
> * Another interruption handler/NMI will re-enter there if it
> * happed, it make the recursion value chaotic
> */
> cpuctx->recursion[rctx]++;
> ......
> }

This doesn't make any sense at all, if IRQs are nested (bad bad bad)
then it will still end up with the same recursion context and detect
that recursion and bail on the nested one. So what's the problem?

NMIs are not allowed to nest, ever.

2010-01-19 08:46:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()

On Tue, 2010-01-19 at 16:37 +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context(),
> it can't avoid race of hard-irq and NMI since they are nested that
> will re-enter this path and make the recursion value chaotic
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need disable
> preemption

NAK

2010-01-19 08:58:21

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()

On Tue, Jan 19, 2010 at 09:19:43AM +0800, Xiao Guangrong wrote:
>
>
> Frederic Weisbecker wrote:
> > On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
> >> It only disable preemption in perf_swevent_get_recursion_context()
> >> it can't avoid race of hard-irq and NMI
> >>
> >> In this patch, we use atomic operation to avoid it and reduce
> >> cpu_ctx->recursion size, it also make this patch no need diable
> >> preemption
> >>
> >> Signed-off-by: Xiao Guangrong <[email protected]>
> >
> >
> >
> > I don't understand what is racy in what we have currently.
> >
>
> It's because hard-irq(we can handle interruption with interruption enabled)
> and NMI are nested, for example:
>
> int perf_swevent_get_recursion_context(void)
> {
> ......
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> /*
> * Another interruption handler/NMI will re-enter there if it
> * happed, it make the recursion value chaotic
> */
> cpuctx->recursion[rctx]++;
> ......




I still don't understand the problem.

It's not like a fight between different cpus, it's a local per cpu
fight.

NMIs can't nest other NMIs but hardirq can nest another hardirqs,
we don't care much about these though.
So let's imagine the following sequence, a fight between nested
hardirqs:

cpuctx->recursion[irq] initially = 0

Interrupt (level 0):

if (cpuctx->recursion[rctx]) {
put_cpu_var(perf_cpu_context);
return -1;
}

Interrupt (level 1):


cpuctx->recursion[rctx]++; // = 1

...
do something
...
cpuctx->recursion[rctx]--; // = 0

End Interrupt (level 1)

cpuctx->recursion[rctx]++; // = 1

...
do something
...
cpuctx->recursion[rctx]--; // = 0

End interrupt (level 0)

Another sequence could be Interrupt level 0 has
already incremented recursion and we are interrupted by
irq level 1 which then won't be able to get the recursion
context. But that's not a big deal I think.


> > This looks broken. We don't call back perf_swevent_put_recursion_context
> > in fail case, so the bit won't ever be cleared once we recurse.
> >
>
> Um, i think we can't clear the bit in this fail case, consider below
> sequence:
>
> path A: path B
>
> set bit but find the bit already set
> atomic set bit |
> | |
> V |
> handle SW event |
> | V
> V exit and not clear the bit
> atomic clear bit
>
> After A and B, the bit is still zero
>
> Right? :-)


Ah indeed, it will be cleared by the interrupted path.
I still don't understand what this patch brings us though.

2010-01-19 09:00:12

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

On Tue, Jan 19, 2010 at 09:26:10AM +0800, Xiao Guangrong wrote:
> > Actual filling buffer is done in the profile handlers,
> > so I think ftrace_perf_buf_prepare() may be better :-)
> > ftrace_perf_buf_submit is good to me:-)
> >
>
> OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
> i guess i can add you Acked-by :-)


You can add mine yeah (couldn't tell for Masami).

2010-01-19 09:09:01

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()



Peter Zijlstra wrote:
> On Tue, 2010-01-19 at 16:37 +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context(),
>> it can't avoid race of hard-irq and NMI since they are nested that
>> will re-enter this path and make the recursion value chaotic
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need disable
>> preemption
>
> NAK
>

Sorry, i forget the nesting of hard-irq is sequential, and not pollute
recursion value, thanks for you point out.

Hi Ingo,

Please ignore this patch, it not pollute other 2 patches in this
patchset

Thanks,
Xiao

2010-01-19 09:11:48

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()



Frederic Weisbecker wrote:

>
> I still don't understand the problem.
>
> It's not like a fight between different cpus, it's a local per cpu
> fight.
>
> NMIs can't nest other NMIs but hardirq can nest another hardirqs,
> we don't care much about these though.
> So let's imagine the following sequence, a fight between nested
> hardirqs:
>
> cpuctx->recursion[irq] initially = 0
>
> Interrupt (level 0):
>
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> Interrupt (level 1):
>
>
> cpuctx->recursion[rctx]++; // = 1
>
> ...
> do something
> ...
> cpuctx->recursion[rctx]--; // = 0
>
> End Interrupt (level 1)
>
> cpuctx->recursion[rctx]++; // = 1
>
> ...
> do something
> ...
> cpuctx->recursion[rctx]--; // = 0
>
> End interrupt (level 0)
>
> Another sequence could be Interrupt level 0 has
> already incremented recursion and we are interrupted by
> irq level 1 which then won't be able to get the recursion
> context. But that's not a big deal I think.
>

Thanks Frederic, i forget this feature of nesting of hard-irq :-(

Sorry for disturb you all

- Xiao

2010-01-19 14:24:36

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation

Frederic Weisbecker wrote:
> On Tue, Jan 19, 2010 at 09:26:10AM +0800, Xiao Guangrong wrote:
>>> Actual filling buffer is done in the profile handlers,
>>> so I think ftrace_perf_buf_prepare() may be better :-)
>>> ftrace_perf_buf_submit is good to me:-)
>>>
>>
>> OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
>> i guess i can add you Acked-by :-)
>
>
> You can add mine yeah (couldn't tell for Masami).

Sure, add mine too, because other parts look good for me. :-)


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]