Subject: [PATCH 00/11] tracing: bugfix and kprobe-based dynamic event updates

Hi,

Here is a series of updating kprobe-based dynamic event for
supporting soft-disabling mode and multiple buffers.
This also has some bugfixes of ftrace which I found
including an important deadlock fix.

I think uprobe-based events can also support multi-buffer
and soft-disabling as I did in this series.

---

Masami Hiramatsu (11):
[BUGFIX] tracing: Return 0 if event_enable_func succeeded
[BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock
ftrace: Cleanup regex_lock and ftrace_lock around hash updating
[BUGFIX] tracing/kprobes: Fix to increment return event probe hit-count
tracing: Indicate enabled soft-mode in enable file
[BUGFIX] tracing: Modify soft-mode only if no other referrer
[TRIVIAL] tracing/kprobes: Use bool for retprobe checker
tracing/kprobes: Increment probe hit-count even if it is used by perf
tracing/kprobes: Pass trace_probe directly from dispatcher
tracing/kprobes: Support ftrace_event_file base multibuffer
tracing/kprobes: Support soft-mode disabling


include/linux/ftrace.h | 1
include/linux/ftrace_event.h | 1
kernel/kprobes.c | 1
kernel/trace/ftrace.c | 82 +++++++-----
kernel/trace/trace_events.c | 19 ++-
kernel/trace/trace_kprobe.c | 289 ++++++++++++++++++++++++++++++++++--------
6 files changed, 305 insertions(+), 88 deletions(-)

--
Masami Hiramatsu <[email protected]>
IT Management Research Dept. and Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory


Subject: [PATCH 01/11] [BUGFIX] tracing: Return 0 if event_enable_func succeeded

Return 0 instead of the number of activated ftrace if
event_enable_func succeeded and return an error code if failed,
beacuse 0 is success code at caller (ftrace_regex_write).

Without this fix, writing enable_event trigger on set_ftrace_filter
always doesn't work, since event_enable_func returns 1 to
ftrace_regex_write, it consumes 1 byte and pass input string
without the first character again. This makes event_enable_func
fail and disables event entry.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
kernel/trace/trace_events.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53582e9..44ac836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2061,8 +2061,11 @@ event_enable_func(struct ftrace_hash *hash,
if (ret < 0)
goto out_put;
ret = register_ftrace_function_probe(glob, ops, data);
- if (!ret)
+ if (!ret) {
+ ret = -ENOENT;
goto out_disable;
+ } else
+ ret = 0;
out:
mutex_unlock(&event_mutex);
return ret;

Subject: [PATCH 06/11] [BUGFIX] tracing: Modify soft-mode only if no other referrer

Modify soft-mode flag only if no other soft-mode referrer
(currently only the ftrace triggers) by using a reference
counter in each ftrace_event_file.

Without this fix, adding and removing several different
enable/disable_event triggers on the same event clear
soft-mode bit from the ftrace_event_file. This also
happens with a typo of glob on setting triggers.

e.g.

# echo vfs_symlink:enable_event:net:netif_rx > set_ftrace_filter
# cat events/net/netif_rx/enable
0*
# echo typo_func:enable_event:net:netif_rx > set_ftrace_filter
# cat events/net/netif_rx/enable
0
# cat set_ftrace_filter
#### all functions enabled ####
vfs_symlink:enable_event:net:netif_rx:unlimited

As above, we still have a trigger, but soft-mode is gone.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: David Sharp <[email protected]>
Cc: Hiraku Toyooka <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
include/linux/ftrace_event.h | 1 +
kernel/trace/trace_events.c | 12 ++++++++++--
2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 34e00fb..4372658 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -293,6 +293,7 @@ struct ftrace_event_file {
* caching and such. Which is mostly OK ;-)
*/
unsigned long flags;
+ atomic_t sm_ref; /* soft-mode reference counter */
};

#define __TRACE_EVENT_FLAGS(name, value) \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cc3c130..deda9ae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -251,7 +251,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
switch (enable) {
case 0:
/*
- * When soft_disable is set and enable is cleared, we want
+ * When soft_disable is set and enable is cleared, the sm_ref
+ * reference counter is decremented. If it reaches 0, we want
* to clear the SOFT_DISABLED flag but leave the event in the
* state that it was. That is, if the event was enabled and
* SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
@@ -263,6 +264,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/
if (soft_disable) {
+ if (atomic_dec_return(&file->sm_ref) > 0)
+ break;
disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
} else
@@ -291,8 +294,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
*/
if (!soft_disable)
clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
- else
+ else {
+ if (atomic_inc_return(&file->sm_ref) > 1)
+ break;
set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ }

if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {

@@ -1540,6 +1546,7 @@ __trace_add_new_event(struct ftrace_event_call *call,

file->event_call = call;
file->tr = tr;
+ atomic_set(&file->sm_ref, 0);
list_add(&file->list, &tr->events);

return event_create_dir(tr->event_dir, file, id, enable, filter, format);
@@ -1562,6 +1569,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,

file->event_call = call;
file->tr = tr;
+ atomic_set(&file->sm_ref, 0);
list_add(&file->list, &tr->events);

return 0;

Subject: [PATCH 07/11] [TRIVIAL] tracing/kprobes: Use bool for retprobe checker

Use bool instead of int for kretprobe checker.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
kernel/trace/trace_kprobe.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6928633..0b7386a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -46,7 +46,7 @@ struct trace_probe {
(sizeof(struct probe_arg) * (n)))


-static __kprobes int trace_probe_is_return(struct trace_probe *tp)
+static __kprobes bool trace_probe_is_return(struct trace_probe *tp)
{
return tp->rp.handler != NULL;
}

Subject: [PATCH 08/11] tracing/kprobes: Increment probe hit-count even if it is used by perf

Increment probe hit-count for profiling even if it is used
by perf tool. Same thing has already done in trace_uprobe.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
---
kernel/trace/trace_kprobe.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 0b7386a..6e86fbb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -733,8 +733,6 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;

- tp->nhit++;
-
local_save_flags(irq_flags);
pc = preempt_count();

@@ -767,8 +765,6 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;

- tp->nhit++;
-
local_save_flags(irq_flags);
pc = preempt_count();

@@ -1075,6 +1071,8 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);

+ tp->nhit++;
+
if (tp->flags & TP_FLAG_TRACE)
kprobe_trace_func(kp, regs);
#ifdef CONFIG_PERF_EVENTS
@@ -1089,6 +1087,8 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);

+ tp->nhit++;
+
if (tp->flags & TP_FLAG_TRACE)
kretprobe_trace_func(ri, regs);
#ifdef CONFIG_PERF_EVENTS

Subject: [PATCH 11/11] tracing/kprobes: Support soft-mode disabling

Support soft-mode disabling on kprobe-based dynamic events.
Soft-disabling is just ignoring recording if the soft disabled
flag is set.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
---
kernel/trace/trace_kprobe.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index fee865d..636d45f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -845,6 +845,9 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,

WARN_ON(call != ftrace_file->event_call);

+ if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();

@@ -893,6 +896,9 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,

WARN_ON(call != ftrace_file->event_call);

+ if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();

Subject: [PATCH 09/11] tracing/kprobes: Pass trace_probe directly from dispatcher

Pass the pointer of struct trace_probe directly from probe
dispatcher to handlers. This removes redundant container_of
macro uses. Same thing has already done in trace_uprobe.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
---
kernel/trace/trace_kprobe.c | 33 ++++++++++++++++-----------------
1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6e86fbb..9ca44fc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -723,9 +723,9 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
}

/* Kprobe handler */
-static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void
+kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
@@ -745,7 +745,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
return;

entry = ring_buffer_event_data(event);
- entry->ip = (unsigned long)kp->addr;
+ entry->ip = (unsigned long)tp->rp.kp.addr;
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);

if (!filter_current_check_discard(buffer, call, entry, event))
@@ -754,10 +754,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
}

/* Kretprobe handler */
-static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+static __kprobes void
+kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
@@ -973,10 +973,9 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS

/* Kprobe profile handler */
-static __kprobes void kprobe_perf_func(struct kprobe *kp,
- struct pt_regs *regs)
+static __kprobes void
+kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
@@ -995,7 +994,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
if (!entry)
return;

- entry->ip = (unsigned long)kp->addr;
+ entry->ip = (unsigned long)tp->rp.kp.addr;
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);

@@ -1005,10 +1004,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
}

/* Kretprobe profile handler */
-static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+static __kprobes void
+kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
@@ -1074,10 +1073,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
tp->nhit++;

if (tp->flags & TP_FLAG_TRACE)
- kprobe_trace_func(kp, regs);
+ kprobe_trace_func(tp, regs);
#ifdef CONFIG_PERF_EVENTS
if (tp->flags & TP_FLAG_PROFILE)
- kprobe_perf_func(kp, regs);
+ kprobe_perf_func(tp, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
@@ -1090,10 +1089,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
tp->nhit++;

if (tp->flags & TP_FLAG_TRACE)
- kretprobe_trace_func(ri, regs);
+ kretprobe_trace_func(tp, ri, regs);
#ifdef CONFIG_PERF_EVENTS
if (tp->flags & TP_FLAG_PROFILE)
- kretprobe_perf_func(ri, regs);
+ kretprobe_perf_func(tp, ri, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}

Subject: [PATCH 10/11] tracing/kprobes: Support ftrace_event_file base multibuffer

Support multi-buffer on kprobe-based dynamic events by
using ftrace_event_file.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
---
kernel/trace/trace_kprobe.c | 250 +++++++++++++++++++++++++++++++++++++------
1 file changed, 214 insertions(+), 36 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9ca44fc..fee865d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,7 +27,6 @@
/**
* Kprobe event core functions
*/
-
struct trace_probe {
struct list_head list;
struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -36,6 +35,7 @@ struct trace_probe {
const char *symbol; /* symbol name */
struct ftrace_event_class class;
struct ftrace_event_call call;
+ struct ftrace_event_file **files;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -183,12 +183,57 @@ static struct trace_probe *find_trace_probe(const char *event,
return NULL;
}

-/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static int enable_trace_probe(struct trace_probe *tp, int flag)
+static int trace_probe_nr_files(struct trace_probe *tp)
+{
+ struct ftrace_event_file **file = tp->files;
+ int ret = 0;
+
+ if (file)
+ while (*(file++))
+ ret++;
+
+ return ret;
+}
+
+static DEFINE_MUTEX(probe_enable_lock);
+
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int
+enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
{
int ret = 0;

- tp->flags |= flag;
+ mutex_lock(&probe_enable_lock);
+
+ if (file) {
+ struct ftrace_event_file **new, **old = tp->files;
+ int n = trace_probe_nr_files(tp);
+
+ /* 1 is for new one and 1 is for stopper */
+ new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
+ GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ memcpy(new, old, n * sizeof(struct ftrace_event_file *));
+ new[n] = file;
+ /* The last one keeps a NULL */
+
+ rcu_assign_pointer(tp->files, new);
+ tp->flags |= TP_FLAG_TRACE;
+
+ if (old) {
+ /* Make sure the probe is done with old files */
+ synchronize_sched();
+ kfree(old);
+ }
+ } else
+ tp->flags |= TP_FLAG_PROFILE;
+
if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
!trace_probe_has_gone(tp)) {
if (trace_probe_is_return(tp))
@@ -197,19 +242,83 @@ static int enable_trace_probe(struct trace_probe *tp, int flag)
ret = enable_kprobe(&tp->rp.kp);
}

+ out_unlock:
+ mutex_unlock(&probe_enable_lock);
+
return ret;
}

-/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static void disable_trace_probe(struct trace_probe *tp, int flag)
+static int
+trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+ int i;
+
+ if (tp->files) {
+ for (i = 0; tp->files[i]; i++)
+ if (tp->files[i] == file)
+ return i;
+ }
+
+ return -1;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int
+disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
{
- tp->flags &= ~flag;
+ int ret = 0;
+
+ mutex_lock(&probe_enable_lock);
+
+ if (file) {
+ struct ftrace_event_file **new, **old = tp->files;
+ int n = trace_probe_nr_files(tp);
+ int i, j;
+
+ if (n == 0 || trace_probe_file_index(tp, file) < 0) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (n == 1) { /* Remove the last file */
+ tp->flags &= ~TP_FLAG_TRACE;
+ new = NULL;
+ } else {
+ new = kzalloc(n * sizeof(struct ftrace_event_file *),
+ GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* This copy & check loop copies the NULL stopper too */
+ for (i = 0, j = 0; j < n && i < n + 1; i++)
+ if (old[i] != file)
+ new[j++] = old[i];
+ }
+
+ rcu_assign_pointer(tp->files, new);
+
+ /* Make sure the probe is done with old files */
+ synchronize_sched();
+ kfree(old);
+ } else
+ tp->flags &= ~TP_FLAG_PROFILE;
+
if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
if (trace_probe_is_return(tp))
disable_kretprobe(&tp->rp);
else
disable_kprobe(&tp->rp.kp);
}
+
+ out_unlock:
+ mutex_unlock(&probe_enable_lock);
+
+ return ret;
}

/* Internal register function - just handle k*probes and flags */
@@ -724,7 +833,8 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,

/* Kprobe handler */
static __kprobes void
-kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
+__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
{
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
@@ -733,14 +843,17 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;

+ WARN_ON(call != ftrace_file->event_call);
+
local_save_flags(irq_flags);
pc = preempt_count();

dsize = __get_data_size(tp, regs);
size = sizeof(*entry) + tp->size + dsize;

- event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, irq_flags, pc);
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
if (!event)
return;

@@ -753,10 +866,23 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
irq_flags, pc, regs);
}

+static __kprobes void
+kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
+{
+ struct ftrace_event_file **file = tp->files;
+
+ /* Note: preempt is already disabled around the kprobe handler */
+ while (*file) {
+ __kprobe_trace_func(tp, regs, *file);
+ file++;
+ }
+}
+
/* Kretprobe handler */
static __kprobes void
-kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
- struct pt_regs *regs)
+__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+ struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
{
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
@@ -765,14 +891,17 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;

+ WARN_ON(call != ftrace_file->event_call);
+
local_save_flags(irq_flags);
pc = preempt_count();

dsize = __get_data_size(tp, regs);
size = sizeof(*entry) + tp->size + dsize;

- event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, irq_flags, pc);
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
if (!event)
return;

@@ -786,6 +915,19 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
irq_flags, pc, regs);
}

+static __kprobes void
+kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct ftrace_event_file **file = tp->files;
+
+ /* Note: preempt is already disabled around the kprobe handler */
+ while (*file) {
+ __kretprobe_trace_func(tp, ri, regs, *file);
+ file++;
+ }
+}
+
/* Event entry printers */
enum print_line_t
print_kprobe_event(struct trace_iterator *iter, int flags,
@@ -1041,20 +1183,19 @@ int kprobe_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_probe *tp = (struct trace_probe *)event->data;
+ struct ftrace_event_file *file = data;

switch (type) {
case TRACE_REG_REGISTER:
- return enable_trace_probe(tp, TP_FLAG_TRACE);
+ return enable_trace_probe(tp, file);
case TRACE_REG_UNREGISTER:
- disable_trace_probe(tp, TP_FLAG_TRACE);
- return 0;
+ return disable_trace_probe(tp, file);

#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return enable_trace_probe(tp, TP_FLAG_PROFILE);
+ return enable_trace_probe(tp, NULL);
case TRACE_REG_PERF_UNREGISTER:
- disable_trace_probe(tp, TP_FLAG_PROFILE);
- return 0;
+ return disable_trace_probe(tp, NULL);
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
@@ -1190,11 +1331,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
return a1 + a2 + a3 + a4 + a5 + a6;
}

+static struct ftrace_event_file *
+find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list)
+ if (file->event_call == &tp->call)
+ return file;
+
+ return NULL;
+}
+
static __init int kprobe_trace_self_tests_init(void)
{
int ret, warn = 0;
int (*target)(int, int, int, int, int, int);
struct trace_probe *tp;
+ struct ftrace_event_file *file;

target = kprobe_trace_selftest_target;

@@ -1204,31 +1358,43 @@ static __init int kprobe_trace_self_tests_init(void)
"$stack $stack0 +0($stack)",
create_trace_probe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on probing function entry.\n");
+ pr_warn("error on probing function entry.\n");
warn++;
} else {
/* Enable trace point */
tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting new probe.\n");
+ pr_warn("error on getting new probe.\n");
warn++;
- } else
- enable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tp, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_probe(tp, file);
+ }
}

ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
"$retval", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on probing function return.\n");
+ pr_warn("error on probing function return.\n");
warn++;
} else {
/* Enable trace point */
tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting new probe.\n");
+ pr_warn("error on getting 2nd new probe.\n");
warn++;
- } else
- enable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tp, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_probe(tp, file);
+ }
}

if (warn)
@@ -1239,27 +1405,39 @@ static __init int kprobe_trace_self_tests_init(void)
/* Disable trace points before removing it */
tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting test probe.\n");
+ pr_warn("error on getting test probe.\n");
warn++;
- } else
- disable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tp, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_probe(tp, file);
+ }

tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting 2nd test probe.\n");
+ pr_warn("error on getting 2nd test probe.\n");
warn++;
- } else
- disable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tp, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_probe(tp, file);
+ }

ret = traceprobe_command("-:testprobe", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on deleting a probe.\n");
+ pr_warn("error on deleting a probe.\n");
warn++;
}

ret = traceprobe_command("-:testprobe2", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on deleting a probe.\n");
+ pr_warn("error on deleting a probe.\n");
warn++;
}

Subject: [PATCH 05/11] tracing: Indicate enabled soft-mode in enable file

Indicate enabled soft-mode event as "1*" in "enable" file
for each event, because it can be soft-disabled when disable_event
trigger is hit.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
kernel/trace/trace_events.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 44ac836..cc3c130 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -623,6 +623,8 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (file->flags & FTRACE_EVENT_FL_ENABLED) {
if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
buf = "0*\n";
+ else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ buf = "1*\n";
else
buf = "1\n";
} else

Subject: [PATCH 03/11] ftrace: Cleanup regex_lock and ftrace_lock around hash updating

Cleanup regex_lock and ftrace_lock locking points around
ftrace_ops hash update code.

The new rule is that regex_lock protects ops->*_hash
read-update-write code for each ftrace_ops. Usually,
hash update is done by following sequence.

1. allocate a new local hash and copy the original hash.
2. update the local hash.
3. move(actually, copy) back the local hash to ftrace_ops.
4. update ftrace entries if needed.
5. release the local hash.

This makes regex_lock to protect #1-#4, and ftrace_lock
to protect #3 and #4.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f29b3d..0575b3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2644,28 +2644,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
return -ENOMEM;
}

+ iter->ops = ops;
+ iter->flags = flag;
+
+ mutex_lock(&ops->regex_lock);
+
if (flag & FTRACE_ITER_NOTRACE)
hash = ops->notrace_hash;
else
hash = ops->filter_hash;

- iter->ops = ops;
- iter->flags = flag;
-
if (file->f_mode & FMODE_WRITE) {
- mutex_lock(&ftrace_lock);
iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
- mutex_unlock(&ftrace_lock);
-
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_unlock;
}
}

- mutex_lock(&ops->regex_lock);
-
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
ftrace_filter_reset(iter->hash);
@@ -2685,6 +2683,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
} else
file->private_data = iter;
+
+ out_unlock:
mutex_unlock(&ops->regex_lock);

return ret;
@@ -2999,7 +2999,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
if (WARN_ON(not))
return -EINVAL;

- mutex_lock(&ftrace_lock);
+ mutex_lock(&trace_probe_ops.regex_lock);

hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
@@ -3057,14 +3057,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,

} while_for_each_ftrace_rec();

+ mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
if (ret < 0)
count = ret;

__enable_ftrace_function_probe();
+ mutex_unlock(&ftrace_lock);

out_unlock:
- mutex_unlock(&ftrace_lock);
+ mutex_unlock(&trace_probe_ops.regex_lock);
free_ftrace_hash(hash);

return count;
@@ -3104,7 +3106,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return;
}

- mutex_lock(&ftrace_lock);
+ mutex_lock(&trace_probe_ops.regex_lock);

hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
@@ -3142,6 +3144,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
list_add(&entry->free_list, &free_list);
}
}
+ mutex_lock(&ftrace_lock);
__disable_ftrace_function_probe();
/*
* Remove after the disable is called. Otherwise, if the last
@@ -3153,9 +3156,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
list_del(&entry->free_list);
ftrace_free_entry(entry);
}
+ mutex_unlock(&ftrace_lock);

out_unlock:
- mutex_unlock(&ftrace_lock);
+ mutex_unlock(&trace_probe_ops.regex_lock);
free_ftrace_hash(hash);
}

@@ -3271,11 +3275,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
} else
iter = file->private_data;

- mutex_lock(&iter->ops->regex_lock);
-
- ret = -ENODEV;
if (unlikely(ftrace_disabled))
- goto out_unlock;
+ return -ENODEV;
+
+ /* iter->hash is a local copy, so we don't need regex_lock */

parser = &iter->parser;
read = trace_get_user(parser, ubuf, cnt, ppos);
@@ -3286,13 +3289,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser->idx, enable);
trace_parser_clear(parser);
if (ret)
- goto out_unlock;
+ goto out;
}

ret = read;
-out_unlock:
- mutex_unlock(&iter->ops->regex_lock);
-
+ out:
return ret;
}

@@ -3344,16 +3345,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
if (unlikely(ftrace_disabled))
return -ENODEV;

+ mutex_lock(&ops->regex_lock);
+
if (enable)
orig_hash = &ops->filter_hash;
else
orig_hash = &ops->notrace_hash;

hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
- if (!hash)
- return -ENOMEM;
+ if (!hash) {
+ ret = -ENOMEM;
+ goto out_regex_unlock;
+ }

- mutex_lock(&ops->regex_lock);
if (reset)
ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3566,8 +3570,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
} else
iter = file->private_data;

- mutex_lock(&iter->ops->regex_lock);
-
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
parser->buffer[parser->idx] = 0;
@@ -3576,6 +3578,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)

trace_parser_put(parser);

+ mutex_lock(&iter->ops->regex_lock);
+
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);

@@ -3593,10 +3597,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)

mutex_unlock(&ftrace_lock);
}
+
+ mutex_unlock(&iter->ops->regex_lock);
free_ftrace_hash(iter->hash);
kfree(iter);

- mutex_unlock(&iter->ops->regex_lock);
return 0;
}

Subject: [PATCH 04/11] [BUGFIX] tracing/kprobes: Fix to increment return event probe hit-count

Fix to increment probe hit-count for function return event.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
---
kernel/trace/trace_kprobe.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1865d5f..6928633 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -767,6 +767,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;

+ tp->nhit++;
+
local_save_flags(irq_flags);
pc = preempt_count();

Subject: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

Fix a deadlock on ftrace_regex_lock which happens when setting
an enable_event trigger on dynamic kprobe event as below.

----
sh-2.05b# echo p vfs_symlink > kprobe_events
sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter

=============================================
[ INFO: possible recursive locking detected ]
3.9.0+ #35 Not tainted
---------------------------------------------
sh/72 is trying to acquire lock:
(ftrace_regex_lock){+.+.+.}, at: [<ffffffff810ba6c1>] ftrace_set_hash+0x81/0x1f0

but task is already holding lock:
(ftrace_regex_lock){+.+.+.}, at: [<ffffffff810b7cbd>] ftrace_regex_write.isra.29.part.30+0x3d/0x220

other info that might help us debug this:
Possible unsafe locking scenario:

CPU0
----
lock(ftrace_regex_lock);
lock(ftrace_regex_lock);

*** DEADLOCK ***
----

To fix that, this introduces a finer regex_lock for each ftrace_ops.
ftrace_regex_lock seems that a big lock which protect all
filter/notrace_hash operation, but it doesn't need to be a global
lock after supporting multiple ftrace_ops because each ftrace_ops
has its own filter/notrace_hash.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
---
include/linux/ftrace.h | 1 +
kernel/kprobes.c | 1 +
kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++----------------
3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f83e17a..4ba3a6e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -110,6 +110,7 @@ struct ftrace_ops {
#ifdef CONFIG_DYNAMIC_FTRACE
struct ftrace_hash *notrace_hash;
struct ftrace_hash *filter_hash;
+ struct mutex regex_lock;
#endif
};

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0..7f307e8 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -934,6 +934,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS,
+ .regex_lock = __MUTEX_INITIALIZER(kprobe_ftrace_ops.regex_lock),
};
static int kprobe_ftrace_enabled;

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8a5c017..3f29b3d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,9 +64,17 @@

#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)

+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_REGEX_LOCK(opsname) \
+ .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#else
+#define INIT_REGEX_LOCK(opsname)
+#endif
+
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ INIT_REGEX_LOCK(ftrace_list_end)
};

/* ftrace_enabled is a method to turn ftrace on or off */
@@ -908,6 +916,7 @@ static void unregister_ftrace_profiler(void)
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ INIT_REGEX_LOCK(ftrace_profile_ops)
};

static int register_ftrace_profiler(void)
@@ -1104,10 +1113,9 @@ static struct ftrace_ops global_ops = {
.notrace_hash = EMPTY_HASH,
.filter_hash = EMPTY_HASH,
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ INIT_REGEX_LOCK(global_ops)
};

-static DEFINE_MUTEX(ftrace_regex_lock);
-
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -2656,7 +2664,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}

- mutex_lock(&ftrace_regex_lock);
+ mutex_lock(&ops->regex_lock);

if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
@@ -2677,7 +2685,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
} else
file->private_data = iter;
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&ops->regex_lock);

return ret;
}
@@ -2910,6 +2918,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
+ INIT_REGEX_LOCK(trace_probe_ops)
};

static int ftrace_probe_registered;
@@ -3256,18 +3265,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;

- mutex_lock(&ftrace_regex_lock);
-
- ret = -ENODEV;
- if (unlikely(ftrace_disabled))
- goto out_unlock;
-
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
iter = m->private;
} else
iter = file->private_data;

+ mutex_lock(&iter->ops->regex_lock);
+
+ ret = -ENODEV;
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
parser = &iter->parser;
read = trace_get_user(parser, ubuf, cnt, ppos);

@@ -3282,7 +3291,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,

ret = read;
out_unlock:
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&iter->ops->regex_lock);

return ret;
}
@@ -3344,7 +3353,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
if (!hash)
return -ENOMEM;

- mutex_lock(&ftrace_regex_lock);
+ mutex_lock(&ops->regex_lock);
if (reset)
ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3366,7 +3375,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_unlock(&ftrace_lock);

out_regex_unlock:
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&ops->regex_lock);

free_ftrace_hash(hash);
return ret;
@@ -3551,14 +3560,14 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
int filter_hash;
int ret;

- mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
iter = m->private;
-
seq_release(inode, file);
} else
iter = file->private_data;

+ mutex_lock(&iter->ops->regex_lock);
+
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
parser->buffer[parser->idx] = 0;
@@ -3587,7 +3596,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
free_ftrace_hash(iter->hash);
kfree(iter);

- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&iter->ops->regex_lock);
return 0;
}

@@ -4127,6 +4136,7 @@ void __init ftrace_init(void)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ INIT_REGEX_LOCK(global_ops)
};

static int __init ftrace_nodyn_init(void)
@@ -4182,6 +4192,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops control_ops = {
.func = ftrace_ops_control_func,
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ INIT_REGEX_LOCK(control_ops)
};

static inline void

2013-05-09 14:31:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 01/11] [BUGFIX] tracing: Return 0 if event_enable_func succeeded

On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:
> Return 0 instead of the number of activated ftrace if
> event_enable_func succeeded and return an error code if failed,
> beacuse 0 is success code at caller (ftrace_regex_write).
>
> Without this fix, writing enable_event trigger on set_ftrace_filter
> always doesn't work, since event_enable_func returns 1 to
> ftrace_regex_write, it consumes 1 byte and pass input string
> without the first character again. This makes event_enable_func
> fail and disables event entry.
>
> Signed-off-by: Masami Hiramatsu <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Cc: Frederic Weisbecker <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Tom Zanussi <[email protected]>
> ---
> kernel/trace/trace_events.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 53582e9..44ac836 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -2061,8 +2061,11 @@ event_enable_func(struct ftrace_hash *hash,
> if (ret < 0)
> goto out_put;
> ret = register_ftrace_function_probe(glob, ops, data);
> - if (!ret)
> + if (!ret) {
> + ret = -ENOENT;
> goto out_disable;
> + } else
> + ret = 0;

I think you meant:

if (ret < 0)
goto out_disable;
ret = 0;

Otherwise, I don't see how you fixed anything, as you still return error
if ret is something other than zero.

Or am I missing something?

-- Steve

> out:
> mutex_unlock(&event_mutex);
> return ret;

2013-05-09 14:47:12

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:
> Fix a deadlock on ftrace_regex_lock which happens when setting
> an enable_event trigger on dynamic kprobe event as below.
>
> ----
> sh-2.05b# echo p vfs_symlink > kprobe_events
> sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter
>
> =============================================
> [ INFO: possible recursive locking detected ]
> 3.9.0+ #35 Not tainted
> ---------------------------------------------
> sh/72 is trying to acquire lock:
> (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810ba6c1>] ftrace_set_hash+0x81/0x1f0
>
> but task is already holding lock:
> (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810b7cbd>] ftrace_regex_write.isra.29.part.30+0x3d/0x220
>
> other info that might help us debug this:
> Possible unsafe locking scenario:
>
> CPU0
> ----
> lock(ftrace_regex_lock);
> lock(ftrace_regex_lock);

Ouch! I'm surprised I didn't trigger this in my tests. I have lockdep
enabled, and I did run kprobe testing. I'll have to look at how this was
missed :-/

>
> *** DEADLOCK ***
> ----
>
> To fix that, this introduces a finer regex_lock for each ftrace_ops.
> ftrace_regex_lock seems that a big lock which protect all
> filter/notrace_hash operation, but it doesn't need to be a global
> lock after supporting multiple ftrace_ops because each ftrace_ops
> has its own filter/notrace_hash.

OK, I'll test this patch out and see how it goes. I first need to see
how it broke without this patch.

Thanks,

-- Steve

2013-05-09 15:11:50

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 01/11] [BUGFIX] tracing: Return 0 if event_enable_func succeeded

On Thu, 2013-05-09 at 10:31 -0400, Steven Rostedt wrote:

> > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> > index 53582e9..44ac836 100644
> > --- a/kernel/trace/trace_events.c
> > +++ b/kernel/trace/trace_events.c
> > @@ -2061,8 +2061,11 @@ event_enable_func(struct ftrace_hash *hash,
> > if (ret < 0)
> > goto out_put;
> > ret = register_ftrace_function_probe(glob, ops, data);
> > - if (!ret)
> > + if (!ret) {
> > + ret = -ENOENT;
> > goto out_disable;
> > + } else
> > + ret = 0;
>
> I think you meant:
>
> if (ret < 0)
> goto out_disable;
> ret = 0;
>
> Otherwise, I don't see how you fixed anything, as you still return error
> if ret is something other than zero.
>
> Or am I missing something?

Yeah, this needs a comment. register_ftrace_function_probe() returns
the number of functions enabled, but if that is zero (or less), then it
should fail. We still need to check for less than ret.

I'll fix this one up.

Thanks!

-- Steve

2013-05-09 15:21:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 01/11] [BUGFIX] tracing: Return 0 if event_enable_func succeeded

On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:
> Return 0 instead of the number of activated ftrace if
> event_enable_func succeeded and return an error code if failed,
> beacuse 0 is success code at caller (ftrace_regex_write).
>
> Without this fix, writing enable_event trigger on set_ftrace_filter
> always doesn't work, since event_enable_func returns 1 to
> ftrace_regex_write, it consumes 1 byte and pass input string
> without the first character again. This makes event_enable_func
> fail and disables event entry.
>

Ah, this actually fixes two bugs! :-)

A typo will be considered success, but it also sends back to the user
that it only wrote one byte!

-- Steve

2013-05-09 15:41:08

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Thu, 2013-05-09 at 10:47 -0400, Steven Rostedt wrote:

> > sh-2.05b# echo p vfs_symlink > kprobe_events
> > sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter
> >

> Ouch! I'm surprised I didn't trigger this in my tests. I have lockdep
> enabled, and I did run kprobe testing. I'll have to look at how this was
> missed :-/

OK, now I know why I missed this. My tests never enabled a kprobe event.
I'll have to update them to do so.

Thanks,

-- Steve

2013-05-09 16:27:44

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:

> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 8a5c017..3f29b3d 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -64,9 +64,17 @@
>
> #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
>
> +#ifdef CONFIG_DYNAMIC_FTRACE
> +#define INIT_REGEX_LOCK(opsname) \
> + .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
> +#else
> +#define INIT_REGEX_LOCK(opsname)
> +#endif
> +
> static struct ftrace_ops ftrace_list_end __read_mostly = {
> .func = ftrace_stub,
> .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
> + INIT_REGEX_LOCK(ftrace_list_end)
> };
>

We probably should have a better way to initialize this. As there are 26
ftrace_ops currently in the kernel (and this patch doesn't cover all of
them). Maybe have the first time its registered to initialize it.

I think adding the INIT_REGEX_LOCK() all over the place is rather ugly.

-- Steve

2013-05-09 16:34:39

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Thu, 2013-05-09 at 12:27 -0400, Steven Rostedt wrote:

> We probably should have a better way to initialize this. As there are 26
> ftrace_ops currently in the kernel (and this patch doesn't cover all of
> them). Maybe have the first time its registered to initialize it.

Crap, but it can be used before that. Hmm, I guess all ftrace functions
will need to check that flag first. We do something similar for rt_mutex
in -rt.

-- Steve


2013-05-09 17:08:35

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Thu, 2013-05-09 at 12:34 -0400, Steven Rostedt wrote:
> On Thu, 2013-05-09 at 12:27 -0400, Steven Rostedt wrote:
>
> > We probably should have a better way to initialize this. As there are 26
> > ftrace_ops currently in the kernel (and this patch doesn't cover all of
> > them). Maybe have the first time its registered to initialize it.
>
> Crap, but it can be used before that. Hmm, I guess all ftrace functions
> will need to check that flag first. We do something similar for rt_mutex
> in -rt.

I added this on top of your patch. I kept the INIT_REGEX_LOCK as it's
only local to ftrace.c and wont spread further. Also, the
ftrace_list_end ftrace_ops is just a place holder (needed for race
conditions that can have function tracers call its stub), so it does not
need to be initialized. If anything tries to grab its mutex, that's a
bug anyway.

What do you think?

-- Steve

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4ba3a6e..99d0fbc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,6 +90,8 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
* not set this, then the ftrace infrastructure will add recursion
* protection for the caller.
* STUB - The ftrace_ops is just a place holder.
+ * INITIALIZED - The ftrace_ops has already been initialized (first use time
+ * register_ftrace_function() is called, it will initialized the ops)
*/
enum {
FTRACE_OPS_FL_ENABLED = 1 << 0,
@@ -100,6 +102,7 @@ enum {
FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5,
FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6,
FTRACE_OPS_FL_STUB = 1 << 7,
+ FTRACE_OPS_FL_INITIALIZED = 1 << 8,
};

struct ftrace_ops {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7f307e8..3fed7f0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -934,7 +934,6 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS,
- .regex_lock = __MUTEX_INITIALIZER(kprobe_ftrace_ops.regex_lock),
};
static int kprobe_ftrace_enabled;

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ec83928..827f2fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -74,7 +74,6 @@
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
- INIT_REGEX_LOCK(ftrace_list_end)
};

/* ftrace_enabled is a method to turn ftrace on or off */
@@ -139,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
while (likely(op = rcu_dereference_raw((op)->next)) && \
unlikely((op) != &ftrace_list_end))

+static inline void ftrace_ops_init(struct ftrace_ops *ops)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+ mutex_init(&ops->regex_lock);
+ ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
+}
+
/**
* ftrace_nr_registered_ops - return number of ops registered
*
@@ -915,7 +924,7 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
INIT_REGEX_LOCK(ftrace_profile_ops)
};

@@ -1112,7 +1121,7 @@ static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.notrace_hash = EMPTY_HASH,
.filter_hash = EMPTY_HASH,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
INIT_REGEX_LOCK(global_ops)
};

@@ -1255,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)

void ftrace_free_filter(struct ftrace_ops *ops)
{
+ ftrace_ops_init(ops);
free_ftrace_hash(ops->filter_hash);
free_ftrace_hash(ops->notrace_hash);
}
@@ -2632,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct ftrace_hash *hash;
int ret = 0;

+ ftrace_ops_init(ops);
+
if (unlikely(ftrace_disabled))
return -ENODEV;

@@ -2918,6 +2930,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
INIT_REGEX_LOCK(trace_probe_ops)
};

@@ -3401,6 +3414,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_addr(ops, ip, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
@@ -3425,6 +3439,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_regex(ops, buf, len, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3443,6 +3458,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_regex(ops, buf, len, reset, 0);
}
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
@@ -3533,6 +3549,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;

+ ftrace_ops_init(ops);
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -4135,7 +4153,7 @@ void __init ftrace_init(void)

static struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
INIT_REGEX_LOCK(global_ops)
};

@@ -4190,8 +4208,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
}

static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .func = ftrace_ops_control_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
INIT_REGEX_LOCK(control_ops)
};

@@ -4550,6 +4568,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
{
int ret = -1;

+ ftrace_ops_init(ops);
+
mutex_lock(&ftrace_lock);

ret = __register_ftrace_function(ops);

2013-05-09 17:12:19

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 03/11] ftrace: Cleanup regex_lock and ftrace_lock around hash updating

On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:
> Cleanup regex_lock and ftrace_lock locking points around
> ftrace_ops hash update code.
>
> The new rule is that regex_lock protects ops->*_hash
> read-update-write code for each ftrace_ops. Usually,
> hash update is done by following sequence.
>
> 1. allocate a new local hash and copy the original hash.
> 2. update the local hash.
> 3. move(actually, copy) back the local hash to ftrace_ops.
> 4. update ftrace entries if needed.
> 5. release the local hash.
>
> This makes regex_lock to protect #1-#4, and ftrace_lock
> to protect #3 and #4.

Patch looks good, but I don't see how ftrace_lock protects #3. The two
things that I see ftrace_lock protecting is the update to the ftrace
entries, and the adding and removing ftrace_ops to the ftrace_ops_list.

-- Steve

>
> Signed-off-by: Masami Hiramatsu <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Cc: Frederic Weisbecker <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Tom Zanussi <[email protected]>
> ---

2013-05-09 22:09:09

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 03/11] ftrace: Cleanup regex_lock and ftrace_lock around hash updating

On Thu, 2013-05-09 at 13:12 -0400, Steven Rostedt wrote:
> On Thu, 2013-05-09 at 14:44 +0900, Masami Hiramatsu wrote:
> > Cleanup regex_lock and ftrace_lock locking points around
> > ftrace_ops hash update code.
> >
> > The new rule is that regex_lock protects ops->*_hash
> > read-update-write code for each ftrace_ops. Usually,
> > hash update is done by following sequence.
> >
> > 1. allocate a new local hash and copy the original hash.
> > 2. update the local hash.
> > 3. move(actually, copy) back the local hash to ftrace_ops.
> > 4. update ftrace entries if needed.
> > 5. release the local hash.
> >
> > This makes regex_lock to protect #1-#4, and ftrace_lock
> > to protect #3 and #4.
>
> Patch looks good, but I don't see how ftrace_lock protects #3. The two
> things that I see ftrace_lock protecting is the update to the ftrace
> entries, and the adding and removing ftrace_ops to the ftrace_ops_list.

OK, yeah, the move does an implicit entries update. Thus it does need
the lock. I'll update the change log to imply that.

-- Steve

Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

(2013/05/10 2:08), Steven Rostedt wrote:
> On Thu, 2013-05-09 at 12:34 -0400, Steven Rostedt wrote:
>> On Thu, 2013-05-09 at 12:27 -0400, Steven Rostedt wrote:
>>
>>> We probably should have a better way to initialize this. As there are 26
>>> ftrace_ops currently in the kernel (and this patch doesn't cover all of
>>> them). Maybe have the first time its registered to initialize it.

Oh, I missed many of them :(

>> Crap, but it can be used before that. Hmm, I guess all ftrace functions
>> will need to check that flag first. We do something similar for rt_mutex
>> in -rt.
>
> I added this on top of your patch. I kept the INIT_REGEX_LOCK as it's
> only local to ftrace.c and wont spread further. Also, the
> ftrace_list_end ftrace_ops is just a place holder (needed for race
> conditions that can have function tracers call its stub), so it does not
> need to be initialized. If anything tries to grab its mutex, that's a
> bug anyway.
>
> What do you think?

Hmm, would we really need to have the additional flag?
I mean, do we better force ftrace user to use ftrace_ops_init before
calling such functions as mutex itself does?

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-05-10 13:38:05

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/11] [BUGFIX] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

On Fri, 2013-05-10 at 10:40 +0900, Masami Hiramatsu wrote:

> Hmm, would we really need to have the additional flag?
> I mean, do we better force ftrace user to use ftrace_ops_init before
> calling such functions as mutex itself does?

It will be hard to get right, and I don't like the macro to initialize
it all over the place. Having the check before its used contained just
in ftrace.c seems to work. None of the functions are hot paths so it's
not like its slowing anything down.

Here's what I did:

>From f04f24fb7e48d446bd89a01c6056571f25972511 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <[email protected]>
Date: Thu, 9 May 2013 14:44:17 +0900
Subject: [PATCH] ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

Fix a deadlock on ftrace_regex_lock which happens when setting
an enable_event trigger on dynamic kprobe event as below.

----
sh-2.05b# echo p vfs_symlink > kprobe_events
sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter

=============================================
[ INFO: possible recursive locking detected ]
3.9.0+ #35 Not tainted
---------------------------------------------
sh/72 is trying to acquire lock:
(ftrace_regex_lock){+.+.+.}, at: [<ffffffff810ba6c1>] ftrace_set_hash+0x81/0x1f0

but task is already holding lock:
(ftrace_regex_lock){+.+.+.}, at: [<ffffffff810b7cbd>] ftrace_regex_write.isra.29.part.30+0x3d/0x220

other info that might help us debug this:
Possible unsafe locking scenario:

CPU0
----
lock(ftrace_regex_lock);
lock(ftrace_regex_lock);

*** DEADLOCK ***
----

To fix that, this introduces a finer regex_lock for each ftrace_ops.
ftrace_regex_lock is too big of a lock which protects all
filter/notrace_hash operations, but it doesn't need to be a global
lock after supporting multiple ftrace_ops because each ftrace_ops
has its own filter/notrace_hash.

Link: http://lkml.kernel.org/r/20130509054417.30398.84254.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tom Zanussi <[email protected]>
Signed-off-by: Masami Hiramatsu <[email protected]>
[ Added initialization flag and automate mutex initialization for
non ftrace.c ftrace_probes. ]
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/ftrace.h | 4 ++
kernel/trace/ftrace.c | 73 ++++++++++++++++++++++++++++++++++--------------
2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f83e17a..99d0fbc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,6 +90,8 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
* not set this, then the ftrace infrastructure will add recursion
* protection for the caller.
* STUB - The ftrace_ops is just a place holder.
+ * INITIALIZED - The ftrace_ops has already been initialized (first use time
+ * register_ftrace_function() is called, it will initialized the ops)
*/
enum {
FTRACE_OPS_FL_ENABLED = 1 << 0,
@@ -100,6 +102,7 @@ enum {
FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5,
FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6,
FTRACE_OPS_FL_STUB = 1 << 7,
+ FTRACE_OPS_FL_INITIALIZED = 1 << 8,
};

struct ftrace_ops {
@@ -110,6 +113,7 @@ struct ftrace_ops {
#ifdef CONFIG_DYNAMIC_FTRACE
struct ftrace_hash *notrace_hash;
struct ftrace_hash *filter_hash;
+ struct mutex regex_lock;
#endif
};

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d85a0ad..827f2fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,6 +64,13 @@

#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)

+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_REGEX_LOCK(opsname) \
+ .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#else
+#define INIT_REGEX_LOCK(opsname)
+#endif
+
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
@@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
while (likely(op = rcu_dereference_raw((op)->next)) && \
unlikely((op) != &ftrace_list_end))

+static inline void ftrace_ops_init(struct ftrace_ops *ops)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+ mutex_init(&ops->regex_lock);
+ ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
+}
+
/**
* ftrace_nr_registered_ops - return number of ops registered
*
@@ -907,7 +924,8 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(ftrace_profile_ops)
};

static int register_ftrace_profiler(void)
@@ -1103,11 +1121,10 @@ static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.notrace_hash = EMPTY_HASH,
.filter_hash = EMPTY_HASH,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(global_ops)
};

-static DEFINE_MUTEX(ftrace_regex_lock);
-
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -1247,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)

void ftrace_free_filter(struct ftrace_ops *ops)
{
+ ftrace_ops_init(ops);
free_ftrace_hash(ops->filter_hash);
free_ftrace_hash(ops->notrace_hash);
}
@@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct ftrace_hash *hash;
int ret = 0;

+ ftrace_ops_init(ops);
+
if (unlikely(ftrace_disabled))
return -ENODEV;

@@ -2656,7 +2676,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}

- mutex_lock(&ftrace_regex_lock);
+ mutex_lock(&ops->regex_lock);

if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
@@ -2677,7 +2697,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
} else
file->private_data = iter;
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&ops->regex_lock);

return ret;
}
@@ -2910,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(trace_probe_ops)
};

static int ftrace_probe_registered;
@@ -3256,18 +3278,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;

- mutex_lock(&ftrace_regex_lock);
-
- ret = -ENODEV;
- if (unlikely(ftrace_disabled))
- goto out_unlock;
-
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
iter = m->private;
} else
iter = file->private_data;

+ mutex_lock(&iter->ops->regex_lock);
+
+ ret = -ENODEV;
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
parser = &iter->parser;
read = trace_get_user(parser, ubuf, cnt, ppos);

@@ -3282,7 +3304,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,

ret = read;
out_unlock:
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&iter->ops->regex_lock);

return ret;
}
@@ -3344,7 +3366,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
if (!hash)
return -ENOMEM;

- mutex_lock(&ftrace_regex_lock);
+ mutex_lock(&ops->regex_lock);
if (reset)
ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3366,7 +3388,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_unlock(&ftrace_lock);

out_regex_unlock:
- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&ops->regex_lock);

free_ftrace_hash(hash);
return ret;
@@ -3392,6 +3414,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_addr(ops, ip, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
@@ -3416,6 +3439,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_regex(ops, buf, len, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3434,6 +3458,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
+ ftrace_ops_init(ops);
return ftrace_set_regex(ops, buf, len, reset, 0);
}
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
@@ -3524,6 +3549,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;

+ ftrace_ops_init(ops);
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -3551,14 +3578,14 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
int filter_hash;
int ret;

- mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
iter = m->private;
-
seq_release(inode, file);
} else
iter = file->private_data;

+ mutex_lock(&iter->ops->regex_lock);
+
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
parser->buffer[parser->idx] = 0;
@@ -3587,7 +3614,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
free_ftrace_hash(iter->hash);
kfree(iter);

- mutex_unlock(&ftrace_regex_lock);
+ mutex_unlock(&iter->ops->regex_lock);
return 0;
}

@@ -4126,7 +4153,8 @@ void __init ftrace_init(void)

static struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(global_ops)
};

static int __init ftrace_nodyn_init(void)
@@ -4180,8 +4208,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
}

static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .func = ftrace_ops_control_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(control_ops)
};

static inline void
@@ -4539,6 +4568,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
{
int ret = -1;

+ ftrace_ops_init(ops);
+
mutex_lock(&ftrace_lock);

ret = __register_ftrace_function(ops);
--
1.7.3.4