This patchset adds filter support for perf counter, so not all
profile events are recorded but only those match the filters
we set.
An example:
#./perf record -f -e irq:irq_handler_entry:irq==18:record
or
#./perf record -f -e irq:irq_handler_entry:irq==18 -R
^C
# ./perf trace
version = 0.5
perf-4303 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
---
include/linux/ftrace_event.h | 19 +++-
include/linux/perf_counter.h | 1 +
include/linux/syscalls.h | 14 ++-
include/trace/ftrace.h | 10 +-
kernel/perf_counter.c | 40 ++++++-
kernel/trace/trace.h | 9 +-
kernel/trace/trace_event_profile.c | 18 +++
kernel/trace/trace_events_filter.c | 247 +++++++++++++++++++++---------------
kernel/trace/trace_syscalls.c | 9 +-
tools/perf/builtin-record.c | 12 ++
tools/perf/util/parse-events.c | 48 ++++++-
tools/perf/util/parse-events.h | 1 +
12 files changed, 303 insertions(+), 125 deletions(-)
Change:
for_each_pred
for_each_subsystem
To:
for_each_subsystem
for_each_pred
This change also prepares for the next patch.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/trace/trace.h | 1 -
kernel/trace/trace_events_filter.c | 124 +++++++++++++-----------------------
2 files changed, 45 insertions(+), 80 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea7e0bc..2b47eba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -790,7 +790,6 @@ struct event_filter {
int n_preds;
struct filter_pred **preds;
char *filter_string;
- bool no_reset;
};
struct event_subsystem {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fb..f9afbdf 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -458,14 +458,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
return 0;
}
-enum {
- FILTER_DISABLE_ALL,
- FILTER_INIT_NO_RESET,
- FILTER_SKIP_NO_RESET,
-};
-
-static void filter_free_subsystem_preds(struct event_subsystem *system,
- int flag)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call;
@@ -476,14 +469,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
if (strcmp(call->system, system->name) != 0)
continue;
- if (flag == FILTER_INIT_NO_RESET) {
- call->filter->no_reset = false;
- continue;
- }
-
- if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
- continue;
-
filter_disable_preds(call);
remove_filter_string(call->filter);
}
@@ -657,44 +642,6 @@ add_pred_fn:
return 0;
}
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
- struct event_subsystem *system,
- struct filter_pred *pred,
- char *filter_string,
- bool dry_run)
-{
- struct ftrace_event_call *call;
- int err = 0;
- bool fail = true;
-
- list_for_each_entry(call, &ftrace_events, list) {
-
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name))
- continue;
-
- if (call->filter->no_reset)
- continue;
-
- err = filter_add_pred(ps, call, pred, dry_run);
- if (err)
- call->filter->no_reset = true;
- else
- fail = false;
-
- if (!dry_run)
- replace_filter_string(call->filter, filter_string);
- }
-
- if (fail) {
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- return err;
- }
- return 0;
-}
-
static void parse_init(struct filter_parse_state *ps,
struct filter_op *ops,
char *infix_string)
@@ -1048,8 +995,7 @@ static int check_preds(struct filter_parse_state *ps)
return 0;
}
-static int replace_preds(struct event_subsystem *system,
- struct ftrace_event_call *call,
+static int replace_preds(struct ftrace_event_call *call,
struct filter_parse_state *ps,
char *filter_string,
bool dry_run)
@@ -1096,11 +1042,7 @@ static int replace_preds(struct event_subsystem *system,
add_pred:
if (!pred)
return -ENOMEM;
- if (call)
- err = filter_add_pred(ps, call, pred, false);
- else
- err = filter_add_subsystem_pred(ps, system, pred,
- filter_string, dry_run);
+ err = filter_add_pred(ps, call, pred, dry_run);
filter_free_pred(pred);
if (err)
return err;
@@ -1111,6 +1053,44 @@ add_pred:
return 0;
}
+static int replace_system_preds(struct event_subsystem *system,
+ struct filter_parse_state *ps,
+ char *filter_string)
+{
+ struct ftrace_event_call *call;
+ int err;
+ bool fail = true;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name) != 0)
+ continue;
+
+ /* try to see if the filter can be applied */
+ err = replace_preds(call, ps, filter_string, true);
+ if (err)
+ continue;
+
+ /* really apply the filter */
+ filter_disable_preds(call);
+ err = replace_preds(call, ps, filter_string, false);
+ if (err)
+ filter_disable_preds(call);
+ else
+ replace_filter_string(call->filter, filter_string);
+ fail = false;
+ }
+
+ if (fail) {
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return err;
+ }
+ return 0;
+}
+
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
int err;
@@ -1145,7 +1125,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out;
}
- err = replace_preds(NULL, call, ps, filter_string, false);
+ err = replace_preds(call, ps, filter_string, false);
if (err)
append_filter_err(ps, call->filter);
@@ -1173,7 +1153,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out_unlock;
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+ filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
mutex_unlock(&event_mutex);
return 0;
@@ -1193,23 +1173,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out;
}
- filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
-
- /* try to see the filter can be applied to which events */
- err = replace_preds(system, NULL, ps, filter_string, true);
- if (err) {
- append_filter_err(ps, system->filter);
- goto out;
- }
-
- filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
-
- /* really apply the filter to the events */
- err = replace_preds(system, NULL, ps, filter_string, false);
- if (err) {
+ err = replace_system_preds(system, ps, filter_string);
+ if (err)
append_filter_err(ps, system->filter);
- filter_free_subsystem_preds(system, 2);
- }
out:
filter_opstack_clear(ps);
--
1.6.3
- add ftrace_profile_set_filter(), to set filter for a profile event
- filter is enabled when profile probe is registered
- filter is disabled when profile probe is unregistered
- in ftrace_profile_##call(), record events only when
filter_match_preds() returns 1
Signed-off-by: Li Zefan <[email protected]>
---
include/linux/ftrace_event.h | 19 +++++-
include/trace/ftrace.h | 10 ++-
kernel/trace/trace.h | 8 ++-
kernel/trace/trace_event_profile.c | 18 +++++
kernel/trace/trace_events_filter.c | 135 ++++++++++++++++++++++++++++--------
5 files changed, 156 insertions(+), 34 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 23f7179..44a7183 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -129,6 +129,10 @@ struct ftrace_event_call {
void *mod;
void *data;
+#ifdef CONFIG_EVENT_PROFILE
+ int profile_filter_active;
+ struct event_filter *profile_filter;
+#endif
atomic_t profile_count;
int (*profile_enable)(struct ftrace_event_call *);
void (*profile_disable)(struct ftrace_event_call *);
@@ -138,12 +142,25 @@ struct ftrace_event_call {
#define MAX_FILTER_STR_VAL 128
extern void destroy_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_match_preds(struct event_filter *filter, void *rec);
extern int filter_current_check_discard(struct ring_buffer *buffer,
struct ftrace_event_call *call,
void *rec,
struct ring_buffer_event *event);
+#ifdef CONFIG_EVENT_PROFILE
+extern void destroy_profile_preds(struct ftrace_event_call *call);
+
+static inline int
+profile_filter_check(struct ftrace_event_call *call, void *rec)
+{
+ if (likely(!call->profile_filter_active) ||
+ filter_match_preds(call->profile_filter, rec))
+ return 1;
+ return 0;
+}
+#endif
+
enum {
FILTER_OTHER = 0,
FILTER_STATIC_STRING,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 308bafd..da95201 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -417,8 +417,11 @@ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
\
static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
{ \
- if (atomic_add_negative(-1, &event_call->profile_count)) \
+ if (atomic_add_negative(-1, &event_call->profile_count)) { \
unregister_trace_##call(ftrace_profile_##call); \
+ tracepoint_synchronize_unregister(); \
+ destroy_profile_preds(event_call); \
+ } \
}
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -742,8 +745,9 @@ static void ftrace_profile_##call(proto) \
\
{ assign; } \
\
- perf_tpcounter_event(event_call->id, __addr, __count, entry,\
- __entry_size); \
+ if (profile_filter_check(event_call, entry)) \
+ perf_tpcounter_event(event_call->id, __addr, __count, \
+ entry, __entry_size); \
} while (0); \
\
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2b47eba..751f996 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -817,6 +817,11 @@ struct filter_pred {
int pop_n;
};
+#ifdef CONFIG_EVENT_PROFILE
+extern int apply_profile_filter(struct ftrace_event_call *call,
+ char *filter_string);
+#endif
+
extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
extern int apply_event_filter(struct ftrace_event_call *call,
@@ -832,7 +837,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+ if (unlikely(call->filter_active)
+ && !filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb..ec6d4b0 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -37,3 +37,21 @@ void ftrace_profile_disable(int event_id)
}
mutex_unlock(&event_mutex);
}
+
+int ftrace_profile_set_filter(int event_id, char *filter)
+{
+ struct ftrace_event_call *event;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(event, &ftrace_events, list) {
+ if (event->id == event_id) {
+ ret = apply_profile_filter(event, filter);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f9afbdf..1ab36b7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -210,9 +210,8 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
}
/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
{
- struct event_filter *filter = call->filter;
int match, top = 0, val1 = 0, val2 = 0;
int stack[MAX_FILTER_PRED];
struct filter_pred *pred;
@@ -385,14 +384,10 @@ static void filter_disable_preds(struct ftrace_event_call *call)
filter->preds[i]->fn = filter_pred_none;
}
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
{
- struct event_filter *filter = call->filter;
int i;
- if (!filter)
- return;
-
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (filter->preds[i])
filter_free_pred(filter->preds[i]);
@@ -400,21 +395,27 @@ void destroy_preds(struct ftrace_event_call *call)
kfree(filter->preds);
kfree(filter->filter_string);
kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+ if (!call->filter)
+ return;
+
+ __free_preds(call->filter);
call->filter = NULL;
+ call->filter_active = 0;
}
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
{
struct event_filter *filter;
struct filter_pred *pred;
int i;
- if (call->filter)
- return 0;
-
- filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- if (!call->filter)
- return -ENOMEM;
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
filter->n_preds = 0;
@@ -430,12 +431,23 @@ static int init_preds(struct ftrace_event_call *call)
filter->preds[i] = pred;
}
- return 0;
+ return filter;
oom:
- destroy_preds(call);
+ __free_preds(filter);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+ if (call->filter)
+ return 0;
- return -ENOMEM;
+ call->filter_active = 0;
+ call->filter = __alloc_preds();
+ if (IS_ERR(call->filter))
+ return PTR_ERR(call->filter);
+ return 0;
}
static int init_subsystem_preds(struct event_subsystem *system)
@@ -476,10 +488,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
static int filter_add_pred_fn(struct filter_parse_state *ps,
struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_pred *pred,
filter_pred_fn_t fn)
{
- struct event_filter *filter = call->filter;
int idx, err;
if (filter->n_preds == MAX_FILTER_PRED) {
@@ -494,7 +506,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
return err;
filter->n_preds++;
- call->filter_active = 1;
return 0;
}
@@ -570,6 +581,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_pred *pred,
bool dry_run)
{
@@ -638,7 +650,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
add_pred_fn:
if (!dry_run)
- return filter_add_pred_fn(ps, call, pred, fn);
+ return filter_add_pred_fn(ps, call, filter, pred, fn);
return 0;
}
@@ -996,6 +1008,7 @@ static int check_preds(struct filter_parse_state *ps)
}
static int replace_preds(struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_parse_state *ps,
char *filter_string,
bool dry_run)
@@ -1042,7 +1055,7 @@ static int replace_preds(struct ftrace_event_call *call,
add_pred:
if (!pred)
return -ENOMEM;
- err = filter_add_pred(ps, call, pred, dry_run);
+ err = filter_add_pred(ps, call, filter, pred, dry_run);
filter_free_pred(pred);
if (err)
return err;
@@ -1058,10 +1071,12 @@ static int replace_system_preds(struct event_subsystem *system,
char *filter_string)
{
struct ftrace_event_call *call;
+ struct event_filter *filter;
int err;
bool fail = true;
list_for_each_entry(call, &ftrace_events, list) {
+ filter = call->filter;
if (!call->define_fields)
continue;
@@ -1070,17 +1085,19 @@ static int replace_system_preds(struct event_subsystem *system,
continue;
/* try to see if the filter can be applied */
- err = replace_preds(call, ps, filter_string, true);
+ err = replace_preds(call, filter, ps, filter_string, true);
if (err)
continue;
/* really apply the filter */
filter_disable_preds(call);
- err = replace_preds(call, ps, filter_string, false);
+ err = replace_preds(call, filter, ps, filter_string, false);
if (err)
filter_disable_preds(call);
- else
- replace_filter_string(call->filter, filter_string);
+ else {
+ call->filter_active = 1;
+ replace_filter_string(filter, filter_string);
+ }
fail = false;
}
@@ -1094,7 +1111,6 @@ static int replace_system_preds(struct event_subsystem *system,
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
int err;
-
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
@@ -1125,10 +1141,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out;
}
- err = replace_preds(call, ps, filter_string, false);
+ err = replace_preds(call, call->filter, ps, filter_string, false);
if (err)
append_filter_err(ps, call->filter);
-
+ else
+ call->filter_active = 1;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1143,7 +1160,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
int err;
-
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
@@ -1187,3 +1203,64 @@ out_unlock:
return err;
}
+#ifdef CONFIG_EVENT_PROFILE
+
+void destroy_profile_preds(struct ftrace_event_call *call)
+{
+ if (!call->profile_filter)
+ return;
+
+ __free_preds(call->profile_filter);
+ call->profile_filter = NULL;
+ call->profile_filter_active = 0;
+}
+EXPORT_SYMBOL_GPL(destroy_profile_preds);
+
+static int init_profile_preds(struct ftrace_event_call *call)
+{
+ if (call->profile_filter)
+ return 0;
+
+ call->profile_filter_active = 0;
+
+ call->profile_filter = __alloc_preds();
+ if (IS_ERR(call->profile_filter))
+ return PTR_ERR(call->profile_filter);
+ return 0;
+}
+
+/* Should be called with event_mutex held */
+int apply_profile_filter(struct ftrace_event_call *call, char *filter_string)
+{
+ int err;
+ struct filter_parse_state *ps;
+
+ err = init_profile_preds(call);
+ if (err)
+ return err;
+
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ return err;
+
+ parse_init(ps, filter_ops, filter_string);
+ err = filter_parse(ps);
+ if (err)
+ goto out;
+
+ err = replace_preds(call, call->profile_filter, ps,
+ filter_string, false);
+ if (!err)
+ call->profile_filter_active = 1;
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+
+ return err;
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
+
--
1.6.3
This makes syscall profile events filter-able.
Signed-off-by: Li Zefan <[email protected]>
---
include/linux/syscalls.h | 14 ++++++++++----
kernel/trace/trace_syscalls.c | 9 +++++++--
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a8e3782..510d941 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -103,30 +103,36 @@ struct perf_counter_attr;
static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \
{ \
int ret = 0; \
- if (!atomic_inc_return(&event_enter_##sname.profile_count)) \
+ if (!atomic_inc_return(&event_call->profile_count)) \
ret = reg_prof_syscall_enter("sys"#sname); \
return ret; \
} \
\
static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
{ \
- if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \
+ if (atomic_add_negative(-1, &event_call->profile_count)) { \
unreg_prof_syscall_enter("sys"#sname); \
+ tracepoint_synchronize_unregister(); \
+ destroy_profile_preds(event_call); \
+ } \
}
#define TRACE_SYS_EXIT_PROFILE(sname) \
static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \
{ \
int ret = 0; \
- if (!atomic_inc_return(&event_exit_##sname.profile_count)) \
+ if (!atomic_inc_return(&event_call->profile_count)) \
ret = reg_prof_syscall_exit("sys"#sname); \
return ret; \
} \
\
static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
{ \
- if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \
+ if (atomic_add_negative(-1, &event_call->profile_count)) { \
unreg_prof_syscall_exit("sys"#sname); \
+ tracepoint_synchronize_unregister(); \
+ destroy_profile_preds(event_call); \
+ } \
}
#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3..c2faf98 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -414,7 +414,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+
+ if (profile_filter_check(sys_data->enter_event, &rec))
+ perf_tpcounter_event(sys_data->enter_id, 0, 1,
+ rec, size);
} while(0);
}
@@ -476,7 +479,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
rec.nr = syscall_nr;
rec.ret = syscall_get_return_value(current, regs);
- perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+ if (profile_filter_check(sys_data->exit_event, &rec))
+ perf_tpcounter_event(sys_data->exit_id, 0, 1,
+ &rec, sizeof(rec));
}
int reg_prof_syscall_exit(char *name)
--
1.6.3
Allow to set profile filter via ioctl.
Signed-off-by: Li Zefan <[email protected]>
---
include/linux/perf_counter.h | 1 +
kernel/perf_counter.c | 40 +++++++++++++++++++++++++++++++++++++---
2 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 972f90d..7a750b3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -217,6 +217,7 @@ struct perf_counter_attr {
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
+#define PERF_COUNTER_IOC_SET_FILTER _IOW('$', 6, char *)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e0d91fd..3fd65e8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1988,7 +1988,9 @@ unlock:
return ret;
}
-int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+static int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+static int perf_counter_set_filter(struct perf_counter *counter,
+ void __user *arg);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -2016,6 +2018,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_COUNTER_IOC_SET_OUTPUT:
return perf_counter_set_output(counter, arg);
+ case PERF_COUNTER_IOC_SET_FILTER:
+ return perf_counter_set_filter(counter, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -3964,6 +3969,7 @@ EXPORT_SYMBOL_GPL(perf_tpcounter_event);
extern int ftrace_profile_enable(int);
extern void ftrace_profile_disable(int);
+extern int ftrace_profile_set_filter(int, char *);
static void tp_perf_counter_destroy(struct perf_counter *counter)
{
@@ -3988,12 +3994,40 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
return &perf_ops_generic;
}
+
+static int perf_counter_set_filter(struct perf_counter *counter,
+ void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (counter->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(counter->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
#else
+
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
return NULL;
}
-#endif
+
+static int perf_counter_set_filter(struct perf_counter *counter,
+ void __user *arg)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
@@ -4246,7 +4280,7 @@ err_size:
goto out;
}
-int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+static int perf_counter_set_output(struct perf_counter *counter, int output_fd)
{
struct perf_counter *output_counter = NULL;
struct file *output_file = NULL;
--
1.6.3
The name length of some trace events is longer than 30, like
sys_enter_sched_get_priority_max, ext4_mb_discard_preallocations.
Signed-off-by: Li Zefan <[email protected]>
---
tools/perf/util/parse-events.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a587d41..892d931 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -139,7 +139,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
(strcmp(evt_dirent.d_name, "..")) && \
(!tp_event_has_id(&sys_dirent, &evt_dirent)))
-#define MAX_EVENT_LENGTH 30
+#define MAX_EVENT_LENGTH 40
int valid_debugfs_mount(const char *debugfs)
{
-- 1.6.3
#./perf record -f -e irq:irq_handler_entry:irq==18:record
or
#./perf record -f -e irq:irq_handler_entry:irq==18 -R
^C
# ./perf trace
version = 0.5
perf-4303 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
init-0 ... irq_handler_entry: irq=18 handler=eth0
The usage is not changed if filter is not used:
#./perf record -f -e irq:irq_handler_entry:record
Signed-off-by: Li Zefan <[email protected]>
---
tools/perf/builtin-record.c | 12 ++++++++++
tools/perf/util/parse-events.c | 46 +++++++++++++++++++++++++++++++++------
tools/perf/util/parse-events.h | 1 +
3 files changed, 52 insertions(+), 7 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 99a12fe..7749982 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -369,9 +369,11 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int
static void create_counter(int counter, int cpu, pid_t pid)
{
+ char *filter = filters[counter];
struct perf_counter_attr *attr = attrs + counter;
struct perf_header_attr *h_attr;
int track = !counter; /* only the first counter needs these */
+ int ret;
struct {
u64 count;
u64 time_enabled;
@@ -485,6 +487,16 @@ try_again:
exit(-1);
}
+ if (attr->type == PERF_TYPE_TRACEPOINT && filter != NULL) {
+ ret = ioctl(fd[nr_cpu][counter],
+ PERF_COUNTER_IOC_SET_FILTER, filter);
+ if (ret) {
+ error("failed to set filter with %d (%s)\n", errno,
+ strerror(errno));
+ exit(-1);
+ }
+ }
+
ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
}
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 892d931..a7d2ef6 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -10,6 +10,7 @@
int nr_counters;
struct perf_counter_attr attrs[MAX_COUNTERS];
+char *filters[MAX_COUNTERS];
struct event_symbol {
u8 type;
@@ -405,15 +406,26 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
return 1;
}
+static int tracepoint_event_set_flags(const char *flags,
+ struct perf_counter_attr *attr)
+{
+ if (!strncmp(flags, "record", strlen(flags))) {
+ attr->sample_type |= PERF_SAMPLE_RAW;
+ return 1;
+ }
+ return 0;
+}
+
static int parse_tracepoint_event(const char **strp,
struct perf_counter_attr *attr)
{
+ char sys_name[MAX_EVENT_LENGTH];
const char *evt_name;
+ char *filter;
char *flags;
- char sys_name[MAX_EVENT_LENGTH];
char id_buf[4];
int fd;
- unsigned int sys_length, evt_length;
+ unsigned int sys_length, evt_length, filter_length;
u64 id;
char evt_path[MAXPATHLEN];
@@ -433,13 +445,33 @@ static int parse_tracepoint_event(const char **strp,
evt_name = evt_name + 1;
flags = strchr(evt_name, ':');
- if (flags) {
- *flags = '\0';
- flags++;
- if (!strncmp(flags, "record", strlen(flags)))
- attr->sample_type |= PERF_SAMPLE_RAW;
+ if (!flags)
+ goto next;
+
+ *flags++ = '\0';
+ if (tracepoint_event_set_flags(flags, attr))
+ goto next;
+
+ filter = flags;
+
+ flags = strchr(filter, ':');
+ if (!flags)
+ filter_length = strlen(filter);
+ else {
+ filter_length = flags++ - filter;
+ if (!tracepoint_event_set_flags(flags, attr))
+ return 0;
+ }
+
+ filters[nr_counters] = malloc(filter_length + 1);
+ if (!filters[nr_counters]) {
+ fprintf(stderr, "Not enough memory to hold filter string\n");
+ exit(-1);
}
+ strncpy(filters[nr_counters], filter, filter_length);
+ filters[nr_counters][filter_length] = '\0';
+next:
evt_length = strlen(evt_name);
if (evt_length >= MAX_EVENT_LENGTH)
return 0;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 60704c1..0ae146e 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -17,6 +17,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
extern int nr_counters;
extern struct perf_counter_attr attrs[MAX_COUNTERS];
+extern char *filters[MAX_COUNTERS];
extern const char *event_name(int ctr);
extern const char *__event_name(int type, u64 config);
-- 1.6.3
On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> Allow to set profile filter via ioctl.
Hrm,.. not at all sure about this.. what are the ABI implications?
* Peter Zijlstra <[email protected]> wrote:
> On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> > Allow to set profile filter via ioctl.
>
> Hrm,.. not at all sure about this.. what are the ABI implications?
I think the ABI should be fine if it's always a sub-set of C syntax.
That would be C expressions initially. Hm?
Ingo
On Mon, 2009-09-07 at 18:48 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> > > Allow to set profile filter via ioctl.
> >
> > Hrm,.. not at all sure about this.. what are the ABI implications?
>
> I think the ABI should be fine if it's always a sub-set of C syntax.
> That would be C expressions initially. Hm?
Right, so I've no clue what filter expressions look like, and the
changelog doesn't help us at all. It doesn't mention its a well
considered decision to henceforth freeze the expression syntax.
Of course, since filters so far only work with tracepoint things, and
since you can only come by tracepoint things through debugfs, and since
anything debugfs is basically a free-for-all ABI-less world, we might be
good, but then this is a very ill-defined ioctl() indeed.
So please, consider this well -- there might not be a second chance.
On Mon, Sep 07, 2009 at 04:11:57PM +0800, Li Zefan wrote:
> This patchset adds filter support for perf counter, so not all
> profile events are recorded but only those match the filters
> we set.
Thanks a lot for this work!
> An example:
>
> #./perf record -f -e irq:irq_handler_entry:irq==18:record
> or
> #./perf record -f -e irq:irq_handler_entry:irq==18 -R
> ^C
We may want to write complex filters. While looking at this patchset,
that seems possible this way, but the raw line may become unreadable:
perf record -f -e irq:irq_handler_entry:"irq==18 && (name == 'foo' || bar == 'blah')":record
May be we should add an option to let one also set the filters seperately then
we could do:
perf record -f -e -R irq:irq_handler_entry --filter "irq==18 && (name == 'foo' || bar == 'blah')"
Hmm.
> # ./perf trace
> version = 0.5
> perf-4303 ... irq_handler_entry: irq=18 handler=eth0
> init-0 ... irq_handler_entry: irq=18 handler=eth0
> init-0 ... irq_handler_entry: irq=18 handler=eth0
> init-0 ... irq_handler_entry: irq=18 handler=eth0
> init-0 ... irq_handler_entry: irq=18 handler=eth0
>
> ---
> include/linux/ftrace_event.h | 19 +++-
> include/linux/perf_counter.h | 1 +
> include/linux/syscalls.h | 14 ++-
> include/trace/ftrace.h | 10 +-
> kernel/perf_counter.c | 40 ++++++-
> kernel/trace/trace.h | 9 +-
> kernel/trace/trace_event_profile.c | 18 +++
> kernel/trace/trace_events_filter.c | 247 +++++++++++++++++++++---------------
> kernel/trace/trace_syscalls.c | 9 +-
> tools/perf/builtin-record.c | 12 ++
> tools/perf/util/parse-events.c | 48 ++++++-
> tools/perf/util/parse-events.h | 1 +
> 12 files changed, 303 insertions(+), 125 deletions(-)
>
Peter Zijlstra wrote:
> On Mon, 2009-09-07 at 18:48 +0200, Ingo Molnar wrote:
>> * Peter Zijlstra <[email protected]> wrote:
>>
>>> On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
>>>> Allow to set profile filter via ioctl.
>>> Hrm,.. not at all sure about this.. what are the ABI implications?
>> I think the ABI should be fine if it's always a sub-set of C syntax.
>> That would be C expressions initially. Hm?
>
> Right, so I've no clue what filter expressions look like, and the
> changelog doesn't help us at all. It doesn't mention its a well
> considered decision to henceforth freeze the expression syntax.
>
> Of course, since filters so far only work with tracepoint things, and
> since you can only come by tracepoint things through debugfs, and since
> anything debugfs is basically a free-for-all ABI-less world, we might be
> good, but then this is a very ill-defined ioctl() indeed.
>
> So please, consider this well -- there might not be a second chance.
>
Ok, the expressions are:
1. S = opr1 op opr2 (op: ==, !=, <, <=, >, >=.
opr1 should be a field in the format file)
2. E = S1 op S2 (op: ||, &&)
3. E = E1 op E2 (op: ||, &&)
4. () can be used
I don't the syntax will be changed, but we may extend it, like
adding not ! operator. Like, for a func ptr, besides "func==0xccee4400",
we may want to allow "func==foo". Those extentions are ok for the
ABI, right?
>> An example:
>>
>> #./perf record -f -e irq:irq_handler_entry:irq==18:record
>> or
>> #./perf record -f -e irq:irq_handler_entry:irq==18 -R
>> ^C
>
> We may want to write complex filters. While looking at this patchset,
> that seems possible this way, but the raw line may become unreadable:
>
> perf record -f -e irq:irq_handler_entry:"irq==18 && (name == 'foo' || bar == 'blah')":record
>
> May be we should add an option to let one also set the filters seperately then
> we could do:
>
> perf record -f -e -R irq:irq_handler_entry --filter "irq==18 && (name == 'foo' || bar == 'blah')"
>
I had the same idea. ;)
But using this option, is it possible to specify different filters
for different events? like this:
perf record -f -e -R irq:irq_handler_entry --filter "irq==18"
-e irq:softirq_entry --filter "vec==1"
On Mon, Sep 07, 2009 at 04:12:53PM +0800, Li Zefan wrote:
> - add ftrace_profile_set_filter(), to set filter for a profile event
>
> - filter is enabled when profile probe is registered
>
> - filter is disabled when profile probe is unregistered
>
> - in ftrace_profile_##call(), record events only when
> filter_match_preds() returns 1
>
> Signed-off-by: Li Zefan <[email protected]>
Well, I feel a bit uncomfortable with this approach.
The events approach taken by perf is different from ftrace.
ftrace events activation/deactivation, ring buffer captures,
filters are all globals. And this is nice to perform kernel
tracing from debugfs files.
But perf has a per counter instance approach. This means
that when a tracepoint counter registers a filter, this should
be private to this tracepoint counter and not propagated to the
others.
So this should rely on a kind of per tracepoint counter
attribute, something that we should probably be stored in
the struct hw_perf_counter like:
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -467,6 +467,7 @@ struct hw_perf_counter {
union { /* software */
atomic64_t count;
struct hrtimer hrtimer;
+ struct event_filter filter;
};
};
atomic64_t prev_count;
You may need to get the current perf context that can
be found in current->perf_counter_ctxp and then iterate
through the counter_list of this ctx to find the current counter
attached to this tracepoint (using the event id).
What is not nice is that we need to iterate in O(n), n beeing the
number of tracepoint counters attached to the current counter
context.
So to avoid the following costly sequence in the tracing fastpath:
- deref ctx->current->perf_counter_ctxp
- list every ctx->counter_list
- find the counter that matches
- deref counter->filter and test...
You could keep the profile_filter field (and profile_filter_active)
in struct ftrace_event_call but allocate them per cpu and
write these fields for a given event each time we enter/exit a
counter context that has a counter that uses this given event.
That's something we could do by using a struct pmu specific for
tracepoints. More precisely with enable/disable callbacks that would do
specific things and then relay on the perf_ops_generic pmu
callbacks.
the struct pmu::enable()/disable() callbacks are functions that are called
each time we schedule in/out a task group that has a counter that
uses the given pmu.
Ie: they are called each time we schedule in/out a counter.
So you have a struct ftrace_event_call. This event can be used in
several different counters instance at the same time. But in a given cpu,
only one of these counters can be currently in use.
Now if we build a simple struct pmu tp_pmu that mostly relays
on the perf_ops_generic pmu but also have a specific enable/disable
pair that calls perf_swcounter_enable/disable and also does:
/*
* We are scheduling this counter on the smp_processor_id() cpu
* (we are in atomic context, ctx->lock acquired) then we
* can safely write a local (per cpu) shortcut from the filter_profile
* field in the event to the counter filter.
*/
static int perf_tpcounter_enable(struct perf_counter *counter)
{
struct event_filter *filter, **shortcut;
int *enable;
struct ftrace_event_call *call;
int cpu = smp_processor_if();
call = find_event_by_id(counter->attr.config);
filter = &counter->hw.filter;
shortcut = &per_cpu_ptr(call->filter_profile, cpu)
enable = &per_cpu_var(call->filter_profile_enabled, cpu)
if (filter is present) {
*enable = 1;
*shortcut = filter;
}
return perf_swcounter_enable(counter);
}
/* We schedule out this counter from this cpu, erase the shortcut */
static void perf_tpcounter_disable(struct perf_counter *counter)
{
/* ... */
enable = 0;
shortcut = NULL;
perf_swcounter_disable(counter);
}
static const struct pmu perf_ops_tracepoint = {
.enable = perf_tpcounter_enable,
.disable = perf_tpcounter_disable,
.read = perf_swcounter_read,
.unthrottle = perf_swcounter_unthrottle,
};
See? Then you can have a O(1) retrieval of the current per
counter filter to apply from ftrace_profile_##call()
I hope I haven't been too much confusing...
On Tue, Sep 08, 2009 at 09:06:49AM +0800, Li Zefan wrote:
> >> An example:
> >>
> >> #./perf record -f -e irq:irq_handler_entry:irq==18:record
> >> or
> >> #./perf record -f -e irq:irq_handler_entry:irq==18 -R
> >> ^C
> >
> > We may want to write complex filters. While looking at this patchset,
> > that seems possible this way, but the raw line may become unreadable:
> >
> > perf record -f -e irq:irq_handler_entry:"irq==18 && (name == 'foo' || bar == 'blah')":record
> >
> > May be we should add an option to let one also set the filters seperately then
> > we could do:
> >
> > perf record -f -e -R irq:irq_handler_entry --filter "irq==18 && (name == 'foo' || bar == 'blah')"
> >
>
> I had the same idea. ;)
>
> But using this option, is it possible to specify different filters
> for different events? like this:
>
> perf record -f -e -R irq:irq_handler_entry --filter "irq==18"
> -e irq:softirq_entry --filter "vec==1"
>
Exactly how I was imagining it :)
On Tue, Sep 08, 2009 at 04:01:19AM +0200, Frederic Weisbecker wrote:
> On Mon, Sep 07, 2009 at 04:12:53PM +0800, Li Zefan wrote:
> > - add ftrace_profile_set_filter(), to set filter for a profile event
> >
> > - filter is enabled when profile probe is registered
> >
> > - filter is disabled when profile probe is unregistered
> >
> > - in ftrace_profile_##call(), record events only when
> > filter_match_preds() returns 1
> >
> > Signed-off-by: Li Zefan <[email protected]>
>
>
>
> Well, I feel a bit uncomfortable with this approach.
>
> The events approach taken by perf is different from ftrace.
>
> ftrace events activation/deactivation, ring buffer captures,
> filters are all globals. And this is nice to perform kernel
> tracing from debugfs files.
>
> But perf has a per counter instance approach. This means
> that when a tracepoint counter registers a filter, this should
> be private to this tracepoint counter and not propagated to the
> others.
>
> So this should rely on a kind of per tracepoint counter
> attribute, something that we should probably be stored in
> the struct hw_perf_counter like:
>
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -467,6 +467,7 @@ struct hw_perf_counter {
> union { /* software */
> atomic64_t count;
> struct hrtimer hrtimer;
> + struct event_filter filter;
> };
> };
> atomic64_t prev_count;
>
>
> You may need to get the current perf context that can
> be found in current->perf_counter_ctxp and then iterate
> through the counter_list of this ctx to find the current counter
> attached to this tracepoint (using the event id).
>
> What is not nice is that we need to iterate in O(n), n beeing the
> number of tracepoint counters attached to the current counter
> context.
>
> So to avoid the following costly sequence in the tracing fastpath:
>
> - deref ctx->current->perf_counter_ctxp
> - list every ctx->counter_list
> - find the counter that matches
> - deref counter->filter and test...
>
> You could keep the profile_filter field (and profile_filter_active)
> in struct ftrace_event_call but allocate them per cpu and
> write these fields for a given event each time we enter/exit a
> counter context that has a counter that uses this given event.
>
> That's something we could do by using a struct pmu specific for
> tracepoints. More precisely with enable/disable callbacks that would do
> specific things and then relay on the perf_ops_generic pmu
> callbacks.
>
> the struct pmu::enable()/disable() callbacks are functions that are called
> each time we schedule in/out a task group that has a counter that
> uses the given pmu.
> Ie: they are called each time we schedule in/out a counter.
>
> So you have a struct ftrace_event_call. This event can be used in
> several different counters instance at the same time. But in a given cpu,
> only one of these counters can be currently in use.
I fear I've been confusing here.
Just to be clear:
Say we schedule in a task in cpu 0.
This task has a irq_entry tp counter. It makes no
sense to have two irq_entry counters for a same task and
if it's possible I guess that should be fixed (unless I'm
missing relevant usecases).
Then when we enter the task, if we have an irq_entry counter
that becomes enabled, we can safely write a shortcut from its
ftrace_event_call structure to the counter filter.
But that must be done per cpu, because another task of the same
task group may be using this counter instance on another cpu.
That's why we need this field to be written per cpu.
> Now if we build a simple struct pmu tp_pmu that mostly relays
> on the perf_ops_generic pmu but also have a specific enable/disable
> pair that calls perf_swcounter_enable/disable and also does:
>
> /*
> * We are scheduling this counter on the smp_processor_id() cpu
> * (we are in atomic context, ctx->lock acquired) then we
> * can safely write a local (per cpu) shortcut from the filter_profile
> * field in the event to the counter filter.
> */
> static int perf_tpcounter_enable(struct perf_counter *counter)
> {
> struct event_filter *filter, **shortcut;
> int *enable;
> struct ftrace_event_call *call;
> int cpu = smp_processor_if();
>
> call = find_event_by_id(counter->attr.config);
> filter = &counter->hw.filter;
>
> shortcut = &per_cpu_ptr(call->filter_profile, cpu)
> enable = &per_cpu_var(call->filter_profile_enabled, cpu)
>
> if (filter is present) {
> *enable = 1;
> *shortcut = filter;
> }
>
> return perf_swcounter_enable(counter);
> }
>
> /* We schedule out this counter from this cpu, erase the shortcut */
> static void perf_tpcounter_disable(struct perf_counter *counter)
> {
> /* ... */
> enable = 0;
> shortcut = NULL;
>
> perf_swcounter_disable(counter);
> }
>
> static const struct pmu perf_ops_tracepoint = {
> .enable = perf_tpcounter_enable,
> .disable = perf_tpcounter_disable,
> .read = perf_swcounter_read,
> .unthrottle = perf_swcounter_unthrottle,
> };
>
>
> See? Then you can have a O(1) retrieval of the current per
> counter filter to apply from ftrace_profile_##call()
>
> I hope I haven't been too much confusing...
>
* Li Zefan <[email protected]> wrote:
> Peter Zijlstra wrote:
> > On Mon, 2009-09-07 at 18:48 +0200, Ingo Molnar wrote:
> >> * Peter Zijlstra <[email protected]> wrote:
> >>
> >>> On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> >>>> Allow to set profile filter via ioctl.
> >>> Hrm,.. not at all sure about this.. what are the ABI implications?
> >> I think the ABI should be fine if it's always a sub-set of C syntax.
> >> That would be C expressions initially. Hm?
> >
> > Right, so I've no clue what filter expressions look like, and the
> > changelog doesn't help us at all. It doesn't mention its a well
> > considered decision to henceforth freeze the expression syntax.
> >
> > Of course, since filters so far only work with tracepoint things, and
> > since you can only come by tracepoint things through debugfs, and since
> > anything debugfs is basically a free-for-all ABI-less world, we might be
> > good, but then this is a very ill-defined ioctl() indeed.
> >
> > So please, consider this well -- there might not be a second chance.
> >
>
> Ok, the expressions are:
>
> 1. S = opr1 op opr2 (op: ==, !=, <, <=, >, >=.
> opr1 should be a field in the format file)
> 2. E = S1 op S2 (op: ||, &&)
> 3. E = E1 op E2 (op: ||, &&)
> 4. () can be used
>
> I don't the syntax will be changed, but we may extend it, like
> adding not ! operator. Like, for a func ptr, besides
> "func==0xccee4400", we may want to allow "func==foo". Those
> extentions are ok for the ABI, right?
Yeah - extensions (new operators, control structures, etc.) are fine
- incompatible changes are not. So as long as we stick to the C
syntax the ABI is: 'be a sub-set of C' - and that's easy to ensure
in the long run. Needs to be added prominently in form of comments,
etc.
It would also be useful for security engines: a filter attached to a
security probe point (or syscalls) would allow the runtime shaping
of security policy - to unprivileged user-space. If filters get
inherited by child tasks and if child tasks are not allowed to make
filters more permissive (i.e. if they can only add filters) that
would be an excellent tool for safe sandboxing like Google Chrome's
sandbox.
Btw., could we define the ABI in a way to allow not just expressions
in the future, but small C-syntax scripts too? I.e. in the long run
these filters could do dprobes alike safe scripting, injected by
unprivileged user-space and parsed/validated and executed in the
kernel.
It could also be useful for network filtering rules, etc. - and
everyone knows C syntax so it has an easy learning curve.
Do you see where i'm going? Filter expressions are a _very_ powerful
concept not just to tracing, and we want to spread it to more places
in the kernel. Perfcounters are a natural first hop - just lets keep
future options open too.
Ingo
* Frederic Weisbecker <[email protected]> wrote:
> On Tue, Sep 08, 2009 at 09:06:49AM +0800, Li Zefan wrote:
> > >> An example:
> > >>
> > >> #./perf record -f -e irq:irq_handler_entry:irq==18:record
> > >> or
> > >> #./perf record -f -e irq:irq_handler_entry:irq==18 -R
> > >> ^C
> > >
> > > We may want to write complex filters. While looking at this patchset,
> > > that seems possible this way, but the raw line may become unreadable:
> > >
> > > perf record -f -e irq:irq_handler_entry:"irq==18 && (name == 'foo' || bar == 'blah')":record
> > >
> > > May be we should add an option to let one also set the filters seperately then
> > > we could do:
> > >
> > > perf record -f -e -R irq:irq_handler_entry --filter "irq==18 && (name == 'foo' || bar == 'blah')"
> > >
> >
> > I had the same idea. ;)
> >
> > But using this option, is it possible to specify different filters
> > for different events? like this:
> >
> > perf record -f -e -R irq:irq_handler_entry --filter "irq==18"
> > -e irq:softirq_entry --filter "vec==1"
> >
>
>
> Exactly how I was imagining it :)
Yes, that looks like a nice syntax. First the event, then the
conditions that restrict its output.
Ingo
On Mon, 2009-09-07 at 18:55 +0200, Peter Zijlstra wrote:
> On Mon, 2009-09-07 at 18:48 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> > > > Allow to set profile filter via ioctl.
> > >
> > > Hrm,.. not at all sure about this.. what are the ABI implications?
> >
> > I think the ABI should be fine if it's always a sub-set of C syntax.
> > That would be C expressions initially. Hm?
>
> Right, so I've no clue what filter expressions look like, and the
> changelog doesn't help us at all. It doesn't mention its a well
> considered decision to henceforth freeze the expression syntax.
>
> Of course, since filters so far only work with tracepoint things, and
> since you can only come by tracepoint things through debugfs, and since
> anything debugfs is basically a free-for-all ABI-less world, we might be
> good, but then this is a very ill-defined ioctl() indeed.
>
> So please, consider this well -- there might not be a second chance.
>
I've been meaning to write up something about the event filters - here's
a first stab that hopefully helps explain them...
Tom
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 2bcc8d4..50fe510 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -97,3 +97,160 @@ The format of this boot option is the same as described in section 2.1.
See The example provided in samples/trace_events
+4. Event formats
+================
+
+Each trace event has a 'format' file associated with it that contains
+a description of each field in a logged event. This information can
+be used to parse the binary trace stream, and is also the place to
+find the field names that can be used in event filters (see section 5
+below).
+
+It also displays the format string that will be used to print the
+event in text mode, along with the event name and ID used for
+profiling.
+
+Every event has a set of 'common' fields associated with it; these are
+the fields prefixed with 'common_'. The other fields vary between
+events and correspond to the fields defined in the TRACE_EVENT
+definition for that event.
+
+Here's the information displayed for the 'sched_wakeup' event:
+
+# cat /debug/tracing/events/sched/sched_wakeup/format
+
+name: sched_wakeup
+ID: 60
+format:
+ field:unsigned short common_type; offset:0; size:2;
+ field:unsigned char common_flags; offset:2; size:1;
+ field:unsigned char common_preempt_count; offset:3; size:1;
+ field:int common_pid; offset:4; size:4;
+ field:int common_tgid; offset:8; size:4;
+
+ field:char comm[TASK_COMM_LEN]; offset:12; size:16;
+ field:pid_t pid; offset:28; size:4;
+ field:int prio; offset:32; size:4;
+ field:int success; offset:36; size:4;
+ field:int cpu; offset:40; size:4;
+
+print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid, REC->prio, REC->success, REC->cpu
+
+5. Event filtering
+==================
+
+Trace events can be filtered in the kernel by associating boolean
+'filter expressions' with them. As soon as an event is logged into
+the trace buffer, its fields are checked against the filter expression
+associated with that event type. An event with field values that
+'match' the filter will appear in the trace output, and an event whose
+values don't match will be discarded. An event with no filter
+associated with it matches everything, which is the default when no
+filter has been set for an event.
+
+A filter expression consists of one or more 'predicates' that can be
+combined using the logical operators '&&' and '||'. A predicate is
+simply a clause that compares the value of a field contained within a
+logged event with a constant value and returns either 0 or 1 depending
+on whether the field value matched (1) or didn't match (0):
+
+ field-name relational-operator value
+
+Parentheses can be used to provide arbitrary logical groupings and
+double-quotes can be used to prevent the shell from interpreting
+operators as shell metacharacters.
+
+The field-names available for use in filters can be found in the
+'format' files for trace events (see section 4 above).
+
+The relational-operators depend on the type of the field being tested:
+
+The operators available for numeric fields are:
+
+==, !=, <, <=, >, >=
+
+And for string fields they are:
+
+==, !=
+
+Currently, only exact string matches are supported.
+
+Currently, the maximum number of predicates in a filter is set at 16.
+
+A filter for an individual event is set by writing a filter expression
+to the 'filter' file for the given event.
+
+For example:
+
+# cd /debug/tracing/events/sched/sched_wakeup
+# echo "common_preempt_count > 4" > filter
+
+A slightly more involved example:
+
+# cd /debug/tracing/events/sched/sched_signal_send
+# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
+
+If there was an error in the expression, you'll get an 'Invalid
+argument' error when setting it, and the erroneous string along with
+an error message can be seen by looking at the filter e.g.:
+
+# cd /debug/tracing/events/sched/sched_signal_send
+# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter
+-bash: echo: write error: Invalid argument
+# cat filter
+((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
+^
+parse_error: Field not found
+
+Currently the caret for an error always appears at the beginning of
+the filter string; the error message should be still be useful though
+even without more accurate position info.
+
+To clear a filter, write a '0' to the filter file.
+
+For convenience, filters for every event in a subsystem can be set or
+cleared as a group by writing a filter expression into the filter file
+at the root of the subsytem. Note, however, that if a filter for any
+event within the subsystem lacks a field specified in the subsystem
+filter, or if the filter can't be applied for any other reason, the
+filter for that event will retain its previous setting. This can
+result in an unintended mixture of filters which could lead to
+confusing (to the user who might think different filters are in
+effect) trace output. Only filters that reference just the common
+fields can be guaranteed to propagate successfully to all events.
+
+To clear the filters for all events in a subsystem, write a '0' to the
+subsystem's filter file.
+
+Here are a few subsystem filter examples that also illustrate the
+above points:
+
+Clear the filters on all events in the sched subsytem:
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo 0 > filter
+# cat sched_switch/filter
+none
+# cat sched_switch/filter
+none
+
+Set a filter using only common fields for all events in the sched
+subsytem (all events end up with the same filter):
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo common_pid == 0 > filter
+# cat sched_switch/filter
+common_pid == 0
+# cat sched_wakeup/filter
+common_pid == 0
+
+Attempt to set a filter using a non-common field for all events in the
+sched subsytem (all events but those that have a prev_pid field retain
+their old filters):
+
+# cd /sys/kernel/debug/tracing/events/sched
+# echo prev_pid == 0 > filter
+# cat sched_switch/filter
+prev_pid == 0
+# cat sched_wakeup/filter
+common_pid == 0
On Tue, 2009-09-08 at 04:01 +0200, Frederic Weisbecker wrote:
> On Mon, Sep 07, 2009 at 04:12:53PM +0800, Li Zefan wrote:
> > - add ftrace_profile_set_filter(), to set filter for a profile event
> >
> > - filter is enabled when profile probe is registered
> >
> > - filter is disabled when profile probe is unregistered
> >
> > - in ftrace_profile_##call(), record events only when
> > filter_match_preds() returns 1
> >
> > Signed-off-by: Li Zefan <[email protected]>
>
>
>
> Well, I feel a bit uncomfortable with this approach.
>
> The events approach taken by perf is different from ftrace.
>
> ftrace events activation/deactivation, ring buffer captures,
> filters are all globals. And this is nice to perform kernel
> tracing from debugfs files.
>
> But perf has a per counter instance approach. This means
> that when a tracepoint counter registers a filter, this should
> be private to this tracepoint counter and not propagated to the
> others.
Agreed.
> So this should rely on a kind of per tracepoint counter
> attribute, something that we should probably be stored in
> the struct hw_perf_counter like:
>
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -467,6 +467,7 @@ struct hw_perf_counter {
> union { /* software */
> atomic64_t count;
> struct hrtimer hrtimer;
> + struct event_filter filter;
> };
> };
> atomic64_t prev_count;
>
>
> You may need to get the current perf context that can
> be found in current->perf_counter_ctxp and then iterate
> through the counter_list of this ctx to find the current counter
> attached to this tracepoint (using the event id).
>
> What is not nice is that we need to iterate in O(n), n beeing the
> number of tracepoint counters attached to the current counter
> context.
>
> So to avoid the following costly sequence in the tracing fastpath:
>
> - deref ctx->current->perf_counter_ctxp
> - list every ctx->counter_list
> - find the counter that matches
> - deref counter->filter and test...
>
> You could keep the profile_filter field (and profile_filter_active)
> in struct ftrace_event_call but allocate them per cpu and
> write these fields for a given event each time we enter/exit a
> counter context that has a counter that uses this given event.
How would that work when you have two counters of the same type in one
context with different filter expressions?
> That's something we could do by using a struct pmu specific for
> tracepoints. More precisely with enable/disable callbacks that would do
> specific things and then relay on the perf_ops_generic pmu
> callbacks.
>
> the struct pmu::enable()/disable() callbacks are functions that are called
> each time we schedule in/out a task group that has a counter that
> uses the given pmu.
> Ie: they are called each time we schedule in/out a counter.
>
> So you have a struct ftrace_event_call. This event can be used in
> several different counters instance at the same time. But in a given cpu,
> only one of these counters can be currently in use.
Not so, you can have as many counters as you want on any one particular
cpu. There is nothing that stops:
perf record -e timer:hrtimer_start -e timer:hrtimer_start -e
timer:hrtimer_start ...
from working, now add a different filter to each of those counter and
enjoy ;-)
I've been thinking of replacing that linear list with a better lookup,
like maybe an RB-tree or hash table, because we hit that silly O(n) loop
on every software event.
On Tue, 2009-09-08 at 08:49 +0800, Li Zefan wrote:
> Peter Zijlstra wrote:
> > On Mon, 2009-09-07 at 18:48 +0200, Ingo Molnar wrote:
> >> * Peter Zijlstra <[email protected]> wrote:
> >>
> >>> On Mon, 2009-09-07 at 16:13 +0800, Li Zefan wrote:
> >>>> Allow to set profile filter via ioctl.
> >>> Hrm,.. not at all sure about this.. what are the ABI implications?
> >> I think the ABI should be fine if it's always a sub-set of C syntax.
> >> That would be C expressions initially. Hm?
> >
> > Right, so I've no clue what filter expressions look like, and the
> > changelog doesn't help us at all. It doesn't mention its a well
> > considered decision to henceforth freeze the expression syntax.
> >
> > Of course, since filters so far only work with tracepoint things, and
> > since you can only come by tracepoint things through debugfs, and since
> > anything debugfs is basically a free-for-all ABI-less world, we might be
> > good, but then this is a very ill-defined ioctl() indeed.
> >
> > So please, consider this well -- there might not be a second chance.
> >
>
> Ok, the expressions are:
>
> 1. S = opr1 op opr2 (op: ==, !=, <, <=, >, >=.
> opr1 should be a field in the format file)
> 2. E = S1 op S2 (op: ||, &&)
> 3. E = E1 op E2 (op: ||, &&)
> 4. () can be used
>
> I don't the syntax will be changed, but we may extend it, like
> adding not ! operator. Like, for a func ptr, besides "func==0xccee4400",
> we may want to allow "func==foo". Those extentions are ok for the
> ABI, right?
Sure, but my point is that you need to be aware that you're creating an
ABI and the changelog was virtually non-existent which didn't inspire
much confidence.
On Tue, Sep 08, 2009 at 10:35:45AM +0200, Peter Zijlstra wrote:
> On Tue, 2009-09-08 at 04:01 +0200, Frederic Weisbecker wrote:
> > You may need to get the current perf context that can
> > be found in current->perf_counter_ctxp and then iterate
> > through the counter_list of this ctx to find the current counter
> > attached to this tracepoint (using the event id).
> >
> > What is not nice is that we need to iterate in O(n), n beeing the
> > number of tracepoint counters attached to the current counter
> > context.
> >
> > So to avoid the following costly sequence in the tracing fastpath:
> >
> > - deref ctx->current->perf_counter_ctxp
> > - list every ctx->counter_list
> > - find the counter that matches
> > - deref counter->filter and test...
> >
> > You could keep the profile_filter field (and profile_filter_active)
> > in struct ftrace_event_call but allocate them per cpu and
> > write these fields for a given event each time we enter/exit a
> > counter context that has a counter that uses this given event.
>
> How would that work when you have two counters of the same type in one
> context with different filter expressions?
Oh so we can do that? That's what I wondered about.
> > That's something we could do by using a struct pmu specific for
> > tracepoints. More precisely with enable/disable callbacks that would do
> > specific things and then relay on the perf_ops_generic pmu
> > callbacks.
> >
> > the struct pmu::enable()/disable() callbacks are functions that are called
> > each time we schedule in/out a task group that has a counter that
> > uses the given pmu.
> > Ie: they are called each time we schedule in/out a counter.
> >
> > So you have a struct ftrace_event_call. This event can be used in
> > several different counters instance at the same time. But in a given cpu,
> > only one of these counters can be currently in use.
>
> Not so, you can have as many counters as you want on any one particular
> cpu. There is nothing that stops:
>
> perf record -e timer:hrtimer_start -e timer:hrtimer_start -e
> timer:hrtimer_start ...
>
> from working, now add a different filter to each of those counter and
> enjoy ;-)
>
Then may be we can do that here:
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
struct perf_sample_data *data)
{
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
if (perf_swcounter_match(counter, type, event, data->regs))
perf_swcounter_add(counter, nr, nmi, data);
}
}
If we have two instances of a same counter id in the ctx, the sample
event will be added twice there.
What we need is to apply the filter there because we are "counter instance"
aware at this stage.
We can do our filter check inside perf_swcounter_match(). We just
need to have the filter and the ftrace event call as
struct hw_perf_counter fields and call filter_match_preds() from
perf_swcounter_match() (if the previous match tests have successed).
> I've been thinking of replacing that linear list with a better lookup,
> like maybe an RB-tree or hash table, because we hit that silly O(n) loop
> on every software event.
Yeah that indeed is also a problem :)
>>>> Hrm,.. not at all sure about this.. what are the ABI implications?
>>> I think the ABI should be fine if it's always a sub-set of C syntax.
>>> That would be C expressions initially. Hm?
>> Right, so I've no clue what filter expressions look like, and the
>> changelog doesn't help us at all. It doesn't mention its a well
>> considered decision to henceforth freeze the expression syntax.
>>
>> Of course, since filters so far only work with tracepoint things, and
>> since you can only come by tracepoint things through debugfs, and since
>> anything debugfs is basically a free-for-all ABI-less world, we might be
>> good, but then this is a very ill-defined ioctl() indeed.
>>
>> So please, consider this well -- there might not be a second chance.
>>
>
> I've been meaning to write up something about the event filters - here's
> a first stab that hopefully helps explain them...
>
Great!
Reviewed-by: Li Zefan <[email protected]>
Could you add your SOB and send it to Ingo?
Some nitpicks below:
> Tom
>
> diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
> index 2bcc8d4..50fe510 100644
> --- a/Documentation/trace/events.txt
> +++ b/Documentation/trace/events.txt
> @@ -97,3 +97,160 @@ The format of this boot option is the same as described in section 2.1.
>
> See The example provided in samples/trace_events
>
> +4. Event formats
> +================
> +
> +Each trace event has a 'format' file associated with it that contains
> +a description of each field in a logged event. This information can
> +be used to parse the binary trace stream, and is also the place to
> +find the field names that can be used in event filters (see section 5
> +below).
> +
> +It also displays the format string that will be used to print the
> +event in text mode, along with the event name and ID used for
> +profiling.
> +
> +Every event has a set of 'common' fields associated with it; these are
> +the fields prefixed with 'common_'. The other fields vary between
> +events and correspond to the fields defined in the TRACE_EVENT
> +definition for that event.
> +
> +Here's the information displayed for the 'sched_wakeup' event:
> +
> +# cat /debug/tracing/events/sched/sched_wakeup/format
> +
> +name: sched_wakeup
> +ID: 60
> +format:
> + field:unsigned short common_type; offset:0; size:2;
> + field:unsigned char common_flags; offset:2; size:1;
> + field:unsigned char common_preempt_count; offset:3; size:1;
> + field:int common_pid; offset:4; size:4;
> + field:int common_tgid; offset:8; size:4;
> +
> + field:char comm[TASK_COMM_LEN]; offset:12; size:16;
> + field:pid_t pid; offset:28; size:4;
> + field:int prio; offset:32; size:4;
> + field:int success; offset:36; size:4;
> + field:int cpu; offset:40; size:4;
> +
> +print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid, REC->prio, REC->success, REC->cpu
> +
> +5. Event filtering
> +==================
> +
How about adding titles 5.1, 5.2, 5.3?
5.1 expression syntax
5.2 set a filter
5.3 clear a filter
5.4 subsystem filter
> +Trace events can be filtered in the kernel by associating boolean
> +'filter expressions' with them. As soon as an event is logged into
> +the trace buffer, its fields are checked against the filter expression
> +associated with that event type. An event with field values that
> +'match' the filter will appear in the trace output, and an event whose
> +values don't match will be discarded. An event with no filter
> +associated with it matches everything, which is the default when no
> +filter has been set for an event.
> +
> +A filter expression consists of one or more 'predicates' that can be
> +combined using the logical operators '&&' and '||'. A predicate is
> +simply a clause that compares the value of a field contained within a
> +logged event with a constant value and returns either 0 or 1 depending
> +on whether the field value matched (1) or didn't match (0):
> +
> + field-name relational-operator value
> +
> +Parentheses can be used to provide arbitrary logical groupings and
> +double-quotes can be used to prevent the shell from interpreting
> +operators as shell metacharacters.
> +
> +The field-names available for use in filters can be found in the
> +'format' files for trace events (see section 4 above).
> +
> +The relational-operators depend on the type of the field being tested:
> +
> +The operators available for numeric fields are:
> +
> +==, !=, <, <=, >, >=
> +
> +And for string fields they are:
> +
> +==, !=
> +
> +Currently, only exact string matches are supported.
> +
> +Currently, the maximum number of predicates in a filter is set at 16.
> +
> +A filter for an individual event is set by writing a filter expression
> +to the 'filter' file for the given event.
> +
> +For example:
> +
> +# cd /debug/tracing/events/sched/sched_wakeup
> +# echo "common_preempt_count > 4" > filter
> +
> +A slightly more involved example:
> +
> +# cd /debug/tracing/events/sched/sched_signal_send
> +# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
> +
> +If there was an error in the expression, you'll get an 'Invalid
> +argument' error when setting it, and the erroneous string along with
> +an error message can be seen by looking at the filter e.g.:
> +
> +# cd /debug/tracing/events/sched/sched_signal_send
> +# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter
> +-bash: echo: write error: Invalid argument
> +# cat filter
> +((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
> +^
> +parse_error: Field not found
> +
> +Currently the caret for an error always appears at the beginning of
> +the filter string; the error message should be still be useful though
s/should be still be/should still be
> +even without more accurate position info.
> +
> +To clear a filter, write a '0' to the filter file.
> +
> +For convenience, filters for every event in a subsystem can be set or
> +cleared as a group by writing a filter expression into the filter file
> +at the root of the subsytem. Note, however, that if a filter for any
> +event within the subsystem lacks a field specified in the subsystem
> +filter, or if the filter can't be applied for any other reason, the
> +filter for that event will retain its previous setting. This can
> +result in an unintended mixture of filters which could lead to
> +confusing (to the user who might think different filters are in
> +effect) trace output. Only filters that reference just the common
> +fields can be guaranteed to propagate successfully to all events.
> +
> +To clear the filters for all events in a subsystem, write a '0' to the
> +subsystem's filter file.
> +
> +Here are a few subsystem filter examples that also illustrate the
> +above points:
> +
> +Clear the filters on all events in the sched subsytem:
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo 0 > filter
> +# cat sched_switch/filter
> +none
> +# cat sched_switch/filter
> +none
> +
> +Set a filter using only common fields for all events in the sched
> +subsytem (all events end up with the same filter):
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo common_pid == 0 > filter
> +# cat sched_switch/filter
> +common_pid == 0
> +# cat sched_wakeup/filter
> +common_pid == 0
> +
> +Attempt to set a filter using a non-common field for all events in the
> +sched subsytem (all events but those that have a prev_pid field retain
> +their old filters):
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo prev_pid == 0 > filter
> +# cat sched_switch/filter
> +prev_pid == 0
> +# cat sched_wakeup/filter
> +common_pid == 0
>
On Wed, 2009-09-09 at 10:18 +0800, Li Zefan wrote:
> >>>> Hrm,.. not at all sure about this.. what are the ABI implications?
> >>> I think the ABI should be fine if it's always a sub-set of C syntax.
> >>> That would be C expressions initially. Hm?
> >> Right, so I've no clue what filter expressions look like, and the
> >> changelog doesn't help us at all. It doesn't mention its a well
> >> considered decision to henceforth freeze the expression syntax.
> >>
> >> Of course, since filters so far only work with tracepoint things, and
> >> since you can only come by tracepoint things through debugfs, and since
> >> anything debugfs is basically a free-for-all ABI-less world, we might be
> >> good, but then this is a very ill-defined ioctl() indeed.
> >>
> >> So please, consider this well -- there might not be a second chance.
> >>
> >
> > I've been meaning to write up something about the event filters - here's
> > a first stab that hopefully helps explain them...
> >
>
> Great!
>
> Reviewed-by: Li Zefan <[email protected]>
>
> Could you add your SOB and send it to Ingo?
>
> Some nitpicks below:
Sure, I'll send a new version shortly - thanks for the suggestions.
Tom
On Tue, 08 Sep 2009 02:01:55 -0500 Tom Zanussi wrote:
> I've been meaning to write up something about the event filters - here's
> a first stab that hopefully helps explain them...
>
> Tom
Hi Tom,
Pretty good info. Thanks.
Just a few small comments & question(s):
> diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
> index 2bcc8d4..50fe510 100644
> --- a/Documentation/trace/events.txt
> +++ b/Documentation/trace/events.txt
> @@ -97,3 +97,160 @@ The format of this boot option is the same as described in section 2.1.
>
> See The example provided in samples/trace_events
>
> +4. Event formats
> +================
> +
> +Each trace event has a 'format' file associated with it that contains
> +a description of each field in a logged event. This information can
> +be used to parse the binary trace stream, and is also the place to
> +find the field names that can be used in event filters (see section 5
> +below).
> +
> +It also displays the format string that will be used to print the
> +event in text mode, along with the event name and ID used for
> +profiling.
> +
> +Every event has a set of 'common' fields associated with it; these are
> +the fields prefixed with 'common_'. The other fields vary between
> +events and correspond to the fields defined in the TRACE_EVENT
> +definition for that event.
> +
> +Here's the information displayed for the 'sched_wakeup' event:
> +
> +# cat /debug/tracing/events/sched/sched_wakeup/format
> +
> +name: sched_wakeup
> +ID: 60
> +format:
> + field:unsigned short common_type; offset:0; size:2;
> + field:unsigned char common_flags; offset:2; size:1;
> + field:unsigned char common_preempt_count; offset:3; size:1;
> + field:int common_pid; offset:4; size:4;
> + field:int common_tgid; offset:8; size:4;
> +
> + field:char comm[TASK_COMM_LEN]; offset:12; size:16;
> + field:pid_t pid; offset:28; size:4;
> + field:int prio; offset:32; size:4;
> + field:int success; offset:36; size:4;
> + field:int cpu; offset:40; size:4;
> +
> +print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid, REC->prio, REC->success, REC->cpu
> +
> +5. Event filtering
> +==================
> +
> +Trace events can be filtered in the kernel by associating boolean
> +'filter expressions' with them. As soon as an event is logged into
> +the trace buffer, its fields are checked against the filter expression
> +associated with that event type. An event with field values that
> +'match' the filter will appear in the trace output, and an event whose
> +values don't match will be discarded. An event with no filter
> +associated with it matches everything, which is the default when no
> +filter has been set for an event.
> +
> +A filter expression consists of one or more 'predicates' that can be
> +combined using the logical operators '&&' and '||'. A predicate is
> +simply a clause that compares the value of a field contained within a
> +logged event with a constant value and returns either 0 or 1 depending
> +on whether the field value matched (1) or didn't match (0):
> +
> + field-name relational-operator value
> +
> +Parentheses can be used to provide arbitrary logical groupings and
> +double-quotes can be used to prevent the shell from interpreting
> +operators as shell metacharacters.
> +
> +The field-names available for use in filters can be found in the
> +'format' files for trace events (see section 4 above).
> +
> +The relational-operators depend on the type of the field being tested:
> +
> +The operators available for numeric fields are:
> +
> +==, !=, <, <=, >, >=
> +
> +And for string fields they are:
> +
> +==, !=
> +
> +Currently, only exact string matches are supported.
> +
> +Currently, the maximum number of predicates in a filter is set at 16.
in a filter is 16.
> +
> +A filter for an individual event is set by writing a filter expression
> +to the 'filter' file for the given event.
> +
> +For example:
> +
> +# cd /debug/tracing/events/sched/sched_wakeup
> +# echo "common_preempt_count > 4" > filter
> +
> +A slightly more involved example:
> +
> +# cd /debug/tracing/events/sched/sched_signal_send
> +# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
> +
> +If there was an error in the expression, you'll get an 'Invalid
If there is an error
> +argument' error when setting it, and the erroneous string along with
> +an error message can be seen by looking at the filter e.g.:
> +
> +# cd /debug/tracing/events/sched/sched_signal_send
> +# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter
> +-bash: echo: write error: Invalid argument
> +# cat filter
> +((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
> +^
> +parse_error: Field not found
> +
> +Currently the caret for an error always appears at the beginning of
caret ('^')
> +the filter string; the error message should be still be useful though
> +even without more accurate position info.
> +
> +To clear a filter, write a '0' to the filter file.
> +
> +For convenience, filters for every event in a subsystem can be set or
> +cleared as a group by writing a filter expression into the filter file
> +at the root of the subsytem. Note, however, that if a filter for any
> +event within the subsystem lacks a field specified in the subsystem
> +filter, or if the filter can't be applied for any other reason, the
> +filter for that event will retain its previous setting. This can
> +result in an unintended mixture of filters which could lead to
> +confusing (to the user who might think different filters are in
> +effect) trace output. Only filters that reference just the common
> +fields can be guaranteed to propagate successfully to all events.
> +
> +To clear the filters for all events in a subsystem, write a '0' to the
> +subsystem's filter file.
> +
> +Here are a few subsystem filter examples that also illustrate the
> +above points:
> +
> +Clear the filters on all events in the sched subsytem:
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo 0 > filter
> +# cat sched_switch/filter
> +none
> +# cat sched_switch/filter
> +none
One of the above should be "sched_wakeup", like the examples below?
> +
> +Set a filter using only common fields for all events in the sched
> +subsytem (all events end up with the same filter):
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo common_pid == 0 > filter
> +# cat sched_switch/filter
> +common_pid == 0
> +# cat sched_wakeup/filter
> +common_pid == 0
> +
> +Attempt to set a filter using a non-common field for all events in the
> +sched subsytem (all events but those that have a prev_pid field retain
> +their old filters):
> +
> +# cd /sys/kernel/debug/tracing/events/sched
> +# echo prev_pid == 0 > filter
> +# cat sched_switch/filter
> +prev_pid == 0
> +# cat sched_wakeup/filter
> +common_pid == 0
---
~Randy
LPC 2009, Sept. 23-25, Portland, Oregon
http://linuxplumbersconf.org/2009/
On Thu, 2009-09-10 at 16:01 -0700, Randy Dunlap wrote:
> On Tue, 08 Sep 2009 02:01:55 -0500 Tom Zanussi wrote:
>
> > I've been meaning to write up something about the event filters - here's
> > a first stab that hopefully helps explain them...
> >
> > Tom
>
> Hi Tom,
>
> Pretty good info. Thanks.
> Just a few small comments & question(s):
>
Thanks, Randy. I'll make those changes in the next version...
Tom