Hello.
Completely untested and _incomplete_. This ignores instance_delete()
and ftrace_event_format_fops, at least.
But I am not going to even try to finish this series unless you tell
me that you agree with this approach.
I have no idea what else could I miss. I probably understand no more
than 1% of this code, I won't be surprized if this can't work.
To remind, we also need other changes we are discussing.
Oleg.
kernel/trace/trace.h | 4 +-
kernel/trace/trace_events.c | 126 ++++++++++++++++--------------------
kernel/trace/trace_events_filter.c | 21 ++----
3 files changed, 66 insertions(+), 85 deletions(-)
Preparation to make the next patches more understandable.
The caller of trace_remove_event_call() is going to free call/files,
this means that every opened id/filter/enable/format file will use
the already freed memory via filp->private_data / inode->i_private.
Change remove_event_from_tracers() to clear ->i_private for every
child. This fixes nothing and even makes the crash more possible,
but this allows to fix the problem later.
Note: this doesn't affect instance_rmdir() paths. It has the similar
problems and they will be fixed separately.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9d2b499..cbd1a57 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1534,6 +1534,14 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
}
}
+static void invalidate_event_file(struct dentry *dir)
+{
+ struct dentry *child;
+ /* ->i_mutex is not needed, nodody can create/remove a file */
+ list_for_each_entry(child, &dir->d_subdirs, d_u.d_child)
+ child->d_inode->i_private = NULL;
+}
+
static void remove_event_from_tracers(struct ftrace_event_call *call)
{
struct ftrace_event_file *file;
@@ -1545,6 +1553,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
continue;
list_del(&file->list);
+ invalidate_event_file(file->dir);
debugfs_remove_recursive(file->dir);
remove_subsystem(file->system);
kmem_cache_free(file_cachep, file);
--
1.5.5.1
ftrace_event_id_fops and event_id_read() is overcomplicated.
1. Change event_create_dir() to pass "data = call->event.type"
to debugfs_create_file().
This means that ftrace_event_id_fops doesn't need .open()
and event_id_read() can simply print (int)i_private
2. event_id_read() has no reason to kmalloc "struct trace_seq"
(more than PAGE_SIZE!), it can use a small buffer.
Note: "if (*ppos) return 0" looks strange and unneeded,
simple_read_from_buffer(ppos) should handle this case corrrectly.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 26 +++++++++-----------------
1 files changed, 9 insertions(+), 17 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cbd1a57..cefbe85 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -962,24 +962,17 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
- struct trace_seq *s;
- int r;
+ int id = (long)ACCESS_ONCE(file_inode(filp)->i_private);
+ char buf[32];
+ int len;
+ if (!id)
+ return -ENODEV;
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->event.type);
-
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- s->buffer, s->len);
- kfree(s);
- return r;
+ len = sprintf(buf, "%d\n", id);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
static ssize_t
@@ -1264,7 +1257,6 @@ static const struct file_operations ftrace_event_format_fops = {
};
static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
.read = event_id_read,
.llseek = default_llseek,
};
@@ -1496,8 +1488,8 @@ event_create_dir(struct dentry *parent,
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir, call,
- id);
+ trace_create_file("id", 0444, file->dir,
+ (void*)(long)call->event.type, id);
#endif
/*
--
1.5.5.1
Kill tracing_open_generic_file() and tracing_release_generic_file(),
they are racy anyway.
Instead, change event_enable_read() and event_enable_write() to rely
on event_mutex and file_inode(filp)->i_private != NULL check.
trace_array_get() goes away. NOTE! this is actually wrong until we
change instance_delete() path to nullify ->i_private.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 62 ++++++++++++++-----------------------------
1 files changed, 20 insertions(+), 42 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cefbe85..6401b51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -410,35 +410,6 @@ static void put_system(struct ftrace_subsystem_dir *dir)
}
/*
- * Open and update trace_array ref count.
- * Must have the current trace_array passed to it.
- */
-static int tracing_open_generic_file(struct inode *inode, struct file *filp)
-{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
- int ret;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
- ret = tracing_open_generic(inode, filp);
- if (ret < 0)
- trace_array_put(tr);
- return ret;
-}
-
-static int tracing_release_generic_file(struct inode *inode, struct file *filp)
-{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
-
- trace_array_put(tr);
-
- return 0;
-}
-
-/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
static int
@@ -679,15 +650,25 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
+ unsigned long flags;
char buf[4] = "0";
- if (file->flags & FTRACE_EVENT_FL_ENABLED &&
- !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ mutex_lock(&event_mutex);
+ file = file_inode(filp)->i_private;
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED &&
+ !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
- file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+ flags & FTRACE_EVENT_FL_SOFT_MODE)
strcat(buf, "*");
strcat(buf, "\n");
@@ -699,13 +680,10 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
unsigned long val;
int ret;
- if (!file)
- return -EINVAL;
-
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -717,8 +695,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
switch (val) {
case 0:
case 1:
+ ret = -ENODEV;
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(file, val);
+ file = file_inode(filp)->i_private;
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -727,7 +708,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
*ppos += cnt;
-
return ret ? ret : cnt;
}
@@ -1242,10 +1222,8 @@ static const struct file_operations ftrace_set_event_fops = {
};
static const struct file_operations ftrace_enable_fops = {
- .open = tracing_open_generic_file,
.read = event_enable_read,
.write = event_enable_write,
- .release = tracing_release_generic_file,
.llseek = default_llseek,
};
--
1.5.5.1
1. Shift mutex_lock(event_mutex) from print/apply_event_filter to
the callers. And rename them just in case.
2. Change the callers, event_filter_read() and event_filter_write()
to read call = i_private under this mutex and abort if it is NULL.
3. Remove .open from ftrace_event_filter_fops.
Note: I think event_filter_read() should not use "struct trace_seq".
It buys nothing compared to the simple kmalloc'ed buffer, just needs
a bit more memory and efforts.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace.h | 4 ++--
kernel/trace/trace_events.c | 29 ++++++++++++++++++-----------
kernel/trace/trace_events_filter.c | 21 ++++++++-------------
3 files changed, 28 insertions(+), 26 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4a4f6e1..57199ac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -985,9 +985,9 @@ struct filter_pred {
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_call *call,
+extern void __print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_call *call,
+extern int __apply_event_filter(struct ftrace_event_call *call,
char *filter_string);
extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6401b51..f4a86f9 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -959,9 +959,9 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
struct trace_seq *s;
- int r;
+ int r = -ENODEV;
if (*ppos)
return 0;
@@ -969,14 +969,18 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
-
trace_seq_init(s);
- print_event_filter(call, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ mutex_lock(&event_mutex);
+ call = file_inode(filp)->i_private;
+ if (likely(call))
+ __print_event_filter(call, s);
+ mutex_unlock(&event_mutex);
- kfree(s);
+ if (call)
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ kfree(s);
return r;
}
@@ -984,9 +988,9 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
char *buf;
- int err;
+ int err = -ENODEV;
if (cnt >= PAGE_SIZE)
return -EINVAL;
@@ -1001,13 +1005,17 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_event_filter(call, buf);
+ mutex_lock(&event_mutex);
+ call = file_inode(filp)->i_private;
+ if (likely(call))
+ err = __apply_event_filter(call, buf);
+ mutex_unlock(&event_mutex);
+
free_page((unsigned long) buf);
if (err < 0)
return err;
*ppos += cnt;
-
return cnt;
}
@@ -1240,7 +1248,6 @@ static const struct file_operations ftrace_event_id_fops = {
};
static const struct file_operations ftrace_event_filter_fops = {
- .open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
.llseek = default_llseek,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0d883dc..f4b5eed 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
+/* caller must hold event_mutex */
+void __print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter;
+ struct event_filter *filter = call->filter;
- mutex_lock(&event_mutex);
- filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_printf(s, "none\n");
- mutex_unlock(&event_mutex);
}
void print_subsystem_event_filter(struct event_subsystem *system,
@@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
return err;
}
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+/* caller must hold event_mutex */
+int __apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
struct event_filter *filter;
int err = 0;
- mutex_lock(&event_mutex);
-
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(call);
filter = call->filter;
if (!filter)
- goto out_unlock;
+ goto out;
RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
- goto out_unlock;
+ goto out;
}
err = create_filter(call, filter_string, true, &filter);
@@ -1884,9 +1881,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
__free_filter(tmp);
}
}
-out_unlock:
- mutex_unlock(&event_mutex);
-
+out:
return err;
}
--
1.5.5.1
On Tue, 2013-07-16 at 20:57 +0200, Oleg Nesterov wrote:
> ftrace_event_id_fops and event_id_read() is overcomplicated.
>
> 1. Change event_create_dir() to pass "data = call->event.type"
> to debugfs_create_file().
>
> This means that ftrace_event_id_fops doesn't need .open()
> and event_id_read() can simply print (int)i_private
>
> 2. event_id_read() has no reason to kmalloc "struct trace_seq"
> (more than PAGE_SIZE!), it can use a small buffer.
Make #2 a separate patch. You can add it before this one.
-- Steve
>
> Note: "if (*ppos) return 0" looks strange and unneeded,
> simple_read_from_buffer(ppos) should handle this case corrrectly.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/trace/trace_events.c | 26 +++++++++-----------------
> 1 files changed, 9 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index cbd1a57..cefbe85 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -962,24 +962,17 @@ static int trace_format_open(struct inode *inode, struct file *file)
> static ssize_t
> event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> {
> - struct ftrace_event_call *call = filp->private_data;
> - struct trace_seq *s;
> - int r;
> + int id = (long)ACCESS_ONCE(file_inode(filp)->i_private);
> + char buf[32];
> + int len;
>
> + if (!id)
> + return -ENODEV;
> if (*ppos)
> return 0;
>
> - s = kmalloc(sizeof(*s), GFP_KERNEL);
> - if (!s)
> - return -ENOMEM;
> -
> - trace_seq_init(s);
> - trace_seq_printf(s, "%d\n", call->event.type);
> -
> - r = simple_read_from_buffer(ubuf, cnt, ppos,
> - s->buffer, s->len);
> - kfree(s);
> - return r;
> + len = sprintf(buf, "%d\n", id);
> + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
> }
>
> static ssize_t
> @@ -1264,7 +1257,6 @@ static const struct file_operations ftrace_event_format_fops = {
> };
>
> static const struct file_operations ftrace_event_id_fops = {
> - .open = tracing_open_generic,
> .read = event_id_read,
> .llseek = default_llseek,
> };
> @@ -1496,8 +1488,8 @@ event_create_dir(struct dentry *parent,
>
> #ifdef CONFIG_PERF_EVENTS
> if (call->event.type && call->class->reg)
> - trace_create_file("id", 0444, file->dir, call,
> - id);
> + trace_create_file("id", 0444, file->dir,
> + (void*)(long)call->event.type, id);
> #endif
>
> /*
On Tue, 2013-07-16 at 20:57 +0200, Oleg Nesterov wrote:
> Kill tracing_open_generic_file() and tracing_release_generic_file(),
> they are racy anyway.
>
> Instead, change event_enable_read() and event_enable_write() to rely
> on event_mutex and file_inode(filp)->i_private != NULL check.
>
> trace_array_get() goes away. NOTE! this is actually wrong until we
> change instance_delete() path to nullify ->i_private.
I think this needs to be done first. Want me to take a crack at it, and
we can interleave our patches if needed.
-- Steve
(2013/07/17 3:56), Oleg Nesterov wrote:
> Hello.
>
> Completely untested and _incomplete_. This ignores instance_delete()
> and ftrace_event_format_fops, at least.
>
> But I am not going to even try to finish this series unless you tell
> me that you agree with this approach.
>
> I have no idea what else could I miss. I probably understand no more
> than 1% of this code, I won't be surprized if this can't work.
>
> To remind, we also need other changes we are discussing.
At a glance, you're trying to change which operation will be
failed. Currently, user can not remove an event while someone
opens files which related to the event. And this approach
changes that the someone can remove the event even if the
files are opened (and read/write operation will be failed).
Am I understand correctly?
Thank you,
--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]
On 07/17, Masami Hiramatsu wrote:
>
> At a glance, you're trying to change which operation will be
> failed. Currently, user can not remove an event while someone
> opens files which related to the event. And this approach
> changes that the someone can remove the event even if the
> files are opened (and read/write operation will be failed).
> Am I understand correctly?
Yes.
Once again, I am still not sure and I am asking for your review.
But to me this looks much better. To simplify the discussion, lets
consider ftrace_enable_fops in particular.
- Why should .open() block rmdir or unregister_uprobe_event?
- Why do we need .open() at all? Whatever it can do to
validate file/call/etc, .read/write can do the same.
- If we kill .open/release, we do not need the nontrivial
refcounting. Everything becomes simple, no need to keep
the state "in between".
We need event_mutex anyway (and note that other f_op's can
also rely on other locks taken by trace_remove_event_call),
the validation degrades to the trivial != NULL check.
- This also simplifies trace_remove_event_call() paths, we
know that once it takes event_mutex nobody can play with
ftrace_event_file/ftrace_event_call we are going to free.
Oleg.
On 07/16, Steven Rostedt wrote:
>
> On Tue, 2013-07-16 at 20:57 +0200, Oleg Nesterov wrote:
> > trace_array_get() goes away. NOTE! this is actually wrong until we
> > change instance_delete() path to nullify ->i_private.
>
> I think this needs to be done first.
Do you mean "change instance_delete() path to nullify ->i_private" ?
Yes. Well yes and no, afaics, but please ignore. Of course, I won't
send the patches with the holes which I knew about.
Once again, this is just RFC to know your and Masami's opinion. If
you think this can work, I'll try to resend this series with the
additional bits to cover instance_delete() too. _Afaics_ this needs
some temporary uglifications "in between".
So far I am sending 5/4 and 6/4 which changes ftrace_event_format_fops,
in reply to 0/4. The only problem with this file is that f_start/next
imho asks for cleanup which comes as 5/4.
Ignoring instance_delete(), I think this is all we need to make
event/* safe.
So please tell me what do you think.
Oleg.
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 60 +++++++++++++++---------------------------
1 files changed, 22 insertions(+), 38 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f4a86f9..6c3e4e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -806,59 +806,33 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
- struct ftrace_event_field *field;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
+ struct list_head *node = v;
(*pos)++;
switch ((unsigned long)v) {
case FORMAT_HEADER:
- if (unlikely(list_empty(common_head)))
- return NULL;
-
- field = list_entry(common_head->prev,
- struct ftrace_event_field, link);
- return field;
+ node = common_head;
+ break;
case FORMAT_FIELD_SEPERATOR:
- if (unlikely(list_empty(head)))
- return NULL;
-
- field = list_entry(head->prev, struct ftrace_event_field, link);
- return field;
+ node = head;
+ break;
case FORMAT_PRINTFMT:
/* all done */
return NULL;
}
- field = v;
- if (field->link.prev == common_head)
+ node = node->prev;
+ if (node == common_head)
return (void *)FORMAT_FIELD_SEPERATOR;
- else if (field->link.prev == head)
+ else if (node == head)
return (void *)FORMAT_PRINTFMT;
-
- field = list_entry(field->link.prev, struct ftrace_event_field, link);
-
- return field;
-}
-
-static void *f_start(struct seq_file *m, loff_t *pos)
-{
- loff_t l = 0;
- void *p;
-
- /* Start by showing the header */
- if (!*pos)
- return (void *)FORMAT_HEADER;
-
- p = (void *)FORMAT_HEADER;
- do {
- p = f_next(m, p, &l);
- } while (p && l < *pos);
-
- return p;
+ else
+ return node;
}
static int f_show(struct seq_file *m, void *v)
@@ -884,8 +858,7 @@ static int f_show(struct seq_file *m, void *v)
return 0;
}
- field = v;
-
+ field = list_entry(v, struct ftrace_event_field, link);
/*
* Smartly shows the array type(except dynamic array).
* Normal:
@@ -912,6 +885,17 @@ static int f_show(struct seq_file *m, void *v)
return 0;
}
+static void *f_start(struct seq_file *m, loff_t *pos)
+{
+ void *p = (void *)FORMAT_HEADER;
+ loff_t l = 0;
+
+ while (p && l < *pos)
+ p = f_next(m, p, &l);
+
+ return p;
+}
+
static void f_stop(struct seq_file *m, void *p)
{
}
--
1.5.5.1
Finally change f_start() to take event_mutex and verify i_private,
f_stop() drops this lock. This closes the races with event_remove.
Note: the usage of event_mutex is sub-optimal but simple, we can
change this later.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 17 +++++++++++++----
1 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c3e4e6..dc6c1ee 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -805,7 +805,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
+ struct inode *inode = m->private;
+ struct ftrace_event_call *call = inode->i_private;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -837,7 +838,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = m->private;
+ struct inode *inode = m->private;
+ struct ftrace_event_call *call = inode->i_private;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -887,9 +889,16 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct inode *inode = m->private;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
+ mutex_lock(&event_mutex);
+ if (unlikely(!inode->i_private)) {
+ mutex_unlock(&event_mutex);
+ return ERR_PTR(-ENODEV);
+ }
+
while (p && l < *pos)
p = f_next(m, p, &l);
@@ -898,6 +907,7 @@ static void *f_start(struct seq_file *m, loff_t *pos)
static void f_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static const struct seq_operations trace_format_seq_ops = {
@@ -909,7 +919,6 @@ static const struct seq_operations trace_format_seq_ops = {
static int trace_format_open(struct inode *inode, struct file *file)
{
- struct ftrace_event_call *call = inode->i_private;
struct seq_file *m;
int ret;
@@ -918,7 +927,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = call;
+ m->private = inode;
return 0;
}
--
1.5.5.1
On 07/17, Oleg Nesterov wrote:
>
> f_next() looks overcomplicated, and it is not strictly correct
> even if this doesn't matter.
Honestly, I do not know what the changelog can say to explain
this patch except "make the code simpler/cleaner".
To simplify the review, please see f_next/f_start with this patch
applied below. f_show() does field = list_entry(v, ...).
Probably makes sense anyway, I even tried to test it. So I would
not mind if you apply it as a separate cleanup ;)
Oleg.
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
(*pos)++;
switch ((unsigned long)v) {
case FORMAT_HEADER:
node = common_head;
break;
case FORMAT_FIELD_SEPERATOR:
node = head;
break;
case FORMAT_PRINTFMT:
/* all done */
return NULL;
}
node = node->prev;
if (node == common_head)
return (void *)FORMAT_FIELD_SEPERATOR;
else if (node == head)
return (void *)FORMAT_PRINTFMT;
else
return node;
}
static void *f_start(struct seq_file *m, loff_t *pos)
{
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
while (p && l < *pos)
p = f_next(m, p, &l);
return p;
}
On 07/16, Steven Rostedt wrote:
>
> On Tue, 2013-07-16 at 20:57 +0200, Oleg Nesterov wrote:
> > ftrace_event_id_fops and event_id_read() is overcomplicated.
> >
> > 1. Change event_create_dir() to pass "data = call->event.type"
> > to debugfs_create_file().
> >
> > This means that ftrace_event_id_fops doesn't need .open()
> > and event_id_read() can simply print (int)i_private
> >
> > 2. event_id_read() has no reason to kmalloc "struct trace_seq"
> > (more than PAGE_SIZE!), it can use a small buffer.
>
> Make #2 a separate patch. You can add it before this one.
OK. I'll extract #2 and re-send.
Oleg.
(2013/07/17 23:43), Oleg Nesterov wrote:
> On 07/17, Masami Hiramatsu wrote:
>>
>> At a glance, you're trying to change which operation will be
>> failed. Currently, user can not remove an event while someone
>> opens files which related to the event. And this approach
>> changes that the someone can remove the event even if the
>> files are opened (and read/write operation will be failed).
>> Am I understand correctly?
>
> Yes.
>
> Once again, I am still not sure and I am asking for your review.
OK,
> But to me this looks much better. To simplify the discussion, lets
> consider ftrace_enable_fops in particular.
>
> - Why should .open() block rmdir or unregister_uprobe_event?
Because it is opened and under preparing for use. :)
But, yeah, if we expect there is only one user using
ftrace, accessing the removing event file is meaningless.
It should be failed.
> - Why do we need .open() at all? Whatever it can do to
> validate file/call/etc, .read/write can do the same.
Currently, just for preparing and reserving.
> - If we kill .open/release, we do not need the nontrivial
> refcounting. Everything becomes simple, no need to keep
> the state "in between".
That also means to refrain checking existence under locking mutex
in all operations. And we have to check it, which I actually concern.
refcounting is not so small and itself is complex, but it just
needs to inc/dec on open/close.
> We need event_mutex anyway (and note that other f_op's can
> also rely on other locks taken by trace_remove_event_call),
> the validation degrades to the trivial != NULL check.
>
> - This also simplifies trace_remove_event_call() paths, we
> know that once it takes event_mutex nobody can play with
> ftrace_event_file/ftrace_event_call we are going to free.
Hmm, it seems that we can remove only refcount check, or more?
Thank you,
--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]
(2013/07/18 4:19), Oleg Nesterov wrote:
> On 07/16, Steven Rostedt wrote:
>>
>> On Tue, 2013-07-16 at 20:57 +0200, Oleg Nesterov wrote:
>>> trace_array_get() goes away. NOTE! this is actually wrong until we
>>> change instance_delete() path to nullify ->i_private.
>>
>> I think this needs to be done first.
>
> Do you mean "change instance_delete() path to nullify ->i_private" ?
>
> Yes. Well yes and no, afaics, but please ignore. Of course, I won't
> send the patches with the holes which I knew about.
>
> Once again, this is just RFC to know your and Masami's opinion. If
> you think this can work, I'll try to resend this series with the
> additional bits to cover instance_delete() too. _Afaics_ this needs
> some temporary uglifications "in between".
I think your proposal goes a good direction, but it also requires
completeness, because, as I pointed, we have to take care of all
operations are correctly checked except for open/close.
So, I agree with Steven, I'd like to see a complete patchset.
Thank you,
--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]
On 07/18, Masami Hiramatsu wrote:
>
> (2013/07/18 4:19), Oleg Nesterov wrote:
> > Once again, this is just RFC to know your and Masami's opinion. If
> > you think this can work, I'll try to resend this series with the
> > additional bits to cover instance_delete() too. _Afaics_ this needs
> > some temporary uglifications "in between".
>
> I think your proposal goes a good direction, but it also requires
> completeness, because, as I pointed, we have to take care of all
> operations are correctly checked except for open/close.
>
> So, I agree with Steven, I'd like to see a complete patchset.
OK. In fact _I think_ it is almost complete, just this series should
remove tracing_open_generic_file() in the last patch, after we change
__trace_remove_event_dirs() to use invalidate_event_files().
I'll try to finish this series and resend, probably tomorrow.
Oleg.
On 07/18, Masami Hiramatsu wrote:
>
> (2013/07/17 23:43), Oleg Nesterov wrote:
> > Once again, I am still not sure and I am asking for your review.
>
> OK,
Good ;)
> > - If we kill .open/release, we do not need the nontrivial
> > refcounting. Everything becomes simple, no need to keep
> > the state "in between".
>
> That also means to refrain checking existence under locking mutex
> in all operations.
Speaking of event_enable_write() it needs the same mutex anyway.
> And we have to check it, which I actually concern.
> refcounting is not so small and itself is complex, but it just
> needs to inc/dec on open/close.
And this inc/dec needs event_mutex too, and the code is not trivial.
But yes, sure, I am not saying that it is always a win performance-wise.
In particular, with the patches I sent event/format holds event_mutex
between .start and .stop. But again, this is only to make the patch
simple. We can narrow the scope of this lock, we can switch to i_mutex
(needs the trivial change in invalidate_event_files) which should not
be contended.
And of course, sometimes it is better to do the "hard work" in .open()
and make .read/write as fast/simple as possible. But not in event/*
case, I think.
> > - This also simplifies trace_remove_event_call() paths, we
> > know that once it takes event_mutex nobody can play with
> > ftrace_event_file/ftrace_event_call we are going to free.
>
> Hmm, it seems that we can remove only refcount check, or more?
But this check is not necessarily trivial too.
And to remind, personally I do not really like the fact that the
opened file blocks rmdir or unregister_probe_event().
To summarise. I believe that this approach is better (and simpler)
in general. But I understand that "better" is subjective, so I won't
argue. Not to mention, it can be simply wrong so I will heavily rely
on your/Steven's review anyway.
Oleg.