Hello.
Untested again, only for review. Sorry, I know this is annoying.
Will try to test tomorrow.
The only essential change is that "clear d_subdirs->i_private"
comes as a last patch, this needs the additional comment in the
changelogs of 1-4 but avoids the temporary complications.
I am sending this series right now because:
- I promised to do this today ;)
- Whatever I do, correctness-wise this needs your review
anyway.
- Even if correct, the very intent needs an ack from Masami
who seems to disagree with this approach.
This series shifts the validation from f_op->open() (which
currently doesn't work) to ->read/write.
While I personally think this makes more sense (simpler
and does not block rmdir/event_remove), I understand that
this is subjective, so I won't argue too much if you prefer
to add the additional ref-counts instead.
>From 6/6:
Note: this doesn't not fix other problems, event_remove() can
destroy the active ftrace_event_call, we need more changes but
those changes are completely orthogonal.
Oleg.
kernel/trace/trace.h | 4 +-
kernel/trace/trace_events.c | 176 ++++++++++++++++++------------------
kernel/trace/trace_events_filter.c | 21 ++---
3 files changed, 99 insertions(+), 102 deletions(-)
event_id_read() is racy, ftrace_event_call can be already freed
by trace_remove_event_call() callers.
Change event_create_dir() to pass "data = call->event.type", this
is all event_id_read() needs. ftrace_event_id_fops no longer needs
tracing_open_generic().
We add the new helper, event_file_data(), to read ->i_private, it
will have more users.
Note: currently ACCESS_ONCE() and "id != 0" check are not needed,
but we are going to change event_remove/rmdir to clear ->i_mutex.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 17 ++++++++++++-----
1 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 76eed53..77990c4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -409,6 +409,11 @@ static void put_system(struct ftrace_subsystem_dir *dir)
mutex_unlock(&event_mutex);
}
+static void *event_file_data(struct file *filp)
+{
+ return ACCESS_ONCE(file_inode(filp)->i_private);
+}
+
/*
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
@@ -946,14 +951,17 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ int id = (long)event_file_data(filp);
char buf[32];
int len;
if (*ppos)
return 0;
- len = sprintf(buf, "%d\n", call->event.type);
+ if (unlikely(!id))
+ return -ENODEV;
+
+ len = sprintf(buf, "%d\n", id);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
@@ -1240,7 +1248,6 @@ static const struct file_operations ftrace_event_format_fops = {
};
static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
.read = event_id_read,
.llseek = default_llseek,
};
@@ -1488,8 +1495,8 @@ event_create_dir(struct dentry *parent,
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir, call,
- id);
+ trace_create_file("id", 0444, file->dir,
+ (void *)(long)call->event.type, id);
#endif
/*
--
1.5.5.1
tracing_open_generic_file() is racy, ftrace_event_file can be
already freed by rmdir or trace_remove_event_call().
Change event_enable_read() and event_disable_read() to read and
verify "file = i_private" under event_mutex.
This fixes nothing, but now we can change debugfs_remove("enable")
callers to nullify ->i_private and fix the the problem.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 32 ++++++++++++++++++++------------
1 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77990c4..821768e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -684,19 +684,28 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
+ unsigned long flags;
char buf[4] = "0";
- if (file->flags & FTRACE_EVENT_FL_ENABLED &&
- !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED &&
+ !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
- file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+ flags & FTRACE_EVENT_FL_SOFT_MODE)
strcat(buf, "*");
strcat(buf, "\n");
-
return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
}
@@ -704,13 +713,10 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
unsigned long val;
int ret;
- if (!file)
- return -EINVAL;
-
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -722,8 +728,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
switch (val) {
case 0:
case 1:
+ ret = -ENODEV;
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(file, val);
+ file = event_file_data(filp);
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -732,7 +741,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
*ppos += cnt;
-
return ret ? ret : cnt;
}
--
1.5.5.1
Preparation for the next patch. Extract the common code from
remove_event_from_tracers() and __trace_remove_event_dirs()
into the new helper, remove_event_file_dir().
The patch looks more complicated than it actually is, it also
moves remove_subsystem() up to avoid the forward declaration.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 66 +++++++++++++++++++++----------------------
1 files changed, 32 insertions(+), 34 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4b43111..3ae037c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -409,11 +409,31 @@ static void put_system(struct ftrace_subsystem_dir *dir)
mutex_unlock(&event_mutex);
}
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+ if (!dir)
+ return;
+
+ if (!--dir->nr_events) {
+ debugfs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
+
static void *event_file_data(struct file *filp)
{
return ACCESS_ONCE(file_inode(filp)->i_private);
}
+static void remove_event_file_dir(struct ftrace_event_file *file)
+{
+ list_del(&file->list);
+ debugfs_remove_recursive(file->dir);
+ remove_subsystem(file->system);
+ kmem_cache_free(file_cachep, file);
+}
+
/*
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
@@ -1542,40 +1562,22 @@ event_create_dir(struct dentry *parent,
return 0;
}
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
-{
- if (!dir)
- return;
-
- if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
- list_del(&dir->list);
- __put_system_dir(dir);
- }
-}
-
static void remove_event_from_tracers(struct ftrace_event_call *call)
{
struct ftrace_event_file *file;
struct trace_array *tr;
do_for_each_event_file_safe(tr, file) {
-
- if (file->event_call != call)
- continue;
-
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
-
- /*
- * The do_for_each_event_file_safe() is
- * a double loop. After finding the call for this
- * trace_array, we use break to jump to the next
- * trace_array.
- */
- break;
+ if (file->event_call == call) {
+ remove_event_file_dir(file);
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ }
} while_for_each_event_file();
}
@@ -2298,12 +2300,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
{
struct ftrace_event_file *file, *next;
- list_for_each_entry_safe(file, next, &tr->events, list) {
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
- }
+ list_for_each_entry_safe(file, next, &tr->events, list)
+ remove_event_file_dir(file);
}
static void
--
1.5.5.1
event_filter_read/write() are racy, ftrace_event_call can be already
freed by trace_remove_event_call() callers.
1. Shift mutex_lock(event_mutex) from print/apply_event_filter to
the callers. Rename print/apply just in case.
2. Change the callers, event_filter_read() and event_filter_write()
to read i_private under this mutex and abort if it is NULL.
This fixes nothing, but now we can change debugfs_remove("filter")
callers to nullify ->i_private and fix the the problem.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace.h | 4 ++--
kernel/trace/trace_events.c | 28 ++++++++++++++++++----------
kernel/trace/trace_events_filter.c | 21 ++++++++-------------
3 files changed, 28 insertions(+), 25 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41..8b7c72b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -985,9 +985,9 @@ struct filter_pred {
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_call *call,
+extern void __print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_call *call,
+extern int __apply_event_filter(struct ftrace_event_call *call,
char *filter_string);
extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 821768e..0f081c0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -977,9 +977,9 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
struct trace_seq *s;
- int r;
+ int r = -ENODEV;
if (*ppos)
return 0;
@@ -987,14 +987,18 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
-
trace_seq_init(s);
- print_event_filter(call, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (likely(call))
+ __print_event_filter(call, s);
+ mutex_unlock(&event_mutex);
- kfree(s);
+ if (call)
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ kfree(s);
return r;
}
@@ -1002,9 +1006,9 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
char *buf;
- int err;
+ int err = -ENODEV;
if (cnt >= PAGE_SIZE)
return -EINVAL;
@@ -1019,13 +1023,17 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_event_filter(call, buf);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (likely(call))
+ err = __apply_event_filter(call, buf);
+ mutex_unlock(&event_mutex);
+
free_page((unsigned long) buf);
if (err < 0)
return err;
*ppos += cnt;
-
return cnt;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c7b75a..71f76ee 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
+/* caller must hold event_mutex */
+void __print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter;
+ struct event_filter *filter = call->filter;
- mutex_lock(&event_mutex);
- filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_puts(s, "none\n");
- mutex_unlock(&event_mutex);
}
void print_subsystem_event_filter(struct event_subsystem *system,
@@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
return err;
}
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+/* caller must hold event_mutex */
+int __apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
struct event_filter *filter;
int err = 0;
- mutex_lock(&event_mutex);
-
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(call);
filter = call->filter;
if (!filter)
- goto out_unlock;
+ goto out;
RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
- goto out_unlock;
+ goto out;
}
err = create_filter(call, filter_string, true, &filter);
@@ -1884,9 +1881,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
__free_filter(tmp);
}
}
-out_unlock:
- mutex_unlock(&event_mutex);
-
+out:
return err;
}
--
1.5.5.1
Change remove_event_file_dir() ->i_private for every file we
are going to remove.
tracing_open_generic_file() and tracing_release_generic_file()
can go away, ftrace_enable_fops and ftrace_event_filter_fops()
use tracing_open_generic() but only to check tracing_disabled.
This fixes all races with event_remove() or instance_delete().
f_op->read/write/whatever can never use the freed file/call,
all event/* files were changed to check and use ->i_private
under event_mutex.
Note: this doesn't not fix other problems, event_remove() can
destroy the active ftrace_event_call, we need more changes but
those changes are completely orthogonal.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 40 ++++++++--------------------------------
1 files changed, 8 insertions(+), 32 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3ae037c..298e4fa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -428,42 +428,19 @@ static void *event_file_data(struct file *filp)
static void remove_event_file_dir(struct ftrace_event_file *file)
{
+ struct dentry *dir = file->dir;
+ struct dentry *child;
+ /* ->i_mutex is not needed, nodody can create/remove a file */
+ list_for_each_entry(child, &dir->d_subdirs, d_u.d_child)
+ child->d_inode->i_private = NULL;
+
list_del(&file->list);
- debugfs_remove_recursive(file->dir);
+ debugfs_remove_recursive(dir);
remove_subsystem(file->system);
kmem_cache_free(file_cachep, file);
}
/*
- * Open and update trace_array ref count.
- * Must have the current trace_array passed to it.
- */
-static int tracing_open_generic_file(struct inode *inode, struct file *filp)
-{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
- int ret;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
- ret = tracing_open_generic(inode, filp);
- if (ret < 0)
- trace_array_put(tr);
- return ret;
-}
-
-static int tracing_release_generic_file(struct inode *inode, struct file *filp)
-{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
-
- trace_array_put(tr);
-
- return 0;
-}
-
-/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
static int
@@ -1274,10 +1251,9 @@ static const struct file_operations ftrace_set_event_fops = {
};
static const struct file_operations ftrace_enable_fops = {
- .open = tracing_open_generic_file,
+ .open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
- .release = tracing_release_generic_file,
.llseek = default_llseek,
};
--
1.5.5.1
trace_format_open() and trace_format_seq_ops are racy, nothing
protects ftrace_event_call from trace_remove_event_call().
Change f_start() to take event_mutex and verify i_private != NULL,
change f_stop() to drop this lock.
This fixes nothing, but now we can change debugfs_remove("format")
callers to nullify ->i_private and fix the the problem.
Note: the usage of event_mutex is sub-optimal but simple, we can
change this later.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 13 +++++++++----
1 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0f081c0..4b43111 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -838,7 +838,7 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -870,7 +870,7 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -923,6 +923,11 @@ static void *f_start(struct seq_file *m, loff_t *pos)
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ if (!event_file_data(m->private))
+ return ERR_PTR(-ENODEV);
+
while (l < *pos && p)
p = f_next(m, p, &l);
@@ -931,6 +936,7 @@ static void *f_start(struct seq_file *m, loff_t *pos)
static void f_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static const struct seq_operations trace_format_seq_ops = {
@@ -942,7 +948,6 @@ static const struct seq_operations trace_format_seq_ops = {
static int trace_format_open(struct inode *inode, struct file *file)
{
- struct ftrace_event_call *call = inode->i_private;
struct seq_file *m;
int ret;
@@ -951,7 +956,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = call;
+ m->private = file;
return 0;
}
--
1.5.5.1
On 07/23, Oleg Nesterov wrote:
>
> Will try to test tomorrow.
Well, it seems to work "as expected", but I forgot about debugfs
problems.
And I am not sure I understand these problems correctly, will try
to report more tomorrow. This makes me think again that perhaps
the subsystem logic is not correct, but probably I misunderstood
it for the 2nd time.
Oleg.
On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> @@ -732,7 +741,6 @@ event_enable_write(struct file *filp, const char
> __user *ubuf, size_t cnt,
> }
>
> *ppos += cnt;
> -
> return ret ? ret : cnt;
> }
Actually, I prefer the space.
-- Steve
On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> event_filter_read/write() are racy, ftrace_event_call can be already
> freed by trace_remove_event_call() callers.
>
> 1. Shift mutex_lock(event_mutex) from print/apply_event_filter to
> the callers. Rename print/apply just in case.
I wouldn't do the rename unless there is a print_/apply_ version added
later. Just add a comment that event_mutex must be held for those
functions.
>
> 2. Change the callers, event_filter_read() and event_filter_write()
> to read i_private under this mutex and abort if it is NULL.
>
> This fixes nothing, but now we can change debugfs_remove("filter")
> callers to nullify ->i_private and fix the the problem.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/trace/trace.h | 4 ++--
> kernel/trace/trace_events.c | 28 ++++++++++++++++++----------
> kernel/trace/trace_events_filter.c | 21 ++++++++-------------
> 3 files changed, 28 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index afaae41..8b7c72b 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -985,9 +985,9 @@ struct filter_pred {
>
> extern enum regex_type
> filter_parse_regex(char *buff, int len, char **search, int *not);
> -extern void print_event_filter(struct ftrace_event_call *call,
> +extern void __print_event_filter(struct ftrace_event_call *call,
> struct trace_seq *s);
> -extern int apply_event_filter(struct ftrace_event_call *call,
> +extern int __apply_event_filter(struct ftrace_event_call *call,
> char *filter_string);
> extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
> char *filter_string);
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 821768e..0f081c0 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -977,9 +977,9 @@ static ssize_t
> event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
> loff_t *ppos)
> {
> - struct ftrace_event_call *call = filp->private_data;
> + struct ftrace_event_call *call;
> struct trace_seq *s;
> - int r;
> + int r = -ENODEV;
>
> if (*ppos)
> return 0;
> @@ -987,14 +987,18 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
> s = kmalloc(sizeof(*s), GFP_KERNEL);
> if (!s)
> return -ENOMEM;
> -
Again, I prefer she space. Just my preference.
> trace_seq_init(s);
>
> - print_event_filter(call, s);
> - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
> + mutex_lock(&event_mutex);
> + call = event_file_data(filp);
> + if (likely(call))
This isn't a fast path. Remove the "likely". It just makes the code
ugly.
> + __print_event_filter(call, s);
> + mutex_unlock(&event_mutex);
>
> - kfree(s);
> + if (call)
Especially since we have no likely here.
> + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
>
> + kfree(s);
> return r;
> }
>
> @@ -1002,9 +1006,9 @@ static ssize_t
> event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> loff_t *ppos)
> {
> - struct ftrace_event_call *call = filp->private_data;
> + struct ftrace_event_call *call;
> char *buf;
> - int err;
> + int err = -ENODEV;
>
> if (cnt >= PAGE_SIZE)
> return -EINVAL;
> @@ -1019,13 +1023,17 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> }
> buf[cnt] = '\0';
>
> - err = apply_event_filter(call, buf);
> + mutex_lock(&event_mutex);
> + call = event_file_data(filp);
> + if (likely(call))
Again, remove the likely.
> + err = __apply_event_filter(call, buf);
> + mutex_unlock(&event_mutex);
> +
> free_page((unsigned long) buf);
> if (err < 0)
> return err;
>
> *ppos += cnt;
> -
> return cnt;
> }
>
> diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
> index 0c7b75a..71f76ee 100644
> --- a/kernel/trace/trace_events_filter.c
> +++ b/kernel/trace/trace_events_filter.c
> @@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
> free_page((unsigned long) buf);
> }
>
> -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
> +/* caller must hold event_mutex */
> +void __print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
> {
> - struct event_filter *filter;
> + struct event_filter *filter = call->filter;
>
> - mutex_lock(&event_mutex);
> - filter = call->filter;
> if (filter && filter->filter_string)
> trace_seq_printf(s, "%s\n", filter->filter_string);
> else
> trace_seq_puts(s, "none\n");
> - mutex_unlock(&event_mutex);
> }
>
> void print_subsystem_event_filter(struct event_subsystem *system,
> @@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
> return err;
> }
>
> -int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
> +/* caller must hold event_mutex */
> +int __apply_event_filter(struct ftrace_event_call *call, char *filter_string)
> {
> struct event_filter *filter;
> int err = 0;
We can remove the 0 init here.
>
> - mutex_lock(&event_mutex);
> -
> if (!strcmp(strstrip(filter_string), "0")) {
> filter_disable(call);
> filter = call->filter;
> if (!filter)
> - goto out_unlock;
> + goto out;
Remove the "out" label, and just return 0 here.
> RCU_INIT_POINTER(call->filter, NULL);
> /* Make sure the filter is not being used */
> synchronize_sched();
> __free_filter(filter);
> - goto out_unlock;
> + goto out;
Here too.
> }
>
> err = create_filter(call, filter_string, true, &filter);
> @@ -1884,9 +1881,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
> __free_filter(tmp);
> }
> }
> -out_unlock:
> - mutex_unlock(&event_mutex);
> -
> +out:
> return err;
> }
>
-- Steve
On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> static void remove_event_from_tracers(struct ftrace_event_call *call)
> {
> struct ftrace_event_file *file;
> struct trace_array *tr;
>
> do_for_each_event_file_safe(tr, file) {
> -
> - if (file->event_call != call)
> - continue;
> -
> - list_del(&file->list);
> - debugfs_remove_recursive(file->dir);
> - remove_subsystem(file->system);
> - kmem_cache_free(file_cachep, file);
> -
> - /*
> - * The do_for_each_event_file_safe() is
> - * a double loop. After finding the call for this
> - * trace_array, we use break to jump to the next
> - * trace_array.
> - */
> - break;
> + if (file->event_call == call) {
I don't care if you do it this way because it's just two lines, but the
reason I do the:
if (file->event_call != call)
continue;
is to keep the indentation down.
-- Steve
> + remove_event_file_dir(file);
> + /*
> + * The do_for_each_event_file_safe() is
> + * a double loop. After finding the call for this
> + * trace_array, we use break to jump to the next
> + * trace_array.
> + */
> + break;
> + }
> } while_for_each_event_file();
> }
>
> @@ -2298,12 +2300,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
> {
> struct ftrace_event_file *file, *next;
>
> - list_for_each_entry_safe(file, next, &tr->events, list) {
> - list_del(&file->list);
> - debugfs_remove_recursive(file->dir);
> - remove_subsystem(file->system);
> - kmem_cache_free(file_cachep, file);
> - }
> + list_for_each_entry_safe(file, next, &tr->events, list)
> + remove_event_file_dir(file);
> }
>
> static void
On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> Change remove_event_file_dir() ->i_private for every file we
> are going to remove.
>
> tracing_open_generic_file() and tracing_release_generic_file()
> can go away, ftrace_enable_fops and ftrace_event_filter_fops()
> use tracing_open_generic() but only to check tracing_disabled.
>
> This fixes all races with event_remove() or instance_delete().
> f_op->read/write/whatever can never use the freed file/call,
> all event/* files were changed to check and use ->i_private
> under event_mutex.
>
> Note: this doesn't not fix other problems, event_remove() can
> destroy the active ftrace_event_call, we need more changes but
> those changes are completely orthogonal.
Hmm, but this patch opens up that race right? We remove the tr ref
counter updates here.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/trace/trace_events.c | 40 ++++++++--------------------------------
> 1 files changed, 8 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 3ae037c..298e4fa 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -428,42 +428,19 @@ static void *event_file_data(struct file *filp)
>
> static void remove_event_file_dir(struct ftrace_event_file *file)
> {
> + struct dentry *dir = file->dir;
> + struct dentry *child;
Need space here.
> + /* ->i_mutex is not needed, nodody can create/remove a file */
"nodody" ;-)
-- Steve
> + list_for_each_entry(child, &dir->d_subdirs, d_u.d_child)
> + child->d_inode->i_private = NULL;
> +
> list_del(&file->list);
> - debugfs_remove_recursive(file->dir);
> + debugfs_remove_recursive(dir);
> remove_subsystem(file->system);
> kmem_cache_free(file_cachep, file);
> }
>
> /*
> - * Open and update trace_array ref count.
> - * Must have the current trace_array passed to it.
> - */
> -static int tracing_open_generic_file(struct inode *inode, struct file *filp)
> -{
> - struct ftrace_event_file *file = inode->i_private;
> - struct trace_array *tr = file->tr;
> - int ret;
> -
> - if (trace_array_get(tr) < 0)
> - return -ENODEV;
> -
> - ret = tracing_open_generic(inode, filp);
> - if (ret < 0)
> - trace_array_put(tr);
> - return ret;
> -}
> -
> -static int tracing_release_generic_file(struct inode *inode, struct file *filp)
> -{
> - struct ftrace_event_file *file = inode->i_private;
> - struct trace_array *tr = file->tr;
> -
> - trace_array_put(tr);
> -
> - return 0;
> -}
> -
> -/*
> * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
> */
> static int
> @@ -1274,10 +1251,9 @@ static const struct file_operations ftrace_set_event_fops = {
> };
>
> static const struct file_operations ftrace_enable_fops = {
> - .open = tracing_open_generic_file,
> + .open = tracing_open_generic,
> .read = event_enable_read,
> .write = event_enable_write,
> - .release = tracing_release_generic_file,
> .llseek = default_llseek,
> };
>
On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> event_id_read() is racy, ftrace_event_call can be already freed
> by trace_remove_event_call() callers.
>
> Change event_create_dir() to pass "data = call->event.type", this
> is all event_id_read() needs. ftrace_event_id_fops no longer needs
> tracing_open_generic().
>
> We add the new helper, event_file_data(), to read ->i_private, it
> will have more users.
>
> Note: currently ACCESS_ONCE() and "id != 0" check are not needed,
> but we are going to change event_remove/rmdir to clear ->i_mutex.
We are changing i_mutex or i_private?
Anyway, this still looks too complex for the id. Just pass the id number
to the filp->private_data, and use that. It should never be cleared, and
as we are not using any pointers it will always return what the id
is/was.
Keep tracing_open_generic() and have:
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t
*ppos)
{
int id = (int)(unsigned long)filp->private_data;
char buf[32];
int len;
if (*ppos)
return 0;
len = sprintf(buf, "%d\n", id);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
No need for accessing the inode->i_private.
-- Steve
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/trace/trace_events.c | 17 ++++++++++++-----
> 1 files changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 76eed53..77990c4 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -409,6 +409,11 @@ static void put_system(struct ftrace_subsystem_dir *dir)
> mutex_unlock(&event_mutex);
> }
>
> +static void *event_file_data(struct file *filp)
> +{
> + return ACCESS_ONCE(file_inode(filp)->i_private);
> +}
> +
> /*
> * Open and update trace_array ref count.
> * Must have the current trace_array passed to it.
> @@ -946,14 +951,17 @@ static int trace_format_open(struct inode *inode, struct file *file)
> static ssize_t
> event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> {
> - struct ftrace_event_call *call = filp->private_data;
> + int id = (long)event_file_data(filp);
> char buf[32];
> int len;
>
> if (*ppos)
> return 0;
>
> - len = sprintf(buf, "%d\n", call->event.type);
> + if (unlikely(!id))
> + return -ENODEV;
> +
> + len = sprintf(buf, "%d\n", id);
> return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
> }
>
> @@ -1240,7 +1248,6 @@ static const struct file_operations ftrace_event_format_fops = {
> };
>
> static const struct file_operations ftrace_event_id_fops = {
> - .open = tracing_open_generic,
> .read = event_id_read,
> .llseek = default_llseek,
> };
> @@ -1488,8 +1495,8 @@ event_create_dir(struct dentry *parent,
>
> #ifdef CONFIG_PERF_EVENTS
> if (call->event.type && call->class->reg)
> - trace_create_file("id", 0444, file->dir, call,
> - id);
> + trace_create_file("id", 0444, file->dir,
> + (void *)(long)call->event.type, id);
> #endif
>
> /*
On 07/24, Steven Rostedt wrote:
>
> On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> > Change remove_event_file_dir() ->i_private for every file we
> > are going to remove.
> >
> > tracing_open_generic_file() and tracing_release_generic_file()
> > can go away, ftrace_enable_fops and ftrace_event_filter_fops()
> > use tracing_open_generic() but only to check tracing_disabled.
> >
> > This fixes all races with event_remove() or instance_delete().
> > f_op->read/write/whatever can never use the freed file/call,
> > all event/* files were changed to check and use ->i_private
> > under event_mutex.
> >
> > Note: this doesn't not fix other problems, event_remove() can
> > destroy the active ftrace_event_call, we need more changes but
> > those changes are completely orthogonal.
>
> Hmm, but this patch opens up that race right? We remove the tr ref
> counter updates here.
But we do not care or I missed something. instance_delete() takes
event_mutex and does __trace_remove_event_dirs() before anything
else. (perhaps it makes sense to move list_del() down but afaics
currently this doesn't matter).
If event_enable_write() takes this mutex first we can pretend it
was called even before instance_delete(). Otherwise _write() will
notice i_private == NULL and do nothing.
Let me also clarify which "other problems" problems I meant. We
still need the already discussed patch below, and we still need
the changes in kprobes/uprobes (you already made these patches).
Except, probe_remove_event_call() doesn't need the "call->flags"
check, of course.
Or I misunderstood?
As for you other comments - thanks, I'll update this series.
Oleg.
----------------------------------------------------------------------
tracing: trace_remove_event_call() should fail if call/file is in use
Change trace_remove_event_call(call) to return the error if this
call is active. This is what the callers assume but can't verify
outside of the tracing locks. Both trace_kprobe.c/trace_uprobe.c
need the additional changes, unregister_trace_probe() should abort
if trace_remove_event_call() fails.
We also check TRACE_EVENT_FL_REF_MASK to ensure that nobody opened
the files we are going to remove, these means that nobody can access
the soon-to-be-freed ftrace_event_file/call via filp->private_data.
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/ftrace_event.h | 2 +-
kernel/trace/trace_events.c | 33 +++++++++++++++++++++++++++++++--
2 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 72ff2c6..bdf6bdd 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -338,7 +338,7 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
const char *name, int offset, int size,
int is_signed, int filter_type);
extern int trace_add_event_call(struct ftrace_event_call *call);
-extern void trace_remove_event_call(struct ftrace_event_call *call);
+extern int trace_remove_event_call(struct ftrace_event_call *call);
#define is_signed_type(type) (((type)(-1)) < (type)1)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 90cf243..1a5547e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1766,16 +1766,45 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
destroy_preds(call);
}
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+ if (call->flags & TRACE_EVENT_FL_REF_MASK)
+ return -EBUSY;
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->perf_refcount)
+ return -EBUSY;
+#endif
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return -EBUSY;
+ break;
+ } while_for_each_event_file();
+
+ __trace_remove_event_call(call);
+
+ return 0;
+}
+
/* Remove an event_call */
-void trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct ftrace_event_call *call)
{
+ int ret;
+
mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
down_write(&trace_event_sem);
- __trace_remove_event_call(call);
+ ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+
+ return ret;
}
#define for_each_event(event, start, end) \
--
1.7.10.4
On 07/24, Steven Rostedt wrote:
>
> On Tue, 2013-07-23 at 22:59 +0200, Oleg Nesterov wrote:
> > event_id_read() is racy, ftrace_event_call can be already freed
> > by trace_remove_event_call() callers.
> >
> > Change event_create_dir() to pass "data = call->event.type", this
> > is all event_id_read() needs. ftrace_event_id_fops no longer needs
> > tracing_open_generic().
> >
> > We add the new helper, event_file_data(), to read ->i_private, it
> > will have more users.
> >
> > Note: currently ACCESS_ONCE() and "id != 0" check are not needed,
> > but we are going to change event_remove/rmdir to clear ->i_mutex.
>
> We are changing i_mutex or i_private?
i_private, yes, will fix.
Steven, I agree with all your comments, thanks. Except this one.
> Anyway, this still looks too complex for the id. Just pass the id number
> to the filp->private_data, and use that. It should never be cleared, and
> as we are not using any pointers it will always return what the id
> is/was.
>
> Keep tracing_open_generic() and have:
>
> event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t
> *ppos)
> {
> int id = (int)(unsigned long)filp->private_data;
> char buf[32];
> int len;
>
> if (*ppos)
> return 0;
>
> len = sprintf(buf, "%d\n", id);
> return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
> }
>
> No need for accessing the inode->i_private.
Unless you strongly object, I'd prefer to avoid filp->private_data
and use event_file_data(). Also, I'd like to keep "if (!id) ENODEV".
The main reason is the consistency with other f_op's, and this allows
us to change event_file_data/remove_event_file_dir without breaking
event_id_read().
Also. Even if we use filp->private_data, it can be NULL/0, ->open()
can race with rmdir. Of course we could update remove_event_file_dir()
to skip this file, but I am sure we should not do this.
Oleg.
On 07/24, Oleg Nesterov wrote:
>
> On 07/23, Oleg Nesterov wrote:
> >
> > Will try to test tomorrow.
>
> Well, it seems to work "as expected",
Yes. And so far I think it is fine.
> but I forgot about debugfs
> problems.
And yes. We still have the problems with "Failed to create system
directory" if subsytem was removed while someone keeps the file
opened.
BUT: so far I think this is another and unrelated problem, see
below.
> This makes me think again that perhaps
> the subsystem logic is not correct, but probably I misunderstood
> it for the 2nd time.
Damn yes, I misread this code again.
Now I am almost sure fs/debugfs is wrong. At least it doesn't match
my expectations ;)
Suppose we have
dir1/
file1
dir2/
file2
somewhere in debugfs.
Suppose that someone opens dir1/dir2/file2.
Now. debugfs_remove_recursive(dir1/dir2) succeeds, and dir1/di2 goes
away.
But debugfs_remove_recursive(dir1) silently fails and doesn't remove
this directory. It relies on list_empty(d_subdirs) and this doesn't
look right.
If nothing else, note that simple_empty() doesn't blindly return
list_empty(d_subdirs), this list still have the deleted nodes.
The patch below fixes the problem, but I do not think it is really
correct, I'll try to think more.
Oleg.
--- a/fs/debugfs/inode.c~ 2013-03-20 12:54:00.000000000 +0100
+++ b/fs/debugfs/inode.c 2013-07-25 18:31:46.000000000 +0200
@@ -566,7 +579,7 @@ void debugfs_remove_recursive(struct den
* If "child" isn't empty, walk down the tree and
* remove all its descendants first.
*/
- if (!list_empty(&child->d_subdirs)) {
+ if (debugfs_positive(child) && !list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
mutex_lock(&parent->d_inode->i_mutex);
Hello.
debugfs_remove_recursive() looks wrong to me. The patch below is
probably wrong too and it was only slightly tested, but could you
please confirm my understanding?
First of all, the usage of simple_release_fs() doesn't look correct
but please ignore, the patch doesn't try to fix this.
The problem is, debugfs_remove_recursive() wrongly (I think) assumes
that a) !list_empty(d_subdirs) means that this dir should be removed,
and b) if d_subdirs does not becomes empty after __debugfs_remove()
debugfs_remove_recursive() gives up and doesn't even try to remove
other entries.
I think this is wrong, and at least we need the additional
debugfs_positive() check before list_empty() or just simple_empty()
instead.
Test-case. Suppose we have
dir1/
dir2/
file2
file1
somewhere in debugfs.
Suppose that someone opens dir1/dir2/file2 and keeps it opened.
Now. debugfs_remove_recursive(dir1/dir2) succeeds, and dir1/di2 goes
away.
But debugfs_remove_recursive(dir1) silently fails and doesn't remove
this directory. Because it tries to delete (the already deleted)
dir1/dir2/file2 again and then fails due to "Avoid infinite loop"
logic.
The patch seems to fix the problem. Note also that the new code
tries to remove as much as possible, iow it does not give up if
__debugfs_remove() fails for any reason. However I am not sure
we want this, perhaps it should simply abort and return the err.
To simplify the review, this is how it looks with the patch applied:
void debugfs_remove_recursive(struct dentry *dentry)
{
struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
parent = dentry->d_parent;
if (!parent || !parent->d_inode)
return;
parent = dentry;
down:
mutex_lock(&parent->d_inode->i_mutex);
child = list_first_entry(&parent->d_subdirs, struct dentry, d_u.d_child);
while (&child->d_u.d_child != &parent->d_subdirs) {
next = list_next_entry(child, d_u.d_child);
if (!debugfs_positive(child))
goto next;
/* XXX: simple_empty(child) instead ? */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
goto down;
}
up:
__debugfs_remove(child, parent);
next:
child = next;
}
mutex_unlock(&parent->d_inode->i_mutex);
if (parent != dentry) {
child = parent;
next = list_next_entry(child, d_u.d_child);
parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
goto up;
}
parent = dentry->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
__debugfs_remove(dentry, parent);
mutex_unlock(&parent->d_inode->i_mutex);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
Oleg.
---
debugfs/inode.c | 69 +++++++++++++++++++++-----------------------------------
1 file changed, 26 insertions(+), 43 deletions(-)
--- x/fs/debugfs/inode.c 2013-07-25 19:08:08.000000000 +0200
+++ x/fs/debugfs/inode.c 2013-07-25 21:18:26.000000000 +0200
@@ -532,6 +532,9 @@ void debugfs_remove(struct dentry *dentr
}
EXPORT_SYMBOL_GPL(debugfs_remove);
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
/**
* debugfs_remove_recursive - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed.
@@ -546,8 +549,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
*/
void debugfs_remove_recursive(struct dentry *dentry)
{
- struct dentry *child;
- struct dentry *parent;
+ struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
@@ -557,54 +559,35 @@ void debugfs_remove_recursive(struct den
return;
parent = dentry;
+ down:
mutex_lock(&parent->d_inode->i_mutex);
+ child = list_first_entry(&parent->d_subdirs, struct dentry, d_u.d_child);
- while (1) {
- /*
- * When all dentries under "parent" has been removed,
- * walk up the tree until we reach our starting point.
- */
- if (list_empty(&parent->d_subdirs)) {
- mutex_unlock(&parent->d_inode->i_mutex);
- if (parent == dentry)
- break;
- parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
- }
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
-
- /*
- * If "child" isn't empty, walk down the tree and
- * remove all its descendants first.
- */
+ while (&child->d_u.d_child != &parent->d_subdirs) {
+ next = list_next_entry(child, d_u.d_child);
+
+ if (!debugfs_positive(child))
+ goto next;
+
+ /* XXX: simple_empty(child) instead ? */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
- mutex_lock(&parent->d_inode->i_mutex);
- continue;
+ goto down;
}
+ up:
__debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
- /*
- * Avoid infinite loop if we fail to remove
- * one dentry.
- */
- mutex_unlock(&parent->d_inode->i_mutex);
- break;
- }
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ next:
+ child = next;
+ }
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (parent != dentry) {
+ child = parent;
+ next = list_next_entry(child, d_u.d_child);
+ parent = parent->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+ goto up;
}
parent = dentry->d_parent;
On 07/25, Oleg Nesterov wrote:
>
> To simplify the review, this is how it looks with the patch applied:
v2. We can use simply use list_for_each_entry_safe() and
list_next_entry() should be calles under ->i_mutex. Although
debugfs_remove_recursive() can race with itself anyway, but
still.
And the code looks much simpler. But I do not know what did
I miss.
Oleg.
void debugfs_remove_recursive(struct dentry *dentry)
{
struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
parent = dentry->d_parent;
if (!parent || !parent->d_inode)
return;
parent = dentry;
down:
mutex_lock(&parent->d_inode->i_mutex);
list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
if (!debugfs_positive(child))
continue;
/* XXX: simple_empty(child) instead ? */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
goto down;
}
up:
__debugfs_remove(child, parent);
}
mutex_unlock(&parent->d_inode->i_mutex);
if (parent != dentry) {
child = parent;
parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
next = list_next_entry(child, d_u.d_child);
goto up;
}
parent = dentry->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
__debugfs_remove(dentry, parent);
mutex_unlock(&parent->d_inode->i_mutex);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
On Thu, Jul 25, 2013 at 10:04:23PM +0200, Oleg Nesterov wrote:
> On 07/25, Oleg Nesterov wrote:
> >
> > To simplify the review, this is how it looks with the patch applied:
>
> v2. We can use simply use list_for_each_entry_safe() and
> list_next_entry() should be calles under ->i_mutex. Although
> debugfs_remove_recursive() can race with itself anyway, but
> still.
>
> And the code looks much simpler. But I do not know what did
> I miss.
debugfs has a number of known issues with removing files / directories.
Al has helpfully pointed them out to me, and it's on my list of things
to fix, but it keeps getting pushed down by other things. Hopefully in
a few weeks...
Anyway, your changes below look good in this version, I don't see
anything obviously wrong with it, but maybe others do?
Care to turn this version into a patch I can test out?
thanks,
greg k-h
(2013/07/26 5:04), Oleg Nesterov wrote:
> On 07/25, Oleg Nesterov wrote:
>>
>> To simplify the review, this is how it looks with the patch applied:
>
> v2. We can use simply use list_for_each_entry_safe() and
> list_next_entry() should be calles under ->i_mutex. Although
> debugfs_remove_recursive() can race with itself anyway, but
> still.
>
> And the code looks much simpler. But I do not know what did
> I miss.
>
> Oleg.
>
> void debugfs_remove_recursive(struct dentry *dentry)
> {
> struct dentry *child, *next, *parent;
>
> if (IS_ERR_OR_NULL(dentry))
> return;
>
> parent = dentry->d_parent;
> if (!parent || !parent->d_inode)
> return;
>
> parent = dentry;
> down:
> mutex_lock(&parent->d_inode->i_mutex);
> list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
Perhaps, you can use list_for_each_entry_safe_continue() here, as below.
parent = dentry;
down:
child = list_first_entry_or_null(&parent->d_subdirs,
typeof(*child), d_u.d_child);
mutex_lock(&parent->d_inode->i_mutex);
restart:
list_for_each_entry_safe_continue(child, next, &parent->d_subdirs, d_u.d_child) {
> if (!debugfs_positive(child))
> continue;
>
> /* XXX: simple_empty(child) instead ? */
> if (!list_empty(&child->d_subdirs)) {
> mutex_unlock(&parent->d_inode->i_mutex);
> parent = child;
> goto down;
> }
> up:
> __debugfs_remove(child, parent);
> }
Then, you can avoid jumping into the loop, just restart it from
parent as below.
mutex_unlock(&parent->d_inode->i_mutex);
child = parent;
parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
__debugfs_remove(child, parent);
if (child != dentry)
goto restart;
> mutex_unlock(&parent->d_inode->i_mutex);
> simple_release_fs(&debugfs_mount, &debugfs_mount_count);
> }
It's just an idea, which came up to my mind. :)
Thank you,
--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]
On 07/26, Masami Hiramatsu wrote:
>
> (2013/07/26 5:04), Oleg Nesterov wrote:
> >
> > parent = dentry;
> > down:
> > mutex_lock(&parent->d_inode->i_mutex);
> > list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
>
> Perhaps, you can use list_for_each_entry_safe_continue() here, as below.
>
> parent = dentry;
> down:
> child = list_first_entry_or_null(&parent->d_subdirs,
> typeof(*child), d_u.d_child);
> mutex_lock(&parent->d_inode->i_mutex);
>
> restart:
> list_for_each_entry_safe_continue(child, next, &parent->d_subdirs, d_u.d_child) {
>
> > if (!debugfs_positive(child))
> > continue;
> >
> > /* XXX: simple_empty(child) instead ? */
> > if (!list_empty(&child->d_subdirs)) {
> > mutex_unlock(&parent->d_inode->i_mutex);
> > parent = child;
> > goto down;
> > }
> > up:
> > __debugfs_remove(child, parent);
> > }
>
> Then, you can avoid jumping into the loop, just restart it from
> parent as below.
Yes, but I'd prefer to jump into the loop. This is subjective, but looks
a bit more understandable to me.
Because "goto down/up" are actually "call/return", and "jump up" looks
like return-after-recursive-debugfs_remove_recursive-call.
However,
> if (child != dentry)
> goto restart;
>
> > mutex_unlock(&parent->d_inode->i_mutex);
Yes, I realized this right after I sent the email ;)
We can factor out the final ->d_parent/mutex_lock if we check
"child != dentry" instead of "parent != dentry".
I'll send the patch in a minute. Thanks.
Oleg.
On 07/25, Greg Kroah-Hartman wrote:
>
> Anyway, your changes below look good in this version, I don't see
> anything obviously wrong with it, but maybe others do?
Please see 1/1.
Changes:
- s/list_next_entry/list_entry/. It would be nice to move
list_next_entry() from kernel/events/core.c to list.h
but this needs another patch
- Do "parent = parent->d_parent" before "!= dentry" check,
this factors out the common code in "goto up" and final
__debugfs_remove(dentry)
- Add simple_release_fs() back.
The patch is hardly readable, so I attached the code below again.
Note!!! The changelog contains the simple test-case. However, this
test-case can actually reveal more problems because kernel/trace/
is buggy too. (hopefully we already have the necessary fixes).
Steven, Greg. Assuming that nobody objects and the patch is fine.
Perhaps this patch can be routed via rostedt/linux-trace tree?
This connects to other problems we are working on, it would be
nice to have this patch in the same tree to simplify the testing.
Oleg.
void debugfs_remove_recursive(struct dentry *dentry)
{
struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
parent = dentry->d_parent;
if (!parent || !parent->d_inode)
return;
parent = dentry;
down:
mutex_lock(&parent->d_inode->i_mutex);
list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
if (!debugfs_positive(child))
continue;
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
goto down;
}
up:
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
mutex_unlock(&parent->d_inode->i_mutex);
child = parent;
parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
if (child != dentry) {
next = list_entry(child->d_u.d_child.next, struct dentry,
d_u.d_child);
goto up;
}
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
mutex_unlock(&parent->d_inode->i_mutex);
}
debugfs_remove_recursive() is wrong,
1. it wrongly assumes that !list_empty(d_subdirs) means that this
dir should be removed.
This is not that bad by itself, but:
2. if d_subdirs does not becomes empty after __debugfs_remove()
it gives up and silently fails, it doesn't even try to remove
other entries.
However ->d_subdirs can be non-empty because it still has the
already deleted !debugfs_positive() entries.
3. simple_release_fs() is called even if __debugfs_remove() fails.
Suppose we have
dir1/
dir2/
file2
file1
and someone opens dir1/dir2/file2.
Now, debugfs_remove_recursive(dir1/dir2) succeeds, and dir1/di2 goes
away.
But debugfs_remove_recursive(dir1) silently fails and doesn't remove
this directory. Because it tries to delete (the already deleted)
dir1/dir2/file2 again and then fails due to "Avoid infinite loop"
logic.
Test-case:
#!/bin/sh
cd /sys/kernel/debug/tracing
echo 'p:probe/sigprocmask sigprocmask' >> kprobe_events
sleep 1000 < events/probe/sigprocmask/id &
echo -n >| kprobe_events
[ -d events/probe ] && echo "ERR!! failed to rm probe"
And after that it is not possible to create another probe entry.
With this patch debugfs_remove_recursive() skips !debugfs_positive()
files although this is not strictly needed. The most important change
is that it does not try to make ->d_subdirs empty, it simply scans
the whole list(s) recursively and removes as much as possible.
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/debugfs/inode.c | 69 ++++++++++++++++-----------------------------------
1 files changed, 22 insertions(+), 47 deletions(-)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3..c7c83ff 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
*/
void debugfs_remove_recursive(struct dentry *dentry)
{
- struct dentry *child;
- struct dentry *parent;
+ struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
return;
parent = dentry;
+ down:
mutex_lock(&parent->d_inode->i_mutex);
+ list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+ if (!debugfs_positive(child))
+ continue;
- while (1) {
- /*
- * When all dentries under "parent" has been removed,
- * walk up the tree until we reach our starting point.
- */
- if (list_empty(&parent->d_subdirs)) {
- mutex_unlock(&parent->d_inode->i_mutex);
- if (parent == dentry)
- break;
- parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
- }
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
-
- /*
- * If "child" isn't empty, walk down the tree and
- * remove all its descendants first.
- */
+ /* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
- mutex_lock(&parent->d_inode->i_mutex);
- continue;
+ goto down;
}
- __debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
- /*
- * Avoid infinite loop if we fail to remove
- * one dentry.
- */
- mutex_unlock(&parent->d_inode->i_mutex);
- break;
- }
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ up:
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
- parent = dentry->d_parent;
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
- __debugfs_remove(dentry, parent);
+
+ if (child != dentry) {
+ next = list_entry(child->d_u.d_child.next, struct dentry,
+ d_u.d_child);
+ goto up;
+ }
+
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
mutex_unlock(&parent->d_inode->i_mutex);
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
--
1.5.5.1
debugfs_remove_recursive() is wrong,
1. it wrongly assumes that !list_empty(d_subdirs) means that this
dir should be removed.
This is not that bad by itself, but:
2. if d_subdirs does not becomes empty after __debugfs_remove()
it gives up and silently fails, it doesn't even try to remove
other entries.
However ->d_subdirs can be non-empty because it still has the
already deleted !debugfs_positive() entries.
3. simple_release_fs() is called even if __debugfs_remove() fails.
Suppose we have
dir1/
dir2/
file2
file1
and someone opens dir1/dir2/file2.
Now, debugfs_remove_recursive(dir1/dir2) succeeds, and dir1/di2 goes
away.
But debugfs_remove_recursive(dir1) silently fails and doesn't remove
this directory. Because it tries to delete (the already deleted)
dir1/dir2/file2 again and then fails due to "Avoid infinite loop"
logic.
Test-case:
#!/bin/sh
cd /sys/kernel/debug/tracing
echo 'p:probe/sigprocmask sigprocmask' >> kprobe_events
sleep 1000 < events/probe/sigprocmask/id &
echo -n >| kprobe_events
[ -d events/probe ] && echo "ERR!! failed to rm probe"
And after that it is not possible to create another probe entry.
With this patch debugfs_remove_recursive() skips !debugfs_positive()
files although this is not strictly needed. The most important change
is that it does not try to make ->d_subdirs empty, it simply scans
the whole list(s) recursively and removes as much as possible.
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/debugfs/inode.c | 69 ++++++++++++++++-----------------------------------
1 files changed, 22 insertions(+), 47 deletions(-)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3..c7c83ff 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
*/
void debugfs_remove_recursive(struct dentry *dentry)
{
- struct dentry *child;
- struct dentry *parent;
+ struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
return;
parent = dentry;
+ down:
mutex_lock(&parent->d_inode->i_mutex);
+ list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+ if (!debugfs_positive(child))
+ continue;
- while (1) {
- /*
- * When all dentries under "parent" has been removed,
- * walk up the tree until we reach our starting point.
- */
- if (list_empty(&parent->d_subdirs)) {
- mutex_unlock(&parent->d_inode->i_mutex);
- if (parent == dentry)
- break;
- parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
- }
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
-
- /*
- * If "child" isn't empty, walk down the tree and
- * remove all its descendants first.
- */
+ /* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
- mutex_lock(&parent->d_inode->i_mutex);
- continue;
+ goto down;
}
- __debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
- /*
- * Avoid infinite loop if we fail to remove
- * one dentry.
- */
- mutex_unlock(&parent->d_inode->i_mutex);
- break;
- }
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ up:
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
- parent = dentry->d_parent;
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
- __debugfs_remove(dentry, parent);
+
+ if (child != dentry) {
+ next = list_entry(child->d_u.d_child.next, struct dentry,
+ d_u.d_child);
+ goto up;
+ }
+
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
mutex_unlock(&parent->d_inode->i_mutex);
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
--
1.5.5.1
Argh, sorry for noise.
Please ignore the extra email with the wrong subject.
On 07/26, Oleg Nesterov wrote:
>
> debugfs_remove_recursive() is wrong,
>
> 1. it wrongly assumes that !list_empty(d_subdirs) means that this
> dir should be removed.
>
> This is not that bad by itself, but:
>
> 2. if d_subdirs does not becomes empty after __debugfs_remove()
> it gives up and silently fails, it doesn't even try to remove
> other entries.
>
> However ->d_subdirs can be non-empty because it still has the
> already deleted !debugfs_positive() entries.
>
On Fri, 2013-07-26 at 17:11 +0200, Oleg Nesterov wrote:
>
> Steven, Greg. Assuming that nobody objects and the patch is fine.
> Perhaps this patch can be routed via rostedt/linux-trace tree?
I'm fine with pushing it. But that requires Greg's Acked-by.
-- Steve
On Fri, Jul 26, 2013 at 11:30:13AM -0400, Steven Rostedt wrote:
> On Fri, 2013-07-26 at 17:11 +0200, Oleg Nesterov wrote:
> >
> > Steven, Greg. Assuming that nobody objects and the patch is fine.
> > Perhaps this patch can be routed via rostedt/linux-trace tree?
>
> I'm fine with pushing it. But that requires Greg's Acked-by.
I can take it through my driver-core tree, which normally handles
debugfs patches, so it's no problem for me to take it.
Let me beat on it a bit first...
thanks,
greg k-h
On Fri, Jul 26, 2013 at 11:30:13AM -0400, Steven Rostedt wrote:
> On Fri, 2013-07-26 at 17:11 +0200, Oleg Nesterov wrote:
> >
> > Steven, Greg. Assuming that nobody objects and the patch is fine.
> > Perhaps this patch can be routed via rostedt/linux-trace tree?
>
> I'm fine with pushing it. But that requires Greg's Acked-by.
Ah, I see that there are tracing fixes that need this, that makes sense
to go through the tracing tree, I'll go ack it...
thanks,
greg k-h
On Fri, Jul 26, 2013 at 05:12:56PM +0200, Oleg Nesterov wrote:
> debugfs_remove_recursive() is wrong,
>
> 1. it wrongly assumes that !list_empty(d_subdirs) means that this
> dir should be removed.
>
> This is not that bad by itself, but:
>
> 2. if d_subdirs does not becomes empty after __debugfs_remove()
> it gives up and silently fails, it doesn't even try to remove
> other entries.
>
> However ->d_subdirs can be non-empty because it still has the
> already deleted !debugfs_positive() entries.
>
> 3. simple_release_fs() is called even if __debugfs_remove() fails.
>
> Suppose we have
>
> dir1/
> dir2/
> file2
> file1
>
> and someone opens dir1/dir2/file2.
>
> Now, debugfs_remove_recursive(dir1/dir2) succeeds, and dir1/di2 goes
> away.
>
> But debugfs_remove_recursive(dir1) silently fails and doesn't remove
> this directory. Because it tries to delete (the already deleted)
> dir1/dir2/file2 again and then fails due to "Avoid infinite loop"
> logic.
>
> Test-case:
>
> #!/bin/sh
>
> cd /sys/kernel/debug/tracing
> echo 'p:probe/sigprocmask sigprocmask' >> kprobe_events
> sleep 1000 < events/probe/sigprocmask/id &
> echo -n >| kprobe_events
>
> [ -d events/probe ] && echo "ERR!! failed to rm probe"
>
> And after that it is not possible to create another probe entry.
>
> With this patch debugfs_remove_recursive() skips !debugfs_positive()
> files although this is not strictly needed. The most important change
> is that it does not try to make ->d_subdirs empty, it simply scans
> the whole list(s) recursively and removes as much as possible.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: Greg Kroah-Hartman <[email protected]>
On Fri, 2013-07-26 at 10:38 -0700, Greg Kroah-Hartman wrote:
> > Signed-off-by: Oleg Nesterov <[email protected]>
>
> Acked-by: Greg Kroah-Hartman <[email protected]>
Thanks Greg, I appreciate this.
-- Steve