2013-07-04 03:40:44

by Steven Rostedt

[permalink] [raw]
Subject: [RFC][PATCH 1/4] tracing: Add ref count to ftrace_event_call

From: "Steven Rostedt (Red Hat)" <[email protected]>

When one of the event files is opened, we need to prevent them from
being removed. Modules do with with the module owner set (automated
from the VFS layer). The ftrace buffer instances have a ref count added
to the trace_array when the enabled file is opened (the others are not
that big of a deal, as they only reference the event calls which
still exist when an instance disappears). But kprobes and uprobes
do not have any protection.

# cd /sys/kernel/debug/tracing
# echo 'p:sigprocmask sigprocmask' > kprobe_events || exit -1
# enable_probe() {
sleep 10
echo 1
}
# file=events/kprobes/sigprocmask/enable
# enable_probe > $file &
> kprobe_events

The above will corrupt the kprobe system, as the write to the enable
file will happen after the kprobe was deleted.

Trying to create the probe again fails:

# echo 'p:sigprocmask sigprocmask' > kprobe_events
# cat kprobe_events
p:kprobes/sigprocmask sigprocmask
# ls events/kprobes/
enable filter

Notice that the sigprocmask does not exist.

The first step to fix this is by adding ref counts to the ftrace_event_calls.
Using the flags field, and protecting the updates with the event_mutex
the ref count will be use the first 20 bits of the flags field, and the
current flags will be shifted 20 bits.

Link: http://lkml.kernel.org/r/[email protected]

Suggested-by: Masami Hiramatsu <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/ftrace_event.h | 6 ++++
kernel/trace/trace_events.c | 76 +++++++++++++++++++++++++++++++++++++++---
2 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4372658..72ff2c6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -195,6 +195,7 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
enum trace_reg type, void *data);

enum {
+ TRACE_EVENT_FL_REF_MAX_BIT = 20,
TRACE_EVENT_FL_FILTERED_BIT,
TRACE_EVENT_FL_CAP_ANY_BIT,
TRACE_EVENT_FL_NO_SET_FILTER_BIT,
@@ -203,6 +204,8 @@ enum {
};

/*
+ * Event ref count uses the first 20 bits of the flags field.
+ *
* Event flags:
* FILTERED - The event has a filter attached
* CAP_ANY - Any user can enable for perf
@@ -213,6 +216,7 @@ enum {
* it is best to clear the buffers that used it).
*/
enum {
+ TRACE_EVENT_FL_REF_MAX = (1 << TRACE_EVENT_FL_REF_MAX_BIT),
TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
@@ -220,6 +224,8 @@ enum {
TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
};

+#define TRACE_EVENT_FL_REF_MASK (TRACE_EVENT_FL_REF_MAX - 1)
+
struct ftrace_event_call {
struct list_head list;
struct ftrace_event_class *class;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7d85429..90cf243 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -391,6 +391,28 @@ static void __get_system_dir(struct ftrace_subsystem_dir *dir)
__get_system(dir->subsystem);
}

+static int ftrace_event_call_get(struct ftrace_event_call *call)
+{
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+ if ((call->flags & TRACE_EVENT_FL_REF_MASK) == TRACE_EVENT_FL_REF_MAX - 1)
+ ret = -EBUSY;
+ else
+ call->flags++;
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static void ftrace_event_call_put(struct ftrace_event_call *call)
+{
+ mutex_lock(&event_mutex);
+ if (!WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_REF_MASK)))
+ call->flags--;
+ mutex_unlock(&event_mutex);
+}
+
static void __put_system_dir(struct ftrace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
@@ -424,7 +446,15 @@ static int tracing_open_generic_file(struct inode *inode, struct file *filp)

ret = tracing_open_generic(inode, filp);
if (ret < 0)
- trace_array_put(tr);
+ goto fail;
+
+ ret = ftrace_event_call_get(file->event_call);
+ if (ret < 0)
+ goto fail;
+
+ return 0;
+ fail:
+ trace_array_put(tr);
return ret;
}

@@ -433,12 +463,40 @@ static int tracing_release_generic_file(struct inode *inode, struct file *filp)
struct ftrace_event_file *file = inode->i_private;
struct trace_array *tr = file->tr;

+ ftrace_event_call_put(file->event_call);
trace_array_put(tr);

return 0;
}

/*
+ * Open and update call ref count.
+ * Must have ftrace_event_call passed in.
+ */
+static int tracing_open_generic_call(struct inode *inode, struct file *filp)
+{
+ struct ftrace_event_call *call = inode->i_private;
+
+ return ftrace_event_call_get(call);
+}
+
+static int tracing_release_generic_call(struct inode *inode, struct file *filp)
+{
+ struct ftrace_event_call *call = inode->i_private;
+
+ ftrace_event_call_put(call);
+ return 0;
+}
+
+static int tracing_seq_release_call(struct inode *inode, struct file *filp)
+{
+ struct ftrace_event_call *call = inode->i_private;
+
+ ftrace_event_call_put(call);
+ return seq_release(inode, filp);
+}
+
+/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
static int
@@ -949,10 +1007,16 @@ static int trace_format_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;

- ret = seq_open(file, &trace_format_seq_ops);
+ ret = ftrace_event_call_get(call);
if (ret < 0)
return ret;

+ ret = seq_open(file, &trace_format_seq_ops);
+ if (ret < 0) {
+ ftrace_event_call_put(call);
+ return ret;
+ }
+
m = file->private_data;
m->private = call;

@@ -1260,20 +1324,22 @@ static const struct file_operations ftrace_event_format_fops = {
.open = trace_format_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = tracing_seq_release_call,
};

static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_call,
.read = event_id_read,
.llseek = default_llseek,
+ .release = tracing_release_generic_call,
};

static const struct file_operations ftrace_event_filter_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_call,
.read = event_filter_read,
.write = event_filter_write,
.llseek = default_llseek,
+ .release = tracing_release_generic_call,
};

static const struct file_operations ftrace_subsystem_filter_fops = {
--
1.7.10.4


Subject: Re: [RFC][PATCH 1/4] tracing: Add ref count to ftrace_event_call

(2013/07/04 12:33), Steven Rostedt wrote:
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 7d85429..90cf243 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -391,6 +391,28 @@ static void __get_system_dir(struct ftrace_subsystem_dir *dir)
> __get_system(dir->subsystem);
> }
>
> +static int ftrace_event_call_get(struct ftrace_event_call *call)
> +{
> + int ret = 0;
> +
> + mutex_lock(&event_mutex);
> + if ((call->flags & TRACE_EVENT_FL_REF_MASK) == TRACE_EVENT_FL_REF_MAX - 1)
> + ret = -EBUSY;
> + else
> + call->flags++;
> + mutex_unlock(&event_mutex);
> +
> + return ret;
> +}
> +
> +static void ftrace_event_call_put(struct ftrace_event_call *call)
> +{
> + mutex_lock(&event_mutex);
> + if (!WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_REF_MASK)))
> + call->flags--;
> + mutex_unlock(&event_mutex);
> +}

Hmm, I might misunderstand, but it seems that there is a small unsafe
time slot.

> @@ -424,7 +446,15 @@ static int tracing_open_generic_file(struct inode *inode, struct file *filp)
>
> ret = tracing_open_generic(inode, filp);
> if (ret < 0)
> - trace_array_put(tr);
> + goto fail;
> +
> + ret = ftrace_event_call_get(file->event_call);
> + if (ret < 0)
> + goto fail;
> +
> + return 0;
> + fail:
> + trace_array_put(tr);
> return ret;
> }
>
> @@ -433,12 +463,40 @@ static int tracing_release_generic_file(struct inode *inode, struct file *filp)
> struct ftrace_event_file *file = inode->i_private;
> struct trace_array *tr = file->tr;
>
> + ftrace_event_call_put(file->event_call);
> trace_array_put(tr);
>
> return 0;
> }

Here, at first we get an event_file from inode->i_private, and then locks
event_mutex and get/put refcount. This should be safe if we get (find) the object
from some list of event_file under the mutex, but we just use inode->i_private.

This can cause a race as below scenario,

CPU0 CPU1
open(kprobe_events)
trace_remove_event_call() open(enable)
lock event_mutex refer event_file
event_remove() wait for unlock event_mutex
...
free event_file
unlock event_mutex
lock event_mutex
add refcount of event_file->call (*)

So, at (*) point, the event_file is already freed. Thus there still be
an unsafe time slot. Or, did I miss something (possibly..)?

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

Subject: [RFC PATCH] tracing: Atomically get refcounts of event_call and trace_array

Currently ftrace_open_generic_file gets an event_file from
inode->i_private, and then locks event_mutex and gets refcount.
However, this can cause a race as below scenario;

CPU0 CPU1
open(kprobe_events)
trace_remove_event_call() open(enable)
lock event_mutex get event_file from inode->i_private
event_remove() wait for unlock event_mutex
...
free event_file
unlock event_mutex
lock event_mutex
add refcount of event_file->call (*)

So, at (*) point, the event_file is already freed and we
may access the corrupted object.
The same thing could happen on trace_array because it is also
directly accessed from event_file.

To avoid this, when opening events/*/*/enable, we must atomically
do; ensure the ftrace_event_file object still exists on a trace_array,
and get refcounts of event_file->call and the trace_array.


CPU0 CPU1
open(kprobe_events)
trace_remove_event_call() open(enable)
lock event_mutex get event_file from inode->i_private
event_remove() wait for unlock event_mutex
...
free event_file
unlock event_mutex
lock event_mutex
search the event_file and failed
unlock event_mutex
return -ENODEV

Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/trace/trace_events.c | 58 +++++++++++++++++++++++++++++++++----------
1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1a5547e..db6b107 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -391,15 +391,24 @@ static void __get_system_dir(struct ftrace_subsystem_dir *dir)
__get_system(dir->subsystem);
}

-static int ftrace_event_call_get(struct ftrace_event_call *call)
+static int __ftrace_event_call_get(struct ftrace_event_call *call)
{
int ret = 0;

- mutex_lock(&event_mutex);
if ((call->flags & TRACE_EVENT_FL_REF_MASK) == TRACE_EVENT_FL_REF_MAX - 1)
ret = -EBUSY;
else
call->flags++;
+
+ return ret;
+}
+
+static int ftrace_event_call_get(struct ftrace_event_call *call)
+{
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+ ret = __ftrace_event_call_get(call);
mutex_unlock(&event_mutex);

return ret;
@@ -413,6 +422,35 @@ static void ftrace_event_call_put(struct ftrace_event_call *call)
mutex_unlock(&event_mutex);
}

+static int ftrace_event_file_get(struct ftrace_event_file *this_file)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
+ int ret = -ENODEV;
+
+ mutex_lock(&event_mutex);
+ do_for_each_event_file(tr, file) {
+ if (file == this_file) {
+ ret = __ftrace_event_call_get(file->event_call);
+ if (!ret)
+ tr->ref++;
+ goto out_unlock;
+ }
+ } while_for_each_event_file();
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static void ftrace_event_file_put(struct ftrace_event_file *file)
+{
+ struct trace_array *tr = file->tr;
+
+ ftrace_event_call_put(file->event_call);
+ trace_array_put(tr);
+}
+
static void __put_system_dir(struct ftrace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
@@ -438,33 +476,27 @@ static void put_system(struct ftrace_subsystem_dir *dir)
static int tracing_open_generic_file(struct inode *inode, struct file *filp)
{
struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
int ret;

- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
- ret = tracing_open_generic(inode, filp);
+ ret = ftrace_event_file_get(file);
if (ret < 0)
- goto fail;
+ return ret;

- ret = ftrace_event_call_get(file->event_call);
+ ret = tracing_open_generic(inode, filp);
if (ret < 0)
goto fail;

return 0;
fail:
- trace_array_put(tr);
+ ftrace_event_file_put(file);
return ret;
}

static int tracing_release_generic_file(struct inode *inode, struct file *filp)
{
struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;

- ftrace_event_call_put(file->event_call);
- trace_array_put(tr);
+ ftrace_event_file_put(file);

return 0;
}

Subject: Re: [RFC PATCH] tracing: Atomically get refcounts of event_call and trace_array

Steven, Oleg,

I think your patches are OK, but not enough.
Here is an additional patch to fix the unsafe case which I found.
Could you review this too?

(2013/07/04 20:55), Masami Hiramatsu wrote:
> Currently ftrace_open_generic_file gets an event_file from
> inode->i_private, and then locks event_mutex and gets refcount.
> However, this can cause a race as below scenario;
>
> CPU0 CPU1
> open(kprobe_events)
> trace_remove_event_call() open(enable)
> lock event_mutex get event_file from inode->i_private
> event_remove() wait for unlock event_mutex
> ...
> free event_file
> unlock event_mutex
> lock event_mutex
> add refcount of event_file->call (*)
>
> So, at (*) point, the event_file is already freed and we
> may access the corrupted object.
> The same thing could happen on trace_array because it is also
> directly accessed from event_file.
>
> To avoid this, when opening events/*/*/enable, we must atomically
> do; ensure the ftrace_event_file object still exists on a trace_array,
> and get refcounts of event_file->call and the trace_array.
>
>
> CPU0 CPU1
> open(kprobe_events)
> trace_remove_event_call() open(enable)
> lock event_mutex get event_file from inode->i_private
> event_remove() wait for unlock event_mutex
> ...
> free event_file
> unlock event_mutex
> lock event_mutex
> search the event_file and failed
> unlock event_mutex
> return -ENODEV
>
> Signed-off-by: Masami Hiramatsu <[email protected]>
> ---
> kernel/trace/trace_events.c | 58 +++++++++++++++++++++++++++++++++----------
> 1 file changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 1a5547e..db6b107 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -391,15 +391,24 @@ static void __get_system_dir(struct ftrace_subsystem_dir *dir)
> __get_system(dir->subsystem);
> }
>
> -static int ftrace_event_call_get(struct ftrace_event_call *call)
> +static int __ftrace_event_call_get(struct ftrace_event_call *call)
> {
> int ret = 0;
>
> - mutex_lock(&event_mutex);
> if ((call->flags & TRACE_EVENT_FL_REF_MASK) == TRACE_EVENT_FL_REF_MAX - 1)
> ret = -EBUSY;
> else
> call->flags++;
> +
> + return ret;
> +}
> +
> +static int ftrace_event_call_get(struct ftrace_event_call *call)
> +{
> + int ret = 0;
> +
> + mutex_lock(&event_mutex);
> + ret = __ftrace_event_call_get(call);
> mutex_unlock(&event_mutex);
>
> return ret;
> @@ -413,6 +422,35 @@ static void ftrace_event_call_put(struct ftrace_event_call *call)
> mutex_unlock(&event_mutex);
> }
>
> +static int ftrace_event_file_get(struct ftrace_event_file *this_file)
> +{
> + struct ftrace_event_file *file;
> + struct trace_array *tr;
> + int ret = -ENODEV;
> +
> + mutex_lock(&event_mutex);
> + do_for_each_event_file(tr, file) {
> + if (file == this_file) {
> + ret = __ftrace_event_call_get(file->event_call);
> + if (!ret)
> + tr->ref++;
> + goto out_unlock;
> + }
> + } while_for_each_event_file();
> + out_unlock:
> + mutex_unlock(&event_mutex);
> +
> + return ret;
> +}
> +
> +static void ftrace_event_file_put(struct ftrace_event_file *file)
> +{
> + struct trace_array *tr = file->tr;
> +
> + ftrace_event_call_put(file->event_call);
> + trace_array_put(tr);
> +}
> +
> static void __put_system_dir(struct ftrace_subsystem_dir *dir)
> {
> WARN_ON_ONCE(dir->ref_count == 0);
> @@ -438,33 +476,27 @@ static void put_system(struct ftrace_subsystem_dir *dir)
> static int tracing_open_generic_file(struct inode *inode, struct file *filp)
> {
> struct ftrace_event_file *file = inode->i_private;
> - struct trace_array *tr = file->tr;
> int ret;
>
> - if (trace_array_get(tr) < 0)
> - return -ENODEV;
> -
> - ret = tracing_open_generic(inode, filp);
> + ret = ftrace_event_file_get(file);
> if (ret < 0)
> - goto fail;
> + return ret;
>
> - ret = ftrace_event_call_get(file->event_call);
> + ret = tracing_open_generic(inode, filp);
> if (ret < 0)
> goto fail;
>
> return 0;
> fail:
> - trace_array_put(tr);
> + ftrace_event_file_put(file);
> return ret;
> }
>
> static int tracing_release_generic_file(struct inode *inode, struct file *filp)
> {
> struct ftrace_event_file *file = inode->i_private;
> - struct trace_array *tr = file->tr;
>
> - ftrace_event_call_put(file->event_call);
> - trace_array_put(tr);
> + ftrace_event_file_put(file);
>
> return 0;
> }

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-07-04 12:35:27

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC PATCH] tracing: Atomically get refcounts of event_call and trace_array

Sorry for the top post. FYI today is a US holiday and tomorrow I'll probably only work a half day. I may not get a chance to look it till Monday.

-- Steve


Masami Hiramatsu <[email protected]> wrote:

>Steven, Oleg,
>
>I think your patches are OK, but not enough.
>Here is an additional patch to fix the unsafe case which I found.
>Could you review this too?
>
>(2013/07/04 20:55), Masami Hiramatsu wrote:
>> Currently ftrace_open_generic_file gets an event_file from
>> inode->i_private, and then locks event_mutex and gets refcount.
>> However, this can cause a race as below scenario;
>>
>> CPU0 CPU1
>> open(kprobe_events)
>> trace_remove_event_call() open(enable)
>> lock event_mutex get event_file from inode->i_private
>> event_remove() wait for unlock event_mutex
>> ...
>> free event_file
>> unlock event_mutex
>> lock event_mutex
>> add refcount of event_file->call (*)
>>
>> So, at (*) point, the event_file is already freed and we
>> may access the corrupted object.
>> The same thing could happen on trace_array because it is also
>> directly accessed from event_file.
>>
>> To avoid this, when opening events/*/*/enable, we must atomically
>> do; ensure the ftrace_event_file object still exists on a
>trace_array,
>> and get refcounts of event_file->call and the trace_array.
>>
>>
>> CPU0 CPU1
>> open(kprobe_events)
>> trace_remove_event_call() open(enable)
>> lock event_mutex get event_file from inode->i_private
>> event_remove() wait for unlock event_mutex
>> ...
>> free event_file
>> unlock event_mutex
>> lock event_mutex
>> search the event_file and failed
>> unlock event_mutex
>> return -ENODEV
>>
>> Signed-off-by: Masami Hiramatsu <[email protected]>
>> ---
>> kernel/trace/trace_events.c | 58
>+++++++++++++++++++++++++++++++++----------
>> 1 file changed, 45 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/trace/trace_events.c
>b/kernel/trace/trace_events.c
>> index 1a5547e..db6b107 100644
>> --- a/kernel/trace/trace_events.c
>> +++ b/kernel/trace/trace_events.c
>> @@ -391,15 +391,24 @@ static void __get_system_dir(struct
>ftrace_subsystem_dir *dir)
>> __get_system(dir->subsystem);
>> }
>>
>> -static int ftrace_event_call_get(struct ftrace_event_call *call)
>> +static int __ftrace_event_call_get(struct ftrace_event_call *call)
>> {
>> int ret = 0;
>>
>> - mutex_lock(&event_mutex);
>> if ((call->flags & TRACE_EVENT_FL_REF_MASK) ==
>TRACE_EVENT_FL_REF_MAX - 1)
>> ret = -EBUSY;
>> else
>> call->flags++;
>> +
>> + return ret;
>> +}
>> +
>> +static int ftrace_event_call_get(struct ftrace_event_call *call)
>> +{
>> + int ret = 0;
>> +
>> + mutex_lock(&event_mutex);
>> + ret = __ftrace_event_call_get(call);
>> mutex_unlock(&event_mutex);
>>
>> return ret;
>> @@ -413,6 +422,35 @@ static void ftrace_event_call_put(struct
>ftrace_event_call *call)
>> mutex_unlock(&event_mutex);
>> }
>>
>> +static int ftrace_event_file_get(struct ftrace_event_file
>*this_file)
>> +{
>> + struct ftrace_event_file *file;
>> + struct trace_array *tr;
>> + int ret = -ENODEV;
>> +
>> + mutex_lock(&event_mutex);
>> + do_for_each_event_file(tr, file) {
>> + if (file == this_file) {
>> + ret = __ftrace_event_call_get(file->event_call);
>> + if (!ret)
>> + tr->ref++;
>> + goto out_unlock;
>> + }
>> + } while_for_each_event_file();
>> + out_unlock:
>> + mutex_unlock(&event_mutex);
>> +
>> + return ret;
>> +}
>> +
>> +static void ftrace_event_file_put(struct ftrace_event_file *file)
>> +{
>> + struct trace_array *tr = file->tr;
>> +
>> + ftrace_event_call_put(file->event_call);
>> + trace_array_put(tr);
>> +}
>> +
>> static void __put_system_dir(struct ftrace_subsystem_dir *dir)
>> {
>> WARN_ON_ONCE(dir->ref_count == 0);
>> @@ -438,33 +476,27 @@ static void put_system(struct
>ftrace_subsystem_dir *dir)
>> static int tracing_open_generic_file(struct inode *inode, struct
>file *filp)
>> {
>> struct ftrace_event_file *file = inode->i_private;
>> - struct trace_array *tr = file->tr;
>> int ret;
>>
>> - if (trace_array_get(tr) < 0)
>> - return -ENODEV;
>> -
>> - ret = tracing_open_generic(inode, filp);
>> + ret = ftrace_event_file_get(file);
>> if (ret < 0)
>> - goto fail;
>> + return ret;
>>
>> - ret = ftrace_event_call_get(file->event_call);
>> + ret = tracing_open_generic(inode, filp);
>> if (ret < 0)
>> goto fail;
>>
>> return 0;
>> fail:
>> - trace_array_put(tr);
>> + ftrace_event_file_put(file);
>> return ret;
>> }
>>
>> static int tracing_release_generic_file(struct inode *inode, struct
>file *filp)
>> {
>> struct ftrace_event_file *file = inode->i_private;
>> - struct trace_array *tr = file->tr;
>>
>> - ftrace_event_call_put(file->event_call);
>> - trace_array_put(tr);
>> + ftrace_event_file_put(file);
>>
>> return 0;
>> }

--
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

2013-07-05 00:37:23

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH] tracing: Atomically get refcounts of event_call and trace_array

On 07/04, Masami Hiramatsu wrote:
>
> Currently ftrace_open_generic_file gets an event_file from
> inode->i_private, and then locks event_mutex and gets refcount.
> However, this can cause a race as below scenario;
>
> CPU0 CPU1
> open(kprobe_events)
> trace_remove_event_call() open(enable)
> lock event_mutex get event_file from inode->i_private
> event_remove() wait for unlock event_mutex
> ...
> free event_file
> unlock event_mutex
> lock event_mutex
> add refcount of event_file->call (*)
>
> So, at (*) point, the event_file is already freed and we
> may access the corrupted object.

Yes, but the same can happen with event_call, so it seems that
this patch is not enough too.

Say, open(id) can take event_mutex when the caller of
trace_remove_event_call() has already freed ftrace_event_call.

Or I missed something...

Perhaps we can rely on d_unlinked() ? IOW, the caller of
__ftrace_event_call_get() should take event_mutex, check
d_unhashed(f_dentry) and only then do _get().

Nasty, I agree.

Oleg.

Subject: Re: [RFC PATCH] tracing: Atomically get refcounts of event_call and trace_array

(2013/07/05 9:32), Oleg Nesterov wrote:
> On 07/04, Masami Hiramatsu wrote:
>>
>> Currently ftrace_open_generic_file gets an event_file from
>> inode->i_private, and then locks event_mutex and gets refcount.
>> However, this can cause a race as below scenario;
>>
>> CPU0 CPU1
>> open(kprobe_events)
>> trace_remove_event_call() open(enable)
>> lock event_mutex get event_file from inode->i_private
>> event_remove() wait for unlock event_mutex
>> ...
>> free event_file
>> unlock event_mutex
>> lock event_mutex
>> add refcount of event_file->call (*)
>>
>> So, at (*) point, the event_file is already freed and we
>> may access the corrupted object.
>
> Yes, but the same can happen with event_call, so it seems that
> this patch is not enough too.

Oops, right!

> Say, open(id) can take event_mutex when the caller of
> trace_remove_event_call() has already freed ftrace_event_call.
>
> Or I missed something...
>
> Perhaps we can rely on d_unlinked() ? IOW, the caller of
> __ftrace_event_call_get() should take event_mutex, check
> d_unhashed(f_dentry) and only then do _get().

It sounds a good idea! :)

Thank you!

>
> Nasty, I agree.
>
> Oleg.
>
>


--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

Subject: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

Currently ftrace_open_generic_file gets an event_file from
inode->i_private, and then locks event_mutex and gets refcount.
However, this can cause a race as below scenario;

CPU0 CPU1
open(kprobe_events)
trace_remove_event_call() open(enable)
lock event_mutex get event_file from inode->i_private
event_remove() wait for unlock event_mutex
...
free event_file
unlock event_mutex
lock event_mutex
add refcount of event_file->call (*)

So, at (*) point, the event_file is already freed and we
may access the corrupted object.
The same thing could happen on event_call because it is also
directly accessed from i_private on some points.

To avoid this, when opening events/*/*/enable, we have to ensure
the dentry of the file is not unlinked yet, under event_mutex
is locked.

CPU0 CPU1
open(kprobe_events)
trace_remove_event_call() open(enable)
lock event_mutex get event_file from inode->i_private
event_remove() wait for unlock event_mutex
...
dput(dentry)
free event_file
unlock event_mutex
lock event_mutex
check !d_unhashed(dentry) and failed
unlock event_mutex
return -ENODEV

Note: trace_array_get(tr) always ensures existance of tr in
trace_arrays, so above check is not needed in the case that
i_private directly points the tr.

Changes from V1:
- Use d_unhashed(f_dentry) to ensure the event exists according
to Oleg's comment.

Signed-off-by: Masami Hiramatsu <[email protected]>
Suggested-by: Oleg Nesterov <[email protected]>
---
kernel/trace/trace_events.c | 70 ++++++++++++++++++++++++++++++++++---------
1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1a5547e..06fdac0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -391,15 +391,32 @@ static void __get_system_dir(struct ftrace_subsystem_dir *dir)
__get_system(dir->subsystem);
}

-static int ftrace_event_call_get(struct ftrace_event_call *call)
+static int __ftrace_event_call_get(struct ftrace_event_call *call)
{
int ret = 0;

- mutex_lock(&event_mutex);
if ((call->flags & TRACE_EVENT_FL_REF_MASK) == TRACE_EVENT_FL_REF_MAX - 1)
ret = -EBUSY;
else
call->flags++;
+
+ return ret;
+}
+
+static int ftrace_event_call_get(struct ftrace_event_call *call,
+ struct dentry *dentry)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&event_mutex);
+ spin_lock(&dentry->d_lock);
+ if (d_unhashed(dentry)) /* This file is already unlinked */
+ goto out_unlock;
+
+ ret = __ftrace_event_call_get(call);
+
+ out_unlock:
+ spin_unlock(&dentry->d_lock);
mutex_unlock(&event_mutex);

return ret;
@@ -413,6 +430,35 @@ static void ftrace_event_call_put(struct ftrace_event_call *call)
mutex_unlock(&event_mutex);
}

+static int ftrace_event_file_get(struct ftrace_event_file *file,
+ struct dentry *dentry)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&event_mutex);
+ spin_lock(&dentry->d_lock);
+ if (d_unhashed(dentry)) /* This file is already unlinked */
+ goto out_unlock;
+
+ ret = __ftrace_event_call_get(file->event_call);
+ if (!ret)
+ file->tr->ref++;
+
+ out_unlock:
+ spin_unlock(&dentry->d_lock);
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static void ftrace_event_file_put(struct ftrace_event_file *file)
+{
+ struct trace_array *tr = file->tr;
+
+ ftrace_event_call_put(file->event_call);
+ trace_array_put(tr);
+}
+
static void __put_system_dir(struct ftrace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
@@ -438,33 +484,27 @@ static void put_system(struct ftrace_subsystem_dir *dir)
static int tracing_open_generic_file(struct inode *inode, struct file *filp)
{
struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
int ret;

- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
- ret = tracing_open_generic(inode, filp);
+ ret = ftrace_event_file_get(file, filp->f_dentry);
if (ret < 0)
- goto fail;
+ return ret;

- ret = ftrace_event_call_get(file->event_call);
+ ret = tracing_open_generic(inode, filp);
if (ret < 0)
goto fail;

return 0;
fail:
- trace_array_put(tr);
+ ftrace_event_file_put(file);
return ret;
}

static int tracing_release_generic_file(struct inode *inode, struct file *filp)
{
struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;

- ftrace_event_call_put(file->event_call);
- trace_array_put(tr);
+ ftrace_event_file_put(file);

return 0;
}
@@ -477,7 +517,7 @@ static int tracing_open_generic_call(struct inode *inode, struct file *filp)
{
struct ftrace_event_call *call = inode->i_private;

- return ftrace_event_call_get(call);
+ return ftrace_event_call_get(call, filp->f_dentry);
}

static int tracing_release_generic_call(struct inode *inode, struct file *filp)
@@ -1007,7 +1047,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;

- ret = ftrace_event_call_get(call);
+ ret = ftrace_event_call_get(call, file->f_dentry);
if (ret < 0)
return ret;

2013-07-15 18:22:38

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/09, Masami Hiramatsu wrote:
>
> To avoid this, when opening events/*/*/enable, we have to ensure
> the dentry of the file is not unlinked yet, under event_mutex
> is locked.

Probably this can work, but I am starting to think that this ref
count becomes toooooo complex....

Could you please look at the draft patch I sent in another email?

Oleg.

Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

(2013/07/16 3:16), Oleg Nesterov wrote:
> On 07/09, Masami Hiramatsu wrote:
>>
>> To avoid this, when opening events/*/*/enable, we have to ensure
>> the dentry of the file is not unlinked yet, under event_mutex
>> is locked.
>
> Probably this can work, but I am starting to think that this ref
> count becomes toooooo complex....

Oleg, I agree that is a bit complex way to fix but it can fix.

However, I think we should fix bugs first, because there are bugs.
I don't like to leave a bug in the kernel as it is, while we re-
start discussing new approach...

> Could you please look at the draft patch I sent in another email?

I'm happy to see the series, and I guess it will take a long
discussion and review time again.

So, I'd like to suggest you getting these fixes into ftrace as
first-aid treatment, and discuss your new series.

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-07-17 14:57:23

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/17, Masami Hiramatsu wrote:
>
> (2013/07/16 3:16), Oleg Nesterov wrote:
> > On 07/09, Masami Hiramatsu wrote:
> >>
> >> To avoid this, when opening events/*/*/enable, we have to ensure
> >> the dentry of the file is not unlinked yet, under event_mutex
> >> is locked.
> >
> > Probably this can work, but I am starting to think that this ref
> > count becomes toooooo complex....
>
> Oleg, I agree that is a bit complex way to fix but it can fix.
>
> However, I think we should fix bugs first, because there are bugs.
> I don't like to leave a bug in the kernel as it is, while we re-
> start discussing new approach...

And I am trying to fix the same problems, just _I think_ there is
another and simpler approach.

> So, I'd like to suggest you getting these fixes into ftrace as
> first-aid treatment, and discuss your new series.

Well, perhaps you are right... But this TRACE_EVENT_FL_REF_MASK code
is new too, it is not that we only need a small fixlets to finish it.

So I think that it makes sense to discuss the alternatives before we
decide what exactly we should do.

Oleg.

Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

(2013/07/17 23:51), Oleg Nesterov wrote:
> On 07/17, Masami Hiramatsu wrote:
>>
>> (2013/07/16 3:16), Oleg Nesterov wrote:
>>> On 07/09, Masami Hiramatsu wrote:
>>>>
>>>> To avoid this, when opening events/*/*/enable, we have to ensure
>>>> the dentry of the file is not unlinked yet, under event_mutex
>>>> is locked.
>>>
>>> Probably this can work, but I am starting to think that this ref
>>> count becomes toooooo complex....
>>
>> Oleg, I agree that is a bit complex way to fix but it can fix.
>>
>> However, I think we should fix bugs first, because there are bugs.
>> I don't like to leave a bug in the kernel as it is, while we re-
>> start discussing new approach...
>
> And I am trying to fix the same problems, just _I think_ there is
> another and simpler approach.

I see.
I also just feel uncomfortable with leaving obvious bugs, like as
having ants in my pants :)

>> So, I'd like to suggest you getting these fixes into ftrace as
>> first-aid treatment, and discuss your new series.
>
> Well, perhaps you are right... But this TRACE_EVENT_FL_REF_MASK code
> is new too, it is not that we only need a small fixlets to finish it.

Would you mean that TRACE_EVENT_FL_REF_MASK may also have some problems?

> So I think that it makes sense to discuss the alternatives before we
> decide what exactly we should do.

Your approach is also interesting for me, indeed. However, it is so
different from current one. I think you should clarify what bug you
would like to solve and how. And we should clarify what ants^H^H^H^H
bugs we have found in ftrace too. ;)

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-07-18 14:56:53

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/18, Masami Hiramatsu wrote:
>
> (2013/07/17 23:51), Oleg Nesterov wrote:
> > Well, perhaps you are right... But this TRACE_EVENT_FL_REF_MASK code
> > is new too, it is not that we only need a small fixlets to finish it.
>
> Would you mean that TRACE_EVENT_FL_REF_MASK may also have some problems?

It was you who initially pointed that it does have problems ;)

And, _afaics_ your patch which tries to fix this problem is not
exactly correct.

It removes trace_array_get/put from tracing_open_generic_file() and
tracing_release_generic_file(). This assumes that "call->flags++" is
enough, but it is not.

Yes, the next patch adds the "flags & TRACE_EVENT_FL_REF_MASK" check
into trace_remove_event_call() path. But this is still racy wrt
instance_delete() unless I missed something.

IOW, I believe that either .open() should do trace_array_get(), or
__trace_remove_event_dirs() needs another for-each-file loop which
checks file->call->flags & TRACE_EVENT_FL_REF_MASK.


> > So I think that it makes sense to discuss the alternatives before we
> > decide what exactly we should do.
>
> Your approach is also interesting for me, indeed. However, it is so
> different from current one. I think you should clarify what bug you
> would like to solve and how.

The same bugs which Steven's 1/4 tries to solve ;)

Oleg.

Subject: Re: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

(2013/07/18 23:51), Oleg Nesterov wrote:
> On 07/18, Masami Hiramatsu wrote:
>>
>> (2013/07/17 23:51), Oleg Nesterov wrote:
>>> Well, perhaps you are right... But this TRACE_EVENT_FL_REF_MASK code
>>> is new too, it is not that we only need a small fixlets to finish it.
>>
>> Would you mean that TRACE_EVENT_FL_REF_MASK may also have some problems?
>
> It was you who initially pointed that it does have problems ;)
>
> And, _afaics_ your patch which tries to fix this problem is not
> exactly correct.

Hm,

> It removes trace_array_get/put from tracing_open_generic_file() and
> tracing_release_generic_file(). This assumes that "call->flags++" is
> enough, but it is not.

No, it replaces trace_array_get/put with ftrace_event_file_get/put
which calls trace_array_get/put inside.
(Just one point, previous ftrace_event_file_get has a racy point
when it does tr->ref++, it should be fixed.)

> Yes, the next patch adds the "flags & TRACE_EVENT_FL_REF_MASK" check
> into trace_remove_event_call() path. But this is still racy wrt
> instance_delete() unless I missed something.
>
> IOW, I believe that either .open() should do trace_array_get(), or
> __trace_remove_event_dirs() needs another for-each-file loop which
> checks file->call->flags & TRACE_EVENT_FL_REF_MASK.

Agreed :)

>>> So I think that it makes sense to discuss the alternatives before we
>>> decide what exactly we should do.
>>
>> Your approach is also interesting for me, indeed. However, it is so
>> different from current one. I think you should clarify what bug you
>> would like to solve and how.
>
> The same bugs which Steven's 1/4 tries to solve ;)

OK, let me confirm that, would you mean we still need 2/4 - 4/4?

Thank you,
--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-07-19 13:38:49

by Oleg Nesterov

[permalink] [raw]
Subject: Re: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/19, Masami Hiramatsu wrote:
>
> (2013/07/18 23:51), Oleg Nesterov wrote:
> > It removes trace_array_get/put from tracing_open_generic_file() and
> > tracing_release_generic_file(). This assumes that "call->flags++" is
> > enough, but it is not.
>
> No, it replaces trace_array_get/put with ftrace_event_file_get/put
> which calls trace_array_get/put inside.

Ah, I didn't notice your patch adds "file->tr->ref++" into
ftrace_event_file_get...

So I was wrong in any case, thanks for correcting me.

But,

> (Just one point, previous ftrace_event_file_get has a racy point
> when it does tr->ref++, it should be fixed.)

Not sure what do you mean, but unless I missed something again this
"tr->ref++" above still looks racy. instance_delete() checks tr->ref
first, then it takes event_mutex and removes/kfrees event_files.

But this doesn't really matter even if I am right, surely this can
be fixed. My only point, imho this is more complex than necessary.

In particular,

> > IOW, I believe that either .open() should do trace_array_get(), or
> > __trace_remove_event_dirs() needs another for-each-file loop which
> > checks file->call->flags & TRACE_EVENT_FL_REF_MASK.
>
> Agreed :)

Yes ;) and this makes the ref-counting even more complex, we use
different methods to avoid the races with rmdir and event_remove().

> > The same bugs which Steven's 1/4 tries to solve ;)
>
> OK, let me confirm that, would you mean we still need 2/4 - 4/4?

Yes, yes.

Oleg.

Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

(2013/07/19 22:33), Oleg Nesterov wrote:
> On 07/19, Masami Hiramatsu wrote:
>>
>> (2013/07/18 23:51), Oleg Nesterov wrote:
>>> It removes trace_array_get/put from tracing_open_generic_file() and
>>> tracing_release_generic_file(). This assumes that "call->flags++" is
>>> enough, but it is not.
>>
>> No, it replaces trace_array_get/put with ftrace_event_file_get/put
>> which calls trace_array_get/put inside.
>
> Ah, I didn't notice your patch adds "file->tr->ref++" into
> ftrace_event_file_get...
>
> So I was wrong in any case, thanks for correcting me.
>
> But,
>
>> (Just one point, previous ftrace_event_file_get has a racy point
>> when it does tr->ref++, it should be fixed.)
>
> Not sure what do you mean, but unless I missed something again this
> "tr->ref++" above still looks racy. instance_delete() checks tr->ref
> first, then it takes event_mutex and removes/kfrees event_files.
>
> But this doesn't really matter even if I am right, surely this can
> be fixed. My only point, imho this is more complex than necessary.

I see, so I'd like to see the fix. However, I'm not sure
we have enough time to fix that cleanly. Note that except
for the timing bug, we still leave a kernel bug which can
easily be reproduced as Jovi reported.

> In particular,
>
>>> IOW, I believe that either .open() should do trace_array_get(), or
>>> __trace_remove_event_dirs() needs another for-each-file loop which
>>> checks file->call->flags & TRACE_EVENT_FL_REF_MASK.
>>
>> Agreed :)
>
> Yes ;) and this makes the ref-counting even more complex, we use
> different methods to avoid the races with rmdir and event_remove().
>
>>> The same bugs which Steven's 1/4 tries to solve ;)
>>
>> OK, let me confirm that, would you mean we still need 2/4 - 4/4?
>
> Yes, yes.

And those are depends on 1/4...

Thank you,


--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]

2013-07-22 17:10:10

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/22, Masami Hiramatsu wrote:
>
> (2013/07/19 22:33), Oleg Nesterov wrote:
> > My only point, imho this is more complex than necessary.
>
> I see, so I'd like to see the fix. However, I'm not sure
> we have enough time to fix that cleanly.

I promise, tomorrow I'll re-send the RFC patches, so if you don't
like them we can switch back to refcounting.

Sorry for delay. Today I was busy with other bugs I "found" in
subsystem_open/etc code, but when I tried to fix them I realized
that I have misread this code.

> Note that except
> for the timing bug, we still leave a kernel bug which can
> easily be reproduced as Jovi reported.

Could you please remind ?

> >> OK, let me confirm that, would you mean we still need 2/4 - 4/4?
> >
> > Yes, yes.
>
> And those are depends on 1/4...

Not at all or I missed something (quite possible). Just 2/4 should
not check ->flags, of course. 3/4 looks "obviously fine", 4/4 was
already merged.

Oleg.

2013-07-23 21:11:26

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH V2] tracing: Check f_dentry before accessing event_file/call in inode->i_private

On 07/22, Oleg Nesterov wrote:
>
> On 07/22, Masami Hiramatsu wrote:
> >
> > (2013/07/19 22:33), Oleg Nesterov wrote:
>
> > >> OK, let me confirm that, would you mean we still need 2/4 - 4/4?
> > >
> > > Yes, yes.
> >
> > And those are depends on 1/4...
>
> Not at all or I missed something (quite possible). Just 2/4 should
> not check ->flags, of course. 3/4 looks "obviously fine", 4/4 was
> already merged.

Sorry for confusion, I meant that your a232e270dcb is already merged.

4/4 is still needed. But again, _afaics_ 2-4 can go on top of the
"tracing: open/delete fixes" series I sent, only 2/4 needs the small
update.

Oleg.