2018-04-18 06:30:53

by Song Liu

[permalink] [raw]
Subject: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

As Miklos reported and suggested:

This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:

ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;

inode = igrab(d_inode(path.dentry));
path_put(&path);

And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.

This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.

Solution: store path instead of inode.

This patch fixes two instances in trace_uprobe.c.

Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
Cc: Steven Rostedt <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Howard McLauchlan <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
---
kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0d450b4..80dfcdf 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,7 +55,7 @@ struct trace_uprobe {
struct list_head list;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
- struct inode *inode;
+ struct path path;
char *filename;
unsigned long offset;
unsigned long nhit;
@@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
for (i = 0; i < tu->tp.nr_args; i++)
traceprobe_free_probe_arg(&tu->tp.args[i]);

- iput(tu->inode);
+ path_put(&tu->path);
kfree(tu->tp.call.class->system);
kfree(tu->tp.call.name);
kfree(tu->filename);
@@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
static int create_trace_uprobe(int argc, char **argv)
{
struct trace_uprobe *tu;
- struct inode *inode;
char *arg, *event, *group, *filename;
char buf[MAX_EVENT_NAME_LEN];
struct path path;
@@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
bool is_delete, is_return;
int i, ret;

- inode = NULL;
ret = 0;
is_delete = false;
is_return = false;
@@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;

- inode = igrab(d_inode(path.dentry));
- path_put(&path);
-
- if (!inode || !S_ISREG(inode->i_mode)) {
- ret = -EINVAL;
- goto fail_address_parse;
- }
-
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
@@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
goto fail_address_parse;
}
tu->offset = offset;
- tu->inode = inode;
+ tu->path.mnt = path.mnt;
+ tu->path.dentry = path.dentry;
tu->filename = kstrdup(filename, GFP_KERNEL);

if (!tu->filename) {
@@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
return ret;

fail_address_parse:
- iput(inode);
+ path_put(&path);

pr_info("Failed to parse address or file.\n");

@@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
goto err_flags;

tu->consumer.filter = filter;
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
+ &tu->consumer);
if (ret)
goto err_buffer;

@@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)

WARN_ON(!uprobe_filter_is_empty(&tu->filter));

- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;

uprobe_buffer_disable();
@@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
write_unlock(&tu->filter.rwlock);

if (!done)
- return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
+ &tu->consumer, false);

return 0;
}
@@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)

err = 0;
if (!done) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(d_inode(tu->path.dentry),
+ tu->offset, &tu->consumer, true);
if (err)
uprobe_perf_close(tu, event);
}
@@ -1352,7 +1346,6 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
{
struct trace_uprobe *tu;
- struct inode *inode;
struct path path;
int ret;

@@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
if (ret)
return ERR_PTR(ret);

- inode = igrab(d_inode(path.dentry));
- path_put(&path);
-
- if (!inode || !S_ISREG(inode->i_mode)) {
- iput(inode);
- return ERR_PTR(-EINVAL);
- }
-
/*
* local trace_kprobes are not added to probe_list, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
@@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
}

tu->offset = offs;
- tu->inode = inode;
+ tu->path.mnt = path.mnt;
+ tu->path.dentry = path.dentry;
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu, &tu->tp.call);

--
2.9.5



2018-04-18 06:30:56

by Song Liu

[permalink] [raw]
Subject: [PATCH 2/2] perf/core: fix bad use of igrab in kernel/event/core.c

As Miklos reported and suggested:

This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:

ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;

inode = igrab(d_inode(path.dentry));
path_put(&path);

And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.

This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.

Solution: store path instead of inode.

This patch fixes the issue in kernel/event/core.c.

NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.
However, my test system doesn't support address filtering (or I made a
mistake?). Therefore, I have NOT tested this patch.

Could someone please help test it?

Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Cc: Alexander Shishkin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
---
arch/x86/events/intel/pt.c | 4 ++--
include/linux/perf_event.h | 2 +-
kernel/events/core.c | 21 +++++++++------------
3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b99394..8d016ce 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;

- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;

@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;

list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..88922d8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
*/
struct perf_addr_filter {
struct list_head entry;
- struct inode *inode;
+ struct path path;
unsigned long offset;
unsigned long size;
enum perf_addr_filter_action_t action;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7af828..7d711ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6668,7 +6668,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)

raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -7333,7 +7333,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;

if (filter->offset > offset + size)
@@ -8674,8 +8674,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;

list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8772,7 +8771,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);

@@ -8846,7 +8845,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8959,19 +8957,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;

/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;

- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;

ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;

event->addr_filters.nr_file_filters++;
--
2.9.5


2018-04-18 14:05:31

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

On Wed, Apr 18, 2018 at 8:29 AM, Song Liu <[email protected]> wrote:
> As Miklos reported and suggested:
>
> This pattern repeats two times in trace_uprobe.c and in
> kernel/events/core.c as well:
>
> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
> if (ret)
> goto fail_address_parse;
>
> inode = igrab(d_inode(path.dentry));
> path_put(&path);
>
> And it's wrong. You can only hold a reference to the inode if you
> have an active ref to the superblock as well (which is normally
> through path.mnt) or holding s_umount.
>
> This way unmounting the containing filesystem while the tracepoint is
> active will give you the "VFS: Busy inodes after unmount..." message
> and a crash when the inode is finally put.
>
> Solution: store path instead of inode.
>
> This patch fixes two instances in trace_uprobe.c.
>
> Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
> Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
> Cc: Steven Rostedt <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Howard McLauchlan <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Srikar Dronamraju <[email protected]>
> Reported-by: Miklos Szeredi <[email protected]>
> Signed-off-by: Song Liu <[email protected]>
> ---
> kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
> 1 file changed, 14 insertions(+), 28 deletions(-)
>
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 0d450b4..80dfcdf 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -55,7 +55,7 @@ struct trace_uprobe {
> struct list_head list;
> struct trace_uprobe_filter filter;
> struct uprobe_consumer consumer;
> - struct inode *inode;
> + struct path path;
> char *filename;
> unsigned long offset;
> unsigned long nhit;
> @@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
> for (i = 0; i < tu->tp.nr_args; i++)
> traceprobe_free_probe_arg(&tu->tp.args[i]);
>
> - iput(tu->inode);
> + path_put(&tu->path);
> kfree(tu->tp.call.class->system);
> kfree(tu->tp.call.name);
> kfree(tu->filename);
> @@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
> static int create_trace_uprobe(int argc, char **argv)
> {
> struct trace_uprobe *tu;
> - struct inode *inode;
> char *arg, *event, *group, *filename;
> char buf[MAX_EVENT_NAME_LEN];
> struct path path;
> @@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
> bool is_delete, is_return;
> int i, ret;
>
> - inode = NULL;
> ret = 0;
> is_delete = false;
> is_return = false;
> @@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
> if (ret)
> goto fail_address_parse;
>
> - inode = igrab(d_inode(path.dentry));

This is not against -linus tree.

> - path_put(&path);
> -
> - if (!inode || !S_ISREG(inode->i_mode)) {
> - ret = -EINVAL;
> - goto fail_address_parse;
> - }
> -
> ret = kstrtoul(arg, 0, &offset);
> if (ret)
> goto fail_address_parse;
> @@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
> goto fail_address_parse;
> }
> tu->offset = offset;
> - tu->inode = inode;
> + tu->path.mnt = path.mnt;
> + tu->path.dentry = path.dentry;

You can just assign the whole structure. No need to mess with
individual members.

tu->path = path;

> tu->filename = kstrdup(filename, GFP_KERNEL);
>
> if (!tu->filename) {
> @@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
> return ret;
>
> fail_address_parse:
> - iput(inode);
> + path_put(&path);
>
> pr_info("Failed to parse address or file.\n");
>
> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
> goto err_flags;
>
> tu->consumer.filter = filter;
> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
> + &tu->consumer);

It is not entirely clear how the lifetime of uprobe relates to the
lifetime of trace_uprobe. Is the uprobe object never going to survive
its creator trace_uprobe object?

If that's the case, it warrants a comment. If that's not the case,
then the path would need to be passed to uprobe_resister() which would
need to obtain its own reference.

> if (ret)
> goto err_buffer;
>
> @@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
>
> WARN_ON(!uprobe_filter_is_empty(&tu->filter));
>
> - uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
> + uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
> tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
>
> uprobe_buffer_disable();
> @@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
> write_unlock(&tu->filter.rwlock);
>
> if (!done)
> - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
> + return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
> + &tu->consumer, false);
>
> return 0;
> }
> @@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
>
> err = 0;
> if (!done) {
> - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
> + err = uprobe_apply(d_inode(tu->path.dentry),
> + tu->offset, &tu->consumer, true);
> if (err)
> uprobe_perf_close(tu, event);
> }
> @@ -1352,7 +1346,6 @@ struct trace_event_call *
> create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> {
> struct trace_uprobe *tu;
> - struct inode *inode;
> struct path path;
> int ret;
>
> @@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> if (ret)
> return ERR_PTR(ret);
>
> - inode = igrab(d_inode(path.dentry));
> - path_put(&path);
> -
> - if (!inode || !S_ISREG(inode->i_mode)) {
> - iput(inode);
> - return ERR_PTR(-EINVAL);
> - }
> -
> /*
> * local trace_kprobes are not added to probe_list, so they are never
> * searched in find_trace_kprobe(). Therefore, there is no concern of
> @@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> }
>
> tu->offset = offs;
> - tu->inode = inode;
> + tu->path.mnt = path.mnt;
> + tu->path.dentry = path.dentry;

tu->path = path


> tu->filename = kstrdup(name, GFP_KERNEL);
> init_trace_event_call(tu, &tu->tp.call);
>
> --
> 2.9.5
>

2018-04-18 14:26:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

On Wed, 18 Apr 2018 16:03:42 +0200
Miklos Szeredi <[email protected]> wrote:

> > @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
> > goto err_flags;
> >
> > tu->consumer.filter = filter;
> > - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
> > + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
> > + &tu->consumer);
>
> It is not entirely clear how the lifetime of uprobe relates to the
> lifetime of trace_uprobe. Is the uprobe object never going to survive
> its creator trace_uprobe object?

Not exactly sure what you mean here.

The trace_uprobe (the probe event) is created, it doesn't do anything
until it is enabled. This function is called when it is enabled. The
trace_uprobe (probe event) can not be deleted while it is enabled
(EBUSY).

Are you asking what happens if the file is deleted while it has probe?
That I don't know about (haven't tried it out). But I would hope that
it keeps a reference to the inode, isn't that what the igrab is for?
And is now being replaced by a reference on the path, or is that the
problem?

-- Steve


>
> If that's the case, it warrants a comment. If that's not the case,
> then the path would need to be passed to uprobe_resister() which would
> need to obtain its own reference.
>
> > if (ret)
> > goto err_buffer;
> >

2018-04-18 14:41:55

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

On Wed, Apr 18, 2018 at 4:25 PM, Steven Rostedt <[email protected]> wrote:
> On Wed, 18 Apr 2018 16:03:42 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>> > @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>> > goto err_flags;
>> >
>> > tu->consumer.filter = filter;
>> > - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>> > + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>> > + &tu->consumer);
>>
>> It is not entirely clear how the lifetime of uprobe relates to the
>> lifetime of trace_uprobe. Is the uprobe object never going to survive
>> its creator trace_uprobe object?
>
> Not exactly sure what you mean here.
>
> The trace_uprobe (the probe event) is created, it doesn't do anything
> until it is enabled. This function is called when it is enabled. The
> trace_uprobe (probe event) can not be deleted while it is enabled
> (EBUSY).
>
> Are you asking what happens if the file is deleted while it has probe?
> That I don't know about (haven't tried it out). But I would hope that
> it keeps a reference to the inode, isn't that what the igrab is for?
> And is now being replaced by a reference on the path, or is that the
> problem?

No, that's not the problem.

What I don't see is how the uprobe object relates to the trace_uprobe object.

Because after the patch the uprobe object still only has a ref to the
inode, and that can lead to the same issue as with trace_uprobe.
OTOH if uprobe can't survive its creating trace_uprobe, then it
doesn't need to take a ref to the inode at all, since trace_uprobe
already holds it. Taking an extra ref isn't incorrect, it's just
unnecessary and confusing.

So this needs to be cleared up in some way.

Thanks,
Miklos

2018-04-18 15:20:44

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

On Wed, 18 Apr 2018 16:40:19 +0200
Miklos Szeredi <[email protected]> wrote:


> > The trace_uprobe (the probe event) is created, it doesn't do anything
> > until it is enabled. This function is called when it is enabled. The
> > trace_uprobe (probe event) can not be deleted while it is enabled
> > (EBUSY).
> >
> > Are you asking what happens if the file is deleted while it has probe?
> > That I don't know about (haven't tried it out). But I would hope that
> > it keeps a reference to the inode, isn't that what the igrab is for?
> > And is now being replaced by a reference on the path, or is that the
> > problem?
>
> No, that's not the problem.
>
> What I don't see is how the uprobe object relates to the trace_uprobe object.
>
> Because after the patch the uprobe object still only has a ref to the
> inode, and that can lead to the same issue as with trace_uprobe.
> OTOH if uprobe can't survive its creating trace_uprobe, then it
> doesn't need to take a ref to the inode at all, since trace_uprobe
> already holds it. Taking an extra ref isn't incorrect, it's just
> unnecessary and confusing.
>
> So this needs to be cleared up in some way.

The uprobe created by the trace_uprobe creation must be deleted before
the trace_uprobe can be deleted. Basically we have this:

# cd /sys/kernel/tracing
# echo "uprobe creation text" > uprobe_events

The trace_uprobe is created (but not the uprobe itself). This is what
calls create_trace_uprobe().

# echo 1 > events/uprobes/enable

This enables all the trace uprobe events, which creates the uprobes.
This is the action that calls probe_event_enable(), which creates
uprobes.

At this point, any write to uprobe_events that would destroy the trace
uprobes would return with -EBUSY, and the trace uprobes will not be
deleted.

# echo 0 > events/uprobes/enable

This will call the probe_event_disable() which will call
uprobe_unregister() which will destroy the uprobe.

Now we can delete the trace uprobe.

Does that answer your question? A uprobe created for trace uprobes can
not survive the trace uprobe itself.

-- Steve

2018-04-18 16:10:43

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c


> On Apr 18, 2018, at 7:03 AM, Miklos Szeredi <[email protected]> wrote:
>
> On Wed, Apr 18, 2018 at 8:29 AM, Song Liu <[email protected]> wrote:
>> As Miklos reported and suggested:
>>
>> This pattern repeats two times in trace_uprobe.c and in
>> kernel/events/core.c as well:
>>
>> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
>> if (ret)
>> goto fail_address_parse;
>>
>> inode = igrab(d_inode(path.dentry));
>> path_put(&path);
>>
>> And it's wrong. You can only hold a reference to the inode if you
>> have an active ref to the superblock as well (which is normally
>> through path.mnt) or holding s_umount.
>>
>> This way unmounting the containing filesystem while the tracepoint is
>> active will give you the "VFS: Busy inodes after unmount..." message
>> and a crash when the inode is finally put.
>>
>> Solution: store path instead of inode.
>>
>> This patch fixes two instances in trace_uprobe.c.
>>
>> Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
>> Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
>> Cc: Steven Rostedt <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> Cc: Howard McLauchlan <[email protected]>
>> Cc: Josef Bacik <[email protected]>
>> Cc: Srikar Dronamraju <[email protected]>
>> Reported-by: Miklos Szeredi <[email protected]>
>> Signed-off-by: Song Liu <[email protected]>
>> ---
>> kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
>> 1 file changed, 14 insertions(+), 28 deletions(-)
>>
>> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
>> index 0d450b4..80dfcdf 100644
>> --- a/kernel/trace/trace_uprobe.c
>> +++ b/kernel/trace/trace_uprobe.c
>> @@ -55,7 +55,7 @@ struct trace_uprobe {
>> struct list_head list;
>> struct trace_uprobe_filter filter;
>> struct uprobe_consumer consumer;
>> - struct inode *inode;
>> + struct path path;
>> char *filename;
>> unsigned long offset;
>> unsigned long nhit;
>> @@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
>> for (i = 0; i < tu->tp.nr_args; i++)
>> traceprobe_free_probe_arg(&tu->tp.args[i]);
>>
>> - iput(tu->inode);
>> + path_put(&tu->path);
>> kfree(tu->tp.call.class->system);
>> kfree(tu->tp.call.name);
>> kfree(tu->filename);
>> @@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
>> static int create_trace_uprobe(int argc, char **argv)
>> {
>> struct trace_uprobe *tu;
>> - struct inode *inode;
>> char *arg, *event, *group, *filename;
>> char buf[MAX_EVENT_NAME_LEN];
>> struct path path;
>> @@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
>> bool is_delete, is_return;
>> int i, ret;
>>
>> - inode = NULL;
>> ret = 0;
>> is_delete = false;
>> is_return = false;
>> @@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
>> if (ret)
>> goto fail_address_parse;
>>
>> - inode = igrab(d_inode(path.dentry));
>
> This is not against -linus tree.

These patches are against tip/perf/core. I can also send version against
-linus tree.

>
>> - path_put(&path);
>> -
>> - if (!inode || !S_ISREG(inode->i_mode)) {
>> - ret = -EINVAL;
>> - goto fail_address_parse;
>> - }
>> -
>> ret = kstrtoul(arg, 0, &offset);
>> if (ret)
>> goto fail_address_parse;
>> @@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
>> goto fail_address_parse;
>> }
>> tu->offset = offset;
>> - tu->inode = inode;
>> + tu->path.mnt = path.mnt;
>> + tu->path.dentry = path.dentry;
>
> You can just assign the whole structure. No need to mess with
> individual members.
>
> tu->path = path;
Will fix in v2.
>
>> tu->filename = kstrdup(filename, GFP_KERNEL);
>>
>> if (!tu->filename) {
>> @@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
>> return ret;
>>
>> fail_address_parse:
>> - iput(inode);
>> + path_put(&path);
>>
>> pr_info("Failed to parse address or file.\n");
>>
>> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>> goto err_flags;
>>
>> tu->consumer.filter = filter;
>> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>> + &tu->consumer);
>
> It is not entirely clear how the lifetime of uprobe relates to the
> lifetime of trace_uprobe. Is the uprobe object never going to survive
> its creator trace_uprobe object?
>
> If that's the case, it warrants a comment. If that's not the case,
> then the path would need to be passed to uprobe_resister() which would
> need to obtain its own reference.

trace_uprobe will not be freed before the uprobe object. trace_uprobe
holds reference to struct path (with path_get()).

>
>> if (ret)
>> goto err_buffer;
>>
>> @@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
>>
>> WARN_ON(!uprobe_filter_is_empty(&tu->filter));
>>
>> - uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
>> + uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
>> tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
>>
>> uprobe_buffer_disable();
>> @@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
>> write_unlock(&tu->filter.rwlock);
>>
>> if (!done)
>> - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
>> + return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
>> + &tu->consumer, false);
>>
>> return 0;
>> }
>> @@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
>>
>> err = 0;
>> if (!done) {
>> - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
>> + err = uprobe_apply(d_inode(tu->path.dentry),
>> + tu->offset, &tu->consumer, true);
>> if (err)
>> uprobe_perf_close(tu, event);
>> }
>> @@ -1352,7 +1346,6 @@ struct trace_event_call *
>> create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> {
>> struct trace_uprobe *tu;
>> - struct inode *inode;
>> struct path path;
>> int ret;
>>
>> @@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> if (ret)
>> return ERR_PTR(ret);
>>
>> - inode = igrab(d_inode(path.dentry));
>> - path_put(&path);
>> -
>> - if (!inode || !S_ISREG(inode->i_mode)) {
>> - iput(inode);
>> - return ERR_PTR(-EINVAL);
>> - }
>> -
>> /*
>> * local trace_kprobes are not added to probe_list, so they are never
>> * searched in find_trace_kprobe(). Therefore, there is no concern of
>> @@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> }
>>
>> tu->offset = offs;
>> - tu->inode = inode;
>> + tu->path.mnt = path.mnt;
>> + tu->path.dentry = path.dentry;
>
> tu->path = path
>
>
>> tu->filename = kstrdup(name, GFP_KERNEL);
>> init_trace_event_call(tu, &tu->tp.call);
>>
>> --
>> 2.9.5
>>


2018-04-18 16:17:31

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c



> On Apr 18, 2018, at 7:25 AM, Steven Rostedt <[email protected]> wrote:
>
> On Wed, 18 Apr 2018 16:03:42 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>>> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>>> goto err_flags;
>>>
>>> tu->consumer.filter = filter;
>>> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>>> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>>> + &tu->consumer);
>>
>> It is not entirely clear how the lifetime of uprobe relates to the
>> lifetime of trace_uprobe. Is the uprobe object never going to survive
>> its creator trace_uprobe object?
>
> Not exactly sure what you mean here.
>
> The trace_uprobe (the probe event) is created, it doesn't do anything
> until it is enabled. This function is called when it is enabled. The
> trace_uprobe (probe event) can not be deleted while it is enabled
> (EBUSY).
>
> Are you asking what happens if the file is deleted while it has probe?
> That I don't know about (haven't tried it out). But I would hope that
> it keeps a reference to the inode, isn't that what the igrab is for?
> And is now being replaced by a reference on the path, or is that the
> problem?
>
> -- Steve
>

Just as Miklos pointed out, I run tests with the uprobe and confirmed
that igrab() is not sufficient to prevent umount. When we change it to
path_get()/path_put(), umount will abort because of the trace_uprobe.

Song


>>
>> If that's the case, it warrants a comment. If that's not the case,
>> then the path would need to be passed to uprobe_resister() which would
>> need to obtain its own reference.
>>
>>> if (ret)
>>> goto err_buffer;
>>>


2018-04-18 16:27:04

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/2] tracing: fix bad use of igrab in trace_uprobe.c

On Wed, 18 Apr 2018 16:08:50 +0000
Song Liu <[email protected]> wrote:

> > This is not against -linus tree.
>
> These patches are against tip/perf/core. I can also send version against
> -linus tree.

I can take this patch in my tree (which is currently equal to
4.17-rc1). The other patch should go through tip.

-- Steve

2018-04-19 06:18:40

by Alexander Shishkin

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/core: fix bad use of igrab in kernel/event/core.c

On Tue, Apr 17, 2018 at 11:29:07PM -0700, Song Liu wrote:
> As Miklos reported and suggested:
>
> This pattern repeats two times in trace_uprobe.c and in
> kernel/events/core.c as well:
>
> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
> if (ret)
> goto fail_address_parse;
>
> inode = igrab(d_inode(path.dentry));
> path_put(&path);
>
> And it's wrong. You can only hold a reference to the inode if you
> have an active ref to the superblock as well (which is normally
> through path.mnt) or holding s_umount.

Oops. I must have snatched it from the uprobe code without thinking.

> This way unmounting the containing filesystem while the tracepoint is
> active will give you the "VFS: Busy inodes after unmount..." message
> and a crash when the inode is finally put.
>
> Solution: store path instead of inode.
>
> This patch fixes the issue in kernel/event/core.c.
>
> NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.

Coresight too, but that's probably even further away from what you have.

> However, my test system doesn't support address filtering (or I made a
> mistake?). Therefore, I have NOT tested this patch.

Check /sys/devices/intel_pt/caps/num_address_ranges, if it's non-zero,
it's supported.

> Could someone please help test it?

Yes:

Reviewed-and-tested-by: Alexander Shishkin <[email protected]>

The subject line needs a little love to be more like other perf commits, but
other than that, looks good.

Thanks!


2018-05-22 22:00:25

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/core: fix bad use of igrab in kernel/event/core.c


> On Apr 18, 2018, at 11:17 PM, Alexander Shishkin <[email protected]> wrote:
>
> On Tue, Apr 17, 2018 at 11:29:07PM -0700, Song Liu wrote:
>> As Miklos reported and suggested:
>>
>> This pattern repeats two times in trace_uprobe.c and in
>> kernel/events/core.c as well:
>>
>> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
>> if (ret)
>> goto fail_address_parse;
>>
>> inode = igrab(d_inode(path.dentry));
>> path_put(&path);
>>
>> And it's wrong. You can only hold a reference to the inode if you
>> have an active ref to the superblock as well (which is normally
>> through path.mnt) or holding s_umount.
>
> Oops. I must have snatched it from the uprobe code without thinking.
>
>> This way unmounting the containing filesystem while the tracepoint is
>> active will give you the "VFS: Busy inodes after unmount..." message
>> and a crash when the inode is finally put.
>>
>> Solution: store path instead of inode.
>>
>> This patch fixes the issue in kernel/event/core.c.
>>
>> NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.
>
> Coresight too, but that's probably even further away from what you have.
>
>> However, my test system doesn't support address filtering (or I made a
>> mistake?). Therefore, I have NOT tested this patch.
>
> Check /sys/devices/intel_pt/caps/num_address_ranges, if it's non-zero,
> it's supported.
>
>> Could someone please help test it?
>
> Yes:
>
> Reviewed-and-tested-by: Alexander Shishkin <[email protected]>
>
> The subject line needs a little love to be more like other perf commits, but
> other than that, looks good.
>
> Thanks!

Did this patch ever make into tip/perf/XX trees? If not, what shall I do
to move it ahead?

Thanks,
Song


2018-05-23 13:14:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/core: fix bad use of igrab in kernel/event/core.c

On Tue, May 22, 2018 at 09:56:18PM +0000, Song Liu wrote:
> Did this patch ever make into tip/perf/XX trees? If not, what shall I do
> to move it ahead?

Got it now, thanks for the reminder.

Subject: [tip:perf/core] perf/core: Fix bad use of igrab()

Commit-ID: 9511bce9fe8e5e6c0f923c09243a713eba560141
Gitweb: https://git.kernel.org/tip/9511bce9fe8e5e6c0f923c09243a713eba560141
Author: Song Liu <[email protected]>
AuthorDate: Tue, 17 Apr 2018 23:29:07 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 25 May 2018 08:11:10 +0200

perf/core: Fix bad use of igrab()

As Miklos reported and suggested:

"This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:

ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;

inode = igrab(d_inode(path.dentry));
path_put(&path);

And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.

This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.

Solution: store path instead of inode."

This patch fixes the issue in kernel/event/core.c.

Reviewed-and-tested-by: Alexander Shishkin <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/events/intel/pt.c | 4 ++--
include/linux/perf_event.h | 2 +-
kernel/events/core.c | 21 +++++++++------------
3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b993942a0e4..8d016ce5b80d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;

- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;

@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;

list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index def866f7269b..bea0b0cd4bf7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
*/
struct perf_addr_filter {
struct list_head entry;
- struct inode *inode;
+ struct path path;
unsigned long offset;
unsigned long size;
enum perf_addr_filter_action_t action;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce6aa5ff3c96..24dea13a27ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6668,7 +6668,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)

raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -7333,7 +7333,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;

if (filter->offset > offset + size)
@@ -8686,8 +8686,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;

list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8784,7 +8783,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);

@@ -8858,7 +8857,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8971,19 +8969,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;

/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;

- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;

ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;

event->addr_filters.nr_file_filters++;