As Miklos reported and suggested:
This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;
inode = igrab(d_inode(path.dentry));
path_put(&path);
And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.
This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.
Solution: store path instead of inode.
This patch fixes two instances in trace_uprobe.c.
Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
Cc: Steven Rostedt <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Howard McLauchlan <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Srikar Dronamraju <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
---
kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
1 file changed, 14 insertions(+), 28 deletions(-)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0d450b4..80dfcdf 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,7 +55,7 @@ struct trace_uprobe {
struct list_head list;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
- struct inode *inode;
+ struct path path;
char *filename;
unsigned long offset;
unsigned long nhit;
@@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
for (i = 0; i < tu->tp.nr_args; i++)
traceprobe_free_probe_arg(&tu->tp.args[i]);
- iput(tu->inode);
+ path_put(&tu->path);
kfree(tu->tp.call.class->system);
kfree(tu->tp.call.name);
kfree(tu->filename);
@@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
static int create_trace_uprobe(int argc, char **argv)
{
struct trace_uprobe *tu;
- struct inode *inode;
char *arg, *event, *group, *filename;
char buf[MAX_EVENT_NAME_LEN];
struct path path;
@@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
bool is_delete, is_return;
int i, ret;
- inode = NULL;
ret = 0;
is_delete = false;
is_return = false;
@@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
- inode = igrab(d_inode(path.dentry));
- path_put(&path);
-
- if (!inode || !S_ISREG(inode->i_mode)) {
- ret = -EINVAL;
- goto fail_address_parse;
- }
-
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
@@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
goto fail_address_parse;
}
tu->offset = offset;
- tu->inode = inode;
+ tu->path.mnt = path.mnt;
+ tu->path.dentry = path.dentry;
tu->filename = kstrdup(filename, GFP_KERNEL);
if (!tu->filename) {
@@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
return ret;
fail_address_parse:
- iput(inode);
+ path_put(&path);
pr_info("Failed to parse address or file.\n");
@@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
goto err_flags;
tu->consumer.filter = filter;
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
+ &tu->consumer);
if (ret)
goto err_buffer;
@@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
uprobe_buffer_disable();
@@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
write_unlock(&tu->filter.rwlock);
if (!done)
- return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
+ &tu->consumer, false);
return 0;
}
@@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
err = 0;
if (!done) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(d_inode(tu->path.dentry),
+ tu->offset, &tu->consumer, true);
if (err)
uprobe_perf_close(tu, event);
}
@@ -1352,7 +1346,6 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
{
struct trace_uprobe *tu;
- struct inode *inode;
struct path path;
int ret;
@@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
if (ret)
return ERR_PTR(ret);
- inode = igrab(d_inode(path.dentry));
- path_put(&path);
-
- if (!inode || !S_ISREG(inode->i_mode)) {
- iput(inode);
- return ERR_PTR(-EINVAL);
- }
-
/*
* local trace_kprobes are not added to probe_list, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
@@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
}
tu->offset = offs;
- tu->inode = inode;
+ tu->path.mnt = path.mnt;
+ tu->path.dentry = path.dentry;
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu, &tu->tp.call);
--
2.9.5
As Miklos reported and suggested:
This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;
inode = igrab(d_inode(path.dentry));
path_put(&path);
And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.
This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.
Solution: store path instead of inode.
This patch fixes the issue in kernel/event/core.c.
NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.
However, my test system doesn't support address filtering (or I made a
mistake?). Therefore, I have NOT tested this patch.
Could someone please help test it?
Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Cc: Alexander Shishkin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
---
arch/x86/events/intel/pt.c | 4 ++--
include/linux/perf_event.h | 2 +-
kernel/events/core.c | 21 +++++++++------------
3 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b99394..8d016ce 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;
- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;
@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;
list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..88922d8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
*/
struct perf_addr_filter {
struct list_head entry;
- struct inode *inode;
+ struct path path;
unsigned long offset;
unsigned long size;
enum perf_addr_filter_action_t action;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7af828..7d711ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6668,7 +6668,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -7333,7 +7333,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -8674,8 +8674,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;
list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8772,7 +8771,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -8846,7 +8845,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8959,19 +8957,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;
/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;
- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;
ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;
event->addr_filters.nr_file_filters++;
--
2.9.5
On Wed, Apr 18, 2018 at 8:29 AM, Song Liu <[email protected]> wrote:
> As Miklos reported and suggested:
>
> This pattern repeats two times in trace_uprobe.c and in
> kernel/events/core.c as well:
>
> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
> if (ret)
> goto fail_address_parse;
>
> inode = igrab(d_inode(path.dentry));
> path_put(&path);
>
> And it's wrong. You can only hold a reference to the inode if you
> have an active ref to the superblock as well (which is normally
> through path.mnt) or holding s_umount.
>
> This way unmounting the containing filesystem while the tracepoint is
> active will give you the "VFS: Busy inodes after unmount..." message
> and a crash when the inode is finally put.
>
> Solution: store path instead of inode.
>
> This patch fixes two instances in trace_uprobe.c.
>
> Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
> Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
> Cc: Steven Rostedt <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Howard McLauchlan <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Srikar Dronamraju <[email protected]>
> Reported-by: Miklos Szeredi <[email protected]>
> Signed-off-by: Song Liu <[email protected]>
> ---
> kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
> 1 file changed, 14 insertions(+), 28 deletions(-)
>
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 0d450b4..80dfcdf 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -55,7 +55,7 @@ struct trace_uprobe {
> struct list_head list;
> struct trace_uprobe_filter filter;
> struct uprobe_consumer consumer;
> - struct inode *inode;
> + struct path path;
> char *filename;
> unsigned long offset;
> unsigned long nhit;
> @@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
> for (i = 0; i < tu->tp.nr_args; i++)
> traceprobe_free_probe_arg(&tu->tp.args[i]);
>
> - iput(tu->inode);
> + path_put(&tu->path);
> kfree(tu->tp.call.class->system);
> kfree(tu->tp.call.name);
> kfree(tu->filename);
> @@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
> static int create_trace_uprobe(int argc, char **argv)
> {
> struct trace_uprobe *tu;
> - struct inode *inode;
> char *arg, *event, *group, *filename;
> char buf[MAX_EVENT_NAME_LEN];
> struct path path;
> @@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
> bool is_delete, is_return;
> int i, ret;
>
> - inode = NULL;
> ret = 0;
> is_delete = false;
> is_return = false;
> @@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
> if (ret)
> goto fail_address_parse;
>
> - inode = igrab(d_inode(path.dentry));
This is not against -linus tree.
> - path_put(&path);
> -
> - if (!inode || !S_ISREG(inode->i_mode)) {
> - ret = -EINVAL;
> - goto fail_address_parse;
> - }
> -
> ret = kstrtoul(arg, 0, &offset);
> if (ret)
> goto fail_address_parse;
> @@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
> goto fail_address_parse;
> }
> tu->offset = offset;
> - tu->inode = inode;
> + tu->path.mnt = path.mnt;
> + tu->path.dentry = path.dentry;
You can just assign the whole structure. No need to mess with
individual members.
tu->path = path;
> tu->filename = kstrdup(filename, GFP_KERNEL);
>
> if (!tu->filename) {
> @@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
> return ret;
>
> fail_address_parse:
> - iput(inode);
> + path_put(&path);
>
> pr_info("Failed to parse address or file.\n");
>
> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
> goto err_flags;
>
> tu->consumer.filter = filter;
> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
> + &tu->consumer);
It is not entirely clear how the lifetime of uprobe relates to the
lifetime of trace_uprobe. Is the uprobe object never going to survive
its creator trace_uprobe object?
If that's the case, it warrants a comment. If that's not the case,
then the path would need to be passed to uprobe_resister() which would
need to obtain its own reference.
> if (ret)
> goto err_buffer;
>
> @@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
>
> WARN_ON(!uprobe_filter_is_empty(&tu->filter));
>
> - uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
> + uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
> tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
>
> uprobe_buffer_disable();
> @@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
> write_unlock(&tu->filter.rwlock);
>
> if (!done)
> - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
> + return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
> + &tu->consumer, false);
>
> return 0;
> }
> @@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
>
> err = 0;
> if (!done) {
> - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
> + err = uprobe_apply(d_inode(tu->path.dentry),
> + tu->offset, &tu->consumer, true);
> if (err)
> uprobe_perf_close(tu, event);
> }
> @@ -1352,7 +1346,6 @@ struct trace_event_call *
> create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> {
> struct trace_uprobe *tu;
> - struct inode *inode;
> struct path path;
> int ret;
>
> @@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> if (ret)
> return ERR_PTR(ret);
>
> - inode = igrab(d_inode(path.dentry));
> - path_put(&path);
> -
> - if (!inode || !S_ISREG(inode->i_mode)) {
> - iput(inode);
> - return ERR_PTR(-EINVAL);
> - }
> -
> /*
> * local trace_kprobes are not added to probe_list, so they are never
> * searched in find_trace_kprobe(). Therefore, there is no concern of
> @@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
> }
>
> tu->offset = offs;
> - tu->inode = inode;
> + tu->path.mnt = path.mnt;
> + tu->path.dentry = path.dentry;
tu->path = path
> tu->filename = kstrdup(name, GFP_KERNEL);
> init_trace_event_call(tu, &tu->tp.call);
>
> --
> 2.9.5
>
On Wed, 18 Apr 2018 16:03:42 +0200
Miklos Szeredi <[email protected]> wrote:
> > @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
> > goto err_flags;
> >
> > tu->consumer.filter = filter;
> > - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
> > + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
> > + &tu->consumer);
>
> It is not entirely clear how the lifetime of uprobe relates to the
> lifetime of trace_uprobe. Is the uprobe object never going to survive
> its creator trace_uprobe object?
Not exactly sure what you mean here.
The trace_uprobe (the probe event) is created, it doesn't do anything
until it is enabled. This function is called when it is enabled. The
trace_uprobe (probe event) can not be deleted while it is enabled
(EBUSY).
Are you asking what happens if the file is deleted while it has probe?
That I don't know about (haven't tried it out). But I would hope that
it keeps a reference to the inode, isn't that what the igrab is for?
And is now being replaced by a reference on the path, or is that the
problem?
-- Steve
>
> If that's the case, it warrants a comment. If that's not the case,
> then the path would need to be passed to uprobe_resister() which would
> need to obtain its own reference.
>
> > if (ret)
> > goto err_buffer;
> >
On Wed, Apr 18, 2018 at 4:25 PM, Steven Rostedt <[email protected]> wrote:
> On Wed, 18 Apr 2018 16:03:42 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>> > @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>> > goto err_flags;
>> >
>> > tu->consumer.filter = filter;
>> > - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>> > + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>> > + &tu->consumer);
>>
>> It is not entirely clear how the lifetime of uprobe relates to the
>> lifetime of trace_uprobe. Is the uprobe object never going to survive
>> its creator trace_uprobe object?
>
> Not exactly sure what you mean here.
>
> The trace_uprobe (the probe event) is created, it doesn't do anything
> until it is enabled. This function is called when it is enabled. The
> trace_uprobe (probe event) can not be deleted while it is enabled
> (EBUSY).
>
> Are you asking what happens if the file is deleted while it has probe?
> That I don't know about (haven't tried it out). But I would hope that
> it keeps a reference to the inode, isn't that what the igrab is for?
> And is now being replaced by a reference on the path, or is that the
> problem?
No, that's not the problem.
What I don't see is how the uprobe object relates to the trace_uprobe object.
Because after the patch the uprobe object still only has a ref to the
inode, and that can lead to the same issue as with trace_uprobe.
OTOH if uprobe can't survive its creating trace_uprobe, then it
doesn't need to take a ref to the inode at all, since trace_uprobe
already holds it. Taking an extra ref isn't incorrect, it's just
unnecessary and confusing.
So this needs to be cleared up in some way.
Thanks,
Miklos
On Wed, 18 Apr 2018 16:40:19 +0200
Miklos Szeredi <[email protected]> wrote:
> > The trace_uprobe (the probe event) is created, it doesn't do anything
> > until it is enabled. This function is called when it is enabled. The
> > trace_uprobe (probe event) can not be deleted while it is enabled
> > (EBUSY).
> >
> > Are you asking what happens if the file is deleted while it has probe?
> > That I don't know about (haven't tried it out). But I would hope that
> > it keeps a reference to the inode, isn't that what the igrab is for?
> > And is now being replaced by a reference on the path, or is that the
> > problem?
>
> No, that's not the problem.
>
> What I don't see is how the uprobe object relates to the trace_uprobe object.
>
> Because after the patch the uprobe object still only has a ref to the
> inode, and that can lead to the same issue as with trace_uprobe.
> OTOH if uprobe can't survive its creating trace_uprobe, then it
> doesn't need to take a ref to the inode at all, since trace_uprobe
> already holds it. Taking an extra ref isn't incorrect, it's just
> unnecessary and confusing.
>
> So this needs to be cleared up in some way.
The uprobe created by the trace_uprobe creation must be deleted before
the trace_uprobe can be deleted. Basically we have this:
# cd /sys/kernel/tracing
# echo "uprobe creation text" > uprobe_events
The trace_uprobe is created (but not the uprobe itself). This is what
calls create_trace_uprobe().
# echo 1 > events/uprobes/enable
This enables all the trace uprobe events, which creates the uprobes.
This is the action that calls probe_event_enable(), which creates
uprobes.
At this point, any write to uprobe_events that would destroy the trace
uprobes would return with -EBUSY, and the trace uprobes will not be
deleted.
# echo 0 > events/uprobes/enable
This will call the probe_event_disable() which will call
uprobe_unregister() which will destroy the uprobe.
Now we can delete the trace uprobe.
Does that answer your question? A uprobe created for trace uprobes can
not survive the trace uprobe itself.
-- Steve
> On Apr 18, 2018, at 7:03 AM, Miklos Szeredi <[email protected]> wrote:
>
> On Wed, Apr 18, 2018 at 8:29 AM, Song Liu <[email protected]> wrote:
>> As Miklos reported and suggested:
>>
>> This pattern repeats two times in trace_uprobe.c and in
>> kernel/events/core.c as well:
>>
>> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
>> if (ret)
>> goto fail_address_parse;
>>
>> inode = igrab(d_inode(path.dentry));
>> path_put(&path);
>>
>> And it's wrong. You can only hold a reference to the inode if you
>> have an active ref to the superblock as well (which is normally
>> through path.mnt) or holding s_umount.
>>
>> This way unmounting the containing filesystem while the tracepoint is
>> active will give you the "VFS: Busy inodes after unmount..." message
>> and a crash when the inode is finally put.
>>
>> Solution: store path instead of inode.
>>
>> This patch fixes two instances in trace_uprobe.c.
>>
>> Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
>> Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
>> Cc: Steven Rostedt <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> Cc: Howard McLauchlan <[email protected]>
>> Cc: Josef Bacik <[email protected]>
>> Cc: Srikar Dronamraju <[email protected]>
>> Reported-by: Miklos Szeredi <[email protected]>
>> Signed-off-by: Song Liu <[email protected]>
>> ---
>> kernel/trace/trace_uprobe.c | 42 ++++++++++++++----------------------------
>> 1 file changed, 14 insertions(+), 28 deletions(-)
>>
>> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
>> index 0d450b4..80dfcdf 100644
>> --- a/kernel/trace/trace_uprobe.c
>> +++ b/kernel/trace/trace_uprobe.c
>> @@ -55,7 +55,7 @@ struct trace_uprobe {
>> struct list_head list;
>> struct trace_uprobe_filter filter;
>> struct uprobe_consumer consumer;
>> - struct inode *inode;
>> + struct path path;
>> char *filename;
>> unsigned long offset;
>> unsigned long nhit;
>> @@ -289,7 +289,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
>> for (i = 0; i < tu->tp.nr_args; i++)
>> traceprobe_free_probe_arg(&tu->tp.args[i]);
>>
>> - iput(tu->inode);
>> + path_put(&tu->path);
>> kfree(tu->tp.call.class->system);
>> kfree(tu->tp.call.name);
>> kfree(tu->filename);
>> @@ -363,7 +363,6 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
>> static int create_trace_uprobe(int argc, char **argv)
>> {
>> struct trace_uprobe *tu;
>> - struct inode *inode;
>> char *arg, *event, *group, *filename;
>> char buf[MAX_EVENT_NAME_LEN];
>> struct path path;
>> @@ -371,7 +370,6 @@ static int create_trace_uprobe(int argc, char **argv)
>> bool is_delete, is_return;
>> int i, ret;
>>
>> - inode = NULL;
>> ret = 0;
>> is_delete = false;
>> is_return = false;
>> @@ -448,14 +446,6 @@ static int create_trace_uprobe(int argc, char **argv)
>> if (ret)
>> goto fail_address_parse;
>>
>> - inode = igrab(d_inode(path.dentry));
>
> This is not against -linus tree.
These patches are against tip/perf/core. I can also send version against
-linus tree.
>
>> - path_put(&path);
>> -
>> - if (!inode || !S_ISREG(inode->i_mode)) {
>> - ret = -EINVAL;
>> - goto fail_address_parse;
>> - }
>> -
>> ret = kstrtoul(arg, 0, &offset);
>> if (ret)
>> goto fail_address_parse;
>> @@ -490,7 +480,8 @@ static int create_trace_uprobe(int argc, char **argv)
>> goto fail_address_parse;
>> }
>> tu->offset = offset;
>> - tu->inode = inode;
>> + tu->path.mnt = path.mnt;
>> + tu->path.dentry = path.dentry;
>
> You can just assign the whole structure. No need to mess with
> individual members.
>
> tu->path = path;
Will fix in v2.
>
>> tu->filename = kstrdup(filename, GFP_KERNEL);
>>
>> if (!tu->filename) {
>> @@ -558,7 +549,7 @@ static int create_trace_uprobe(int argc, char **argv)
>> return ret;
>>
>> fail_address_parse:
>> - iput(inode);
>> + path_put(&path);
>>
>> pr_info("Failed to parse address or file.\n");
>>
>> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>> goto err_flags;
>>
>> tu->consumer.filter = filter;
>> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>> + &tu->consumer);
>
> It is not entirely clear how the lifetime of uprobe relates to the
> lifetime of trace_uprobe. Is the uprobe object never going to survive
> its creator trace_uprobe object?
>
> If that's the case, it warrants a comment. If that's not the case,
> then the path would need to be passed to uprobe_resister() which would
> need to obtain its own reference.
trace_uprobe will not be freed before the uprobe object. trace_uprobe
holds reference to struct path (with path_get()).
>
>> if (ret)
>> goto err_buffer;
>>
>> @@ -981,7 +973,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
>>
>> WARN_ON(!uprobe_filter_is_empty(&tu->filter));
>>
>> - uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
>> + uprobe_unregister(d_inode(tu->path.dentry), tu->offset, &tu->consumer);
>> tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
>>
>> uprobe_buffer_disable();
>> @@ -1056,7 +1048,8 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
>> write_unlock(&tu->filter.rwlock);
>>
>> if (!done)
>> - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
>> + return uprobe_apply(d_inode(tu->path.dentry), tu->offset,
>> + &tu->consumer, false);
>>
>> return 0;
>> }
>> @@ -1088,7 +1081,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
>>
>> err = 0;
>> if (!done) {
>> - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
>> + err = uprobe_apply(d_inode(tu->path.dentry),
>> + tu->offset, &tu->consumer, true);
>> if (err)
>> uprobe_perf_close(tu, event);
>> }
>> @@ -1352,7 +1346,6 @@ struct trace_event_call *
>> create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> {
>> struct trace_uprobe *tu;
>> - struct inode *inode;
>> struct path path;
>> int ret;
>>
>> @@ -1360,14 +1353,6 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> if (ret)
>> return ERR_PTR(ret);
>>
>> - inode = igrab(d_inode(path.dentry));
>> - path_put(&path);
>> -
>> - if (!inode || !S_ISREG(inode->i_mode)) {
>> - iput(inode);
>> - return ERR_PTR(-EINVAL);
>> - }
>> -
>> /*
>> * local trace_kprobes are not added to probe_list, so they are never
>> * searched in find_trace_kprobe(). Therefore, there is no concern of
>> @@ -1383,7 +1368,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
>> }
>>
>> tu->offset = offs;
>> - tu->inode = inode;
>> + tu->path.mnt = path.mnt;
>> + tu->path.dentry = path.dentry;
>
> tu->path = path
>
>
>> tu->filename = kstrdup(name, GFP_KERNEL);
>> init_trace_event_call(tu, &tu->tp.call);
>>
>> --
>> 2.9.5
>>
> On Apr 18, 2018, at 7:25 AM, Steven Rostedt <[email protected]> wrote:
>
> On Wed, 18 Apr 2018 16:03:42 +0200
> Miklos Szeredi <[email protected]> wrote:
>
>>> @@ -937,7 +928,8 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
>>> goto err_flags;
>>>
>>> tu->consumer.filter = filter;
>>> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
>>> + ret = uprobe_register(d_inode(tu->path.dentry), tu->offset,
>>> + &tu->consumer);
>>
>> It is not entirely clear how the lifetime of uprobe relates to the
>> lifetime of trace_uprobe. Is the uprobe object never going to survive
>> its creator trace_uprobe object?
>
> Not exactly sure what you mean here.
>
> The trace_uprobe (the probe event) is created, it doesn't do anything
> until it is enabled. This function is called when it is enabled. The
> trace_uprobe (probe event) can not be deleted while it is enabled
> (EBUSY).
>
> Are you asking what happens if the file is deleted while it has probe?
> That I don't know about (haven't tried it out). But I would hope that
> it keeps a reference to the inode, isn't that what the igrab is for?
> And is now being replaced by a reference on the path, or is that the
> problem?
>
> -- Steve
>
Just as Miklos pointed out, I run tests with the uprobe and confirmed
that igrab() is not sufficient to prevent umount. When we change it to
path_get()/path_put(), umount will abort because of the trace_uprobe.
Song
>>
>> If that's the case, it warrants a comment. If that's not the case,
>> then the path would need to be passed to uprobe_resister() which would
>> need to obtain its own reference.
>>
>>> if (ret)
>>> goto err_buffer;
>>>
On Wed, 18 Apr 2018 16:08:50 +0000
Song Liu <[email protected]> wrote:
> > This is not against -linus tree.
>
> These patches are against tip/perf/core. I can also send version against
> -linus tree.
I can take this patch in my tree (which is currently equal to
4.17-rc1). The other patch should go through tip.
-- Steve
On Tue, Apr 17, 2018 at 11:29:07PM -0700, Song Liu wrote:
> As Miklos reported and suggested:
>
> This pattern repeats two times in trace_uprobe.c and in
> kernel/events/core.c as well:
>
> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
> if (ret)
> goto fail_address_parse;
>
> inode = igrab(d_inode(path.dentry));
> path_put(&path);
>
> And it's wrong. You can only hold a reference to the inode if you
> have an active ref to the superblock as well (which is normally
> through path.mnt) or holding s_umount.
Oops. I must have snatched it from the uprobe code without thinking.
> This way unmounting the containing filesystem while the tracepoint is
> active will give you the "VFS: Busy inodes after unmount..." message
> and a crash when the inode is finally put.
>
> Solution: store path instead of inode.
>
> This patch fixes the issue in kernel/event/core.c.
>
> NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.
Coresight too, but that's probably even further away from what you have.
> However, my test system doesn't support address filtering (or I made a
> mistake?). Therefore, I have NOT tested this patch.
Check /sys/devices/intel_pt/caps/num_address_ranges, if it's non-zero,
it's supported.
> Could someone please help test it?
Yes:
Reviewed-and-tested-by: Alexander Shishkin <[email protected]>
The subject line needs a little love to be more like other perf commits, but
other than that, looks good.
Thanks!
> On Apr 18, 2018, at 11:17 PM, Alexander Shishkin <[email protected]> wrote:
>
> On Tue, Apr 17, 2018 at 11:29:07PM -0700, Song Liu wrote:
>> As Miklos reported and suggested:
>>
>> This pattern repeats two times in trace_uprobe.c and in
>> kernel/events/core.c as well:
>>
>> ret = kern_path(filename, LOOKUP_FOLLOW, &path);
>> if (ret)
>> goto fail_address_parse;
>>
>> inode = igrab(d_inode(path.dentry));
>> path_put(&path);
>>
>> And it's wrong. You can only hold a reference to the inode if you
>> have an active ref to the superblock as well (which is normally
>> through path.mnt) or holding s_umount.
>
> Oops. I must have snatched it from the uprobe code without thinking.
>
>> This way unmounting the containing filesystem while the tracepoint is
>> active will give you the "VFS: Busy inodes after unmount..." message
>> and a crash when the inode is finally put.
>>
>> Solution: store path instead of inode.
>>
>> This patch fixes the issue in kernel/event/core.c.
>>
>> NOTE: Based on my understanding, perf_addr_filter only supports intel_pt.
>
> Coresight too, but that's probably even further away from what you have.
>
>> However, my test system doesn't support address filtering (or I made a
>> mistake?). Therefore, I have NOT tested this patch.
>
> Check /sys/devices/intel_pt/caps/num_address_ranges, if it's non-zero,
> it's supported.
>
>> Could someone please help test it?
>
> Yes:
>
> Reviewed-and-tested-by: Alexander Shishkin <[email protected]>
>
> The subject line needs a little love to be more like other perf commits, but
> other than that, looks good.
>
> Thanks!
Did this patch ever make into tip/perf/XX trees? If not, what shall I do
to move it ahead?
Thanks,
Song
On Tue, May 22, 2018 at 09:56:18PM +0000, Song Liu wrote:
> Did this patch ever make into tip/perf/XX trees? If not, what shall I do
> to move it ahead?
Got it now, thanks for the reminder.
Commit-ID: 9511bce9fe8e5e6c0f923c09243a713eba560141
Gitweb: https://git.kernel.org/tip/9511bce9fe8e5e6c0f923c09243a713eba560141
Author: Song Liu <[email protected]>
AuthorDate: Tue, 17 Apr 2018 23:29:07 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 25 May 2018 08:11:10 +0200
perf/core: Fix bad use of igrab()
As Miklos reported and suggested:
"This pattern repeats two times in trace_uprobe.c and in
kernel/events/core.c as well:
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
goto fail_address_parse;
inode = igrab(d_inode(path.dentry));
path_put(&path);
And it's wrong. You can only hold a reference to the inode if you
have an active ref to the superblock as well (which is normally
through path.mnt) or holding s_umount.
This way unmounting the containing filesystem while the tracepoint is
active will give you the "VFS: Busy inodes after unmount..." message
and a crash when the inode is finally put.
Solution: store path instead of inode."
This patch fixes the issue in kernel/event/core.c.
Reviewed-and-tested-by: Alexander Shishkin <[email protected]>
Reported-by: Miklos Szeredi <[email protected]>
Signed-off-by: Song Liu <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/events/intel/pt.c | 4 ++--
include/linux/perf_event.h | 2 +-
kernel/events/core.c | 21 +++++++++------------
3 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b993942a0e4..8d016ce5b80d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;
- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;
@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;
list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index def866f7269b..bea0b0cd4bf7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
*/
struct perf_addr_filter {
struct list_head entry;
- struct inode *inode;
+ struct path path;
unsigned long offset;
unsigned long size;
enum perf_addr_filter_action_t action;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce6aa5ff3c96..24dea13a27ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6668,7 +6668,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -7333,7 +7333,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -8686,8 +8686,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;
list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8784,7 +8783,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -8858,7 +8857,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8971,19 +8969,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;
/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;
- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;
ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;
event->addr_filters.nr_file_filters++;