When toying around with debugfs, intentionally trying to break things,
I managed to get it into a reproducible endless loop when cleaning up.
debugfs_remove_recursive() completely ignores that entries found
on ->d_subdirs may already be d_delete()d, but not yet d_kill()ed.
In this case, the first two entries found have both been such dentries
(d_iname = ".", btw), while later in the list there was still a real,
not yet deleted entry.
That way, the "goto next_sibling;" did not catch this case,
the "break the infinit loop if we fail to remove something"
did not catch it either.
Disclaimer: I'm not a dentries and inodes guy...
Still, I think the "is this child a non-empty subdir" check
was just wrong. This is my fix:
- if (list_empty(&child->d_subdirs))
+ if (!simple_emty(child))
Also, always trying to __debugfs_remove() the first entry found from
parent->d_subdirs.next is wrong, we need to skip over any already
deleted children. I introduced the debugfs_find_next_positive_child()
helper for this.
If I understand it correctly, if we do it this way, it can never fail.
That is, as long as we can be sure that no new dentries will be created
while we are in debugfs_remove_recursive().
So the
if (debugfs_positive(child)) {
/*
* Avoid infinite loop if we fail to remove
* one dentry.
is probably dead code now?
As an additional fix, to prevent an access after free and resulting Oops,
I serialize dereferencing of attr->get/set and attr->data with d_delete(),
using the file->f_dentry->d_parent->d_inode->i_mutex.
If this file->f_dentry meanwhile has been deleted, simple_attr_read()
and simple_attr_write() now returns -ESTALE. (Any other error code preferences?)
With this patch, I was able to
cd /sys/debugfs/test-module/some/dir
exec 7< some-file
rmmod test-module
cat <&7
without any infinite loops, hangs, oopses or other problems,
and as expected get an ESTALE for the cat.
Without the patch, I'll get either an infinite loop and rmmod never
terminates, or cat oopses.
If you think this is correct, please comment on the FIXME
below, and help me write a nice commit message.
If you think this is still wrong or even makes things worse,
please help me with a proper fix ;-)
Patch is against upstream as of yesterday,
but looks like it still applies way back into 2009, 2.6.3x,
so if it is correct, it may even qualify for the stable branches?
Cheers,
Lars
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92..fc80900 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -520,6 +520,19 @@ void debugfs_remove(struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(debugfs_remove);
+static struct dentry *debugfs_find_next_positive_child(struct dentry *parent)
+{
+ struct dentry *tmp;
+ list_for_each_entry(tmp, &parent->d_subdirs, d_u.d_child) {
+ if (debugfs_positive(tmp))
+ return tmp;
+ pr_debug("debugfs_remove_recursive: skipped over %.32s(%p)/%.32s(%p)\n",
+ parent->d_iname, parent,
+ tmp->d_iname, tmp);
+ }
+ return NULL;
+}
+
/**
* debugfs_remove_recursive - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed.
@@ -549,42 +562,36 @@ void debugfs_remove_recursive(struct dentry *dentry)
while (1) {
/*
- * When all dentries under "parent" has been removed,
+ * Skip over any already d_delete()d,
+ * but not yet d_kill()ed children.
+ */
+ child = debugfs_find_next_positive_child(parent);
+
+ /*
+ * When all dentries under "parent" have been removed,
* walk up the tree until we reach our starting point.
*/
- if (list_empty(&parent->d_subdirs)) {
+ if (!child) {
mutex_unlock(&parent->d_inode->i_mutex);
if (parent == dentry)
break;
parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
+ continue;
}
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
/*
* If "child" isn't empty, walk down the tree and
* remove all its descendants first.
*/
- if (!list_empty(&child->d_subdirs)) {
+ if (!simple_empty(child)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
mutex_lock(&parent->d_inode->i_mutex);
continue;
}
__debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
+ if (debugfs_positive(child)) {
/*
* Avoid infinite loop if we fail to remove
* one dentry.
diff --git a/fs/libfs.c b/fs/libfs.c
index 7cc37ca..bc5f3f3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -794,8 +794,24 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
if (*ppos) { /* continued read */
size = strlen(attr->get_buf);
} else { /* first read */
+ struct dentry *parent;
u64 val;
+
+ ret = -ESTALE;
+ parent = file->f_dentry->d_parent;
+ /* FIXME do I need to check this? */
+ if (!parent || !parent->d_inode)
+ goto out;
+
+ /* serialize with d_delete() */
+ mutex_lock(&parent->d_inode->i_mutex);
+ if (!simple_positive(file->f_dentry)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ goto out;
+ }
+
ret = attr->get(attr->data, &val);
+ mutex_unlock(&parent->d_inode->i_mutex);
if (ret)
goto out;
@@ -813,6 +829,7 @@ out:
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
+ struct dentry *parent;
struct simple_attr *attr;
u64 val;
size_t size;
@@ -833,7 +850,22 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
attr->set_buf[size] = '\0';
val = simple_strtoll(attr->set_buf, NULL, 0);
+
+ ret = -ESTALE;
+ parent = file->f_dentry->d_parent;
+ /* FIXME do I need to check this? */
+ if (!parent || !parent->d_inode)
+ goto out;
+
+ /* serialize with d_delete() */
+ mutex_lock(&parent->d_inode->i_mutex);
+ if (!simple_positive(file->f_dentry)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ goto out;
+ }
+
ret = attr->set(attr->data, val);
+ mutex_unlock(&parent->d_inode->i_mutex);
if (ret == 0)
ret = len; /* on success, claim we got the whole input */
out:
On Fri, 2012-11-23 at 18:15 +0100, Lars Ellenberg wrote:
> When toying around with debugfs, intentionally trying to break things,
> I managed to get it into a reproducible endless loop when cleaning up.
>
> debugfs_remove_recursive() completely ignores that entries found
> on ->d_subdirs may already be d_delete()d, but not yet d_kill()ed.
>
> In this case, the first two entries found have both been such dentries
> (d_iname = ".", btw), while later in the list there was still a real,
> not yet deleted entry.
>
> That way, the "goto next_sibling;" did not catch this case,
> the "break the infinit loop if we fail to remove something"
> did not catch it either.
>
>
> Disclaimer: I'm not a dentries and inodes guy...
I'm not a dentries or inodes guy either, so I wont comment on the actual
merits of this patch.
>
> Still, I think the "is this child a non-empty subdir" check
> was just wrong. This is my fix:
> - if (list_empty(&child->d_subdirs))
> + if (!simple_emty(child))
"simple_empty"
>
> Also, always trying to __debugfs_remove() the first entry found from
> parent->d_subdirs.next is wrong, we need to skip over any already
> deleted children. I introduced the debugfs_find_next_positive_child()
> helper for this.
>
> If I understand it correctly, if we do it this way, it can never fail.
> That is, as long as we can be sure that no new dentries will be created
> while we are in debugfs_remove_recursive().
> So the
> if (debugfs_positive(child)) {
> /*
> * Avoid infinite loop if we fail to remove
> * one dentry.
> is probably dead code now?
>
>
> As an additional fix, to prevent an access after free and resulting Oops,
> I serialize dereferencing of attr->get/set and attr->data with d_delete(),
> using the file->f_dentry->d_parent->d_inode->i_mutex.
>
> If this file->f_dentry meanwhile has been deleted, simple_attr_read()
> and simple_attr_write() now returns -ESTALE. (Any other error code preferences?)
>
>
> With this patch, I was able to
> cd /sys/debugfs/test-module/some/dir
> exec 7< some-file
> rmmod test-module
> cat <&7
I saw this and thought "hmm, I wonder if the trace_events have issues,
as they create debugfs directories and files via modules too". I went
and tried to reproduce but couldn't get passed the rmmod, as the module
count gets incremented for any open files that the module creates. I
take it that you didn't add that feature to your test module.
>
> without any infinite loops, hangs, oopses or other problems,
> and as expected get an ESTALE for the cat.
>
> Without the patch, I'll get either an infinite loop and rmmod never
> terminates, or cat oopses.
>
>
> If you think this is correct, please comment on the FIXME
> below, and help me write a nice commit message.
>
> If you think this is still wrong or even makes things worse,
> please help me with a proper fix ;-)
>
>
> Patch is against upstream as of yesterday,
> but looks like it still applies way back into 2009, 2.6.3x,
> so if it is correct, it may even qualify for the stable branches?
>
Now, is there any current user of debugfs that is susceptible for this
bug? I'm not saying that these issues shouldn't be fixed. But I'm also
concerned about exploits and other things that just a root user may
accidentally cause harm. If there's current problem then maybe this
isn't needed for stable. But should probably be fixed for the future.
-- Steve
On Tue, Nov 27, 2012 at 10:16:28PM -0500, Steven Rostedt wrote:
> On Fri, 2012-11-23 at 18:15 +0100, Lars Ellenberg wrote:
> > When toying around with debugfs, intentionally trying to break things,
> > I managed to get it into a reproducible endless loop when cleaning up.
> >
> > debugfs_remove_recursive() completely ignores that entries found
> > on ->d_subdirs may already be d_delete()d, but not yet d_kill()ed.
> >
> > In this case, the first two entries found have both been such dentries
> > (d_iname = ".", btw), while later in the list there was still a real,
> > not yet deleted entry.
> >
> > That way, the "goto next_sibling;" did not catch this case,
> > the "break the infinit loop if we fail to remove something"
> > did not catch it either.
> >
> >
> > Disclaimer: I'm not a dentries and inodes guy...
>
> I'm not a dentries or inodes guy either, so I wont comment on the actual
> merits of this patch.
Though you submitted the "next_sibling" code back in 2009, right?
That's why you ended up on Cc.
I suspect that even back then, the real problem was
"already dead but still linked" dentries, as below:
> > Still, I think the "is this child a non-empty subdir" check
> > was just wrong. This is my fix:
> > - if (list_empty(&child->d_subdirs))
> > + if (!simple_emty(child))
>
> "simple_empty"
Of course. I mistyped both lines in the email body :-/
In the patch below it was correct:
- if (!list_empty(&child->d_subdirs)) {
+ if (!simple_empty(child)) {
> > Also, always trying to __debugfs_remove() the first entry found from
> > parent->d_subdirs.next is wrong, we need to skip over any already
> > deleted children. I introduced the debugfs_find_next_positive_child()
> > helper for this.
> >
> > If I understand it correctly, if we do it this way, it can never fail.
> > That is, as long as we can be sure that no new dentries will be created
> > while we are in debugfs_remove_recursive().
> > So the
> > if (debugfs_positive(child)) {
> > /*
> > * Avoid infinite loop if we fail to remove
> > * one dentry.
> > is probably dead code now?
> >
> >
> > As an additional fix, to prevent an access after free and resulting Oops,
> > I serialize dereferencing of attr->get/set and attr->data with d_delete(),
> > using the file->f_dentry->d_parent->d_inode->i_mutex.
> >
> > If this file->f_dentry meanwhile has been deleted, simple_attr_read()
> > and simple_attr_write() now returns -ESTALE. (Any other error code preferences?)
> >
> >
> > With this patch, I was able to
> > cd /sys/debugfs/test-module/some/dir
> > exec 7< some-file
> > rmmod test-module
> > cat <&7
>
> I saw this and thought "hmm, I wonder if the trace_events have issues,
> as they create debugfs directories and files via modules too". I went
> and tried to reproduce but couldn't get passed the rmmod, as the module
> count gets incremented for any open files that the module creates.
Is that so.
> I take it that you didn't add that feature to your test module.
If you use the "debugfs_create_u32" (and similar "_<simple attribute>")
wrappers, you don't get a chance to do so from the attribute callbacks,
all callbacks are predefined, you just submit a pointer.
You mean something like this, or am I missing something?
__module_get();
debugfs_create_u32();
And later
debugfs_remove_recursive();
module_put();
You still have the race between someone opening some attribute file,
you removing the attribute and releasing your refcount, then the other
guy dereferencing that pointer in the read() call.
I think you can only deal with that race,
if you provide your own file_operations for all your attributes.
I'm about to add some such debugfs entries for our driver, and would
like to avoid re-implementing all of the simple_attribute fops.
> > without any infinite loops, hangs, oopses or other problems,
> > and as expected get an ESTALE for the cat.
> >
> > Without the patch, I'll get either an infinite loop and rmmod never
> > terminates, or cat oopses.
> >
> >
> > If you think this is correct, please comment on the FIXME
> > below, and help me write a nice commit message.
> >
> > If you think this is still wrong or even makes things worse,
> > please help me with a proper fix ;-)
> >
> >
> > Patch is against upstream as of yesterday,
> > but looks like it still applies way back into 2009, 2.6.3x,
> > so if it is correct, it may even qualify for the stable branches?
> >
>
> Now, is there any current user of debugfs that is susceptible for this
> bug?
I'm unsure. Grepping for debugfs_create_u32, I suspect that some of the
crypto or wifi modules may be affected.
> I'm not saying that these issues shouldn't be fixed. But I'm also
> concerned about exploits and other things that just a root user may
> accidentally cause harm.
I'm concerned about monitoring/statistic gathering stuff (running as any
user) may hold some debugfs file open,
and race with removal of dynamic debugfs entries.
> If there's current problem then maybe this
> isn't needed for stable. But should probably be fixed for the future.
I attach my "hello-debugfs" module.
# make -C /lib/modules/`uname -r`/build M=$PWD modules
Test case1:
cd /sys/kernel/debug/hello-debugfs/dir1/dir3/
rmmod
will unload the module, but give up on removing
/sys/kernel/debug/hello-debugfs/dir1
You won't be able to insmod a second time.
Test case 2:
cd /sys/kernel/debug/hello-debugfs/dir1/dir3/
exec 7< file7
rmmod
cat <&7
As described above, depending on I don't know what exactly,
the outcome was either some infinite loop, or cat will oops/panic.
Note that all but rmmod may be done as normal user,
e.g. some monitoring or statistics gathering tool,
and the rmmod is just a placeholder for
"any event that triggers removal of dynamic debugfs entries".
Lars
On Wed, Nov 28, 2012 at 10:37:54AM +0100, Lars Ellenberg wrote:
> Note that all but rmmod may be done as normal user,
Ok, that is not exactly correct,
I forgot about debugfs being mounted 0700 ;-)
Still...
Lars
On Wed, 2012-11-28 at 10:37 +0100, Lars Ellenberg wrote:
> On Tue, Nov 27, 2012 at 10:16:28PM -0500, Steven Rostedt wrote:
> > On Fri, 2012-11-23 at 18:15 +0100, Lars Ellenberg wrote:
> > > When toying around with debugfs, intentionally trying to break things,
> > > I managed to get it into a reproducible endless loop when cleaning up.
> > >
> > > debugfs_remove_recursive() completely ignores that entries found
> > > on ->d_subdirs may already be d_delete()d, but not yet d_kill()ed.
> > >
> > > In this case, the first two entries found have both been such dentries
> > > (d_iname = ".", btw), while later in the list there was still a real,
> > > not yet deleted entry.
> > >
> > > That way, the "goto next_sibling;" did not catch this case,
> > > the "break the infinit loop if we fail to remove something"
> > > did not catch it either.
> > >
> > >
> > > Disclaimer: I'm not a dentries and inodes guy...
> >
> > I'm not a dentries or inodes guy either, so I wont comment on the actual
> > merits of this patch.
>
> Though you submitted the "next_sibling" code back in 2009, right?
> That's why you ended up on Cc.
2009! Egad, I was another person back then ;-)
Yes, I was digging into the bowels of debugfs and it's use of dentries
back then and understood a lot more of it back then too.
> I suspect that even back then, the real problem was
> "already dead but still linked" dentries, as below:
Could be.
>
> > > Still, I think the "is this child a non-empty subdir" check
> > > was just wrong. This is my fix:
> > > - if (list_empty(&child->d_subdirs))
> > > + if (!simple_emty(child))
> >
> > "simple_empty"
>
> Of course. I mistyped both lines in the email body :-/
> In the patch below it was correct:
> - if (!list_empty(&child->d_subdirs)) {
> + if (!simple_empty(child)) {
>
> > > Also, always trying to __debugfs_remove() the first entry found from
> > > parent->d_subdirs.next is wrong, we need to skip over any already
> > > deleted children. I introduced the debugfs_find_next_positive_child()
> > > helper for this.
> > >
> > > If I understand it correctly, if we do it this way, it can never fail.
> > > That is, as long as we can be sure that no new dentries will be created
> > > while we are in debugfs_remove_recursive().
> > > So the
> > > if (debugfs_positive(child)) {
> > > /*
> > > * Avoid infinite loop if we fail to remove
> > > * one dentry.
> > > is probably dead code now?
> > >
> > >
> > > As an additional fix, to prevent an access after free and resulting Oops,
> > > I serialize dereferencing of attr->get/set and attr->data with d_delete(),
> > > using the file->f_dentry->d_parent->d_inode->i_mutex.
> > >
> > > If this file->f_dentry meanwhile has been deleted, simple_attr_read()
> > > and simple_attr_write() now returns -ESTALE. (Any other error code preferences?)
> > >
> > >
> > > With this patch, I was able to
> > > cd /sys/debugfs/test-module/some/dir
> > > exec 7< some-file
> > > rmmod test-module
> > > cat <&7
> >
> > I saw this and thought "hmm, I wonder if the trace_events have issues,
> > as they create debugfs directories and files via modules too". I went
> > and tried to reproduce but couldn't get passed the rmmod, as the module
> > count gets incremented for any open files that the module creates.
>
> Is that so.
>
> > I take it that you didn't add that feature to your test module.
>
> If you use the "debugfs_create_u32" (and similar "_<simple attribute>")
> wrappers, you don't get a chance to do so from the attribute callbacks,
> all callbacks are predefined, you just submit a pointer.
>
> You mean something like this, or am I missing something?
> __module_get();
> debugfs_create_u32();
> And later
> debugfs_remove_recursive();
> module_put();
>
> You still have the race between someone opening some attribute file,
> you removing the attribute and releasing your refcount, then the other
> guy dereferencing that pointer in the read() call.
>
> I think you can only deal with that race,
> if you provide your own file_operations for all your attributes.
>
> I'm about to add some such debugfs entries for our driver, and would
> like to avoid re-implementing all of the simple_attribute fops.
I went through a lot of hassle to get this to work. If you look at
kernel/trace/trace_events.c:trace_create_file_ops() you'll see that I
dynamically create a structure that has the specific file_operations in
it. I then set the "owner" field for each of them to the given module.
I haven't looked at the implementation of the owner field in the file
operations, but it does seem to keep me from removing any module that
created any of these files.
>
> > > without any infinite loops, hangs, oopses or other problems,
> > > and as expected get an ESTALE for the cat.
> > >
> > > Without the patch, I'll get either an infinite loop and rmmod never
> > > terminates, or cat oopses.
> > >
> > >
> > > If you think this is correct, please comment on the FIXME
> > > below, and help me write a nice commit message.
> > >
> > > If you think this is still wrong or even makes things worse,
> > > please help me with a proper fix ;-)
> > >
> > >
> > > Patch is against upstream as of yesterday,
> > > but looks like it still applies way back into 2009, 2.6.3x,
> > > so if it is correct, it may even qualify for the stable branches?
> > >
> >
> > Now, is there any current user of debugfs that is susceptible for this
> > bug?
>
> I'm unsure. Grepping for debugfs_create_u32, I suspect that some of the
> crypto or wifi modules may be affected.
Yeah, looking at some of the drivers here, it does seem that they have
issues.
>
> > I'm not saying that these issues shouldn't be fixed. But I'm also
> > concerned about exploits and other things that just a root user may
> > accidentally cause harm.
>
> I'm concerned about monitoring/statistic gathering stuff (running as any
> user) may hold some debugfs file open,
> and race with removal of dynamic debugfs entries.
I agree.
>
> > If there's current problem then maybe this
> > isn't needed for stable. But should probably be fixed for the future.
>
> I attach my "hello-debugfs" module.
> # make -C /lib/modules/`uname -r`/build M=$PWD modules
>
> Test case1:
> cd /sys/kernel/debug/hello-debugfs/dir1/dir3/
> rmmod
> will unload the module, but give up on removing
> /sys/kernel/debug/hello-debugfs/dir1
> You won't be able to insmod a second time.
>
> Test case 2:
> cd /sys/kernel/debug/hello-debugfs/dir1/dir3/
> exec 7< file7
> rmmod
> cat <&7
> As described above, depending on I don't know what exactly,
> the outcome was either some infinite loop, or cat will oops/panic.
>
> Note that all but rmmod may be done as normal user,
> e.g. some monitoring or statistics gathering tool,
> and the rmmod is just a placeholder for
> "any event that triggers removal of dynamic debugfs entries".
Thanks for the test case. This is something that Greg should look into
(I love volunteering others :-)
-- Steve