If there are some mount points(not exported for nfs) under
a pseudo root, after client's operation of those entry under
the root, anyone can unmount those mount points until nfsd stop.
# cat /etc/exports
/nfs/xfs *(rw,insecure,no_subtree_check,no_root_squash)
/nfs/pnfs *(rw,insecure,no_subtree_check,no_root_squash)
# ll /nfs/
total 0
drwxr-xr-x. 3 root root 84 Apr 21 22:27 pnfs
drwxr-xr-x. 3 root root 84 Apr 21 22:27 test
drwxr-xr-x. 2 root root 6 Apr 20 22:01 xfs
# mount /dev/sde /nfs/test
# df
Filesystem 1K-blocks Used Available Use% Mounted on
......
/dev/sdd 1038336 32944 1005392 4% /nfs/pnfs
/dev/sdc 10475520 32928 10442592 1% /nfs/xfs
/dev/sde 999320 1284 929224 1% /nfs/test
# mount -t nfs 127.0.0.1:/nfs/ /mnt
# ll /mnt/*/
/mnt/pnfs/:
total 0
-rw-r--r--. 1 root root 0 Apr 21 22:23 attr
drwxr-xr-x. 2 root root 6 Apr 21 22:19 tmp
/mnt/xfs/:
total 0
# umount /nfs/test/
umount: /nfs/test/: target is busy
(In some cases useful info about processes that
use the device is found by lsof(8) or fuser(1).)
I don't think that's user expect, they want umount /nfs/test/.
It's caused by exports cache of nfsd holds the reference of
the path (here is /nfs/test/), so, it can't umounted until nfsd stop.
There is one strange thing exist, the export cache status of
/nfs/test/ is CACHE_NEGATIVE, also hold the reference of the path.
I think nfsd should not hold the reference when CACHE_NEGATIVE.
I can't find a better way to put the reference, just path it after
svc_export_update().
Any comments are welcome, thanks.
Signed-off-by: Kinglong Mee <[email protected]>
---
fs/nfsd/export.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c3e3b6e..5595cffc7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -793,9 +793,15 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
int hash = svc_export_hash(old);
ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
- if (ch)
- return container_of(ch, struct svc_export, h);
- else
+ if (ch) {
+ struct svc_export *exp = container_of(ch, struct svc_export, h);
+ if (test_bit(CACHE_NEGATIVE, &exp->h.flags)) {
+ path_put(&exp->ex_path);
+ exp->ex_path.mnt = NULL;
+ exp->ex_path.dentry = NULL;
+ }
+ return exp;
+ } else
return NULL;
}
--
2.3.5
On Tue, Apr 21, 2015 at 10:50:31PM +0800, Kinglong Mee wrote:
> If there are some mount points(not exported for nfs) under
> a pseudo root, after client's operation of those entry under
> the root, anyone can unmount those mount points until nfsd stop.
>
> # cat /etc/exports
> /nfs/xfs *(rw,insecure,no_subtree_check,no_root_squash)
> /nfs/pnfs *(rw,insecure,no_subtree_check,no_root_squash)
> # ll /nfs/
> total 0
> drwxr-xr-x. 3 root root 84 Apr 21 22:27 pnfs
> drwxr-xr-x. 3 root root 84 Apr 21 22:27 test
> drwxr-xr-x. 2 root root 6 Apr 20 22:01 xfs
> # mount /dev/sde /nfs/test
> # df
> Filesystem 1K-blocks Used Available Use% Mounted on
> ......
> /dev/sdd 1038336 32944 1005392 4% /nfs/pnfs
> /dev/sdc 10475520 32928 10442592 1% /nfs/xfs
> /dev/sde 999320 1284 929224 1% /nfs/test
> # mount -t nfs 127.0.0.1:/nfs/ /mnt
> # ll /mnt/*/
> /mnt/pnfs/:
> total 0
> -rw-r--r--. 1 root root 0 Apr 21 22:23 attr
> drwxr-xr-x. 2 root root 6 Apr 21 22:19 tmp
>
> /mnt/xfs/:
> total 0
> # umount /nfs/test/
> umount: /nfs/test/: target is busy
> (In some cases useful info about processes that
> use the device is found by lsof(8) or fuser(1).)
>
> I don't think that's user expect, they want umount /nfs/test/.
I agree that this is annoying.
> It's caused by exports cache of nfsd holds the reference of
> the path (here is /nfs/test/), so, it can't umounted until nfsd stop.
A cache flush (exportfs -f) should also do the job, as long as you
then manage to unmount before the cache entry is re-added. (So I
suppose if you wanted to be completely safe:
exportfs -f
killall -STOP rpc.mountd
umount /nfs/test/
killall -CONT rpc.mountd
)
> There is one strange thing exist, the export cache status of
> /nfs/test/ is CACHE_NEGATIVE, also hold the reference of the path.
> I think nfsd should not hold the reference when CACHE_NEGATIVE.
>
> I can't find a better way to put the reference, just path it after
> svc_export_update().
Unfortunately ex_path is part of the key--it's what we need to do
lookups in this cache. I'm surprised something like ls /mnt/ still
works after this.
I'm not sure what to do. I wonder if we really need the struct path as
part of the key, or if we could get away with just a string path?
That's what we're actually passing to userspace, after all.
--b.
>
> Any comments are welcome, thanks.
>
> Signed-off-by: Kinglong Mee <[email protected]>
> ---
> fs/nfsd/export.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
> index c3e3b6e..5595cffc7 100644
> --- a/fs/nfsd/export.c
> +++ b/fs/nfsd/export.c
> @@ -793,9 +793,15 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
> int hash = svc_export_hash(old);
>
> ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
> - if (ch)
> - return container_of(ch, struct svc_export, h);
> - else
> + if (ch) {
> + struct svc_export *exp = container_of(ch, struct svc_export, h);
> + if (test_bit(CACHE_NEGATIVE, &exp->h.flags)) {
> + path_put(&exp->ex_path);
> + exp->ex_path.mnt = NULL;
> + exp->ex_path.dentry = NULL;
> + }
> + return exp;
> + } else
> return NULL;
> }
>
> --
> 2.3.5
On Tue, 21 Apr 2015 17:54:17 -0400 "J. Bruce Fields" <[email protected]>
>
> Unfortunately ex_path is part of the key--it's what we need to do
> lookups in this cache. I'm surprised something like ls /mnt/ still
> works after this.
>
> I'm not sure what to do. I wonder if we really need the struct path as
> part of the key, or if we could get away with just a string path?
> That's what we're actually passing to userspace, after all.
I wonder if we can use Al's new 'fs_pin' stuff, so that when the filesystem
is unmounted we get a call-back and can release the filesystem...
NeilBrown
On 4/22/2015 5:54 AM, J. Bruce Fields wrote:
> On Tue, Apr 21, 2015 at 10:50:31PM +0800, Kinglong Mee wrote:
>> If there are some mount points(not exported for nfs) under
>> a pseudo root, after client's operation of those entry under
>> the root, anyone can unmount those mount points until nfsd stop.
>>
>> # cat /etc/exports
>> /nfs/xfs *(rw,insecure,no_subtree_check,no_root_squash)
>> /nfs/pnfs *(rw,insecure,no_subtree_check,no_root_squash)
>> # ll /nfs/
>> total 0
>> drwxr-xr-x. 3 root root 84 Apr 21 22:27 pnfs
>> drwxr-xr-x. 3 root root 84 Apr 21 22:27 test
>> drwxr-xr-x. 2 root root 6 Apr 20 22:01 xfs
>> # mount /dev/sde /nfs/test
>> # df
>> Filesystem 1K-blocks Used Available Use% Mounted on
>> ......
>> /dev/sdd 1038336 32944 1005392 4% /nfs/pnfs
>> /dev/sdc 10475520 32928 10442592 1% /nfs/xfs
>> /dev/sde 999320 1284 929224 1% /nfs/test
>> # mount -t nfs 127.0.0.1:/nfs/ /mnt
>> # ll /mnt/*/
>> /mnt/pnfs/:
>> total 0
>> -rw-r--r--. 1 root root 0 Apr 21 22:23 attr
>> drwxr-xr-x. 2 root root 6 Apr 21 22:19 tmp
>>
>> /mnt/xfs/:
>> total 0
>> # umount /nfs/test/
>> umount: /nfs/test/: target is busy
>> (In some cases useful info about processes that
>> use the device is found by lsof(8) or fuser(1).)
>>
>> I don't think that's user expect, they want umount /nfs/test/.
>
> I agree that this is annoying.
>
>> It's caused by exports cache of nfsd holds the reference of
>> the path (here is /nfs/test/), so, it can't umounted until nfsd stop.
>
> A cache flush (exportfs -f) should also do the job, as long as you
> then manage to unmount before the cache entry is re-added. (So I
> suppose if you wanted to be completely safe:
>
> exportfs -f
> killall -STOP rpc.mountd
> umount /nfs/test/
> killall -CONT rpc.mountd
> )
Yes, that's right.
>
>> There is one strange thing exist, the export cache status of
>> /nfs/test/ is CACHE_NEGATIVE, also hold the reference of the path.
>> I think nfsd should not hold the reference when CACHE_NEGATIVE.
>>
>> I can't find a better way to put the reference, just path it after
>> svc_export_update().
>
> Unfortunately ex_path is part of the key--it's what we need to do
> lookups in this cache. I'm surprised something like ls /mnt/ still
> works after this.
I just do some simplify tests, and can't make sure everything is right.
>
> I'm not sure what to do. I wonder if we really need the struct path as
> part of the key, or if we could get away with just a string path?
> That's what we're actually passing to userspace, after all.
I have do a simplify grep of ex_path, it is used in many places.
I don't think a string path can resolve the problem.
Reference of dentry/mnt is like a cache, avoids re-finding of them,
it is necessary to store them in svc_export.
Neil points out another way of 'fs_pin', I will check that.
thanks,
Kinglong Mee
>
> --b.
>
>>
>> Any comments are welcome, thanks.
>>
>> Signed-off-by: Kinglong Mee <[email protected]>
>> ---
>> fs/nfsd/export.c | 12 +++++++++---
>> 1 file changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
>> index c3e3b6e..5595cffc7 100644
>> --- a/fs/nfsd/export.c
>> +++ b/fs/nfsd/export.c
>> @@ -793,9 +793,15 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
>> int hash = svc_export_hash(old);
>>
>> ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
>> - if (ch)
>> - return container_of(ch, struct svc_export, h);
>> - else
>> + if (ch) {
>> + struct svc_export *exp = container_of(ch, struct svc_export, h);
>> + if (test_bit(CACHE_NEGATIVE, &exp->h.flags)) {
>> + path_put(&exp->ex_path);
>> + exp->ex_path.mnt = NULL;
>> + exp->ex_path.dentry = NULL;
>> + }
>> + return exp;
>> + } else
>> return NULL;
>> }
>>
>> --
>> 2.3.5
>
On Wed, Apr 22, 2015 at 07:11:30PM +0800, Kinglong Mee wrote:
> On 4/22/2015 5:54 AM, J. Bruce Fields wrote:
> > On Tue, Apr 21, 2015 at 10:50:31PM +0800, Kinglong Mee wrote:
> >> If there are some mount points(not exported for nfs) under
> >> a pseudo root, after client's operation of those entry under
> >> the root, anyone can unmount those mount points until nfsd stop.
> >>
> >> # cat /etc/exports
> >> /nfs/xfs *(rw,insecure,no_subtree_check,no_root_squash)
> >> /nfs/pnfs *(rw,insecure,no_subtree_check,no_root_squash)
> >> # ll /nfs/
> >> total 0
> >> drwxr-xr-x. 3 root root 84 Apr 21 22:27 pnfs
> >> drwxr-xr-x. 3 root root 84 Apr 21 22:27 test
> >> drwxr-xr-x. 2 root root 6 Apr 20 22:01 xfs
> >> # mount /dev/sde /nfs/test
> >> # df
> >> Filesystem 1K-blocks Used Available Use% Mounted on
> >> ......
> >> /dev/sdd 1038336 32944 1005392 4% /nfs/pnfs
> >> /dev/sdc 10475520 32928 10442592 1% /nfs/xfs
> >> /dev/sde 999320 1284 929224 1% /nfs/test
> >> # mount -t nfs 127.0.0.1:/nfs/ /mnt
> >> # ll /mnt/*/
> >> /mnt/pnfs/:
> >> total 0
> >> -rw-r--r--. 1 root root 0 Apr 21 22:23 attr
> >> drwxr-xr-x. 2 root root 6 Apr 21 22:19 tmp
> >>
> >> /mnt/xfs/:
> >> total 0
> >> # umount /nfs/test/
> >> umount: /nfs/test/: target is busy
> >> (In some cases useful info about processes that
> >> use the device is found by lsof(8) or fuser(1).)
> >>
> >> I don't think that's user expect, they want umount /nfs/test/.
> >
> > I agree that this is annoying.
> >
> >> It's caused by exports cache of nfsd holds the reference of
> >> the path (here is /nfs/test/), so, it can't umounted until nfsd stop.
> >
> > A cache flush (exportfs -f) should also do the job, as long as you
> > then manage to unmount before the cache entry is re-added. (So I
> > suppose if you wanted to be completely safe:
> >
> > exportfs -f
> > killall -STOP rpc.mountd
> > umount /nfs/test/
> > killall -CONT rpc.mountd
> > )
>
> Yes, that's right.
>
> >
> >> There is one strange thing exist, the export cache status of
> >> /nfs/test/ is CACHE_NEGATIVE, also hold the reference of the path.
> >> I think nfsd should not hold the reference when CACHE_NEGATIVE.
> >>
> >> I can't find a better way to put the reference, just path it after
> >> svc_export_update().
> >
> > Unfortunately ex_path is part of the key--it's what we need to do
> > lookups in this cache. I'm surprised something like ls /mnt/ still
> > works after this.
>
> I just do some simplify tests, and can't make sure everything is right.
>
> >
> > I'm not sure what to do. I wonder if we really need the struct path as
> > part of the key, or if we could get away with just a string path?
> > That's what we're actually passing to userspace, after all.
>
> I have do a simplify grep of ex_path, it is used in many places.
> I don't think a string path can resolve the problem.
I was thinking we could still keep ex_path, but as part of the "value"
rather than part of the "key"? I'm not sure if that works.
> Reference of dentry/mnt is like a cache, avoids re-finding of them,
> it is necessary to store them in svc_export.
>
> Neil points out another way of 'fs_pin', I will check that.
Yes, that'd be interesting. On a very quick look--I don't understand
what it's meant to do at all. But if it does provide a way to get a
callback on umount, that'd certainly be interesting....
--b.
>
> thanks,
> Kinglong Mee
>
> >
> > --b.
> >
> >>
> >> Any comments are welcome, thanks.
> >>
> >> Signed-off-by: Kinglong Mee <[email protected]>
> >> ---
> >> fs/nfsd/export.c | 12 +++++++++---
> >> 1 file changed, 9 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
> >> index c3e3b6e..5595cffc7 100644
> >> --- a/fs/nfsd/export.c
> >> +++ b/fs/nfsd/export.c
> >> @@ -793,9 +793,15 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
> >> int hash = svc_export_hash(old);
> >>
> >> ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
> >> - if (ch)
> >> - return container_of(ch, struct svc_export, h);
> >> - else
> >> + if (ch) {
> >> + struct svc_export *exp = container_of(ch, struct svc_export, h);
> >> + if (test_bit(CACHE_NEGATIVE, &exp->h.flags)) {
> >> + path_put(&exp->ex_path);
> >> + exp->ex_path.mnt = NULL;
> >> + exp->ex_path.dentry = NULL;
> >> + }
> >> + return exp;
> >> + } else
> >> return NULL;
> >> }
> >>
> >> --
> >> 2.3.5
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> > Reference of dentry/mnt is like a cache, avoids re-finding of them,
> > it is necessary to store them in svc_export.
> >
> > Neil points out another way of 'fs_pin', I will check that.
>
> Yes, that'd be interesting. On a very quick look--I don't understand
> what it's meant to do at all. But if it does provide a way to get a
> callback on umount, that'd certainly be interesting....
Yeah, on a quick look it isn't really obvious at all.
But I didn't read the code. I read
https://lwn.net/Articles/636730/
which says:
In its place is a mechanism by which an object can be added to a vfsmount
structure (which represents a mounted filesystem); that object supports
only one method: kill(). These "pin" objects hang around until the final
reference to the vfsmount goes away, at which point each one's kill() function
is called. It thus is a clean mechanism for performing cleanup when a filesystem
goes away.
This is used to close "BSD process accounting" files on umount, and sound
like the perfect interface for flushing things from the sunrpc cache on
umount.
NeilBrown
On 4/23/2015 7:44 AM, NeilBrown wrote:
> On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
>>> Reference of dentry/mnt is like a cache, avoids re-finding of them,
>>> it is necessary to store them in svc_export.
>>>
>>> Neil points out another way of 'fs_pin', I will check that.
>>
>> Yes, that'd be interesting. On a very quick look--I don't understand
>> what it's meant to do at all. But if it does provide a way to get a
>> callback on umount, that'd certainly be interesting....
>
> Yeah, on a quick look it isn't really obvious at all.
>
> But I didn't read the code. I read
> https://lwn.net/Articles/636730/
>
> which says:
>
> In its place is a mechanism by which an object can be added to a vfsmount
> structure (which represents a mounted filesystem); that object supports
> only one method: kill(). These "pin" objects hang around until the final
> reference to the vfsmount goes away, at which point each one's kill() function
> is called. It thus is a clean mechanism for performing cleanup when a filesystem
> goes away.
>
> This is used to close "BSD process accounting" files on umount, and sound
> like the perfect interface for flushing things from the sunrpc cache on
> umount.
I have review those codes in fs/fs_pin.c and kernel/acct.c.
'fs_pin' is used to fix the race between file->f_path.mnt for acct
and umount, file->f_path.mnt just holds the reference of vfsmount
but not keep busy, umount will check the reference and return busy.
The logical is same as sunrpc cache for exports.
Before commit 3064c3563b ("death to mnt_pinned"), acct uses a hack
method, acct get a pin to vfsmount and then put the reference,
so umount finds no other reference and callback those pins in vfsmount,
at last umount success.
After that commit, besides pin to original vfsmount and put the reference
of the original vfsmount, acct clone the vfsmount storing in file->f_path.mnt.
The different between those two methods is the value of file->f_path.mnt,
the first one, it contains the original vfsmnt without reference,
the second one, contains a cloned vfsmnt with reference.
I have test the first method, pins to vfsmount for all exports cache,
nfsd can work but sometimes will delay or hang at rpc.mountd's calling
of name_to_handle_at, I can't find the reason.
Also test the second method, there are many problem caused by the cloned
vfsmount, mnt_clone_internal() change mnt_root values to the path dentry.
Those cases are tested for all exports cache, but, I think nfsd should
put the reference of vfsmount when finding exports upcall fail.
The success exports cache should also take it.
The following is a draft of the first method.
thanks,
Kinglong Mee
----------------------------------------------------------------------
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 611b540..553e8b1 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -17,6 +17,7 @@ void pin_remove(struct fs_pin *pin)
wake_up_locked(&pin->wait);
spin_unlock_irq(&pin->wait.lock);
}
+EXPORT_SYMBOL(pin_remove);
void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
{
@@ -26,11 +27,13 @@ void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head
hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
spin_unlock(&pin_lock);
}
+EXPORT_SYMBOL(pin_insert_group);
void pin_insert(struct fs_pin *pin, struct vfsmount *m)
{
pin_insert_group(pin, m, &m->mnt_sb->s_pins);
}
+EXPORT_SYMBOL(pin_insert);
void pin_kill(struct fs_pin *p)
{
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c3e3b6e..3e3df8c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -309,7 +309,7 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
static void svc_export_put(struct kref *ref)
{
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
- path_put(&exp->ex_path);
+ pin_remove(&exp->ex_pin);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp->ex_uuid);
@@ -695,15 +695,26 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
orig->ex_path.mnt == new->ex_path.mnt;
}
+static void export_pin_kill(struct fs_pin *pin)
+{
+ struct svc_export *exp = container_of(pin, struct svc_export, ex_pin);
+
+ write_lock(&exp->cd->hash_lock);
+ cache_mark_expired(&exp->h);
+ pin_remove(pin);
+ write_unlock(&exp->cd->hash_lock);
+}
+
static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
{
struct svc_export *new = container_of(cnew, struct svc_export, h);
struct svc_export *item = container_of(citem, struct svc_export, h);
+ init_fs_pin(&new->ex_pin, export_pin_kill);
kref_get(&item->ex_client->ref);
new->ex_client = item->ex_client;
new->ex_path = item->ex_path;
- path_get(&item->ex_path);
+ pin_insert_group(&new->ex_pin, new->ex_path.mnt, NULL);
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 1f52bfc..718a27e 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -4,6 +4,7 @@
#ifndef NFSD_EXPORT_H
#define NFSD_EXPORT_H
+#include <linux/fs_pin.h>
#include <linux/sunrpc/cache.h>
#include <uapi/linux/nfsd/export.h>
@@ -49,6 +50,7 @@ struct svc_export {
struct auth_domain * ex_client;
int ex_flags;
struct path ex_path;
+ struct fs_pin ex_pin;
kuid_t ex_anon_uid;
kgid_t ex_anon_gid;
int ex_fsid;
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 437ddb6..6936684 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -206,6 +206,11 @@ static inline int cache_is_expired(struct cache_detail *detail, struct cache_hea
(detail->flush_time > h->last_refresh);
}
+static inline void cache_mark_expired(struct cache_head *h)
+{
+ h->expiry_time = seconds_since_boot() - 1;
+}
+
extern int cache_check(struct cache_detail *detail,
struct cache_head *h, struct cache_req *rqstp);
extern void cache_flush(void);
On Thu, 23 Apr 2015 20:52:40 +0800 Kinglong Mee <[email protected]> wrote:
> On 4/23/2015 7:44 AM, NeilBrown wrote:
> > On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
> > wrote:
> >>> Reference of dentry/mnt is like a cache, avoids re-finding of them,
> >>> it is necessary to store them in svc_export.
> >>>
> >>> Neil points out another way of 'fs_pin', I will check that.
> >>
> >> Yes, that'd be interesting. On a very quick look--I don't understand
> >> what it's meant to do at all. But if it does provide a way to get a
> >> callback on umount, that'd certainly be interesting....
> >
> > Yeah, on a quick look it isn't really obvious at all.
> >
> > But I didn't read the code. I read
> > https://lwn.net/Articles/636730/
> >
> > which says:
> >
> > In its place is a mechanism by which an object can be added to a vfsmount
> > structure (which represents a mounted filesystem); that object supports
> > only one method: kill(). These "pin" objects hang around until the final
> > reference to the vfsmount goes away, at which point each one's kill() function
> > is called. It thus is a clean mechanism for performing cleanup when a filesystem
> > goes away.
> >
> > This is used to close "BSD process accounting" files on umount, and sound
> > like the perfect interface for flushing things from the sunrpc cache on
> > umount.
>
> I have review those codes in fs/fs_pin.c and kernel/acct.c.
> 'fs_pin' is used to fix the race between file->f_path.mnt for acct
> and umount, file->f_path.mnt just holds the reference of vfsmount
> but not keep busy, umount will check the reference and return busy.
>
> The logical is same as sunrpc cache for exports.
>
> Before commit 3064c3563b ("death to mnt_pinned"), acct uses a hack
> method, acct get a pin to vfsmount and then put the reference,
> so umount finds no other reference and callback those pins in vfsmount,
> at last umount success.
>
> After that commit, besides pin to original vfsmount and put the reference
> of the original vfsmount, acct clone the vfsmount storing in file->f_path.mnt.
>
> The different between those two methods is the value of file->f_path.mnt,
> the first one, it contains the original vfsmnt without reference,
> the second one, contains a cloned vfsmnt with reference.
>
> I have test the first method, pins to vfsmount for all exports cache,
> nfsd can work but sometimes will delay or hang at rpc.mountd's calling
> of name_to_handle_at, I can't find the reason.
A kernel stack trace of exactly where it is hanging would help a lot.
>
> Also test the second method, there are many problem caused by the cloned
> vfsmount, mnt_clone_internal() change mnt_root values to the path dentry.
>
> Those cases are tested for all exports cache, but, I think nfsd should
> put the reference of vfsmount when finding exports upcall fail.
> The success exports cache should also take it.
>
> The following is a draft of the first method.
I think the first method sounds a lot better than the second.
>
> thanks,
> Kinglong Mee
>
> ----------------------------------------------------------------------
> diff --git a/fs/fs_pin.c b/fs/fs_pin.c
> index 611b540..553e8b1 100644
> --- a/fs/fs_pin.c
> +++ b/fs/fs_pin.c
> @@ -17,6 +17,7 @@ void pin_remove(struct fs_pin *pin)
> wake_up_locked(&pin->wait);
> spin_unlock_irq(&pin->wait.lock);
> }
> +EXPORT_SYMBOL(pin_remove);
>
> void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
> {
> @@ -26,11 +27,13 @@ void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head
> hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
> spin_unlock(&pin_lock);
> }
> +EXPORT_SYMBOL(pin_insert_group);
>
> void pin_insert(struct fs_pin *pin, struct vfsmount *m)
> {
> pin_insert_group(pin, m, &m->mnt_sb->s_pins);
> }
> +EXPORT_SYMBOL(pin_insert);
>
> void pin_kill(struct fs_pin *p)
> {
> diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
> index c3e3b6e..3e3df8c 100644
> --- a/fs/nfsd/export.c
> +++ b/fs/nfsd/export.c
> @@ -309,7 +309,7 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
> static void svc_export_put(struct kref *ref)
> {
> struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
> - path_put(&exp->ex_path);
> + pin_remove(&exp->ex_pin);
> auth_domain_put(exp->ex_client);
> nfsd4_fslocs_free(&exp->ex_fslocs);
> kfree(exp->ex_uuid);
> @@ -695,15 +695,26 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
> orig->ex_path.mnt == new->ex_path.mnt;
> }
>
> +static void export_pin_kill(struct fs_pin *pin)
> +{
> + struct svc_export *exp = container_of(pin, struct svc_export, ex_pin);
> +
> + write_lock(&exp->cd->hash_lock);
> + cache_mark_expired(&exp->h);
> + pin_remove(pin);
> + write_unlock(&exp->cd->hash_lock);
> +}
I think you need to add some sort of delay here. The svc_export entry might
still be in active use by an nfsd thread and you need to wait for that to
complete.
I think you need to wait for the refcnt to drop to '1'. Maybe create a
global wait_queue to wait for that.
Actually... svc_expkey contains a reference to a 'path' too, so you need to
use pinning to purge those when the filesystem is unmounted.
Oh.. and you don't want to call 'pin_remove' from export_pin_kill(). It is
called from svc_export_put() and calling from both could be problematic.
Hmmm... Ahhhh.
If you get export_pin_kill to only call cache_mark_expired() (maybe move the
locking into that function) and *not* call pin_remove(), then the pin will
remain on the list and ->done will be -1.
So mnt_pin_kill will loop around and this time pin_kill() will wait for
p->done to be set.
So we really need that thing to be removed from cache promptly. I don't
think we can wait for the next time cache_clean will be run. We could
call cache_flush, but I think I'd rather remove just that one entry.
Also.... this requires that the pin (and the struct containing it) be freed
using RCU.
So:
- add an rcu_head to svc_export and svc_expkey
- use rcu_kfree to free both those objects
- write a 'cache_force_expire()' which sets the expiry time, and then
removes the entry from the cache.
- Use pin_insert_group rather then mntget for both svc_export and svc_expkey
- have the 'kill' functions for both call cache_force_expire(), but *not*
pin_remove
> +
> static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
> {
> struct svc_export *new = container_of(cnew, struct svc_export, h);
> struct svc_export *item = container_of(citem, struct svc_export, h);
>
> + init_fs_pin(&new->ex_pin, export_pin_kill);
> kref_get(&item->ex_client->ref);
> new->ex_client = item->ex_client;
> new->ex_path = item->ex_path;
> - path_get(&item->ex_path);
> + pin_insert_group(&new->ex_pin, new->ex_path.mnt, NULL);
This doesn't look right. path_get does a 'mntget()' and a 'dget()'.
You are replacing 'mntget()' with 'pin_insert_group()', but I think you still
need the dget().
Similarly you need a dput() up where you removed path_put().
Thanks for working on this!
NeilBrown
> new->ex_fslocs.locations = NULL;
> new->ex_fslocs.locations_count = 0;
> new->ex_fslocs.migrated = 0;
> diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
> index 1f52bfc..718a27e 100644
> --- a/fs/nfsd/export.h
> +++ b/fs/nfsd/export.h
> @@ -4,6 +4,7 @@
> #ifndef NFSD_EXPORT_H
> #define NFSD_EXPORT_H
>
> +#include <linux/fs_pin.h>
> #include <linux/sunrpc/cache.h>
> #include <uapi/linux/nfsd/export.h>
>
> @@ -49,6 +50,7 @@ struct svc_export {
> struct auth_domain * ex_client;
> int ex_flags;
> struct path ex_path;
> + struct fs_pin ex_pin;
> kuid_t ex_anon_uid;
> kgid_t ex_anon_gid;
> int ex_fsid;
> diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
> index 437ddb6..6936684 100644
> --- a/include/linux/sunrpc/cache.h
> +++ b/include/linux/sunrpc/cache.h
> @@ -206,6 +206,11 @@ static inline int cache_is_expired(struct cache_detail *detail, struct cache_hea
> (detail->flush_time > h->last_refresh);
> }
>
> +static inline void cache_mark_expired(struct cache_head *h)
> +{
> + h->expiry_time = seconds_since_boot() - 1;
> +}
> +
> extern int cache_check(struct cache_detail *detail,
> struct cache_head *h, struct cache_req *rqstp);
> extern void cache_flush(void);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/24/2015 11:00 AM, NeilBrown wrote:
> On Thu, 23 Apr 2015 20:52:40 +0800 Kinglong Mee <[email protected]> wrote:
>
>> On 4/23/2015 7:44 AM, NeilBrown wrote:
>>> On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
>>> wrote:
>>>>> Reference of dentry/mnt is like a cache, avoids re-finding of them,
>>>>> it is necessary to store them in svc_export.
>>>>>
>>>>> Neil points out another way of 'fs_pin', I will check that.
>>>>
>>>> Yes, that'd be interesting. On a very quick look--I don't understand
>>>> what it's meant to do at all. But if it does provide a way to get a
>>>> callback on umount, that'd certainly be interesting....
>>>
>>> Yeah, on a quick look it isn't really obvious at all.
>>>
>>> But I didn't read the code. I read
>>> https://lwn.net/Articles/636730/
>>>
>>> which says:
>>>
>>> In its place is a mechanism by which an object can be added to a vfsmount
>>> structure (which represents a mounted filesystem); that object supports
>>> only one method: kill(). These "pin" objects hang around until the final
>>> reference to the vfsmount goes away, at which point each one's kill() function
>>> is called. It thus is a clean mechanism for performing cleanup when a filesystem
>>> goes away.
>>>
>>> This is used to close "BSD process accounting" files on umount, and sound
>>> like the perfect interface for flushing things from the sunrpc cache on
>>> umount.
>>
>> I have review those codes in fs/fs_pin.c and kernel/acct.c.
>> 'fs_pin' is used to fix the race between file->f_path.mnt for acct
>> and umount, file->f_path.mnt just holds the reference of vfsmount
>> but not keep busy, umount will check the reference and return busy.
>>
>> The logical is same as sunrpc cache for exports.
>>
>> Before commit 3064c3563b ("death to mnt_pinned"), acct uses a hack
>> method, acct get a pin to vfsmount and then put the reference,
>> so umount finds no other reference and callback those pins in vfsmount,
>> at last umount success.
>>
>> After that commit, besides pin to original vfsmount and put the reference
>> of the original vfsmount, acct clone the vfsmount storing in file->f_path.mnt.
>>
>> The different between those two methods is the value of file->f_path.mnt,
>> the first one, it contains the original vfsmnt without reference,
>> the second one, contains a cloned vfsmnt with reference.
>>
>> I have test the first method, pins to vfsmount for all exports cache,
>> nfsd can work but sometimes will delay or hang at rpc.mountd's calling
>> of name_to_handle_at, I can't find the reason.
>
> A kernel stack trace of exactly where it is hanging would help a lot.
Witch adding fs_pin to exports, it seems there is a mutex race between
nfsd and rpc.mountd for locking parent inode.
Apr 27 19:57:03 ntest kernel: rpc.mountd D ffff88006ac5fc28 0 1152 1 0x00000000
Apr 27 19:57:03 ntest kernel: ffff88006ac5fc28 ffff880068c38970 ffff880068c3de60 ffff88006ac5fc08
Apr 27 19:57:03 ntest kernel: ffff88006ac60000 ffff880035ecf694 ffff880068c3de60 00000000ffffffff
Apr 27 19:57:03 ntest kernel: ffff880035ecf698 ffff88006ac5fc48 ffffffff8177ffd7 0000000000000000
Apr 27 19:57:03 ntest kernel: Call Trace:
Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
Apr 27 19:57:03 ntest kernel: [<ffffffff8178031e>] schedule_preempt_disabled+0xe/0x10
Apr 27 19:57:03 ntest kernel: [<ffffffff81781e92>] __mutex_lock_slowpath+0xb2/0x120
Apr 27 19:57:03 ntest kernel: [<ffffffff81781f23>] mutex_lock+0x23/0x40
Apr 27 19:57:03 ntest kernel: [<ffffffff81230714>] lookup_slow+0x34/0xc0
Apr 27 19:57:03 ntest kernel: [<ffffffff812358ee>] path_lookupat+0x89e/0xc60
Apr 27 19:57:03 ntest kernel: [<ffffffff81205192>] ? kmem_cache_alloc+0x1e2/0x260
Apr 27 19:57:03 ntest kernel: [<ffffffff812367a6>] ? getname_flags+0x56/0x200
Apr 27 19:57:03 ntest kernel: [<ffffffff81235cd7>] filename_lookup+0x27/0xc0
Apr 27 19:57:03 ntest kernel: [<ffffffff81237b53>] user_path_at_empty+0x63/0xd0
Apr 27 19:57:03 ntest kernel: [<ffffffff8121d722>] ? put_object+0x32/0x60
Apr 27 19:57:03 ntest kernel: [<ffffffff8121d94d>] ? delete_object_full+0x2d/0x40
Apr 27 19:57:03 ntest kernel: [<ffffffff8123e273>] ? dput+0x33/0x230
Apr 27 19:57:03 ntest kernel: [<ffffffff81237bd1>] user_path_at+0x11/0x20
Apr 27 19:57:03 ntest kernel: [<ffffffff81286489>] SyS_name_to_handle_at+0x59/0x200
Apr 27 19:57:03 ntest kernel: [<ffffffff81783f6e>] system_call_fastpath+0x12/0x71
Apr 27 19:57:03 ntest kernel: nfsd S ffff88006e92b708 0 1170 2 0x00000000
Apr 27 19:57:03 ntest kernel: ffff88006e92b708 ffffffff81c12480 ffff88006acbcb80 ffff88006e92b758
Apr 27 19:57:03 ntest kernel: ffff88006e92c000 ffffffff82290040 ffff88006e92b758 ffffffff82290040
Apr 27 19:57:03 ntest kernel: 00000000fffffff5 ffff88006e92b728 ffffffff8177ffd7 0000000100043a09
Apr 27 19:57:03 ntest kernel: Call Trace:
Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
Apr 27 19:57:03 ntest kernel: [<ffffffff81782e97>] schedule_timeout+0x117/0x230
Apr 27 19:57:03 ntest kernel: [<ffffffff8110b240>] ? internal_add_timer+0xb0/0xb0
Apr 27 19:57:03 ntest kernel: [<ffffffff81780ff3>] wait_for_completion_interruptible_timeout+0xf3/0x150
Apr 27 19:57:03 ntest kernel: [<ffffffff810cb090>] ? wake_up_state+0x20/0x20
Apr 27 19:57:03 ntest kernel: [<ffffffffa01350cc>] cache_wait_req.isra.10+0x9c/0x110 [sunrpc]
Apr 27 19:57:03 ntest kernel: [<ffffffffa0134760>] ? sunrpc_init_cache_detail+0xc0/0xc0 [sunrpc]
Apr 27 19:57:03 ntest kernel: [<ffffffffa0135c54>] cache_check+0x1d4/0x380 [sunrpc]
Apr 27 19:57:03 ntest kernel: [<ffffffff8133248c>] ? inode_doinit_with_dentry+0x48c/0x6a0
Apr 27 19:57:03 ntest kernel: [<ffffffffa045e132>] exp_get_by_name+0x82/0xb0 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffff81776d0a>] ? kmemleak_free+0x3a/0xa0
Apr 27 19:57:03 ntest kernel: [<ffffffff81332210>] ? inode_doinit_with_dentry+0x210/0x6a0
Apr 27 19:57:03 ntest kernel: [<ffffffff813326bc>] ? selinux_d_instantiate+0x1c/0x20
Apr 27 19:57:03 ntest kernel: [<ffffffff8123def7>] ? _d_rehash+0x37/0x40
Apr 27 19:57:03 ntest kernel: [<ffffffff8123f776>] ? d_splice_alias+0xa6/0x2d0
Apr 27 19:57:03 ntest kernel: [<ffffffff812be90b>] ? ext4_lookup+0xdb/0x160
Apr 27 19:57:03 ntest kernel: [<ffffffffa0460114>] rqst_exp_get_by_name+0x64/0x140 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa045a8f6>] nfsd_cross_mnt+0x76/0x1b0 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa0475300>] nfsd4_encode_dirent+0x160/0x3d0 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa04751a0>] ? nfsd4_encode_getattr+0x40/0x40 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa045c631>] nfsd_readdir+0x1c1/0x2a0 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa045a1e0>] ? nfsd_direct_splice_actor+0x20/0x20 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa046bd70>] nfsd4_encode_readdir+0x120/0x220 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa047573d>] nfsd4_encode_operation+0x7d/0x190 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa046a51d>] nfsd4_proc_compound+0x24d/0x6f0 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa0455f83>] nfsd_dispatch+0xc3/0x220 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa012a01b>] svc_process_common+0x43b/0x690 [sunrpc]
Apr 27 19:57:03 ntest kernel: [<ffffffffa012b133>] svc_process+0x103/0x1b0 [sunrpc]
Apr 27 19:57:03 ntest kernel: [<ffffffffa045596f>] nfsd+0xff/0x170 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffffa0455870>] ? nfsd_destroy+0x80/0x80 [nfsd]
Apr 27 19:57:03 ntest kernel: [<ffffffff810bd378>] kthread+0xd8/0xf0
Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
Apr 27 19:57:03 ntest kernel: [<ffffffff81784362>] ret_from_fork+0x42/0x70
Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
>>
>> Also test the second method, there are many problem caused by the cloned
>> vfsmount, mnt_clone_internal() change mnt_root values to the path dentry.
>>
>> Those cases are tested for all exports cache, but, I think nfsd should
>> put the reference of vfsmount when finding exports upcall fail.
>> The success exports cache should also take it.
>>
>> The following is a draft of the first method.
>
> I think the first method sounds a lot better than the second.
>
>>
>> thanks,
>> Kinglong Mee
>>
>> ----------------------------------------------------------------------
>> diff --git a/fs/fs_pin.c b/fs/fs_pin.c
>> index 611b540..553e8b1 100644
>> --- a/fs/fs_pin.c
>> +++ b/fs/fs_pin.c
>> @@ -17,6 +17,7 @@ void pin_remove(struct fs_pin *pin)
>> wake_up_locked(&pin->wait);
>> spin_unlock_irq(&pin->wait.lock);
>> }
>> +EXPORT_SYMBOL(pin_remove);
>>
>> void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
>> {
>> @@ -26,11 +27,13 @@ void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head
>> hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
>> spin_unlock(&pin_lock);
>> }
>> +EXPORT_SYMBOL(pin_insert_group);
>>
>> void pin_insert(struct fs_pin *pin, struct vfsmount *m)
>> {
>> pin_insert_group(pin, m, &m->mnt_sb->s_pins);
>> }
>> +EXPORT_SYMBOL(pin_insert);
>>
>> void pin_kill(struct fs_pin *p)
>> {
>> diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
>> index c3e3b6e..3e3df8c 100644
>> --- a/fs/nfsd/export.c
>> +++ b/fs/nfsd/export.c
>> @@ -309,7 +309,7 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
>> static void svc_export_put(struct kref *ref)
>> {
>> struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
>> - path_put(&exp->ex_path);
>> + pin_remove(&exp->ex_pin);
>> auth_domain_put(exp->ex_client);
>> nfsd4_fslocs_free(&exp->ex_fslocs);
>> kfree(exp->ex_uuid);
>> @@ -695,15 +695,26 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
>> orig->ex_path.mnt == new->ex_path.mnt;
>> }
>>
>> +static void export_pin_kill(struct fs_pin *pin)
>> +{
>> + struct svc_export *exp = container_of(pin, struct svc_export, ex_pin);
>> +
>> + write_lock(&exp->cd->hash_lock);
>> + cache_mark_expired(&exp->h);
>> + pin_remove(pin);
>> + write_unlock(&exp->cd->hash_lock);
>> +}
>
> I think you need to add some sort of delay here. The svc_export entry might
> still be in active use by an nfsd thread and you need to wait for that to
> complete.
>
> I think you need to wait for the refcnt to drop to '1'. Maybe create a
> global wait_queue to wait for that.
>
> Actually... svc_expkey contains a reference to a 'path' too, so you need to
> use pinning to purge those when the filesystem is unmounted.
>
> Oh.. and you don't want to call 'pin_remove' from export_pin_kill(). It is
> called from svc_export_put() and calling from both could be problematic.
>
> Hmmm... Ahhhh.
>
> If you get export_pin_kill to only call cache_mark_expired() (maybe move the
> locking into that function) and *not* call pin_remove(), then the pin will
> remain on the list and ->done will be -1.
> So mnt_pin_kill will loop around and this time pin_kill() will wait for
> p->done to be set.
> So we really need that thing to be removed from cache promptly. I don't
> think we can wait for the next time cache_clean will be run. We could
> call cache_flush, but I think I'd rather remove just that one entry.
>
> Also.... this requires that the pin (and the struct containing it) be freed
> using RCU.
>
> So:
> - add an rcu_head to svc_export and svc_expkey
> - use rcu_kfree to free both those objects
> - write a 'cache_force_expire()' which sets the expiry time, and then
> removes the entry from the cache.
> - Use pin_insert_group rather then mntget for both svc_export and svc_expkey
> - have the 'kill' functions for both call cache_force_expire(), but *not*
> pin_remove
Thanks very much for your comment in great detail.
I will send a patch after fix the above race and basic tests.
thanks,
Kinglong Mee
>
>> +
>> static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
>> {
>> struct svc_export *new = container_of(cnew, struct svc_export, h);
>> struct svc_export *item = container_of(citem, struct svc_export, h);
>>
>> + init_fs_pin(&new->ex_pin, export_pin_kill);
>> kref_get(&item->ex_client->ref);
>> new->ex_client = item->ex_client;
>> new->ex_path = item->ex_path;
>> - path_get(&item->ex_path);
>> + pin_insert_group(&new->ex_pin, new->ex_path.mnt, NULL);
>
> This doesn't look right. path_get does a 'mntget()' and a 'dget()'.
> You are replacing 'mntget()' with 'pin_insert_group()', but I think you still
> need the dget().
>
> Similarly you need a dput() up where you removed path_put().
>
>
> Thanks for working on this!
>
> NeilBrown
>
>
>
>> new->ex_fslocs.locations = NULL;
>> new->ex_fslocs.locations_count = 0;
>> new->ex_fslocs.migrated = 0;
>> diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
>> index 1f52bfc..718a27e 100644
>> --- a/fs/nfsd/export.h
>> +++ b/fs/nfsd/export.h
>> @@ -4,6 +4,7 @@
>> #ifndef NFSD_EXPORT_H
>> #define NFSD_EXPORT_H
>>
>> +#include <linux/fs_pin.h>
>> #include <linux/sunrpc/cache.h>
>> #include <uapi/linux/nfsd/export.h>
>>
>> @@ -49,6 +50,7 @@ struct svc_export {
>> struct auth_domain * ex_client;
>> int ex_flags;
>> struct path ex_path;
>> + struct fs_pin ex_pin;
>> kuid_t ex_anon_uid;
>> kgid_t ex_anon_gid;
>> int ex_fsid;
>> diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
>> index 437ddb6..6936684 100644
>> --- a/include/linux/sunrpc/cache.h
>> +++ b/include/linux/sunrpc/cache.h
>> @@ -206,6 +206,11 @@ static inline int cache_is_expired(struct cache_detail *detail, struct cache_hea
>> (detail->flush_time > h->last_refresh);
>> }
>>
>> +static inline void cache_mark_expired(struct cache_head *h)
>> +{
>> + h->expiry_time = seconds_since_boot() - 1;
>> +}
>> +
>> extern int cache_check(struct cache_detail *detail,
>> struct cache_head *h, struct cache_req *rqstp);
>> extern void cache_flush(void);
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Mon, 27 Apr 2015 20:11:48 +0800 Kinglong Mee <[email protected]> wrote:
> On 4/24/2015 11:00 AM, NeilBrown wrote:
> > On Thu, 23 Apr 2015 20:52:40 +0800 Kinglong Mee <[email protected]> wrote:
> >
> >> On 4/23/2015 7:44 AM, NeilBrown wrote:
> >>> On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
> >>> wrote:
> >>>>> Reference of dentry/mnt is like a cache, avoids re-finding of them,
> >>>>> it is necessary to store them in svc_export.
> >>>>>
> >>>>> Neil points out another way of 'fs_pin', I will check that.
> >>>>
> >>>> Yes, that'd be interesting. On a very quick look--I don't understand
> >>>> what it's meant to do at all. But if it does provide a way to get a
> >>>> callback on umount, that'd certainly be interesting....
> >>>
> >>> Yeah, on a quick look it isn't really obvious at all.
> >>>
> >>> But I didn't read the code. I read
> >>> https://lwn.net/Articles/636730/
> >>>
> >>> which says:
> >>>
> >>> In its place is a mechanism by which an object can be added to a vfsmount
> >>> structure (which represents a mounted filesystem); that object supports
> >>> only one method: kill(). These "pin" objects hang around until the final
> >>> reference to the vfsmount goes away, at which point each one's kill() function
> >>> is called. It thus is a clean mechanism for performing cleanup when a filesystem
> >>> goes away.
> >>>
> >>> This is used to close "BSD process accounting" files on umount, and sound
> >>> like the perfect interface for flushing things from the sunrpc cache on
> >>> umount.
> >>
> >> I have review those codes in fs/fs_pin.c and kernel/acct.c.
> >> 'fs_pin' is used to fix the race between file->f_path.mnt for acct
> >> and umount, file->f_path.mnt just holds the reference of vfsmount
> >> but not keep busy, umount will check the reference and return busy.
> >>
> >> The logical is same as sunrpc cache for exports.
> >>
> >> Before commit 3064c3563b ("death to mnt_pinned"), acct uses a hack
> >> method, acct get a pin to vfsmount and then put the reference,
> >> so umount finds no other reference and callback those pins in vfsmount,
> >> at last umount success.
> >>
> >> After that commit, besides pin to original vfsmount and put the reference
> >> of the original vfsmount, acct clone the vfsmount storing in file->f_path.mnt.
> >>
> >> The different between those two methods is the value of file->f_path.mnt,
> >> the first one, it contains the original vfsmnt without reference,
> >> the second one, contains a cloned vfsmnt with reference.
> >>
> >> I have test the first method, pins to vfsmount for all exports cache,
> >> nfsd can work but sometimes will delay or hang at rpc.mountd's calling
> >> of name_to_handle_at, I can't find the reason.
> >
> > A kernel stack trace of exactly where it is hanging would help a lot.
>
> Witch adding fs_pin to exports, it seems there is a mutex race between
> nfsd and rpc.mountd for locking parent inode.
>
> Apr 27 19:57:03 ntest kernel: rpc.mountd D ffff88006ac5fc28 0 1152 1 0x00000000
> Apr 27 19:57:03 ntest kernel: ffff88006ac5fc28 ffff880068c38970 ffff880068c3de60 ffff88006ac5fc08
> Apr 27 19:57:03 ntest kernel: ffff88006ac60000 ffff880035ecf694 ffff880068c3de60 00000000ffffffff
> Apr 27 19:57:03 ntest kernel: ffff880035ecf698 ffff88006ac5fc48 ffffffff8177ffd7 0000000000000000
> Apr 27 19:57:03 ntest kernel: Call Trace:
> Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
> Apr 27 19:57:03 ntest kernel: [<ffffffff8178031e>] schedule_preempt_disabled+0xe/0x10
> Apr 27 19:57:03 ntest kernel: [<ffffffff81781e92>] __mutex_lock_slowpath+0xb2/0x120
> Apr 27 19:57:03 ntest kernel: [<ffffffff81781f23>] mutex_lock+0x23/0x40
> Apr 27 19:57:03 ntest kernel: [<ffffffff81230714>] lookup_slow+0x34/0xc0
> Apr 27 19:57:03 ntest kernel: [<ffffffff812358ee>] path_lookupat+0x89e/0xc60
> Apr 27 19:57:03 ntest kernel: [<ffffffff81205192>] ? kmem_cache_alloc+0x1e2/0x260
> Apr 27 19:57:03 ntest kernel: [<ffffffff812367a6>] ? getname_flags+0x56/0x200
> Apr 27 19:57:03 ntest kernel: [<ffffffff81235cd7>] filename_lookup+0x27/0xc0
> Apr 27 19:57:03 ntest kernel: [<ffffffff81237b53>] user_path_at_empty+0x63/0xd0
> Apr 27 19:57:03 ntest kernel: [<ffffffff8121d722>] ? put_object+0x32/0x60
> Apr 27 19:57:03 ntest kernel: [<ffffffff8121d94d>] ? delete_object_full+0x2d/0x40
> Apr 27 19:57:03 ntest kernel: [<ffffffff8123e273>] ? dput+0x33/0x230
> Apr 27 19:57:03 ntest kernel: [<ffffffff81237bd1>] user_path_at+0x11/0x20
> Apr 27 19:57:03 ntest kernel: [<ffffffff81286489>] SyS_name_to_handle_at+0x59/0x200
> Apr 27 19:57:03 ntest kernel: [<ffffffff81783f6e>] system_call_fastpath+0x12/0x71
> Apr 27 19:57:03 ntest kernel: nfsd S ffff88006e92b708 0 1170 2 0x00000000
> Apr 27 19:57:03 ntest kernel: ffff88006e92b708 ffffffff81c12480 ffff88006acbcb80 ffff88006e92b758
> Apr 27 19:57:03 ntest kernel: ffff88006e92c000 ffffffff82290040 ffff88006e92b758 ffffffff82290040
> Apr 27 19:57:03 ntest kernel: 00000000fffffff5 ffff88006e92b728 ffffffff8177ffd7 0000000100043a09
> Apr 27 19:57:03 ntest kernel: Call Trace:
> Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
> Apr 27 19:57:03 ntest kernel: [<ffffffff81782e97>] schedule_timeout+0x117/0x230
> Apr 27 19:57:03 ntest kernel: [<ffffffff8110b240>] ? internal_add_timer+0xb0/0xb0
> Apr 27 19:57:03 ntest kernel: [<ffffffff81780ff3>] wait_for_completion_interruptible_timeout+0xf3/0x150
> Apr 27 19:57:03 ntest kernel: [<ffffffff810cb090>] ? wake_up_state+0x20/0x20
> Apr 27 19:57:03 ntest kernel: [<ffffffffa01350cc>] cache_wait_req.isra.10+0x9c/0x110 [sunrpc]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0134760>] ? sunrpc_init_cache_detail+0xc0/0xc0 [sunrpc]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0135c54>] cache_check+0x1d4/0x380 [sunrpc]
> Apr 27 19:57:03 ntest kernel: [<ffffffff8133248c>] ? inode_doinit_with_dentry+0x48c/0x6a0
> Apr 27 19:57:03 ntest kernel: [<ffffffffa045e132>] exp_get_by_name+0x82/0xb0 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffff81776d0a>] ? kmemleak_free+0x3a/0xa0
> Apr 27 19:57:03 ntest kernel: [<ffffffff81332210>] ? inode_doinit_with_dentry+0x210/0x6a0
> Apr 27 19:57:03 ntest kernel: [<ffffffff813326bc>] ? selinux_d_instantiate+0x1c/0x20
> Apr 27 19:57:03 ntest kernel: [<ffffffff8123def7>] ? _d_rehash+0x37/0x40
> Apr 27 19:57:03 ntest kernel: [<ffffffff8123f776>] ? d_splice_alias+0xa6/0x2d0
> Apr 27 19:57:03 ntest kernel: [<ffffffff812be90b>] ? ext4_lookup+0xdb/0x160
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0460114>] rqst_exp_get_by_name+0x64/0x140 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa045a8f6>] nfsd_cross_mnt+0x76/0x1b0 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0475300>] nfsd4_encode_dirent+0x160/0x3d0 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa04751a0>] ? nfsd4_encode_getattr+0x40/0x40 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa045c631>] nfsd_readdir+0x1c1/0x2a0 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa045a1e0>] ? nfsd_direct_splice_actor+0x20/0x20 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa046bd70>] nfsd4_encode_readdir+0x120/0x220 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa047573d>] nfsd4_encode_operation+0x7d/0x190 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa046a51d>] nfsd4_proc_compound+0x24d/0x6f0 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0455f83>] nfsd_dispatch+0xc3/0x220 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa012a01b>] svc_process_common+0x43b/0x690 [sunrpc]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa012b133>] svc_process+0x103/0x1b0 [sunrpc]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa045596f>] nfsd+0xff/0x170 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffffa0455870>] ? nfsd_destroy+0x80/0x80 [nfsd]
> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd378>] kthread+0xd8/0xf0
> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
> Apr 27 19:57:03 ntest kernel: [<ffffffff81784362>] ret_from_fork+0x42/0x70
> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
>
Hmm... That looks bad. I wonder why we haven't hit the deadlock before.
nfsd gets a LOOKUP request for $DIR/name which is a mountpoint, or a READDIR
request for $DIR which contains 'name' which is a mountpoint.
nfsd_lookup_dentry of nfsd_buffered_readdir lock i_mutex on $DIR, then do
the mountpoint check. If the mounted filesystem is not exported, but the $DIR
filesystem has 'crossmnt', an upcall to mountd asks for export information.
When mountd try to do a lookup on the name, it works fine if lookup_fast finds
the child, but it deadlocks on the parent's i_mutex if it has to go to
lookup_slow.
It should only go to lookup_slow() if:
- the child isn't in cache .. but as it is mounted on, it must be
- d_revalidate returns 0
This probably means that $DIR is on an NFS filesystem and it failed the
revalidation for some reason....
Kinglong: can you reproduce this easily? If so can you enable nfs debugging
and collect the output?
I want to confirm that the dfprintk in nfs_lookup_revalidate is saying that
the name is invalid.
In any case, holding a mutex while waiting for an upcall is probably a bad
idea. We should try to find a way to work around that...
Any ideas Bruce ?-)
NeilBrown
On 4/29/2015 10:57 AM, NeilBrown wrote:
> On Mon, 27 Apr 2015 20:11:48 +0800 Kinglong Mee <[email protected]> wrote:
>
>> On 4/24/2015 11:00 AM, NeilBrown wrote:
>>> On Thu, 23 Apr 2015 20:52:40 +0800 Kinglong Mee <[email protected]> wrote:
>>>
>>>> On 4/23/2015 7:44 AM, NeilBrown wrote:
>>>>> On Wed, 22 Apr 2015 11:07:03 -0400 "J. Bruce Fields" <[email protected]>
>>>>> wrote:
>>>>>>> Reference of dentry/mnt is like a cache, avoids re-finding of them,
>>>>>>> it is necessary to store them in svc_export.
>>>>>>>
>>>>>>> Neil points out another way of 'fs_pin', I will check that.
>>>>>>
>>>>>> Yes, that'd be interesting. On a very quick look--I don't understand
>>>>>> what it's meant to do at all. But if it does provide a way to get a
>>>>>> callback on umount, that'd certainly be interesting....
>>>>>
>>>>> Yeah, on a quick look it isn't really obvious at all.
>>>>>
>>>>> But I didn't read the code. I read
>>>>> https://lwn.net/Articles/636730/
>>>>>
>>>>> which says:
>>>>>
>>>>> In its place is a mechanism by which an object can be added to a vfsmount
>>>>> structure (which represents a mounted filesystem); that object supports
>>>>> only one method: kill(). These "pin" objects hang around until the final
>>>>> reference to the vfsmount goes away, at which point each one's kill() function
>>>>> is called. It thus is a clean mechanism for performing cleanup when a filesystem
>>>>> goes away.
>>>>>
>>>>> This is used to close "BSD process accounting" files on umount, and sound
>>>>> like the perfect interface for flushing things from the sunrpc cache on
>>>>> umount.
>>>>
>>>> I have review those codes in fs/fs_pin.c and kernel/acct.c.
>>>> 'fs_pin' is used to fix the race between file->f_path.mnt for acct
>>>> and umount, file->f_path.mnt just holds the reference of vfsmount
>>>> but not keep busy, umount will check the reference and return busy.
>>>>
>>>> The logical is same as sunrpc cache for exports.
>>>>
>>>> Before commit 3064c3563b ("death to mnt_pinned"), acct uses a hack
>>>> method, acct get a pin to vfsmount and then put the reference,
>>>> so umount finds no other reference and callback those pins in vfsmount,
>>>> at last umount success.
>>>>
>>>> After that commit, besides pin to original vfsmount and put the reference
>>>> of the original vfsmount, acct clone the vfsmount storing in file->f_path.mnt.
>>>>
>>>> The different between those two methods is the value of file->f_path.mnt,
>>>> the first one, it contains the original vfsmnt without reference,
>>>> the second one, contains a cloned vfsmnt with reference.
>>>>
>>>> I have test the first method, pins to vfsmount for all exports cache,
>>>> nfsd can work but sometimes will delay or hang at rpc.mountd's calling
>>>> of name_to_handle_at, I can't find the reason.
>>>
>>> A kernel stack trace of exactly where it is hanging would help a lot.
>>
>> Witch adding fs_pin to exports, it seems there is a mutex race between
>> nfsd and rpc.mountd for locking parent inode.
>>
>> Apr 27 19:57:03 ntest kernel: rpc.mountd D ffff88006ac5fc28 0 1152 1 0x00000000
>> Apr 27 19:57:03 ntest kernel: ffff88006ac5fc28 ffff880068c38970 ffff880068c3de60 ffff88006ac5fc08
>> Apr 27 19:57:03 ntest kernel: ffff88006ac60000 ffff880035ecf694 ffff880068c3de60 00000000ffffffff
>> Apr 27 19:57:03 ntest kernel: ffff880035ecf698 ffff88006ac5fc48 ffffffff8177ffd7 0000000000000000
>> Apr 27 19:57:03 ntest kernel: Call Trace:
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8178031e>] schedule_preempt_disabled+0xe/0x10
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81781e92>] __mutex_lock_slowpath+0xb2/0x120
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81781f23>] mutex_lock+0x23/0x40
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81230714>] lookup_slow+0x34/0xc0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff812358ee>] path_lookupat+0x89e/0xc60
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81205192>] ? kmem_cache_alloc+0x1e2/0x260
>> Apr 27 19:57:03 ntest kernel: [<ffffffff812367a6>] ? getname_flags+0x56/0x200
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81235cd7>] filename_lookup+0x27/0xc0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81237b53>] user_path_at_empty+0x63/0xd0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8121d722>] ? put_object+0x32/0x60
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8121d94d>] ? delete_object_full+0x2d/0x40
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8123e273>] ? dput+0x33/0x230
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81237bd1>] user_path_at+0x11/0x20
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81286489>] SyS_name_to_handle_at+0x59/0x200
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81783f6e>] system_call_fastpath+0x12/0x71
>> Apr 27 19:57:03 ntest kernel: nfsd S ffff88006e92b708 0 1170 2 0x00000000
>> Apr 27 19:57:03 ntest kernel: ffff88006e92b708 ffffffff81c12480 ffff88006acbcb80 ffff88006e92b758
>> Apr 27 19:57:03 ntest kernel: ffff88006e92c000 ffffffff82290040 ffff88006e92b758 ffffffff82290040
>> Apr 27 19:57:03 ntest kernel: 00000000fffffff5 ffff88006e92b728 ffffffff8177ffd7 0000000100043a09
>> Apr 27 19:57:03 ntest kernel: Call Trace:
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8177ffd7>] schedule+0x37/0x90
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81782e97>] schedule_timeout+0x117/0x230
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8110b240>] ? internal_add_timer+0xb0/0xb0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81780ff3>] wait_for_completion_interruptible_timeout+0xf3/0x150
>> Apr 27 19:57:03 ntest kernel: [<ffffffff810cb090>] ? wake_up_state+0x20/0x20
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa01350cc>] cache_wait_req.isra.10+0x9c/0x110 [sunrpc]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0134760>] ? sunrpc_init_cache_detail+0xc0/0xc0 [sunrpc]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0135c54>] cache_check+0x1d4/0x380 [sunrpc]
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8133248c>] ? inode_doinit_with_dentry+0x48c/0x6a0
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa045e132>] exp_get_by_name+0x82/0xb0 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81776d0a>] ? kmemleak_free+0x3a/0xa0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81332210>] ? inode_doinit_with_dentry+0x210/0x6a0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff813326bc>] ? selinux_d_instantiate+0x1c/0x20
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8123def7>] ? _d_rehash+0x37/0x40
>> Apr 27 19:57:03 ntest kernel: [<ffffffff8123f776>] ? d_splice_alias+0xa6/0x2d0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff812be90b>] ? ext4_lookup+0xdb/0x160
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0460114>] rqst_exp_get_by_name+0x64/0x140 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa045a8f6>] nfsd_cross_mnt+0x76/0x1b0 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0475300>] nfsd4_encode_dirent+0x160/0x3d0 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa04751a0>] ? nfsd4_encode_getattr+0x40/0x40 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa045c631>] nfsd_readdir+0x1c1/0x2a0 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa045a1e0>] ? nfsd_direct_splice_actor+0x20/0x20 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa046bd70>] nfsd4_encode_readdir+0x120/0x220 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa047573d>] nfsd4_encode_operation+0x7d/0x190 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa046a51d>] nfsd4_proc_compound+0x24d/0x6f0 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0455f83>] nfsd_dispatch+0xc3/0x220 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa012a01b>] svc_process_common+0x43b/0x690 [sunrpc]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa012b133>] svc_process+0x103/0x1b0 [sunrpc]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa045596f>] nfsd+0xff/0x170 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffffa0455870>] ? nfsd_destroy+0x80/0x80 [nfsd]
>> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd378>] kthread+0xd8/0xf0
>> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
>> Apr 27 19:57:03 ntest kernel: [<ffffffff81784362>] ret_from_fork+0x42/0x70
>> Apr 27 19:57:03 ntest kernel: [<ffffffff810bd2a0>] ? kthread_worker_fn+0x180/0x180
>>
>
> Hmm... That looks bad. I wonder why we haven't hit the deadlock before.
It appears after nfs-utils commit 6091c0a4c4
("mountd: add support for case-insensitive file names")
adds using name_to_handle_at which will locking parent.
>
> nfsd gets a LOOKUP request for $DIR/name which is a mountpoint, or a READDIR
> request for $DIR which contains 'name' which is a mountpoint.
>
> nfsd_lookup_dentry of nfsd_buffered_readdir lock i_mutex on $DIR, then do
> the mountpoint check. If the mounted filesystem is not exported, but the $DIR
> filesystem has 'crossmnt', an upcall to mountd asks for export information.
> When mountd try to do a lookup on the name, it works fine if lookup_fast finds
> the child, but it deadlocks on the parent's i_mutex if it has to go to
> lookup_slow.
I have send a patch for it,
"[PATCH] NFSD: Avoid race of locking parent's mutex at cross mount"
>
> It should only go to lookup_slow() if:
> - the child isn't in cache .. but as it is mounted on, it must be
Yes, it is.
If after some operations of those files, they are cached,
the race will not appear.
> - d_revalidate returns 0
>
> This probably means that $DIR is on an NFS filesystem and it failed the
> revalidation for some reason....
It is caused by readdir of pseudo root, there are many unexported files in it.
>
> Kinglong: can you reproduce this easily? If so can you enable nfs debugging
> and collect the output?
> I want to confirm that the dfprintk in nfs_lookup_revalidate is saying that
> the name is invalid.
Yes, I will send the reproduce method in the other threads, add cc you.
thanks,
Kinglong Mee
On Wed, Apr 29, 2015 at 12:57:28PM +1000, NeilBrown wrote:
> In any case, holding a mutex while waiting for an upcall is probably a bad
> idea. We should try to find a way to work around that...
Yeah, it sounds fragile even if we could fix this particular issue in
mountd.
> Any ideas Bruce ?-)
Looking at nfsd_buffered_readdir():
/*
* Various filldir functions may end up calling back into
* lookup_one_len() and the file system's ->lookup() method.
* These expect i_mutex to be held, as it would within readdir.
*/
host_err = mutex_lock_killable(&dir_inode->i_mutex);
and as far as I can tell Kinglong's approach (adding an unlock and lock
around nfsd_cross_mnt() calls might actually be OK.
Though in general I expect
lock()
...code...
unlock()
to mark a critical section and would be unpleasantly surprised to
discover that a function somewhere in the middle had a buried
unlock/lock pair.
Maybe drop the locking from nfsd_buffered_readdir and *just* take the
i_mutex around lookup_one_len(), if that's the only place we need it?
--b.
On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Wed, Apr 29, 2015 at 12:57:28PM +1000, NeilBrown wrote:
> > In any case, holding a mutex while waiting for an upcall is probably a bad
> > idea. We should try to find a way to work around that...
>
> Yeah, it sounds fragile even if we could fix this particular issue in
> mountd.
>
> > Any ideas Bruce ?-)
>
> Looking at nfsd_buffered_readdir():
>
> /*
> * Various filldir functions may end up calling back into
> * lookup_one_len() and the file system's ->lookup() method.
> * These expect i_mutex to be held, as it would within readdir.
> */
> host_err = mutex_lock_killable(&dir_inode->i_mutex);
>
> and as far as I can tell Kinglong's approach (adding an unlock and lock
> around nfsd_cross_mnt() calls might actually be OK.
Yes, I think now that it would work. It would look odd though, as you
suggest.
There would need to be very clear comments explaining why the lock is being
managed that way.
>
> Though in general I expect
>
> lock()
> ...code...
> unlock()
>
> to mark a critical section and would be unpleasantly surprised to
> discover that a function somewhere in the middle had a buried
> unlock/lock pair.
>
> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> i_mutex around lookup_one_len(), if that's the only place we need it?
I think it is the only place needed. Certainly normal path lookup only takes
i_mutex very briefly around __lookup_hash().
One cost would be that we would take the mutex for every name instead of once
for a whole set of names. That is generally frowned upon for performance
reasons.
An alternative might be to stop using lookup_one_len, and use kern_path()
instead. Or kern_path_mountpoint if we don't want to cross a mountpoint.
They would only take the i_mutex if absolutely necessary.
The draw back is that they don't take a 'len' argument, so we would need to
make sure the name is nul terminated (and maybe double-check that there is
no '/'??). It would be fairly easy to nul-terminate names from readdir -
just use a bit more space in the buffer in nfsd_buffered_filldir().
I'm less sure about the locking needed in nfsd_lookup_dentry().
The comments suggests that there is good reason to keep the lock for an
extended period. But that reasoning might only apply to files, and
nfsd_mountpoint should only be true for directories... I hope.
A different approach would be to pass NULL for the rqstp to nfsd_cross_mnt().
This will ensure it doesn't block, but it might fail incorrectly.
If it does fail, we drop the lock, retry with the real rqstp, retake the lock
and .... no, I think that gets a bit too messy.
I think I'm in favour of:
- not taking the lock in readdir
- maybe not taking it in lookup
- using kern_path_mountpoint or kern_path
- nul terminating then names, copying the nfsd_lookup name to
__getname() if necessary.
Sound reasonable?
NeilBrown
On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
> > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > i_mutex around lookup_one_len(), if that's the only place we need it?
>
> I think it is the only place needed. Certainly normal path lookup
> only takes i_mutex very briefly around __lookup_hash().
>
> One cost would be that we would take the mutex for every name instead
> of once for a whole set of names. That is generally frowned upon for
> performance reasons.
>
> An alternative might be to stop using lookup_one_len, and use
> kern_path() instead. Or kern_path_mountpoint if we don't want to
> cross a mountpoint.
>
> They would only take the i_mutex if absolutely necessary.
>
> The draw back is that they don't take a 'len' argument, so we would
> need to make sure the name is nul terminated (and maybe double-check
> that there is no '/'??). It would be fairly easy to nul-terminate
> names from readdir - just use a bit more space in the buffer in
> nfsd_buffered_filldir().
They're also a lot more complicated than lookup_one_len. Is any of this
extra stuff they do (audit?) going to bite us?
For me understanding that would be harder than writing a variant of
lookup_one_len that took the i_mutex when required. But I guess that's
my problem, I should try to understand the lookup code better....
> I'm less sure about the locking needed in nfsd_lookup_dentry(). The
> comments suggests that there is good reason to keep the lock for an
> extended period. But that reasoning might only apply to files, and
> nfsd_mountpoint should only be true for directories... I hope.
I thought d_mountpoint() could be true for files, but certainly we won't
be doing nfs4 opens on directories.
> A different approach would be to pass NULL for the rqstp to
> nfsd_cross_mnt(). This will ensure it doesn't block, but it might
> fail incorrectly. If it does fail, we drop the lock, retry with the
> real rqstp, retake the lock and .... no, I think that gets a bit too
> messy.
Yes.
> I think I'm in favour of:
> - not taking the lock in readdir
> - maybe not taking it in lookup
> - using kern_path_mountpoint or kern_path
> - nul terminating then names, copying the nfsd_lookup name to
> __getname() if necessary.
>
> Sound reasonable?
I guess so, though I wish I understood kern_path_mountpoint better.
And nfsd_lookup_dentry looks like work for another day. No, wait, I
forgot the goal here: you're right, nfsd_lookup_dentry has the same
upcall-under-i_mutex problem, so we need to fix it too.
OK, agreed.
--b.
On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> > On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
> > wrote:
> > > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > > i_mutex around lookup_one_len(), if that's the only place we need it?
> >
> > I think it is the only place needed. Certainly normal path lookup
> > only takes i_mutex very briefly around __lookup_hash().
> >
> > One cost would be that we would take the mutex for every name instead
> > of once for a whole set of names. That is generally frowned upon for
> > performance reasons.
> >
> > An alternative might be to stop using lookup_one_len, and use
> > kern_path() instead. Or kern_path_mountpoint if we don't want to
> > cross a mountpoint.
> >
> > They would only take the i_mutex if absolutely necessary.
> >
> > The draw back is that they don't take a 'len' argument, so we would
> > need to make sure the name is nul terminated (and maybe double-check
> > that there is no '/'??). It would be fairly easy to nul-terminate
> > names from readdir - just use a bit more space in the buffer in
> > nfsd_buffered_filldir().
>
> They're also a lot more complicated than lookup_one_len. Is any of this
> extra stuff they do (audit?) going to bite us?
>
> For me understanding that would be harder than writing a variant of
> lookup_one_len that took the i_mutex when required. But I guess that's
> my problem, I should try to understand the lookup code better....
Tempting though ... see below (untested).
While writing that I began to wonder if lookup_one_len is really the right
interface to be used, even though it was introduced (in 2.3.99pre2-4)
specifically for nfsd.
The problem is that it assumes things about the filesystem. So it makes
perfect sense for various filesystems to use it on themselves, but I'm not
sure how *right* it is for nfsd (or cachefiles etc) to use it on some
*other* filesystem.
The particular issue is that it avoids the d_revalidate call.
Both vfat and reiserfs have that call ... I wonder if that could ever be a
problem.
So I'm really leaning towards creating a variant of kern_path_mountpoint and
using a variant of that which takes a length.
I think adding the d_revalidate is correct and even adding auditing is
probably a good idea.
Maybe I'll try that (unless someone beats me to it...)
NeilBrown
>
> > I'm less sure about the locking needed in nfsd_lookup_dentry(). The
> > comments suggests that there is good reason to keep the lock for an
> > extended period. But that reasoning might only apply to files, and
> > nfsd_mountpoint should only be true for directories... I hope.
>
> I thought d_mountpoint() could be true for files, but certainly we won't
> be doing nfs4 opens on directories.
>
> > A different approach would be to pass NULL for the rqstp to
> > nfsd_cross_mnt(). This will ensure it doesn't block, but it might
> > fail incorrectly. If it does fail, we drop the lock, retry with the
> > real rqstp, retake the lock and .... no, I think that gets a bit too
> > messy.
>
> Yes.
>
> > I think I'm in favour of:
> > - not taking the lock in readdir
> > - maybe not taking it in lookup
> > - using kern_path_mountpoint or kern_path
> > - nul terminating then names, copying the nfsd_lookup name to
> > __getname() if necessary.
> >
> > Sound reasonable?
>
> I guess so, though I wish I understood kern_path_mountpoint better.
>
> And nfsd_lookup_dentry looks like work for another day. No, wait, I
> forgot the goal here: you're right, nfsd_lookup_dentry has the same
> upcall-under-i_mutex problem, so we need to fix it too.
>
> OK, agreed.
>
> --b.
diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..d6b2dc36029e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2182,6 +2182,56 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ retrun ret;
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> An alternative might be to stop using lookup_one_len, and use kern_path()
> instead. Or kern_path_mountpoint if we don't want to cross a mountpoint.
NAK. This is a monumentally bad idea - please, do not do it. Not to mention
anything else, kern_path() is a _lot_ heavier, including much worse stack
footprint.
On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
> While writing that I began to wonder if lookup_one_len is really the right
> interface to be used, even though it was introduced (in 2.3.99pre2-4)
> specifically for nfsd.
> The problem is that it assumes things about the filesystem. So it makes
> perfect sense for various filesystems to use it on themselves, but I'm not
> sure how *right* it is for nfsd (or cachefiles etc) to use it on some
> *other* filesystem.
> The particular issue is that it avoids the d_revalidate call.
> Both vfat and reiserfs have that call ... I wonder if that could ever be a
> problem.
>
> So I'm really leaning towards creating a variant of kern_path_mountpoint and
> using a variant of that which takes a length.
NAK. As in, "no way in hell". And yes, lookup_one_len() *does* revalidate -
RTFS(lookup_dcache), please.
What kind of consistency warranties do callers expect, BTW? You do realize
that between iterate_dir() and callbacks an entry might have been removed
and/or replaced?
On Fri, 1 May 2015 03:03:24 +0100 Al Viro <[email protected]> wrote:
> On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
> > While writing that I began to wonder if lookup_one_len is really the right
> > interface to be used, even though it was introduced (in 2.3.99pre2-4)
> > specifically for nfsd.
> > The problem is that it assumes things about the filesystem. So it makes
> > perfect sense for various filesystems to use it on themselves, but I'm not
> > sure how *right* it is for nfsd (or cachefiles etc) to use it on some
> > *other* filesystem.
> > The particular issue is that it avoids the d_revalidate call.
> > Both vfat and reiserfs have that call ... I wonder if that could ever be a
> > problem.
> >
> > So I'm really leaning towards creating a variant of kern_path_mountpoint and
> > using a variant of that which takes a length.
>
> NAK. As in, "no way in hell". And yes, lookup_one_len() *does* revalidate -
> RTFS(lookup_dcache), please.
Damn - I always seems to get lost when I'm following those call paths.
lookup_one_len -> __lookup_hash -> lookup_dcache -> d_lookup,d_revalidate -> __d_lookup
-> lookup_real -> i_op->lookup
I think I was confusing __lookup_hash with __d_lookup in my thoughts.
>
> What kind of consistency warranties do callers expect, BTW? You do realize
> that between iterate_dir() and callbacks an entry might have been removed
> and/or replaced?
For READDIR_PLUS, lookup_one_len is called on each name and it requires
i_mutex, so the code currently holds i_mutex over the whole sequence.
This is triggering a deadlock.
We could just grab/drop i_mutex over each call to lookup_one_len(), but that
sort of thing is usually frowned upon, and we don't really always *need*
i_mutex if the lookup can be served from the d_cache.
So I'm looking for the best way to perform the lookup without holding i_mutex
for too long.
It sounds like you are suggesting something like lookup_one_len_unlocked(),
which .... uhm...
I was going to say uses lookup_dcache, but that needs i_mutex.
It calls d_lookup(), which doesn't seem to really need i_mutex, and
d_revalidate().
Does the later need i_mutex? I don't think so.
So maybe it is just how d_lookup handles failure that needs i_mutex.
So lookup_one_len_unlocked() could call d_lookup and d_revalidate and if
that all worked nicely, return the result. If it didn't, grab i_mutex and try
again??
Or do we just wear the cost of taking i_mutex for each name in the directory
during READDIR_PLUS?
Thanks,
NeilBrown
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > What kind of consistency warranties do callers expect, BTW? You do realize
> > that between iterate_dir() and callbacks an entry might have been removed
> > and/or replaced?
>
> For READDIR_PLUS, lookup_one_len is called on each name and it requires
> i_mutex, so the code currently holds i_mutex over the whole sequence.
> This is triggering a deadlock.
Yes, I've seen the context. However, you are _not_ holding it between
actual iterate_dir() and those callbacks, which opens a window when
directory might have been changed.
Again, what kind of consistency is expected by callers? Are they ready to
cope with "there's no such entry anymore" or "inumber is nothing like
what we'd put in ->ino, since it's no the same object" or "->d_type is
completely unrelated to what we'd found, since the damn thing had been
removed and created from scratch"?
On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > that between iterate_dir() and callbacks an entry might have been removed
> > > and/or replaced?
> >
> > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > This is triggering a deadlock.
>
> Yes, I've seen the context. However, you are _not_ holding it between
> actual iterate_dir() and those callbacks, which opens a window when
> directory might have been changed.
>
> Again, what kind of consistency is expected by callers? Are they ready to
> cope with "there's no such entry anymore" or "inumber is nothing like
> what we'd put in ->ino, since it's no the same object" or "->d_type is
> completely unrelated to what we'd found, since the damn thing had been
> removed and created from scratch"?
Ah, sorry.
Yes, the callers are prepared for "there's no such entry anymore".
They don't use d_type, so don't care if it might be meaningless.
NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
about ino changing.
In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
thence to compose_entry_fh() and it should report failure if
dchild->d_inode->i_ino doesn't match.
Simply not returning the extra attributes is perfectly acceptable in NFSv3.
So it looks like we are mostly OK here - we don't really need i_mutex to be
held for very long.
NeilBrown
On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
>
> > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > and/or replaced?
> > >
> > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > This is triggering a deadlock.
> >
> > Yes, I've seen the context. However, you are _not_ holding it between
> > actual iterate_dir() and those callbacks, which opens a window when
> > directory might have been changed.
> >
> > Again, what kind of consistency is expected by callers? Are they ready to
> > cope with "there's no such entry anymore" or "inumber is nothing like
> > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > completely unrelated to what we'd found, since the damn thing had been
> > removed and created from scratch"?
>
> Ah, sorry.
>
> Yes, the callers are prepared for "there's no such entry anymore".
> They don't use d_type, so don't care if it might be meaningless.
> NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> about ino changing.
>
> In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> thence to compose_entry_fh() and it should report failure if
> dchild->d_inode->i_ino doesn't match.
Just to make sure I understand the concern..... So it shouldn't really
be a problem if readdir and lookup find different objects for the same
name, the problem is just when we mix attributes from the two objects,
right? Looks like the v3 code could return an inode number derived from
the readdir and a filehandle from the lookup, which is a problem. The
v4 code will get everything from the result of the lookup, which should
be OK.
> Simply not returning the extra attributes is perfectly acceptable in NFSv3.
Right, so no big deal anyway.--b.
> So it looks like we are mostly OK here - we don't really need i_mutex to be
> held for very long.
>
> NeilBrown
>
On Fri, 1 May 2015 09:29:53 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> > On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> >
> > > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > > and/or replaced?
> > > >
> > > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > > This is triggering a deadlock.
> > >
> > > Yes, I've seen the context. However, you are _not_ holding it between
> > > actual iterate_dir() and those callbacks, which opens a window when
> > > directory might have been changed.
> > >
> > > Again, what kind of consistency is expected by callers? Are they ready to
> > > cope with "there's no such entry anymore" or "inumber is nothing like
> > > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > > completely unrelated to what we'd found, since the damn thing had been
> > > removed and created from scratch"?
> >
> > Ah, sorry.
> >
> > Yes, the callers are prepared for "there's no such entry anymore".
> > They don't use d_type, so don't care if it might be meaningless.
> > NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> > about ino changing.
> >
> > In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> > thence to compose_entry_fh() and it should report failure if
> > dchild->d_inode->i_ino doesn't match.
>
> Just to make sure I understand the concern..... So it shouldn't really
> be a problem if readdir and lookup find different objects for the same
> name, the problem is just when we mix attributes from the two objects,
> right? Looks like the v3 code could return an inode number derived from
> the readdir and a filehandle from the lookup, which is a problem. The
> v4 code will get everything from the result of the lookup, which should
> be OK.
That agrees with my understanding, yes.
I did wonder for a little while about the possibility of a directory
containing both 'a' and 'b', and NFSv4 doing the readdir and the stat of 'a',
and the a "mv a b" happening before the stat of 'b'.
Then the readdir response will show both 'a' and 'b' referring to the same
object with a link count of 1.
I can't quite decide if that is a problem or not.
>
> > Simply not returning the extra attributes is perfectly acceptable in NFSv3.
>
> Right, so no big deal anyway.--b.
Not a big deal, but we should really add a patch like the following ("like"
as in "actually compile tested and documented" which this one isn't).
NeilBrown
>
> > So it looks like we are mostly OK here - we don't really need i_mutex to be
> > held for very long.
> >
> > NeilBrown
> >
>
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e4b2b4322553..f6e7cbabac5a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
static __be32
compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
- const char *name, int namlen)
+ const char *name, int namlen, u64 ino)
{
struct svc_export *exp;
struct dentry *dparent, *dchild;
@@ -830,19 +830,21 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
goto out;
if (d_really_is_negative(dchild))
goto out;
+ if (dchild->d_inode->i_ino != ino)
+ goto out;
rv = fh_compose(fhp, exp, dchild, &cd->fh);
out:
dput(dchild);
return rv;
}
-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
{
struct svc_fh *fh = &cd->scratch;
__be32 err;
fh_init(fh, NFS3_FHSIZE);
- err = compose_entry_fh(cd, fh, name, namlen);
+ err = compose_entry_fh(cd, fh, name, namlen, ino);
if (err) {
*p++ = 0;
*p++ = 0;
@@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p = encode_entry_baggage(cd, p, name, namlen, ino);
if (plus)
- p = encode_entryplus_baggage(cd, p, name, namlen);
+ p = encode_entryplus_baggage(cd, p, name, namlen, ino);
num_entry_words = p - cd->buffer;
} else if (*(page+1) != NULL) {
/* temporarily encode entry into next page, then move back to
@@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
if (plus)
- p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
On Sun, May 03, 2015 at 09:16:53AM +1000, NeilBrown wrote:
> On Fri, 1 May 2015 09:29:53 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
>
> > On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> > > On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> > >
> > > > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > > > and/or replaced?
> > > > >
> > > > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > > > This is triggering a deadlock.
> > > >
> > > > Yes, I've seen the context. However, you are _not_ holding it between
> > > > actual iterate_dir() and those callbacks, which opens a window when
> > > > directory might have been changed.
> > > >
> > > > Again, what kind of consistency is expected by callers? Are they ready to
> > > > cope with "there's no such entry anymore" or "inumber is nothing like
> > > > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > > > completely unrelated to what we'd found, since the damn thing had been
> > > > removed and created from scratch"?
> > >
> > > Ah, sorry.
> > >
> > > Yes, the callers are prepared for "there's no such entry anymore".
> > > They don't use d_type, so don't care if it might be meaningless.
> > > NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> > > about ino changing.
> > >
> > > In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> > > thence to compose_entry_fh() and it should report failure if
> > > dchild->d_inode->i_ino doesn't match.
> >
> > Just to make sure I understand the concern..... So it shouldn't really
> > be a problem if readdir and lookup find different objects for the same
> > name, the problem is just when we mix attributes from the two objects,
> > right? Looks like the v3 code could return an inode number derived from
> > the readdir and a filehandle from the lookup, which is a problem. The
> > v4 code will get everything from the result of the lookup, which should
> > be OK.
>
> That agrees with my understanding, yes.
>
> I did wonder for a little while about the possibility of a directory
> containing both 'a' and 'b', and NFSv4 doing the readdir and the stat of 'a',
> and the a "mv a b" happening before the stat of 'b'.
>
> Then the readdir response will show both 'a' and 'b' referring to the same
> object with a link count of 1.
>
> I can't quite decide if that is a problem or not.
My understanding is that that's completely normal behavior for lots of
filesystems.
Googling around.... Here's Ted on the question:
http://yarchive.net/comp/linux/readdir_nonatomicity.html
In some cases it won't even just get lost, but the old and new
name can both be returned. For example, if you assume the use
of a simple non-tree, linked-list implementation of a directory,
such can be found in Solaris's ufs, BSD 4.3's FFS, Linux's ext2
and minix filesystems, and many others, and you have a fully
tightly packed directory (i.e., no gaps), with the directory
entry "foo" at the beginning of the file, and readdir() has
already returned the first "foo" entry when some other
application renames it "Supercalifragilisticexpialidocious", the
new name will not fit in the old name's directory location, so
it will be placed at the end of the directory --- where it will
be returned by readdir() a second time.
This is not a bug; the POSIX specification explicitly allows
this behavior. If a filename is renamed during a readdir()
session of a directory, it is undefined where that neither,
either, or both of the new and old filenames will be returned.
--b.
On Sat, 2 May 2015 20:37:43 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Sun, May 03, 2015 at 09:16:53AM +1000, NeilBrown wrote:
> > On Fri, 1 May 2015 09:29:53 -0400 "J. Bruce Fields" <[email protected]>
> > wrote:
> >
> > > On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> > > > On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> > > >
> > > > > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > > > > and/or replaced?
> > > > > >
> > > > > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > > > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > > > > This is triggering a deadlock.
> > > > >
> > > > > Yes, I've seen the context. However, you are _not_ holding it between
> > > > > actual iterate_dir() and those callbacks, which opens a window when
> > > > > directory might have been changed.
> > > > >
> > > > > Again, what kind of consistency is expected by callers? Are they ready to
> > > > > cope with "there's no such entry anymore" or "inumber is nothing like
> > > > > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > > > > completely unrelated to what we'd found, since the damn thing had been
> > > > > removed and created from scratch"?
> > > >
> > > > Ah, sorry.
> > > >
> > > > Yes, the callers are prepared for "there's no such entry anymore".
> > > > They don't use d_type, so don't care if it might be meaningless.
> > > > NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> > > > about ino changing.
> > > >
> > > > In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> > > > thence to compose_entry_fh() and it should report failure if
> > > > dchild->d_inode->i_ino doesn't match.
> > >
> > > Just to make sure I understand the concern..... So it shouldn't really
> > > be a problem if readdir and lookup find different objects for the same
> > > name, the problem is just when we mix attributes from the two objects,
> > > right? Looks like the v3 code could return an inode number derived from
> > > the readdir and a filehandle from the lookup, which is a problem. The
> > > v4 code will get everything from the result of the lookup, which should
> > > be OK.
> >
> > That agrees with my understanding, yes.
> >
> > I did wonder for a little while about the possibility of a directory
> > containing both 'a' and 'b', and NFSv4 doing the readdir and the stat of 'a',
> > and the a "mv a b" happening before the stat of 'b'.
> >
> > Then the readdir response will show both 'a' and 'b' referring to the same
> > object with a link count of 1.
> >
> > I can't quite decide if that is a problem or not.
>
> My understanding is that that's completely normal behavior for lots of
> filesystems.
>
> Googling around.... Here's Ted on the question:
>
> http://yarchive.net/comp/linux/readdir_nonatomicity.html
>
> In some cases it won't even just get lost, but the old and new
> name can both be returned. For example, if you assume the use
> of a simple non-tree, linked-list implementation of a directory,
> such can be found in Solaris's ufs, BSD 4.3's FFS, Linux's ext2
> and minix filesystems, and many others, and you have a fully
> tightly packed directory (i.e., no gaps), with the directory
> entry "foo" at the beginning of the file, and readdir() has
> already returned the first "foo" entry when some other
> application renames it "Supercalifragilisticexpialidocious", the
> new name will not fit in the old name's directory location, so
> it will be placed at the end of the directory --- where it will
> be returned by readdir() a second time.
>
> This is not a bug; the POSIX specification explicitly allows
> this behavior. If a filename is renamed during a readdir()
> session of a directory, it is undefined where that neither,
> either, or both of the new and old filenames will be returned.
>
I think that is a slightly different situation to the one I was imagining.
Ted's observation here is completely about readdir results.
A NFS READDIR_PLUS result can be used to satisfy subsequence stat() requests.
I don't think it would ever be correct to
stat('a')
stat('b')
stat('a')
and get exactly the same stat info in every case, including inode number and
ctime and link count of '1'.
If those stats were served from the READDIRPLUS results that I described
above, that is exactly what you would get.
I'm not sure if the post-op attributes would be enough to tell the client it
needs to do a GETATTR again straight away to verify things.
But if the attr info stays cached (which is kind-of the point of
READDIR_PLUS), this is a very different circumstance than the one Ted
described.
Still not sure how important it is, but I like the NFSv3 option of just not
returning attributes if we aren't certain of them.
An option for NFSv4 might be to abort/retry the readdir op if the directory
has changed at all(?).
NeilBrown
On Sun, May 03, 2015 at 09:16:53AM +1000, NeilBrown wrote:
> On Fri, 1 May 2015 09:29:53 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
>
> > On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> > > On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> > >
> > > > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > > > and/or replaced?
> > > > >
> > > > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > > > This is triggering a deadlock.
> > > >
> > > > Yes, I've seen the context. However, you are _not_ holding it between
> > > > actual iterate_dir() and those callbacks, which opens a window when
> > > > directory might have been changed.
> > > >
> > > > Again, what kind of consistency is expected by callers? Are they ready to
> > > > cope with "there's no such entry anymore" or "inumber is nothing like
> > > > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > > > completely unrelated to what we'd found, since the damn thing had been
> > > > removed and created from scratch"?
> > >
> > > Ah, sorry.
> > >
> > > Yes, the callers are prepared for "there's no such entry anymore".
> > > They don't use d_type, so don't care if it might be meaningless.
> > > NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> > > about ino changing.
> > >
> > > In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> > > thence to compose_entry_fh() and it should report failure if
> > > dchild->d_inode->i_ino doesn't match.
> >
> > Just to make sure I understand the concern..... So it shouldn't really
> > be a problem if readdir and lookup find different objects for the same
> > name, the problem is just when we mix attributes from the two objects,
> > right? Looks like the v3 code could return an inode number derived from
> > the readdir and a filehandle from the lookup, which is a problem. The
> > v4 code will get everything from the result of the lookup, which should
> > be OK.
>
> That agrees with my understanding, yes.
>
> I did wonder for a little while about the possibility of a directory
> containing both 'a' and 'b', and NFSv4 doing the readdir and the stat of 'a',
> and the a "mv a b" happening before the stat of 'b'.
>
> Then the readdir response will show both 'a' and 'b' referring to the same
> object with a link count of 1.
>
> I can't quite decide if that is a problem or not.
>
>
> >
> > > Simply not returning the extra attributes is perfectly acceptable in NFSv3.
> >
> > Right, so no big deal anyway.--b.
>
> Not a big deal, but we should really add a patch like the following ("like"
> as in "actually compile tested and documented" which this one isn't).
Doesn't seem to break anything. Any second thoughts, or can I add a
signed-off-by?
--b.
commit e11f8acace69
Author: NeilBrown <[email protected]>
Date: Sun May 3 09:16:53 2015 +1000
nfsd: stop READDIRPLUS returning inconsistent attributes
The NFSv3 READDIRPLUS gets some of the returned attributes from the
readdir, and some from an inode returned from a new lookup. The two
objects could be different thanks to intervening renames.
The attributes in READDIRPLUS are optional, so let's just skip them if
we notice this case.
Signed-off-by: J. Bruce Fields <[email protected]>
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e4b2b4322553..f6e7cbabac5a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
static __be32
compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
- const char *name, int namlen)
+ const char *name, int namlen, u64 ino)
{
struct svc_export *exp;
struct dentry *dparent, *dchild;
@@ -830,19 +830,21 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
goto out;
if (d_really_is_negative(dchild))
goto out;
+ if (dchild->d_inode->i_ino != ino)
+ goto out;
rv = fh_compose(fhp, exp, dchild, &cd->fh);
out:
dput(dchild);
return rv;
}
-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
{
struct svc_fh *fh = &cd->scratch;
__be32 err;
fh_init(fh, NFS3_FHSIZE);
- err = compose_entry_fh(cd, fh, name, namlen);
+ err = compose_entry_fh(cd, fh, name, namlen, ino);
if (err) {
*p++ = 0;
*p++ = 0;
@@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p = encode_entry_baggage(cd, p, name, namlen, ino);
if (plus)
- p = encode_entryplus_baggage(cd, p, name, namlen);
+ p = encode_entryplus_baggage(cd, p, name, namlen, ino);
num_entry_words = p - cd->buffer;
} else if (*(page+1) != NULL) {
/* temporarily encode entry into next page, then move back to
@@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
if (plus)
- p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
>
> > On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> > > On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
> > > wrote:
> > > > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > > > i_mutex around lookup_one_len(), if that's the only place we need it?
> > >
> > > I think it is the only place needed. Certainly normal path lookup
> > > only takes i_mutex very briefly around __lookup_hash().
> > >
> > > One cost would be that we would take the mutex for every name instead
> > > of once for a whole set of names. That is generally frowned upon for
> > > performance reasons.
> > >
> > > An alternative might be to stop using lookup_one_len, and use
> > > kern_path() instead. Or kern_path_mountpoint if we don't want to
> > > cross a mountpoint.
> > >
> > > They would only take the i_mutex if absolutely necessary.
> > >
> > > The draw back is that they don't take a 'len' argument, so we would
> > > need to make sure the name is nul terminated (and maybe double-check
> > > that there is no '/'??). It would be fairly easy to nul-terminate
> > > names from readdir - just use a bit more space in the buffer in
> > > nfsd_buffered_filldir().
> >
> > They're also a lot more complicated than lookup_one_len. Is any of this
> > extra stuff they do (audit?) going to bite us?
> >
> > For me understanding that would be harder than writing a variant of
> > lookup_one_len that took the i_mutex when required. But I guess that's
> > my problem, I should try to understand the lookup code better....
>
> Tempting though ... see below (untested).
With documentation and a stab at the nfsd stuff (also untested. OK, and
pretty much unread. Compiles, though!)
--b.
commit 6023d45abd1a
Author: NeilBrown <[email protected]>
Date: Fri May 1 11:53:26 2015 +1000
nfsd: don't hold i_mutex over userspace upcalls
We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.
In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.
With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.
It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like
mutex_lock()
lookup_one_len()
mutex_unlock()
In may cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.
Signed-off-by: J. Bruce Fields <[email protected]>
diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..5ec103d4775d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * Must be called with the parented i_mutex held.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2182,6 +2184,68 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ return ret;
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..cc7995762190 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
+ if (!S_ISREG(d_inode(dentry)->i_mode)) {
+ /*
+ * Never mind, we're not going to open this
+ * anyway. And we don't want to hold it over
+ * the userspace upcalls in nfsd_mountpoint. */
+ fh_unlock(fhp);
+ }
/*
* check if we have crossed a mount point ...
*/
@@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
Cc Steve, Viro,
On 5/1/2015 5:36 AM, J. Bruce Fields wrote:
> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
>> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
>> wrote:
>>> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
>>> i_mutex around lookup_one_len(), if that's the only place we need it?
As description in other thread, before the upcall to rpc.mountd,
nfsd have call lookup_one_len() for the file, but why rpc.mountd
also blocked in lookup ?
There is a bug in rpc.mountd when checking sub-directory,
it sets bad patch length for child.
If parent if "/nfs/xfs" and child is "/nfs/test", the child name
will be truncated to "/nfs/tes" for strlen(parent), "/nfs/test"
have exist in kernel's cache for the lookup_one_len(), but
"/nfs/tes" is a bad path, which needs lookup_slow(), so blocked.
static int is_subdirectory(char *child, char *parent)
{
/* Check is child is strictly a subdirectory of
* parent or a more distant descendant.
*/
size_t l = strlen(parent);
if (strcmp(parent, "/") == 0 && child[1] != 0)
return 1;
return (same_path(child, parent, l) && child[l] == '/');
}
The following path makes a correct path, not a truncated path.
Have be tested, everything is OK.
thanks,
Kinglong Mee
-----------------------------------------------------------------------------------
On Tue, 05 May 2015 11:53:27 +0800 Kinglong Mee <[email protected]> wrote:
> Cc Steve, Viro,
>
> On 5/1/2015 5:36 AM, J. Bruce Fields wrote:
> > On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> >> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
> >> wrote:
> >>> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> >>> i_mutex around lookup_one_len(), if that's the only place we need it?
>
> As description in other thread, before the upcall to rpc.mountd,
> nfsd have call lookup_one_len() for the file, but why rpc.mountd
> also blocked in lookup ?
>
> There is a bug in rpc.mountd when checking sub-directory,
> it sets bad patch length for child.
>
> If parent if "/nfs/xfs" and child is "/nfs/test", the child name
> will be truncated to "/nfs/tes" for strlen(parent), "/nfs/test"
> have exist in kernel's cache for the lookup_one_len(), but
> "/nfs/tes" is a bad path, which needs lookup_slow(), so blocked.
Testing for "/nfs/tes" certain seems like a wrong thing to do.
>
> static int is_subdirectory(char *child, char *parent)
> {
> /* Check is child is strictly a subdirectory of
> * parent or a more distant descendant.
> */
> size_t l = strlen(parent);
>
> if (strcmp(parent, "/") == 0 && child[1] != 0)
> return 1;
>
> return (same_path(child, parent, l) && child[l] == '/');
I guess this should be:
child[l] == '/' && same_path(child, parent, l)
That way there would be no risk of truncating anything.
Can you please test if that one-line change removes the problem?
> }
>
> The following path makes a correct path, not a truncated path.
> Have be tested, everything is OK.
>
> thanks,
> Kinglong Mee
>
> -----------------------------------------------------------------------------------
> >From 70b9d1d93a24db8a7837998cb7eb0ff4e98480a6 Mon Sep 17 00:00:00 2001
> From: Kinglong Mee <[email protected]>
> Date: Tue, 5 May 2015 11:47:20 +0800
> Subject: [PATCH] mountd: Case-insensitive path length must equal to parent
>
> Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
> introdues a bug cause mutex race when looking bad path.
I think we should be clear that the mutex race is already present.
I think you are right that there is a bug here which is making it easy to
trigger, but it isn't exactly "causing" the bug.
Thanks,
NeilBrown
>
> Signed-off-by: Kinglong Mee <[email protected]>
> ---
> utils/mountd/cache.c | 22 +++++++++++++++++++++-
> 1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/utils/mountd/cache.c b/utils/mountd/cache.c
> index 7d250f9..9d9a1bb 100644
> --- a/utils/mountd/cache.c
> +++ b/utils/mountd/cache.c
> @@ -468,16 +468,36 @@ fallback:
> return 1;
> }
>
> +static int subdir_len(char *name, int count_slashes)
> +{
> + char *ptr = NULL;
> + int i;
> +
> + ptr = name;
> + for (i = 0; i < count_slashes + 1; i++) {
> + ptr = strchr(ptr, '/');
> + if (NULL == ptr)
> + return strlen(name);
> + ptr++;
> + }
> +
> + return ptr - name;
> +}
> +
> static int is_subdirectory(char *child, char *parent)
> {
> /* Check is child is strictly a subdirectory of
> * parent or a more distant descendant.
> */
> - size_t l = strlen(parent);
> + size_t l = subdir_len(child, count_slashes(parent));
>
> if (strcmp(parent, "/") == 0 && child[1] != 0)
> return 1;
>
> + /* Case-insensitive path length must equal to parent */
> + if (l != strlen(parent))
> + return 0;
> +
> return (same_path(child, parent, l) && child[l] == '/');
> }
>
On 5/5/2015 12:19 PM, NeilBrown wrote:
> On Tue, 05 May 2015 11:53:27 +0800 Kinglong Mee <[email protected]> wrote:
>
>> Cc Steve, Viro,
>>
>> On 5/1/2015 5:36 AM, J. Bruce Fields wrote:
>>> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
>>>> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
>>>> wrote:
>>>>> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
>>>>> i_mutex around lookup_one_len(), if that's the only place we need it?
>>
>> As description in other thread, before the upcall to rpc.mountd,
>> nfsd have call lookup_one_len() for the file, but why rpc.mountd
>> also blocked in lookup ?
>>
>> There is a bug in rpc.mountd when checking sub-directory,
>> it sets bad patch length for child.
>>
>> If parent if "/nfs/xfs" and child is "/nfs/test", the child name
>> will be truncated to "/nfs/tes" for strlen(parent), "/nfs/test"
>> have exist in kernel's cache for the lookup_one_len(), but
>> "/nfs/tes" is a bad path, which needs lookup_slow(), so blocked.
>
> Testing for "/nfs/tes" certain seems like a wrong thing to do.
>
>>
>> static int is_subdirectory(char *child, char *parent)
>> {
>> /* Check is child is strictly a subdirectory of
>> * parent or a more distant descendant.
>> */
>> size_t l = strlen(parent);
>>
>> if (strcmp(parent, "/") == 0 && child[1] != 0)
>> return 1;
>>
>> return (same_path(child, parent, l) && child[l] == '/');
>
> I guess this should be:
>
> child[l] == '/' && same_path(child, parent, l)
>
> That way there would be no risk of truncating anything.
>
> Can you please test if that one-line change removes the problem?
Yes, that's OK.
I have think about the "/nfs/nfs" and "/nfs/123" of the first patch,
yes, that's redundant.
>
>> }
>>
>> The following path makes a correct path, not a truncated path.
>> Have be tested, everything is OK.
>>
>> thanks,
>> Kinglong Mee
>>
>> -----------------------------------------------------------------------------------
>> >From 70b9d1d93a24db8a7837998cb7eb0ff4e98480a6 Mon Sep 17 00:00:00 2001
>> From: Kinglong Mee <[email protected]>
>> Date: Tue, 5 May 2015 11:47:20 +0800
>> Subject: [PATCH] mountd: Case-insensitive path length must equal to parent
>>
>> Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
>> introdues a bug cause mutex race when looking bad path.
>
> I think we should be clear that the mutex race is already present.
> I think you are right that there is a bug here which is making it easy to
> trigger, but it isn't exactly "causing" the bug.
Thanks for the comments.
thanks,
Kinglong Mee
-----------------------------------------------------------------------------------
On Tue, May 05, 2015 at 04:32:06PM +0800, Kinglong Mee wrote:
> -----------------------------------------------------------------------------------
> >From d831154bb7e527f9003e16ac049526be5ed90228 Mon Sep 17 00:00:00 2001
> From: Kinglong Mee <[email protected]>
> Date: Tue, 5 May 2015 16:24:16 +0800
> Subject: [PATCH] mountd: Case-insensitive path length must equals to parent
>
> Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
> introduces looking up bad path which is easy to trigger a present mutex race.
ACK to this patch. (Steved, did you get this?)
--b.
>
> Signed-off-by: Kinglong Mee <[email protected]>
> ---
> utils/mountd/cache.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/utils/mountd/cache.c b/utils/mountd/cache.c
> index 7d250f9..155695a 100644
> --- a/utils/mountd/cache.c
> +++ b/utils/mountd/cache.c
> @@ -478,7 +478,7 @@ static int is_subdirectory(char *child, char *parent)
> if (strcmp(parent, "/") == 0 && child[1] != 0)
> return 1;
>
> - return (same_path(child, parent, l) && child[l] == '/');
> + return (child[l] == '/' && same_path(child, parent, l));
> }
>
> static int path_matches(nfs_export *exp, char *path)
> --
> 2.4.0
On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
>> On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <[email protected]>
>> wrote:
>>
>>> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
>>>> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <[email protected]>
>>>> wrote:
>>>>> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
>>>>> i_mutex around lookup_one_len(), if that's the only place we need it?
>>>>
>>>> I think it is the only place needed. Certainly normal path lookup
>>>> only takes i_mutex very briefly around __lookup_hash().
>>>>
>>>> One cost would be that we would take the mutex for every name instead
>>>> of once for a whole set of names. That is generally frowned upon for
>>>> performance reasons.
>>>>
>>>> An alternative might be to stop using lookup_one_len, and use
>>>> kern_path() instead. Or kern_path_mountpoint if we don't want to
>>>> cross a mountpoint.
>>>>
>>>> They would only take the i_mutex if absolutely necessary.
>>>>
>>>> The draw back is that they don't take a 'len' argument, so we would
>>>> need to make sure the name is nul terminated (and maybe double-check
>>>> that there is no '/'??). It would be fairly easy to nul-terminate
>>>> names from readdir - just use a bit more space in the buffer in
>>>> nfsd_buffered_filldir().
>>>
>>> They're also a lot more complicated than lookup_one_len. Is any of this
>>> extra stuff they do (audit?) going to bite us?
>>>
>>> For me understanding that would be harder than writing a variant of
>>> lookup_one_len that took the i_mutex when required. But I guess that's
>>> my problem, I should try to understand the lookup code better....
>>
>> Tempting though ... see below (untested).
>
> With documentation and a stab at the nfsd stuff (also untested. OK, and
> pretty much unread. Compiles, though!)
>
> --b.
>
> commit 6023d45abd1a
> Author: NeilBrown <[email protected]>
> Date: Fri May 1 11:53:26 2015 +1000
>
> nfsd: don't hold i_mutex over userspace upcalls
>
> We need information about exports when crossing mountpoints during
> lookup or NFSv4 readdir. If we don't already have that information
> cached, we may have to ask (and wait for) rpc.mountd.
>
> In both cases we currently hold the i_mutex on the parent of the
> directory we're asking rpc.mountd about. We've seen situations where
> rpc.mountd performs some operation on that directory that tries to take
> the i_mutex again, resulting in deadlock.
>
> With some care, we may be able to avoid that in rpc.mountd. But it
> seems better just to avoid holding a mutex while waiting on userspace.
>
> It appears that lookup_one_len is pretty much the only operation that
> needs the i_mutex. So we could just drop the i_mutex elsewhere and do
> something like
>
> mutex_lock()
> lookup_one_len()
> mutex_unlock()
>
> In may cases though the lookup would have been cached and not required
> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> that only takes the i_mutex when necessary.
>
> Signed-off-by: J. Bruce Fields <[email protected]>
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..5ec103d4775d 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> *
> * Note that this routine is purely a helper for filesystem usage and should
> * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.
> */
> struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> {
> @@ -2182,6 +2184,68 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> }
> EXPORT_SYMBOL(lookup_one_len);
>
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
> + * @name: pathname component to lookup
> + * @base: base directory to lookup from
> + * @len: maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> + struct dentry *base, int len)
> +{
> + struct qstr this;
> + unsigned int c;
> + int err;
> + struct dentry *ret;
> +
> + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
Remove this line.
> +
> + this.name = name;
> + this.len = len;
> + this.hash = full_name_hash(name, len);
> + if (!len)
> + return ERR_PTR(-EACCES);
> +
> + if (unlikely(name[0] == '.')) {
> + if (len < 2 || (len == 2 && name[1] == '.'))
> + return ERR_PTR(-EACCES);
> + }
> +
> + while (len--) {
> + c = *(const unsigned char *)name++;
> + if (c == '/' || c == '\0')
> + return ERR_PTR(-EACCES);
> + }
> + /*
> + * See if the low-level filesystem might want
> + * to use its own hash..
> + */
> + if (base->d_flags & DCACHE_OP_HASH) {
> + int err = base->d_op->d_hash(base, &this);
> + if (err < 0)
> + return ERR_PTR(err);
> + }
> +
> + err = inode_permission(base->d_inode, MAY_EXEC);
> + if (err)
> + return ERR_PTR(err);
> +
> + ret = __d_lookup(base, &this);
> + if (ret)
> + return ret;
> + mutex_lock(&base->d_inode->i_mutex);
> + ret = __lookup_hash(&this, base, 0);
> + mutex_unlock(&base->d_inode->i_mutex);
> + return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> struct path *path, int *empty)
> {
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> __be32 nfserr;
> int ignore_crossmnt = 0;
>
> - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> if (IS_ERR(dentry))
> return nfserrno(PTR_ERR(dentry));
> if (d_really_is_negative(dentry)) {
> /*
> - * nfsd_buffered_readdir drops the i_mutex between
> - * readdir and calling this callback, leaving a window
> - * where this directory entry could have gone away.
> + * we're not holding the i_mutex here, so there's
> + * a window where this directory entry could have gone
> + * away.
> */
> dput(dentry);
> return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..cc7995762190 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> host_err = PTR_ERR(dentry);
> if (IS_ERR(dentry))
> goto out_nfserr;
> + if (!S_ISREG(d_inode(dentry)->i_mode)) {
Got a crash here tested by pynfs,
# ./testserver.py 127.0.0.1:/nfs/pnfs/ --rundeps --maketree open
PID: 2314 TASK: ffff88006ad2cfc0 CPU: 0 COMMAND: "nfsd"
#0 [ffff88006adc7870] machine_kexec at ffffffff8104debb
#1 [ffff88006adc78e0] crash_kexec at ffffffff810f92a2
#2 [ffff88006adc79b0] oops_end at ffffffff81015c28
#3 [ffff88006adc79e0] no_context at ffffffff8105a9af
#4 [ffff88006adc7a50] __bad_area_nosemaphore at ffffffff8105ac90
#5 [ffff88006adc7aa0] bad_area_nosemaphore at ffffffff8105ae23
#6 [ffff88006adc7ab0] __do_page_fault at ffffffff8105b13e
#7 [ffff88006adc7b10] do_page_fault at ffffffff8105b4df
#8 [ffff88006adc7b50] page_fault at ffffffff81713258
[exception RIP: nfsd_lookup_dentry+231]
RIP: ffffffffa0447b87 RSP: ffff88006adc7c08 RFLAGS: 00010283
RAX: ffff88006a9d2180 RBX: ffff88006ad80068 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88007fdf4eb0 RDI: ffff88006a9d2180
RBP: ffff88006adc7c88 R8: ffff88006b13dcc0 R9: 0000000000000000
R10: ffff88006adb6cc0 R11: ffff88006adb6cc0 R12: ffff88006a9d0cc0
R13: ffff88006adc7ca0 R14: ffff88006adc7ca8 R15: ffff88006ad640bc
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff88006adc7c90] nfsd_lookup at ffffffffa0448029 [nfsd]
#10 [ffff88006adc7cf0] nfsd4_open at ffffffffa0456cbb [nfsd]
#11 [ffff88006adc7d60] nfsd4_proc_compound at ffffffffa0457317 [nfsd]
#12 [ffff88006adc7dc0] nfsd_dispatch at ffffffffa0442f83 [nfsd]
#13 [ffff88006adc7e00] svc_process_common at ffffffffa0160c5b [sunrpc]
#14 [ffff88006adc7e70] svc_process at ffffffffa0161cc3 [sunrpc]
#15 [ffff88006adc7ea0] nfsd at ffffffffa044294f [nfsd]
#16 [ffff88006adc7ed0] kthread at ffffffff810a9d07
#17 [ffff88006adc7f50] ret_from_fork at ffffffff81711b62
thanks,
Kinglong Mee
On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > + * Unlike lookup_one_len, it should be called without the parent
> > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > + */
> > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > + struct dentry *base, int len)
> > > +{
> > > + struct qstr this;
> > > + unsigned int c;
> > > + int err;
> > > + struct dentry *ret;
> > > +
> > > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> >
> > Remove this line.
>
> Whoops, thanks.
>
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index a30e79900086..cc7995762190 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > host_err = PTR_ERR(dentry);
> > > if (IS_ERR(dentry))
> > > goto out_nfserr;
> > > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
> >
> > Got a crash here tested by pynfs,
>
> OK, I guess I just forgot to take into account negative dentries.
> Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
And I also forgot nfs3xdr.c.
--b.
commit 6c942d342f9f
Author: NeilBrown <[email protected]>
Date: Fri May 1 11:53:26 2015 +1000
nfsd: don't hold i_mutex over userspace upcalls
We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.
In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.
With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.
It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like
mutex_lock()
lookup_one_len()
mutex_unlock()
In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.
Signed-off-by: J. Bruce Fields <[email protected]>
diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..0f554bf41dea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * Must be called with the parented i_mutex held.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ return ret;
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..4accdb00976b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
+ if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
+ /*
+ * Never mind, we're not going to open this
+ * anyway. And we don't want to hold it over
+ * the userspace upcalls in nfsd_mountpoint. */
+ fh_unlock(fhp);
+ }
/*
* check if we have crossed a mount point ...
*/
@@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > + * Unlike lookup_one_len, it should be called without the parent
> > + * i_mutex held, and will take the i_mutex itself if necessary.
> > + */
> > +struct dentry *lookup_one_len_unlocked(const char *name,
> > + struct dentry *base, int len)
> > +{
> > + struct qstr this;
> > + unsigned int c;
> > + int err;
> > + struct dentry *ret;
> > +
> > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
>
> Remove this line.
Whoops, thanks.
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a30e79900086..cc7995762190 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > host_err = PTR_ERR(dentry);
> > if (IS_ERR(dentry))
> > goto out_nfserr;
> > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
>
> Got a crash here tested by pynfs,
OK, I guess I just forgot to take into account negative dentries.
Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
--b.
> # ./testserver.py 127.0.0.1:/nfs/pnfs/ --rundeps --maketree open
>
> PID: 2314 TASK: ffff88006ad2cfc0 CPU: 0 COMMAND: "nfsd"
> #0 [ffff88006adc7870] machine_kexec at ffffffff8104debb
> #1 [ffff88006adc78e0] crash_kexec at ffffffff810f92a2
> #2 [ffff88006adc79b0] oops_end at ffffffff81015c28
> #3 [ffff88006adc79e0] no_context at ffffffff8105a9af
> #4 [ffff88006adc7a50] __bad_area_nosemaphore at ffffffff8105ac90
> #5 [ffff88006adc7aa0] bad_area_nosemaphore at ffffffff8105ae23
> #6 [ffff88006adc7ab0] __do_page_fault at ffffffff8105b13e
> #7 [ffff88006adc7b10] do_page_fault at ffffffff8105b4df
> #8 [ffff88006adc7b50] page_fault at ffffffff81713258
> [exception RIP: nfsd_lookup_dentry+231]
> RIP: ffffffffa0447b87 RSP: ffff88006adc7c08 RFLAGS: 00010283
> RAX: ffff88006a9d2180 RBX: ffff88006ad80068 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: ffff88007fdf4eb0 RDI: ffff88006a9d2180
> RBP: ffff88006adc7c88 R8: ffff88006b13dcc0 R9: 0000000000000000
> R10: ffff88006adb6cc0 R11: ffff88006adb6cc0 R12: ffff88006a9d0cc0
> R13: ffff88006adc7ca0 R14: ffff88006adc7ca8 R15: ffff88006ad640bc
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
> #9 [ffff88006adc7c90] nfsd_lookup at ffffffffa0448029 [nfsd]
> #10 [ffff88006adc7cf0] nfsd4_open at ffffffffa0456cbb [nfsd]
> #11 [ffff88006adc7d60] nfsd4_proc_compound at ffffffffa0457317 [nfsd]
> #12 [ffff88006adc7dc0] nfsd_dispatch at ffffffffa0442f83 [nfsd]
> #13 [ffff88006adc7e00] svc_process_common at ffffffffa0160c5b [sunrpc]
> #14 [ffff88006adc7e70] svc_process at ffffffffa0161cc3 [sunrpc]
> #15 [ffff88006adc7ea0] nfsd at ffffffffa044294f [nfsd]
> #16 [ffff88006adc7ed0] kthread at ffffffff810a9d07
> #17 [ffff88006adc7f50] ret_from_fork at ffffffff81711b62
>
> thanks,
> Kinglong Mee
On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > + struct dentry *base, int len)
> > > > +{
> > > > + struct qstr this;
> > > > + unsigned int c;
> > > > + int err;
> > > > + struct dentry *ret;
> > > > +
> > > > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > >
> > > Remove this line.
> >
> > Whoops, thanks.
> >
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > host_err = PTR_ERR(dentry);
> > > > if (IS_ERR(dentry))
> > > > goto out_nfserr;
> > > > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > >
> > > Got a crash here tested by pynfs,
> >
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
>
> And I also forgot nfs3xdr.c.
>
> --b.
>
> commit 6c942d342f9f
> Author: NeilBrown <[email protected]>
> Date: Fri May 1 11:53:26 2015 +1000
>
> nfsd: don't hold i_mutex over userspace upcalls
>
> We need information about exports when crossing mountpoints during
> lookup or NFSv4 readdir. If we don't already have that information
> cached, we may have to ask (and wait for) rpc.mountd.
>
> In both cases we currently hold the i_mutex on the parent of the
> directory we're asking rpc.mountd about. We've seen situations where
> rpc.mountd performs some operation on that directory that tries to take
> the i_mutex again, resulting in deadlock.
>
> With some care, we may be able to avoid that in rpc.mountd. But it
> seems better just to avoid holding a mutex while waiting on userspace.
>
> It appears that lookup_one_len is pretty much the only operation that
> needs the i_mutex. So we could just drop the i_mutex elsewhere and do
> something like
>
> mutex_lock()
> lookup_one_len()
> mutex_unlock()
>
> In many cases though the lookup would have been cached and not required
> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> that only takes the i_mutex when necessary.
>
> Signed-off-by: J. Bruce Fields <[email protected]>
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..0f554bf41dea 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> *
> * Note that this routine is purely a helper for filesystem usage and should
> * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.
"parented"?
* base->i_mutex must be held by caller.
??
> */
> struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> {
> @@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> }
> EXPORT_SYMBOL(lookup_one_len);
>
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
lookup_one_len_unlocked - ..
> + * @name: pathname component to lookup
> + * @base: base directory to lookup from
> + * @len: maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> + struct dentry *base, int len)
> +{
> + struct qstr this;
> + unsigned int c;
> + int err;
> + struct dentry *ret;
> +
> + this.name = name;
> + this.len = len;
> + this.hash = full_name_hash(name, len);
> + if (!len)
> + return ERR_PTR(-EACCES);
> +
> + if (unlikely(name[0] == '.')) {
> + if (len < 2 || (len == 2 && name[1] == '.'))
> + return ERR_PTR(-EACCES);
> + }
> +
> + while (len--) {
> + c = *(const unsigned char *)name++;
> + if (c == '/' || c == '\0')
> + return ERR_PTR(-EACCES);
> + }
> + /*
> + * See if the low-level filesystem might want
> + * to use its own hash..
> + */
> + if (base->d_flags & DCACHE_OP_HASH) {
> + int err = base->d_op->d_hash(base, &this);
> + if (err < 0)
> + return ERR_PTR(err);
> + }
> +
> + err = inode_permission(base->d_inode, MAY_EXEC);
> + if (err)
> + return ERR_PTR(err);
> +
> + ret = __d_lookup(base, &this);
> + if (ret)
> + return ret;
__d_lookup() is a little too low level, as it doesn't call d_revalidate() and
it doesn't retry if the rename_lock seqlock says it should.
The latter doesn't really matter as we would fall through to the safe
mutex-protected version.
The former isn't much of an issue for most filesystems under nfsd, but still
needs to be handled to some extent.
Also, the comment for __d_lookup says
* __d_lookup callers must be commented.
The simplest resolution would be:
/* __d_lookup() is used to try to get a quick answer and avoid the
* mutex. A false-negative does no harm.
*/
ret = __d_lookup(base, &this);
if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
dput(ret);
ret = NULL;
}
if (ret)
return ret;
It probably wouldn't be too much harder to add the d_revalidate() call, and
call d_invalidate() if it returns zero.
Maybe.
if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
int err = d_revalidate(ret, 0);
if (err == 0) {
d_invalidate(ret);
dput(ret);
ret = NULL;
} else if (err < 0) {
dput(ret);
return ERR_PTR(err);
}
}
> + mutex_lock(&base->d_inode->i_mutex);
> + ret = __lookup_hash(&this, base, 0);
> + mutex_unlock(&base->d_inode->i_mutex);
> + return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> struct path *path, int *empty)
> {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> } else
> dchild = dget(dparent);
> } else
> - dchild = lookup_one_len(name, dparent, namlen);
> + dchild = lookup_one_len_unlocked(name, dparent, namlen);
> if (IS_ERR(dchild))
> return rv;
> if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> __be32 nfserr;
> int ignore_crossmnt = 0;
>
> - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> if (IS_ERR(dentry))
> return nfserrno(PTR_ERR(dentry));
> if (d_really_is_negative(dentry)) {
> /*
> - * nfsd_buffered_readdir drops the i_mutex between
> - * readdir and calling this callback, leaving a window
> - * where this directory entry could have gone away.
> + * we're not holding the i_mutex here, so there's
> + * a window where this directory entry could have gone
> + * away.
> */
> dput(dentry);
> return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..4accdb00976b 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> host_err = PTR_ERR(dentry);
> if (IS_ERR(dentry))
> goto out_nfserr;
> + if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> + /*
> + * Never mind, we're not going to open this
> + * anyway. And we don't want to hold it over
> + * the userspace upcalls in nfsd_mountpoint. */
> + fh_unlock(fhp);
> + }
You could avoid the test by just putting the fh_unlock(fhp) inside the
if (nfsd_mountpoint(dentry, exp)) {
block. It might also be appropriate for nfsd_mountpoint to fail on
non-directories.
Up to you though.
Thanks. Feel free to add
*-by: NeilBrown <[email protected]>
as you see fit.
NeilBrown
> /*
> * check if we have crossed a mount point ...
> */
> @@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> offset = *offsetp;
>
> while (1) {
> - struct inode *dir_inode = file_inode(file);
> unsigned int reclen;
>
> cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> if (!size)
> break;
>
> - /*
> - * Various filldir functions may end up calling back into
> - * lookup_one_len() and the file system's ->lookup() method.
> - * These expect i_mutex to be held, as it would within readdir.
> - */
> - host_err = mutex_lock_killable(&dir_inode->i_mutex);
> - if (host_err)
> - break;
> -
> de = (struct buffered_dirent *)buf.dirent;
> while (size > 0) {
> offset = de->offset;
> @@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> size -= reclen;
> de = (struct buffered_dirent *)((char *)de + reclen);
> }
> - mutex_unlock(&dir_inode->i_mutex);
> if (size > 0) /* We bailed out early */
> break;
>
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
> extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>
> extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>
> extern int follow_down_one(struct path *);
> extern int follow_down(struct path *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 4 May 2015 17:48:22 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Sun, May 03, 2015 at 09:16:53AM +1000, NeilBrown wrote:
> > On Fri, 1 May 2015 09:29:53 -0400 "J. Bruce Fields" <[email protected]>
> > wrote:
> >
> > > On Fri, May 01, 2015 at 01:08:26PM +1000, NeilBrown wrote:
> > > > On Fri, 1 May 2015 03:29:40 +0100 Al Viro <[email protected]> wrote:
> > > >
> > > > > On Fri, May 01, 2015 at 12:23:33PM +1000, NeilBrown wrote:
> > > > > > > What kind of consistency warranties do callers expect, BTW? You do realize
> > > > > > > that between iterate_dir() and callbacks an entry might have been removed
> > > > > > > and/or replaced?
> > > > > >
> > > > > > For READDIR_PLUS, lookup_one_len is called on each name and it requires
> > > > > > i_mutex, so the code currently holds i_mutex over the whole sequence.
> > > > > > This is triggering a deadlock.
> > > > >
> > > > > Yes, I've seen the context. However, you are _not_ holding it between
> > > > > actual iterate_dir() and those callbacks, which opens a window when
> > > > > directory might have been changed.
> > > > >
> > > > > Again, what kind of consistency is expected by callers? Are they ready to
> > > > > cope with "there's no such entry anymore" or "inumber is nothing like
> > > > > what we'd put in ->ino, since it's no the same object" or "->d_type is
> > > > > completely unrelated to what we'd found, since the damn thing had been
> > > > > removed and created from scratch"?
> > > >
> > > > Ah, sorry.
> > > >
> > > > Yes, the callers are prepared for "there's no such entry anymore".
> > > > They don't use d_type, so don't care if it might be meaningless.
> > > > NFSv4 doesn't use ino either, but NFSv3 does and isn't properly cautious
> > > > about ino changing.
> > > >
> > > > In nfs3xdr, we should probably pass 'ino' to encode_entryplus_baggage() and
> > > > thence to compose_entry_fh() and it should report failure if
> > > > dchild->d_inode->i_ino doesn't match.
> > >
> > > Just to make sure I understand the concern..... So it shouldn't really
> > > be a problem if readdir and lookup find different objects for the same
> > > name, the problem is just when we mix attributes from the two objects,
> > > right? Looks like the v3 code could return an inode number derived from
> > > the readdir and a filehandle from the lookup, which is a problem. The
> > > v4 code will get everything from the result of the lookup, which should
> > > be OK.
> >
> > That agrees with my understanding, yes.
> >
> > I did wonder for a little while about the possibility of a directory
> > containing both 'a' and 'b', and NFSv4 doing the readdir and the stat of 'a',
> > and the a "mv a b" happening before the stat of 'b'.
> >
> > Then the readdir response will show both 'a' and 'b' referring to the same
> > object with a link count of 1.
> >
> > I can't quite decide if that is a problem or not.
> >
> >
> > >
> > > > Simply not returning the extra attributes is perfectly acceptable in NFSv3.
> > >
> > > Right, so no big deal anyway.--b.
> >
> > Not a big deal, but we should really add a patch like the following ("like"
> > as in "actually compile tested and documented" which this one isn't).
>
> Doesn't seem to break anything. Any second thoughts, or can I add a
> signed-off-by?
No second thoughts.
Signed-off-by: NeilBrown <[email protected]>
Thanks.
NeilBrown
>
> --b.
>
> commit e11f8acace69
> Author: NeilBrown <[email protected]>
> Date: Sun May 3 09:16:53 2015 +1000
>
> nfsd: stop READDIRPLUS returning inconsistent attributes
>
> The NFSv3 READDIRPLUS gets some of the returned attributes from the
> readdir, and some from an inode returned from a new lookup. The two
> objects could be different thanks to intervening renames.
>
> The attributes in READDIRPLUS are optional, so let's just skip them if
> we notice this case.
>
> Signed-off-by: J. Bruce Fields <[email protected]>
>
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index e4b2b4322553..f6e7cbabac5a 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
>
> static __be32
> compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> - const char *name, int namlen)
> + const char *name, int namlen, u64 ino)
> {
> struct svc_export *exp;
> struct dentry *dparent, *dchild;
> @@ -830,19 +830,21 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> goto out;
> if (d_really_is_negative(dchild))
> goto out;
> + if (dchild->d_inode->i_ino != ino)
> + goto out;
> rv = fh_compose(fhp, exp, dchild, &cd->fh);
> out:
> dput(dchild);
> return rv;
> }
>
> -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
> +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
> {
> struct svc_fh *fh = &cd->scratch;
> __be32 err;
>
> fh_init(fh, NFS3_FHSIZE);
> - err = compose_entry_fh(cd, fh, name, namlen);
> + err = compose_entry_fh(cd, fh, name, namlen, ino);
> if (err) {
> *p++ = 0;
> *p++ = 0;
> @@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
> p = encode_entry_baggage(cd, p, name, namlen, ino);
>
> if (plus)
> - p = encode_entryplus_baggage(cd, p, name, namlen);
> + p = encode_entryplus_baggage(cd, p, name, namlen, ino);
> num_entry_words = p - cd->buffer;
> } else if (*(page+1) != NULL) {
> /* temporarily encode entry into next page, then move back to
> @@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
> p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
>
> if (plus)
> - p1 = encode_entryplus_baggage(cd, p1, name, namlen);
> + p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
>
> /* determine entry word length and lengths to go in pages */
> num_entry_words = p1 - tmp;
On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > + struct dentry *base, int len)
> > > > +{
> > > > + struct qstr this;
> > > > + unsigned int c;
> > > > + int err;
> > > > + struct dentry *ret;
> > > > +
> > > > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > >
> > > Remove this line.
> >
> > Whoops, thanks.
> >
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > host_err = PTR_ERR(dentry);
> > > > if (IS_ERR(dentry))
> > > > goto out_nfserr;
> > > > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > >
> > > Got a crash here tested by pynfs,
> >
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
>
> And I also forgot nfs3xdr.c.
But, this doesn't look good.
OK, to be fair I'm not sure whether this was already happening before this
patch.
--b.
[57287.226846] ======================================================
[57287.227499] [ INFO: possible circular locking dependency detected ]
[57287.228009] 4.1.0-rc2-22946-g6c942d3 #174 Not tainted
[57287.228009] -------------------------------------------------------
[57287.228009] updatedb/19312 is trying to acquire lock:
[57287.228009] (&mm->mmap_sem){++++++}, at: [<ffffffff811980bf>] might_fault+0x5f/0xb0
[57287.228009]
but task is already holding lock:
[57287.228009] (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009]
which lock already depends on the new lock.
[57287.228009]
the existing dependency chain (in reverse order) is:
[57287.228009]
-> #2 (&xfs_dir_ilock_class){++++..}:
[57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009] [<ffffffff810c8e0d>] down_read_nested+0x4d/0x70
[57287.228009] [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009] [<ffffffff81400e68>] xfs_ilock_attr_map_shared+0x38/0x50
[57287.228009] [<ffffffff8139b9e1>] xfs_attr_get+0xc1/0x180
[57287.228009] [<ffffffff8140ff87>] xfs_xattr_get+0x37/0x50
[57287.228009] [<ffffffff811e866f>] generic_getxattr+0x4f/0x70
[57287.228009] [<ffffffff816535d2>] inode_doinit_with_dentry+0x152/0x670
[57287.228009] [<ffffffff81657f8b>] sb_finish_set_opts+0xdb/0x260
[57287.228009] [<ffffffff81658604>] selinux_set_mnt_opts+0x2c4/0x600
[57287.228009] [<ffffffff816589ff>] superblock_doinit+0xbf/0xd0
[57287.228009] [<ffffffff81658a6d>] selinux_sb_kern_mount+0x3d/0xa0
[57287.228009] [<ffffffff8164efa6>] security_sb_kern_mount+0x16/0x20
[57287.228009] [<ffffffff811c29dd>] mount_fs+0x7d/0x190
[57287.228009] [<ffffffff811e2258>] vfs_kern_mount+0x68/0x160
[57287.228009] [<ffffffff811e58b4>] do_mount+0x204/0xc60
[57287.228009] [<ffffffff811e660f>] SyS_mount+0x6f/0xb0
[57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009]
-> #1 (&isec->lock){+.+.+.}:
[57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009] [<ffffffff81b675f3>] mutex_lock_nested+0x63/0x400
[57287.228009] [<ffffffff81653525>] inode_doinit_with_dentry+0xa5/0x670
[57287.228009] [<ffffffff81653b0c>] selinux_d_instantiate+0x1c/0x20
[57287.228009] [<ffffffff8164eb2b>] security_d_instantiate+0x1b/0x30
[57287.228009] [<ffffffff811d9ce4>] d_instantiate+0x54/0x80
[57287.228009] [<ffffffff811894fe>] __shmem_file_setup+0xce/0x250
[57287.228009] [<ffffffff8118c5f8>] shmem_zero_setup+0x28/0x70
[57287.228009] [<ffffffff811a1818>] mmap_region+0x5c8/0x5e0
[57287.228009] [<ffffffff811a1ba3>] do_mmap_pgoff+0x373/0x420
[57287.228009] [<ffffffff8118cb00>] vm_mmap_pgoff+0x90/0xc0
[57287.228009] [<ffffffff811a0000>] SyS_mmap_pgoff+0x1b0/0x240
[57287.228009] [<ffffffff81008c92>] SyS_mmap+0x22/0x30
[57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009]
-> #0 (&mm->mmap_sem){++++++}:
[57287.228009] [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
[57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009] [<ffffffff811980ec>] might_fault+0x8c/0xb0
[57287.228009] [<ffffffff811d3ff2>] filldir+0x92/0x120
[57287.228009] [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
[57287.228009] [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
[57287.228009] [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
[57287.228009] [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
[57287.228009] [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
[57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
[57287.228009]
other info that might help us debug this:
[57287.228009] Chain exists of:
&mm->mmap_sem --> &isec->lock --> &xfs_dir_ilock_class
[57287.228009] Possible unsafe locking scenario:
[57287.228009] CPU0 CPU1
[57287.228009] ---- ----
[57287.228009] lock(&xfs_dir_ilock_class);
[57287.228009] lock(&isec->lock);
[57287.228009] lock(&xfs_dir_ilock_class);
[57287.228009] lock(&mm->mmap_sem);
[57287.228009]
*** DEADLOCK ***
[57287.228009] 2 locks held by updatedb/19312:
[57287.228009] #0: (&type->i_mutex_dir_key#5){+.+.+.}, at: [<ffffffff811d3d91>] iterate_dir+0x61/0x140
[57287.228009] #1: (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
[57287.228009]
stack backtrace:
[57287.228009] CPU: 0 PID: 19312 Comm: updatedb Not tainted 4.1.0-rc2-22946-g6c942d3 #174
[57287.228009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[57287.228009] ffffffff82c61ed0 ffff880060d0faa8 ffffffff81b61a1a 0000000080000001
[57287.228009] ffffffff82c345d0 ffff880060d0faf8 ffffffff810cb843 0000000000000000
[57287.228009] ffff880060d0fb28 0000000000000000 0000000000000001 0000000000000000
[57287.228009] Call Trace:
[57287.228009] [<ffffffff81b61a1a>] dump_stack+0x4f/0x7b
[57287.228009] [<ffffffff810cb843>] print_circular_bug+0x203/0x310
[57287.228009] [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
[57287.228009] [<ffffffff810ce018>] ? __lock_acquire+0x448/0x1920
[57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
[57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
[57287.228009] [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
[57287.228009] [<ffffffff811980ec>] might_fault+0x8c/0xb0
[57287.228009] [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
[57287.228009] [<ffffffff811d3ff2>] filldir+0x92/0x120
[57287.228009] [<ffffffff81400e24>] ? xfs_ilock_data_map_shared+0x34/0x40
[57287.228009] [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
[57287.228009] [<ffffffff81400c1a>] ? xfs_ilock+0x10a/0x2e0
[57287.228009] [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
[57287.228009] [<ffffffff810cd6b5>] ? mark_held_locks+0x75/0xa0
[57287.228009] [<ffffffff81b6800a>] ? mutex_lock_killable_nested+0x27a/0x4e0
[57287.228009] [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
[57287.228009] [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
[57287.228009] [<ffffffff811dfceb>] ? __fget_light+0x7b/0xa0
[57287.228009] [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
[57287.228009] [<ffffffff811d3f60>] ? fillonedir+0xf0/0xf0
[57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
On Thu, 7 May 2015 11:31:16 -0400 "J. Bruce Fields" <[email protected]>
wrote:
> On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > > + */
> > > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > > + struct dentry *base, int len)
> > > > > +{
> > > > > + struct qstr this;
> > > > > + unsigned int c;
> > > > > + int err;
> > > > > + struct dentry *ret;
> > > > > +
> > > > > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > >
> > > > Remove this line.
> > >
> > > Whoops, thanks.
> > >
> > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > index a30e79900086..cc7995762190 100644
> > > > > --- a/fs/nfsd/vfs.c
> > > > > +++ b/fs/nfsd/vfs.c
> > > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > host_err = PTR_ERR(dentry);
> > > > > if (IS_ERR(dentry))
> > > > > goto out_nfserr;
> > > > > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > >
> > > > Got a crash here tested by pynfs,
> > >
> > > OK, I guess I just forgot to take into account negative dentries.
> > > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> >
> > And I also forgot nfs3xdr.c.
>
> But, this doesn't look good.
>
> OK, to be fair I'm not sure whether this was already happening before this
> patch.
None of the stacks show any code that we changed, so I'm guessing it is a
separate issues as you hint at.
If I didn't have a list as long as my arm of things I really should be doing,
I'd go diving into the XFS code ....
NeilBrown
>
> --b.
>
> [57287.226846] ======================================================
> [57287.227499] [ INFO: possible circular locking dependency detected ]
> [57287.228009] 4.1.0-rc2-22946-g6c942d3 #174 Not tainted
> [57287.228009] -------------------------------------------------------
> [57287.228009] updatedb/19312 is trying to acquire lock:
> [57287.228009] (&mm->mmap_sem){++++++}, at: [<ffffffff811980bf>] might_fault+0x5f/0xb0
> [57287.228009]
> but task is already holding lock:
> [57287.228009] (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009]
> which lock already depends on the new lock.
>
> [57287.228009]
> the existing dependency chain (in reverse order) is:
> [57287.228009]
> -> #2 (&xfs_dir_ilock_class){++++..}:
> [57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009] [<ffffffff810c8e0d>] down_read_nested+0x4d/0x70
> [57287.228009] [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009] [<ffffffff81400e68>] xfs_ilock_attr_map_shared+0x38/0x50
> [57287.228009] [<ffffffff8139b9e1>] xfs_attr_get+0xc1/0x180
> [57287.228009] [<ffffffff8140ff87>] xfs_xattr_get+0x37/0x50
> [57287.228009] [<ffffffff811e866f>] generic_getxattr+0x4f/0x70
> [57287.228009] [<ffffffff816535d2>] inode_doinit_with_dentry+0x152/0x670
> [57287.228009] [<ffffffff81657f8b>] sb_finish_set_opts+0xdb/0x260
> [57287.228009] [<ffffffff81658604>] selinux_set_mnt_opts+0x2c4/0x600
> [57287.228009] [<ffffffff816589ff>] superblock_doinit+0xbf/0xd0
> [57287.228009] [<ffffffff81658a6d>] selinux_sb_kern_mount+0x3d/0xa0
> [57287.228009] [<ffffffff8164efa6>] security_sb_kern_mount+0x16/0x20
> [57287.228009] [<ffffffff811c29dd>] mount_fs+0x7d/0x190
> [57287.228009] [<ffffffff811e2258>] vfs_kern_mount+0x68/0x160
> [57287.228009] [<ffffffff811e58b4>] do_mount+0x204/0xc60
> [57287.228009] [<ffffffff811e660f>] SyS_mount+0x6f/0xb0
> [57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009]
> -> #1 (&isec->lock){+.+.+.}:
> [57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009] [<ffffffff81b675f3>] mutex_lock_nested+0x63/0x400
> [57287.228009] [<ffffffff81653525>] inode_doinit_with_dentry+0xa5/0x670
> [57287.228009] [<ffffffff81653b0c>] selinux_d_instantiate+0x1c/0x20
> [57287.228009] [<ffffffff8164eb2b>] security_d_instantiate+0x1b/0x30
> [57287.228009] [<ffffffff811d9ce4>] d_instantiate+0x54/0x80
> [57287.228009] [<ffffffff811894fe>] __shmem_file_setup+0xce/0x250
> [57287.228009] [<ffffffff8118c5f8>] shmem_zero_setup+0x28/0x70
> [57287.228009] [<ffffffff811a1818>] mmap_region+0x5c8/0x5e0
> [57287.228009] [<ffffffff811a1ba3>] do_mmap_pgoff+0x373/0x420
> [57287.228009] [<ffffffff8118cb00>] vm_mmap_pgoff+0x90/0xc0
> [57287.228009] [<ffffffff811a0000>] SyS_mmap_pgoff+0x1b0/0x240
> [57287.228009] [<ffffffff81008c92>] SyS_mmap+0x22/0x30
> [57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009]
> -> #0 (&mm->mmap_sem){++++++}:
> [57287.228009] [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
> [57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009] [<ffffffff811980ec>] might_fault+0x8c/0xb0
> [57287.228009] [<ffffffff811d3ff2>] filldir+0x92/0x120
> [57287.228009] [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
> [57287.228009] [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
> [57287.228009] [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
> [57287.228009] [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
> [57287.228009] [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
> [57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
> [57287.228009]
> other info that might help us debug this:
>
> [57287.228009] Chain exists of:
> &mm->mmap_sem --> &isec->lock --> &xfs_dir_ilock_class
>
> [57287.228009] Possible unsafe locking scenario:
>
> [57287.228009] CPU0 CPU1
> [57287.228009] ---- ----
> [57287.228009] lock(&xfs_dir_ilock_class);
> [57287.228009] lock(&isec->lock);
> [57287.228009] lock(&xfs_dir_ilock_class);
> [57287.228009] lock(&mm->mmap_sem);
> [57287.228009]
> *** DEADLOCK ***
>
> [57287.228009] 2 locks held by updatedb/19312:
> [57287.228009] #0: (&type->i_mutex_dir_key#5){+.+.+.}, at: [<ffffffff811d3d91>] iterate_dir+0x61/0x140
> [57287.228009] #1: (&xfs_dir_ilock_class){++++..}, at: [<ffffffff81400c1a>] xfs_ilock+0x10a/0x2e0
> [57287.228009]
> stack backtrace:
> [57287.228009] CPU: 0 PID: 19312 Comm: updatedb Not tainted 4.1.0-rc2-22946-g6c942d3 #174
> [57287.228009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
> [57287.228009] ffffffff82c61ed0 ffff880060d0faa8 ffffffff81b61a1a 0000000080000001
> [57287.228009] ffffffff82c345d0 ffff880060d0faf8 ffffffff810cb843 0000000000000000
> [57287.228009] ffff880060d0fb28 0000000000000000 0000000000000001 0000000000000000
> [57287.228009] Call Trace:
> [57287.228009] [<ffffffff81b61a1a>] dump_stack+0x4f/0x7b
> [57287.228009] [<ffffffff810cb843>] print_circular_bug+0x203/0x310
> [57287.228009] [<ffffffff810ccafb>] check_prev_add+0x51b/0x860
> [57287.228009] [<ffffffff810ce018>] ? __lock_acquire+0x448/0x1920
> [57287.228009] [<ffffffff810cf1c2>] __lock_acquire+0x15f2/0x1920
> [57287.228009] [<ffffffff810cfe40>] lock_acquire+0xc0/0x280
> [57287.228009] [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
> [57287.228009] [<ffffffff811980ec>] might_fault+0x8c/0xb0
> [57287.228009] [<ffffffff811980bf>] ? might_fault+0x5f/0xb0
> [57287.228009] [<ffffffff811d3ff2>] filldir+0x92/0x120
> [57287.228009] [<ffffffff81400e24>] ? xfs_ilock_data_map_shared+0x34/0x40
> [57287.228009] [<ffffffff813ec51a>] xfs_dir2_sf_getdents.isra.10+0x1ba/0x220
> [57287.228009] [<ffffffff81400c1a>] ? xfs_ilock+0x10a/0x2e0
> [57287.228009] [<ffffffff813ed0de>] xfs_readdir+0x15e/0x300
> [57287.228009] [<ffffffff810cd6b5>] ? mark_held_locks+0x75/0xa0
> [57287.228009] [<ffffffff81b6800a>] ? mutex_lock_killable_nested+0x27a/0x4e0
> [57287.228009] [<ffffffff813f116b>] xfs_file_readdir+0x2b/0x30
> [57287.228009] [<ffffffff811d3dca>] iterate_dir+0x9a/0x140
> [57287.228009] [<ffffffff811dfceb>] ? __fget_light+0x7b/0xa0
> [57287.228009] [<ffffffff811d42b1>] SyS_getdents+0x81/0x100
> [57287.228009] [<ffffffff811d3f60>] ? fillonedir+0xf0/0xf0
> [57287.228009] [<ffffffff81b6b697>] system_call_fastpath+0x12/0x6f
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 08, 2015 at 08:42:30AM +1000, NeilBrown wrote:
> On Thu, 7 May 2015 11:31:16 -0400 "J. Bruce Fields" <[email protected]>
> wrote:
>
> > On Tue, May 05, 2015 at 11:52:03AM -0400, J. Bruce Fields wrote:
> > > On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > > > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > > > + */
> > > > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > > > + struct dentry *base, int len)
> > > > > > +{
> > > > > > + struct qstr this;
> > > > > > + unsigned int c;
> > > > > > + int err;
> > > > > > + struct dentry *ret;
> > > > > > +
> > > > > > + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > > >
> > > > > Remove this line.
> > > >
> > > > Whoops, thanks.
> > > >
> > > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > > index a30e79900086..cc7995762190 100644
> > > > > > --- a/fs/nfsd/vfs.c
> > > > > > +++ b/fs/nfsd/vfs.c
> > > > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > > > host_err = PTR_ERR(dentry);
> > > > > > if (IS_ERR(dentry))
> > > > > > goto out_nfserr;
> > > > > > + if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > > >
> > > > > Got a crash here tested by pynfs,
> > > >
> > > > OK, I guess I just forgot to take into account negative dentries.
> > > > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> > >
> > > And I also forgot nfs3xdr.c.
> >
> > But, this doesn't look good.
> >
> > OK, to be fair I'm not sure whether this was already happening before this
> > patch.
>
> None of the stacks show any code that we changed, so I'm guessing it is a
> separate issues as you hint at.
>
> If I didn't have a list as long as my arm of things I really should be doing,
> I'd go diving into the XFS code ....
Yeah. Looks like it might be intermittent. I'll keep an eye on it and
report to xfs people, I guess....
--b.
On Wed, May 06, 2015 at 08:26:28AM +1000, NeilBrown wrote:
> On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <[email protected]>
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 4a8d998b7274..0f554bf41dea 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> > *
> > * Note that this routine is purely a helper for filesystem usage and should
> > * not be called by generic code.
> > + *
> > + * Must be called with the parented i_mutex held.
>
> "parented"?
>
> * base->i_mutex must be held by caller.
>
> ??
Thanks.
> > +struct dentry *lookup_one_len_unlocked(const char *name,
...
> > + ret = __d_lookup(base, &this);
> > + if (ret)
> > + return ret;
>
> __d_lookup() is a little too low level, as it doesn't call d_revalidate() and
> it doesn't retry if the rename_lock seqlock says it should.
>
> The latter doesn't really matter as we would fall through to the safe
> mutex-protected version.
> The former isn't much of an issue for most filesystems under nfsd, but still
> needs to be handled to some extent.
> Also, the comment for __d_lookup says
>
> * __d_lookup callers must be commented.
>
> The simplest resolution would be:
>
> /* __d_lookup() is used to try to get a quick answer and avoid the
> * mutex. A false-negative does no harm.
> */
> ret = __d_lookup(base, &this);
> if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> dput(ret);
> ret = NULL;
> }
> if (ret)
> return ret;
That looks right to me.... I'll probably take the simple option.
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a30e79900086..4accdb00976b 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > host_err = PTR_ERR(dentry);
> > if (IS_ERR(dentry))
> > goto out_nfserr;
> > + if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> > + /*
> > + * Never mind, we're not going to open this
> > + * anyway. And we don't want to hold it over
> > + * the userspace upcalls in nfsd_mountpoint. */
> > + fh_unlock(fhp);
> > + }
>
> You could avoid the test by just putting the fh_unlock(fhp) inside the
>
> if (nfsd_mountpoint(dentry, exp)) {
>
> block. It might also be appropriate for nfsd_mountpoint to fail on
> non-directories.
If check_export()'s to be believed, our support for regular-file
exports is intentional. So even if nfsd_mountpoint() is true, it's
possible we might still be about to open the result.
But I think we're OK:
The only reason for keeping the i_mutex here in the open case is to
avoid the situation where a client does OPEN(dirfh, "foo") and gets a
reply that grants a delegation even though the file has actually been
renamed to "bar". (Thus violating the protocol, since the delegation's
supposed to guarantee you'll be notified before the file's renamed.)
(So the i_mutex is actually overkill, it would be enough to detect a
rename and then refuse the delegation (which we're always allowed to
do).)
But actually, if this is a mountpoint then I suppose we're safe from a
rename. So, right, we can just move that unlock into the mountpoint
case.
--b.
From: NeilBrown <[email protected]>
We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.
In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.
With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.
It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like
mutex_lock()
lookup_one_len()
mutex_unlock()
In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.
Signed-off-by: J. Bruce Fields <[email protected]>
---
fs/namei.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/nfs3xdr.c | 2 +-
fs/nfsd/nfs4xdr.c | 8 +++---
fs/nfsd/vfs.c | 23 +++++++---------
include/linux/namei.h | 1 +
5 files changed, 89 insertions(+), 19 deletions(-)
Here's an updated patch.
diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..8b866d79c5b7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ return ret;
+ /*
+ * __d_lookup() is used to try to get a quick answer and avoid the
+ * mutex. A false-negative does no harm.
+ */
+ ret = __d_lookup(base, &this);
+ if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+ dput(ret);
+ ret = NULL;
+ }
+ if (ret)
+ return ret;
+
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..6d5b33458e91 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
--
1.9.3
This passes my review, but it needs an ACK from Al or someone for the
addition of the new lookup_one_len_unlocked (which is the same as
lookup_one_len except that it takes the i_mutex itself when required
instead of requiring the caller to).
--b.
On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> From: NeilBrown <[email protected]>
>
> We need information about exports when crossing mountpoints during
> lookup or NFSv4 readdir. If we don't already have that information
> cached, we may have to ask (and wait for) rpc.mountd.
>
> In both cases we currently hold the i_mutex on the parent of the
> directory we're asking rpc.mountd about. We've seen situations where
> rpc.mountd performs some operation on that directory that tries to take
> the i_mutex again, resulting in deadlock.
>
> With some care, we may be able to avoid that in rpc.mountd. But it
> seems better just to avoid holding a mutex while waiting on userspace.
>
> It appears that lookup_one_len is pretty much the only operation that
> needs the i_mutex. So we could just drop the i_mutex elsewhere and do
> something like
>
> mutex_lock()
> lookup_one_len()
> mutex_unlock()
>
> In many cases though the lookup would have been cached and not required
> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> that only takes the i_mutex when necessary.
>
> Signed-off-by: J. Bruce Fields <[email protected]>
> ---
> fs/namei.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/nfsd/nfs3xdr.c | 2 +-
> fs/nfsd/nfs4xdr.c | 8 +++---
> fs/nfsd/vfs.c | 23 +++++++---------
> include/linux/namei.h | 1 +
> 5 files changed, 89 insertions(+), 19 deletions(-)
>
> Here's an updated patch.
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..8b866d79c5b7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> *
> * Note that this routine is purely a helper for filesystem usage and should
> * not be called by generic code.
> + *
> + * The caller must hold base->i_mutex.
> */
> struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> {
> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> }
> EXPORT_SYMBOL(lookup_one_len);
>
> +/**
> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> + * @name: pathname component to lookup
> + * @base: base directory to lookup from
> + * @len: maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> + struct dentry *base, int len)
> +{
> + struct qstr this;
> + unsigned int c;
> + int err;
> + struct dentry *ret;
> +
> + this.name = name;
> + this.len = len;
> + this.hash = full_name_hash(name, len);
> + if (!len)
> + return ERR_PTR(-EACCES);
> +
> + if (unlikely(name[0] == '.')) {
> + if (len < 2 || (len == 2 && name[1] == '.'))
> + return ERR_PTR(-EACCES);
> + }
> +
> + while (len--) {
> + c = *(const unsigned char *)name++;
> + if (c == '/' || c == '\0')
> + return ERR_PTR(-EACCES);
> + }
> + /*
> + * See if the low-level filesystem might want
> + * to use its own hash..
> + */
> + if (base->d_flags & DCACHE_OP_HASH) {
> + int err = base->d_op->d_hash(base, &this);
> + if (err < 0)
> + return ERR_PTR(err);
> + }
> +
> + err = inode_permission(base->d_inode, MAY_EXEC);
> + if (err)
> + return ERR_PTR(err);
> +
> + ret = __d_lookup(base, &this);
> + if (ret)
> + return ret;
> + /*
> + * __d_lookup() is used to try to get a quick answer and avoid the
> + * mutex. A false-negative does no harm.
> + */
> + ret = __d_lookup(base, &this);
> + if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> + dput(ret);
> + ret = NULL;
> + }
> + if (ret)
> + return ret;
> +
> + mutex_lock(&base->d_inode->i_mutex);
> + ret = __lookup_hash(&this, base, 0);
> + mutex_unlock(&base->d_inode->i_mutex);
> + return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> struct path *path, int *empty)
> {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> } else
> dchild = dget(dparent);
> } else
> - dchild = lookup_one_len(name, dparent, namlen);
> + dchild = lookup_one_len_unlocked(name, dparent, namlen);
> if (IS_ERR(dchild))
> return rv;
> if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> __be32 nfserr;
> int ignore_crossmnt = 0;
>
> - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> if (IS_ERR(dentry))
> return nfserrno(PTR_ERR(dentry));
> if (d_really_is_negative(dentry)) {
> /*
> - * nfsd_buffered_readdir drops the i_mutex between
> - * readdir and calling this callback, leaving a window
> - * where this directory entry could have gone away.
> + * we're not holding the i_mutex here, so there's
> + * a window where this directory entry could have gone
> + * away.
> */
> dput(dentry);
> return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..6d5b33458e91 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> host_err = PTR_ERR(dentry);
> if (IS_ERR(dentry))
> goto out_nfserr;
> - /*
> - * check if we have crossed a mount point ...
> - */
> if (nfsd_mountpoint(dentry, exp)) {
> + /*
> + * We don't need the i_mutex after all. It's
> + * still possible we could open this (regular
> + * files can be mountpoints too), but the
> + * i_mutex is just there to prevent renames of
> + * something that we might be about to delegate,
> + * and a mountpoint won't be renamed:
> + */
> + fh_unlock(fhp);
> if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> dput(dentry);
> goto out_nfserr;
> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> offset = *offsetp;
>
> while (1) {
> - struct inode *dir_inode = file_inode(file);
> unsigned int reclen;
>
> cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> if (!size)
> break;
>
> - /*
> - * Various filldir functions may end up calling back into
> - * lookup_one_len() and the file system's ->lookup() method.
> - * These expect i_mutex to be held, as it would within readdir.
> - */
> - host_err = mutex_lock_killable(&dir_inode->i_mutex);
> - if (host_err)
> - break;
> -
> de = (struct buffered_dirent *)buf.dirent;
> while (size > 0) {
> offset = de->offset;
> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> size -= reclen;
> de = (struct buffered_dirent *)((char *)de + reclen);
> }
> - mutex_unlock(&dir_inode->i_mutex);
> if (size > 0) /* We bailed out early */
> break;
>
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
> extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>
> extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>
> extern int follow_down_one(struct path *);
> extern int follow_down(struct path *);
> --
> 1.9.3
>
ping ....
> -----------------------------------------------------------------------------------
>>From d831154bb7e527f9003e16ac049526be5ed90228 Mon Sep 17 00:00:00 2001
> From: Kinglong Mee <[email protected]>
> Date: Tue, 5 May 2015 16:24:16 +0800
> Subject: [PATCH] mountd: Case-insensitive path length must equals to parent
>
> Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
> introduces looking up bad path which is easy to trigger a present mutex race.
>
> Signed-off-by: Kinglong Mee <[email protected]>
> ---
> utils/mountd/cache.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/utils/mountd/cache.c b/utils/mountd/cache.c
> index 7d250f9..155695a 100644
> --- a/utils/mountd/cache.c
> +++ b/utils/mountd/cache.c
> @@ -478,7 +478,7 @@ static int is_subdirectory(char *child, char *parent)
> if (strcmp(parent, "/") == 0 && child[1] != 0)
> return 1;
>
> - return (same_path(child, parent, l) && child[l] == '/');
> + return (child[l] == '/' && same_path(child, parent, l));
> }
>
> static int path_matches(nfs_export *exp, char *path)
>
On Sat, 27 Jun 2015 07:14:00 +0800 Kinglong Mee <[email protected]>
wrote:
> ping ....
>
> > -----------------------------------------------------------------------------------
> >>From d831154bb7e527f9003e16ac049526be5ed90228 Mon Sep 17 00:00:00 2001
> > From: Kinglong Mee <[email protected]>
> > Date: Tue, 5 May 2015 16:24:16 +0800
> > Subject: [PATCH] mountd: Case-insensitive path length must equals to parent
> >
> > Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
> > introduces looking up bad path which is easy to trigger a present mutex race.
> >
How do you know that every file system that treats multiple different
strings as the same, imposes the rule that the strings must be the same
length?
Given how complex unicode case rules can be, I certainly wouldn't be
certain of that.
If the only purpose of this patch is to avoid triggering a bug in a
separate piece of code, then I am not in favour of it.
Thanks,
NeilBrown
> > Signed-off-by: Kinglong Mee <[email protected]>
> > ---
> > utils/mountd/cache.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/utils/mountd/cache.c b/utils/mountd/cache.c
> > index 7d250f9..155695a 100644
> > --- a/utils/mountd/cache.c
> > +++ b/utils/mountd/cache.c
> > @@ -478,7 +478,7 @@ static int is_subdirectory(char *child, char *parent)
> > if (strcmp(parent, "/") == 0 && child[1] != 0)
> > return 1;
> >
> > - return (same_path(child, parent, l) && child[l] == '/');
> > + return (child[l] == '/' && same_path(child, parent, l));
> > }
> >
> > static int path_matches(nfs_export *exp, char *path)
> >
On 6/27/2015 07:35, NeilBrown wrote:
> On Sat, 27 Jun 2015 07:14:00 +0800 Kinglong Mee <[email protected]>
> wrote:
>
>> ping ....
>>
>>> -----------------------------------------------------------------------------------
>>> >From d831154bb7e527f9003e16ac049526be5ed90228 Mon Sep 17 00:00:00 2001
>>> From: Kinglong Mee <[email protected]>
>>> Date: Tue, 5 May 2015 16:24:16 +0800
>>> Subject: [PATCH] mountd: Case-insensitive path length must equals to parent
>>>
>>> Commit 6091c0a4c4 (mountd: add support for case-insensitive file names)
>>> introduces looking up bad path which is easy to trigger a present mutex race.
>>>
>
> How do you know that every file system that treats multiple different
> strings as the same, imposes the rule that the strings must be the same
> length?
Agree with you.
With your suggestion, there is a bug exist.
static int is_subdirectory(char *child, char *parent)
{
/* Check is child is strictly a subdirectory of
* parent or a more distant descendant.
*/
size_t l = strlen(parent);
# rpc.mountd get parent's length.
if (strcmp(parent, "/") == 0 && child[1] != 0)
return 1;
return (same_path(child, parent, l) && child[l] == '/');
# will truncate child's name by parent's length, also checking child[l].
# I think the length should be calculated by the count of '/'.
}
>
> Given how complex unicode case rules can be, I certainly wouldn't be
> certain of that.
>
> If the only purpose of this patch is to avoid triggering a bug in a
> separate piece of code, then I am not in favour of it.
Yes,
This patch just change two condition's position.
Please ignore this patch,
maybe a new patch for NeilBrown's suggestion will be post.
thanks,
Kinglong Mee
>
> Thanks,
> NeilBrown
>
>
>>> Signed-off-by: Kinglong Mee <[email protected]>
>>> ---
>>> utils/mountd/cache.c | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/utils/mountd/cache.c b/utils/mountd/cache.c
>>> index 7d250f9..155695a 100644
>>> --- a/utils/mountd/cache.c
>>> +++ b/utils/mountd/cache.c
>>> @@ -478,7 +478,7 @@ static int is_subdirectory(char *child, char *parent)
>>> if (strcmp(parent, "/") == 0 && child[1] != 0)
>>> return 1;
>>>
>>> - return (same_path(child, parent, l) && child[l] == '/');
>>> + return (child[l] == '/' && same_path(child, parent, l));
>>> }
>>>
>>> static int path_matches(nfs_export *exp, char *path)
>>>
>
>
Ping...
What's the state of this patch ?
Without modify nfs-utils, this one is make sense.
thanks,
Kinglong Mee
On 6/3/2015 23:18, J. Bruce Fields wrote:
> This passes my review, but it needs an ACK from Al or someone for the
> addition of the new lookup_one_len_unlocked (which is the same as
> lookup_one_len except that it takes the i_mutex itself when required
> instead of requiring the caller to).
>
> --b.
>
> On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
>> From: NeilBrown <[email protected]>
>>
>> We need information about exports when crossing mountpoints during
>> lookup or NFSv4 readdir. If we don't already have that information
>> cached, we may have to ask (and wait for) rpc.mountd.
>>
>> In both cases we currently hold the i_mutex on the parent of the
>> directory we're asking rpc.mountd about. We've seen situations where
>> rpc.mountd performs some operation on that directory that tries to take
>> the i_mutex again, resulting in deadlock.
>>
>> With some care, we may be able to avoid that in rpc.mountd. But it
>> seems better just to avoid holding a mutex while waiting on userspace.
>>
>> It appears that lookup_one_len is pretty much the only operation that
>> needs the i_mutex. So we could just drop the i_mutex elsewhere and do
>> something like
>>
>> mutex_lock()
>> lookup_one_len()
>> mutex_unlock()
>>
>> In many cases though the lookup would have been cached and not required
>> the i_mutex, so it's more efficient to create a lookup_one_len() variant
>> that only takes the i_mutex when necessary.
>>
>> Signed-off-by: J. Bruce Fields <[email protected]>
>> ---
>> fs/namei.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/nfsd/nfs3xdr.c | 2 +-
>> fs/nfsd/nfs4xdr.c | 8 +++---
>> fs/nfsd/vfs.c | 23 +++++++---------
>> include/linux/namei.h | 1 +
>> 5 files changed, 89 insertions(+), 19 deletions(-)
>>
>> Here's an updated patch.
>>
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 4a8d998b7274..8b866d79c5b7 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>> *
>> * Note that this routine is purely a helper for filesystem usage and should
>> * not be called by generic code.
>> + *
>> + * The caller must hold base->i_mutex.
>> */
>> struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>> {
>> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>> }
>> EXPORT_SYMBOL(lookup_one_len);
>>
>> +/**
>> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
>> + * @name: pathname component to lookup
>> + * @base: base directory to lookup from
>> + * @len: maximum length @len should be interpreted to
>> + *
>> + * Note that this routine is purely a helper for filesystem usage and should
>> + * not be called by generic code.
>> + *
>> + * Unlike lookup_one_len, it should be called without the parent
>> + * i_mutex held, and will take the i_mutex itself if necessary.
>> + */
>> +struct dentry *lookup_one_len_unlocked(const char *name,
>> + struct dentry *base, int len)
>> +{
>> + struct qstr this;
>> + unsigned int c;
>> + int err;
>> + struct dentry *ret;
>> +
>> + this.name = name;
>> + this.len = len;
>> + this.hash = full_name_hash(name, len);
>> + if (!len)
>> + return ERR_PTR(-EACCES);
>> +
>> + if (unlikely(name[0] == '.')) {
>> + if (len < 2 || (len == 2 && name[1] == '.'))
>> + return ERR_PTR(-EACCES);
>> + }
>> +
>> + while (len--) {
>> + c = *(const unsigned char *)name++;
>> + if (c == '/' || c == '\0')
>> + return ERR_PTR(-EACCES);
>> + }
>> + /*
>> + * See if the low-level filesystem might want
>> + * to use its own hash..
>> + */
>> + if (base->d_flags & DCACHE_OP_HASH) {
>> + int err = base->d_op->d_hash(base, &this);
>> + if (err < 0)
>> + return ERR_PTR(err);
>> + }
>> +
>> + err = inode_permission(base->d_inode, MAY_EXEC);
>> + if (err)
>> + return ERR_PTR(err);
>> +
>> + ret = __d_lookup(base, &this);
>> + if (ret)
>> + return ret;
>> + /*
>> + * __d_lookup() is used to try to get a quick answer and avoid the
>> + * mutex. A false-negative does no harm.
>> + */
>> + ret = __d_lookup(base, &this);
>> + if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
>> + dput(ret);
>> + ret = NULL;
>> + }
>> + if (ret)
>> + return ret;
>> +
>> + mutex_lock(&base->d_inode->i_mutex);
>> + ret = __lookup_hash(&this, base, 0);
>> + mutex_unlock(&base->d_inode->i_mutex);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(lookup_one_len_unlocked);
>> +
>> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>> struct path *path, int *empty)
>> {
>> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> index f6e7cbabac5a..01dcd494f781 100644
>> --- a/fs/nfsd/nfs3xdr.c
>> +++ b/fs/nfsd/nfs3xdr.c
>> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>> } else
>> dchild = dget(dparent);
>> } else
>> - dchild = lookup_one_len(name, dparent, namlen);
>> + dchild = lookup_one_len_unlocked(name, dparent, namlen);
>> if (IS_ERR(dchild))
>> return rv;
>> if (d_mountpoint(dchild))
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 158badf945df..2c1adaa0bd2f 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>> __be32 nfserr;
>> int ignore_crossmnt = 0;
>>
>> - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
>> + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>> if (IS_ERR(dentry))
>> return nfserrno(PTR_ERR(dentry));
>> if (d_really_is_negative(dentry)) {
>> /*
>> - * nfsd_buffered_readdir drops the i_mutex between
>> - * readdir and calling this callback, leaving a window
>> - * where this directory entry could have gone away.
>> + * we're not holding the i_mutex here, so there's
>> + * a window where this directory entry could have gone
>> + * away.
>> */
>> dput(dentry);
>> return nfserr_noent;
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index a30e79900086..6d5b33458e91 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> host_err = PTR_ERR(dentry);
>> if (IS_ERR(dentry))
>> goto out_nfserr;
>> - /*
>> - * check if we have crossed a mount point ...
>> - */
>> if (nfsd_mountpoint(dentry, exp)) {
>> + /*
>> + * We don't need the i_mutex after all. It's
>> + * still possible we could open this (regular
>> + * files can be mountpoints too), but the
>> + * i_mutex is just there to prevent renames of
>> + * something that we might be about to delegate,
>> + * and a mountpoint won't be renamed:
>> + */
>> + fh_unlock(fhp);
>> if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>> dput(dentry);
>> goto out_nfserr;
>> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>> offset = *offsetp;
>>
>> while (1) {
>> - struct inode *dir_inode = file_inode(file);
>> unsigned int reclen;
>>
>> cdp->err = nfserr_eof; /* will be cleared on successful read */
>> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>> if (!size)
>> break;
>>
>> - /*
>> - * Various filldir functions may end up calling back into
>> - * lookup_one_len() and the file system's ->lookup() method.
>> - * These expect i_mutex to be held, as it would within readdir.
>> - */
>> - host_err = mutex_lock_killable(&dir_inode->i_mutex);
>> - if (host_err)
>> - break;
>> -
>> de = (struct buffered_dirent *)buf.dirent;
>> while (size > 0) {
>> offset = de->offset;
>> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>> size -= reclen;
>> de = (struct buffered_dirent *)((char *)de + reclen);
>> }
>> - mutex_unlock(&dir_inode->i_mutex);
>> if (size > 0) /* We bailed out early */
>> break;
>>
>> diff --git a/include/linux/namei.h b/include/linux/namei.h
>> index c8990779f0c3..bb3a2f7cca67 100644
>> --- a/include/linux/namei.h
>> +++ b/include/linux/namei.h
>> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>> extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>>
>> extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
>> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>>
>> extern int follow_down_one(struct path *);
>> extern int follow_down(struct path *);
>> --
>> 1.9.3
>>
>
On Sun, Jul 05, 2015 at 07:27:25PM +0800, Kinglong Mee wrote:
> Ping...
>
> What's the state of this patch ?
I think I still need an ACK from Al for the addition of
lookup_one_len_unlocked.
--b.
> On 6/3/2015 23:18, J. Bruce Fields wrote:
> > This passes my review, but it needs an ACK from Al or someone for the
> > addition of the new lookup_one_len_unlocked (which is the same as
> > lookup_one_len except that it takes the i_mutex itself when required
> > instead of requiring the caller to).
> >
> > --b.
> >
> > On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> >> From: NeilBrown <[email protected]>
> >>
> >> We need information about exports when crossing mountpoints during
> >> lookup or NFSv4 readdir. If we don't already have that information
> >> cached, we may have to ask (and wait for) rpc.mountd.
> >>
> >> In both cases we currently hold the i_mutex on the parent of the
> >> directory we're asking rpc.mountd about. We've seen situations where
> >> rpc.mountd performs some operation on that directory that tries to take
> >> the i_mutex again, resulting in deadlock.
> >>
> >> With some care, we may be able to avoid that in rpc.mountd. But it
> >> seems better just to avoid holding a mutex while waiting on userspace.
> >>
> >> It appears that lookup_one_len is pretty much the only operation that
> >> needs the i_mutex. So we could just drop the i_mutex elsewhere and do
> >> something like
> >>
> >> mutex_lock()
> >> lookup_one_len()
> >> mutex_unlock()
> >>
> >> In many cases though the lookup would have been cached and not required
> >> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> >> that only takes the i_mutex when necessary.
> >>
> >> Signed-off-by: J. Bruce Fields <[email protected]>
> >> ---
> >> fs/namei.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >> fs/nfsd/nfs3xdr.c | 2 +-
> >> fs/nfsd/nfs4xdr.c | 8 +++---
> >> fs/nfsd/vfs.c | 23 +++++++---------
> >> include/linux/namei.h | 1 +
> >> 5 files changed, 89 insertions(+), 19 deletions(-)
> >>
> >> Here's an updated patch.
> >>
> >> diff --git a/fs/namei.c b/fs/namei.c
> >> index 4a8d998b7274..8b866d79c5b7 100644
> >> --- a/fs/namei.c
> >> +++ b/fs/namei.c
> >> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> >> *
> >> * Note that this routine is purely a helper for filesystem usage and should
> >> * not be called by generic code.
> >> + *
> >> + * The caller must hold base->i_mutex.
> >> */
> >> struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >> {
> >> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >> }
> >> EXPORT_SYMBOL(lookup_one_len);
> >>
> >> +/**
> >> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> >> + * @name: pathname component to lookup
> >> + * @base: base directory to lookup from
> >> + * @len: maximum length @len should be interpreted to
> >> + *
> >> + * Note that this routine is purely a helper for filesystem usage and should
> >> + * not be called by generic code.
> >> + *
> >> + * Unlike lookup_one_len, it should be called without the parent
> >> + * i_mutex held, and will take the i_mutex itself if necessary.
> >> + */
> >> +struct dentry *lookup_one_len_unlocked(const char *name,
> >> + struct dentry *base, int len)
> >> +{
> >> + struct qstr this;
> >> + unsigned int c;
> >> + int err;
> >> + struct dentry *ret;
> >> +
> >> + this.name = name;
> >> + this.len = len;
> >> + this.hash = full_name_hash(name, len);
> >> + if (!len)
> >> + return ERR_PTR(-EACCES);
> >> +
> >> + if (unlikely(name[0] == '.')) {
> >> + if (len < 2 || (len == 2 && name[1] == '.'))
> >> + return ERR_PTR(-EACCES);
> >> + }
> >> +
> >> + while (len--) {
> >> + c = *(const unsigned char *)name++;
> >> + if (c == '/' || c == '\0')
> >> + return ERR_PTR(-EACCES);
> >> + }
> >> + /*
> >> + * See if the low-level filesystem might want
> >> + * to use its own hash..
> >> + */
> >> + if (base->d_flags & DCACHE_OP_HASH) {
> >> + int err = base->d_op->d_hash(base, &this);
> >> + if (err < 0)
> >> + return ERR_PTR(err);
> >> + }
> >> +
> >> + err = inode_permission(base->d_inode, MAY_EXEC);
> >> + if (err)
> >> + return ERR_PTR(err);
> >> +
> >> + ret = __d_lookup(base, &this);
> >> + if (ret)
> >> + return ret;
> >> + /*
> >> + * __d_lookup() is used to try to get a quick answer and avoid the
> >> + * mutex. A false-negative does no harm.
> >> + */
> >> + ret = __d_lookup(base, &this);
> >> + if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> >> + dput(ret);
> >> + ret = NULL;
> >> + }
> >> + if (ret)
> >> + return ret;
> >> +
> >> + mutex_lock(&base->d_inode->i_mutex);
> >> + ret = __lookup_hash(&this, base, 0);
> >> + mutex_unlock(&base->d_inode->i_mutex);
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> >> +
> >> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> >> struct path *path, int *empty)
> >> {
> >> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> >> index f6e7cbabac5a..01dcd494f781 100644
> >> --- a/fs/nfsd/nfs3xdr.c
> >> +++ b/fs/nfsd/nfs3xdr.c
> >> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> >> } else
> >> dchild = dget(dparent);
> >> } else
> >> - dchild = lookup_one_len(name, dparent, namlen);
> >> + dchild = lookup_one_len_unlocked(name, dparent, namlen);
> >> if (IS_ERR(dchild))
> >> return rv;
> >> if (d_mountpoint(dchild))
> >> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> >> index 158badf945df..2c1adaa0bd2f 100644
> >> --- a/fs/nfsd/nfs4xdr.c
> >> +++ b/fs/nfsd/nfs4xdr.c
> >> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> >> __be32 nfserr;
> >> int ignore_crossmnt = 0;
> >>
> >> - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> >> + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> >> if (IS_ERR(dentry))
> >> return nfserrno(PTR_ERR(dentry));
> >> if (d_really_is_negative(dentry)) {
> >> /*
> >> - * nfsd_buffered_readdir drops the i_mutex between
> >> - * readdir and calling this callback, leaving a window
> >> - * where this directory entry could have gone away.
> >> + * we're not holding the i_mutex here, so there's
> >> + * a window where this directory entry could have gone
> >> + * away.
> >> */
> >> dput(dentry);
> >> return nfserr_noent;
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index a30e79900086..6d5b33458e91 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> host_err = PTR_ERR(dentry);
> >> if (IS_ERR(dentry))
> >> goto out_nfserr;
> >> - /*
> >> - * check if we have crossed a mount point ...
> >> - */
> >> if (nfsd_mountpoint(dentry, exp)) {
> >> + /*
> >> + * We don't need the i_mutex after all. It's
> >> + * still possible we could open this (regular
> >> + * files can be mountpoints too), but the
> >> + * i_mutex is just there to prevent renames of
> >> + * something that we might be about to delegate,
> >> + * and a mountpoint won't be renamed:
> >> + */
> >> + fh_unlock(fhp);
> >> if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> >> dput(dentry);
> >> goto out_nfserr;
> >> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >> offset = *offsetp;
> >>
> >> while (1) {
> >> - struct inode *dir_inode = file_inode(file);
> >> unsigned int reclen;
> >>
> >> cdp->err = nfserr_eof; /* will be cleared on successful read */
> >> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >> if (!size)
> >> break;
> >>
> >> - /*
> >> - * Various filldir functions may end up calling back into
> >> - * lookup_one_len() and the file system's ->lookup() method.
> >> - * These expect i_mutex to be held, as it would within readdir.
> >> - */
> >> - host_err = mutex_lock_killable(&dir_inode->i_mutex);
> >> - if (host_err)
> >> - break;
> >> -
> >> de = (struct buffered_dirent *)buf.dirent;
> >> while (size > 0) {
> >> offset = de->offset;
> >> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >> size -= reclen;
> >> de = (struct buffered_dirent *)((char *)de + reclen);
> >> }
> >> - mutex_unlock(&dir_inode->i_mutex);
> >> if (size > 0) /* We bailed out early */
> >> break;
> >>
> >> diff --git a/include/linux/namei.h b/include/linux/namei.h
> >> index c8990779f0c3..bb3a2f7cca67 100644
> >> --- a/include/linux/namei.h
> >> +++ b/include/linux/namei.h
> >> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
> >> extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
> >>
> >> extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> >> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
> >>
> >> extern int follow_down_one(struct path *);
> >> extern int follow_down(struct path *);
> >> --
> >> 1.9.3
> >>
> >
Al, can I get an ACK/NACK for this?
--b.
commit 5494e10316e3
Author: NeilBrown <[email protected]>
Date: Fri May 1 11:53:26 2015 +1000
nfsd: don't hold i_mutex over userspace upcalls
We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.
In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.
With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.
It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like
mutex_lock()
lookup_one_len()
mutex_unlock()
In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.
Signed-off-by: NeilBrown <[email protected]>
Signed-off-by: J. Bruce Fields <[email protected]>
diff --git a/fs/namei.c b/fs/namei.c
index ae4e4c18b2ac..1627d0302a59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2250,6 +2250,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2293,6 +2295,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ return ret;
+ /*
+ * __d_lookup() is used to try to get a quick answer and avoid the
+ * mutex. A false-negative does no harm.
+ */
+ ret = __d_lookup(base, &this);
+ if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+ dput(ret);
+ ret = NULL;
+ }
+ if (ret)
+ return ret;
+
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..2ea6c6a37364 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
From: NeilBrown <[email protected]>
Subject: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.
In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.
With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.
It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like
mutex_lock()
lookup_one_len()
mutex_unlock()
In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.
Signed-off-by: NeilBrown <[email protected]>
Signed-off-by: J. Bruce Fields <[email protected]>
---
fs/namei.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/nfs3xdr.c | 2 +-
fs/nfsd/nfs4xdr.c | 8 +++---
fs/nfsd/vfs.c | 23 +++++++---------
include/linux/namei.h | 1 +
5 files changed, 89 insertions(+), 19 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 726d211db484..d68c21fe4598 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2278,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2321,6 +2323,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = __d_lookup(base, &this);
+ if (ret)
+ return ret;
+ /*
+ * __d_lookup() is used to try to get a quick answer and avoid the
+ * mutex. A false-negative does no harm.
+ */
+ ret = __d_lookup(base, &this);
+ if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+ dput(ret);
+ ret = NULL;
+ }
+ if (ret)
+ return ret;
+
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
--
2.5.0