Here's a series for fixing issues with d_drop on a directory dentry with
children and adding support for such dropped directories in fuse.
I tested the fuse part, but not the other filesystems (AFS, GFS2, NFS, SYSFS).
The changes to filesystems are pretty trivial, though.
Thanks,
Miklos
---
Anand Avati (1):
fuse: drop dentry on failed revalidate
Miklos Szeredi (3):
vfs: check submounts and drop atomically
vfs: check unlinked ancestors before mount
fuse: use d_materialise_unique()
---
fs/afs/dir.c | 6 +-
fs/dcache.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/fuse/dir.c | 76 ++++++++++-------------
fs/gfs2/dentry.c | 6 +-
fs/internal.h | 1 +
fs/namespace.c | 9 +++
fs/nfs/dir.c | 10 ++--
fs/sysfs/dir.c | 6 +-
include/linux/dcache.h | 1 +
9 files changed, 216 insertions(+), 58 deletions(-)
From: Anand Avati <[email protected]>
Drop a subtree when we find that it has moved or been delated. This can be
done as long as there are no submounts under this location.
If the directory was moved and we come across the same directory in a
future lookup it will be reconnected by d_materialise_unique().
Signed-off-by: Anand Avati <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dir.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 131d14b..4ba5893 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -226,8 +226,13 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (!err) {
struct fuse_inode *fi = get_fuse_inode(inode);
if (outarg.nodeid != get_node_id(inode)) {
+ int ret = 0;
+
+ if (check_submounts_and_drop(entry) != 0)
+ ret = 1;
+
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
- return 0;
+ return ret;
}
spin_lock(&fc->lock);
fi->nlookup++;
--
1.8.1.4
From: Miklos Szeredi <[email protected]>
Use d_materialise_unique() instead of d_splice_alias(). This allows dentry
subtrees to be moved to a new place if there moved, even if something is
referencing a dentry in the subtree (open fd, cwd, etc..).
This will also allow us to drop a subtree if it is found to be replaced by
something else. In this case the disconnected subtree can later be
reconnected to its new location.
d_materialise_unique() ensures that a directory entry only ever has one
alias. We keep fc->inst_mutex around the calls for d_materialise_unique()
on directories to prevent a race with mkdir "stealing" the inode.
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dir.c | 69 ++++++++++++++++++++++-------------------------------------
1 file changed, 26 insertions(+), 43 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a5d5b..131d14b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -267,26 +267,6 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
-/*
- * Add a directory inode to a dentry, ensuring that no other dentry
- * refers to this inode. Called with fc->inst_mutex.
- */
-static struct dentry *fuse_d_add_directory(struct dentry *entry,
- struct inode *inode)
-{
- struct dentry *alias = d_find_alias(inode);
- if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
- /* This tries to shrink the subtree below alias */
- fuse_invalidate_entry(alias);
- dput(alias);
- if (!hlist_empty(&inode->i_dentry))
- return ERR_PTR(-EBUSY);
- } else {
- dput(alias);
- }
- return d_splice_alias(inode, entry);
-}
-
int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
@@ -345,6 +325,24 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
return err;
}
+static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
+ struct inode *inode)
+{
+ struct dentry *newent;
+
+ if (inode && S_ISDIR(inode->i_mode)) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ mutex_lock(&fc->inst_mutex);
+ newent = d_materialise_unique(dentry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ } else {
+ newent = d_materialise_unique(dentry, inode);
+ }
+
+ return newent;
+}
+
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
@@ -352,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
struct fuse_entry_out outarg;
struct inode *inode;
struct dentry *newent;
- struct fuse_conn *fc = get_fuse_conn(dir);
bool outarg_valid = true;
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
@@ -368,16 +365,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (inode && get_node_id(inode) == FUSE_ROOT_ID)
goto out_iput;
- if (inode && S_ISDIR(inode->i_mode)) {
- mutex_lock(&fc->inst_mutex);
- newent = fuse_d_add_directory(entry, inode);
- mutex_unlock(&fc->inst_mutex);
- err = PTR_ERR(newent);
- if (IS_ERR(newent))
- goto out_iput;
- } else {
- newent = d_splice_alias(inode, entry);
- }
+ newent = fuse_materialise_dentry(entry, inode);
+ err = PTR_ERR(newent);
+ if (IS_ERR(newent))
+ goto out_err;
entry = newent ? newent : entry;
if (outarg_valid)
@@ -1275,18 +1266,10 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- if (S_ISDIR(inode->i_mode)) {
- mutex_lock(&fc->inst_mutex);
- alias = fuse_d_add_directory(dentry, inode);
- mutex_unlock(&fc->inst_mutex);
- err = PTR_ERR(alias);
- if (IS_ERR(alias)) {
- iput(inode);
- goto out;
- }
- } else {
- alias = d_splice_alias(inode, dentry);
- }
+ alias = fuse_materialise_dentry(dentry, inode);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias))
+ goto out;
if (alias) {
dput(dentry);
--
1.8.1.4
From: Miklos Szeredi <[email protected]>
We check submounts before doing d_drop() on a non-empty directory dentry in
NFS (have_submounts()). But we do not exclude a racing mount. Nor do we
prevent mounts to be added to the disconnected subtree using relative paths
after the d_drop().
This patch fixes these issues by checking for unlinked (unhashed, non-root)
ancestors before proceeding with the mount. This is done after setting
DCACHE_MOUNTED on the soon-to-be mountpoint and with the rename seqlock
taken for write. This ensures that the only one of have_submounts() or
has_unlinked_ancestor() can succeed.
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/dcache.c | 28 ++++++++++++++++++++++++++++
fs/internal.h | 1 +
fs/namespace.c | 9 +++++++++
3 files changed, 38 insertions(+)
diff --git a/fs/dcache.c b/fs/dcache.c
index ba429d9..eae7cc1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1103,6 +1103,34 @@ rename_retry:
}
EXPORT_SYMBOL(have_submounts);
+static bool __has_unlinked_ancestor(struct dentry *dentry)
+{
+ struct dentry *this;
+
+ for (this = dentry; !IS_ROOT(this); this = this->d_parent) {
+ if (d_unhashed(this))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called by mount code to check if the mountpoint is reachable (e.g. NFS can
+ * unhash a directory dentry and then the complete subtree can become
+ * unreachable).
+ */
+bool has_unlinked_ancestor(struct dentry *dentry)
+{
+ bool found;
+
+ /* Need exclusion wrt. check_submounts_and_drop() */
+ write_seqlock(&rename_lock);
+ found = __has_unlinked_ancestor(dentry);
+ write_sequnlock(&rename_lock);
+
+ return found;
+}
+
/*
* Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
diff --git a/fs/internal.h b/fs/internal.h
index 7c5f01c..d232355 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -126,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+extern bool has_unlinked_ancestor(struct dentry *dentry);
/*
* read_write.c
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9b..bb92a9c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -634,6 +634,15 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
}
dentry->d_flags |= DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
+
+ if (has_unlinked_ancestor(dentry)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_MOUNTED;
+ spin_unlock(&dentry->d_lock);
+ kfree(mp);
+ return ERR_PTR(-ENOENT);
+ }
+
mp->m_dentry = dentry;
mp->m_count = 1;
list_add(&mp->m_hash, chain);
--
1.8.1.4
From: Miklos Szeredi <[email protected]>
We check submounts before doing d_drop() on a non-empty directory dentry in
NFS (have_submounts()), but we do not exclude a racing mount.
Process A: have_submounts() -> returns false
Process B: mount() -> success
Process A: d_drop()
This patch prepares the ground for the fix by doing the following
operations all under the same rename lock:
have_submounts()
shrink_dcache_parent()
d_drop()
This is actually an optimization since have_submounts() and
shrink_dcache_parent() both traverse the same dentry tree separately.
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/afs/dir.c | 6 +--
fs/dcache.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/gfs2/dentry.c | 6 +--
fs/nfs/dir.c | 10 ++--
fs/sysfs/dir.c | 6 +--
include/linux/dcache.h | 1 +
6 files changed, 146 insertions(+), 14 deletions(-)
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34494fb..968f50d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -687,14 +687,14 @@ not_found:
out_bad:
if (dentry->d_inode) {
/* don't unhash if we have submounts */
- if (have_submounts(dentry))
+ if (check_submounts_and_drop(dentry) != 0)
goto out_skip;
+ } else {
+ d_drop(dentry);
}
_debug("dropping dentry %s/%s",
parent->d_name.name, dentry->d_name.name);
- shrink_dcache_parent(dentry);
- d_drop(dentry);
dput(parent);
key_put(key);
diff --git a/fs/dcache.c b/fs/dcache.c
index 87bdb53..ba429d9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1224,6 +1224,137 @@ void shrink_dcache_parent(struct dentry * parent)
}
EXPORT_SYMBOL(shrink_dcache_parent);
+static int __check_submounts_and_drop(struct dentry *parent,
+ struct list_head *dispose)
+{
+ struct dentry *this_parent;
+ struct list_head *next;
+ unsigned seq;
+ int found = 0;
+ int locked = 0;
+
+ seq = read_seqbegin(&rename_lock);
+again:
+ this_parent = parent;
+ spin_lock(&this_parent->d_lock);
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry;
+
+ dentry = list_entry(tmp, struct dentry, d_u.d_child);
+ next = tmp->next;
+
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (d_mountpoint(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ found = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * move only zero ref count dentries to the dispose list.
+ *
+ * Those which are presently on the shrink list, being processed
+ * by shrink_dentry_list(), shouldn't be moved. Otherwise the
+ * loop in shrink_dcache_parent() might not make any progress
+ * and loop forever.
+ */
+ if (dentry->d_count) {
+ dentry_lru_del(dentry);
+ } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+ dentry_lru_move_list(dentry, dispose);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ found++;
+ }
+ /*
+ * We can return to the caller if we have found some (this
+ * ensures forward progress). We'll be coming back to find
+ * the rest.
+ */
+ if (found && need_resched()) {
+ spin_unlock(&dentry->d_lock);
+ goto out;
+ }
+
+ /*
+ * Descend a level if the d_subdirs list is non-empty.
+ */
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+ this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+ goto repeat;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ }
+ /*
+ * All done at this level ... ascend and resume the search.
+ */
+ if (this_parent != parent) {
+ struct dentry *child = this_parent;
+ this_parent = try_to_ascend(this_parent, locked, seq);
+ if (!this_parent)
+ goto rename_retry;
+ next = child->d_u.d_child.next;
+ goto resume;
+ }
+out:
+ if (!locked && read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
+ goto rename_retry;
+ }
+ __d_drop(this_parent);
+ spin_unlock(&this_parent->d_lock);
+
+ if (locked)
+ write_sequnlock(&rename_lock);
+ return found;
+
+
+rename_retry:
+ if (found)
+ return found;
+ if (locked)
+ goto again;
+ locked = 1;
+ write_seqlock(&rename_lock);
+ goto again;
+}
+
+/**
+ * check_submounts_and_drop - prune dcache, check for submounts and drop
+ *
+ * All done as a single atomic operation relative to has_unlinked_ancestor().
+ * Returns 0 if successfully unhashed @parent. If there were submounts then
+ * return -EBUSY.
+ *
+ * @dentry: dentry to prune and drop
+ */
+int check_submounts_and_drop(struct dentry *dentry)
+{
+ int found;
+
+ for (;;) {
+ LIST_HEAD(dispose);
+ found = __check_submounts_and_drop(dentry, &dispose);
+ if (!list_empty(&dispose))
+ shrink_dentry_list(&dispose);
+
+ if (found <= 0)
+ break;
+
+ cond_resched();
+ }
+
+ return found;
+}
+EXPORT_SYMBOL(check_submounts_and_drop);
+
/**
* __d_alloc - allocate a dcache entry
* @sb: filesystem it will belong to
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index f2448ab..6964725 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -94,11 +94,11 @@ invalid_gunlock:
gfs2_glock_dq_uninit(&d_gh);
invalid:
if (inode && S_ISDIR(inode->i_mode)) {
- if (have_submounts(dentry))
+ if (check_submounts_and_drop(dentry) != 0)
goto valid;
- shrink_dcache_parent(dentry);
+ } else {
+ d_drop(dentry);
}
- d_drop(dentry);
dput(parent);
return 0;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e474ca2b..a2fd681 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1135,14 +1135,14 @@ out_zap_parent:
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
nfs_zap_caches(inode);
- /* If we have submounts, don't unhash ! */
- if (have_submounts(dentry))
- goto out_valid;
if (dentry->d_flags & DCACHE_DISCONNECTED)
goto out_valid;
- shrink_dcache_parent(dentry);
+ /* If we have submounts, don't unhash ! */
+ if (check_submounts_and_drop(dentry) != 0)
+ goto out_valid;
+ } else {
+ d_drop(dentry);
}
- d_drop(dentry);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e068e74..1778320 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -348,11 +348,11 @@ out_bad:
* to lie about the state of the filesystem to prevent
* leaks and other nasty things.
*/
- if (have_submounts(dentry))
+ if (check_submounts_and_drop(dentry) != 0)
goto out_valid;
- shrink_dcache_parent(dentry);
+ } else {
+ d_drop(dentry);
}
- d_drop(dentry);
return 0;
}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b90337c..41b21ca 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -251,6 +251,7 @@ extern void d_prune_aliases(struct inode *);
/* test whether we have any submounts in a subdir tree */
extern int have_submounts(struct dentry *);
+extern int check_submounts_and_drop(struct dentry *);
/*
* This adds the entry to the hash queues.
--
1.8.1.4
On Tue, Aug 06, 2013 at 04:30:01PM +0200, Miklos Szeredi wrote:
> From: Miklos Szeredi <[email protected]>
>
> We check submounts before doing d_drop() on a non-empty directory dentry in
> NFS (have_submounts()). But we do not exclude a racing mount. Nor do we
> prevent mounts to be added to the disconnected subtree using relative paths
> after the d_drop().
>
> This patch fixes these issues by checking for unlinked (unhashed, non-root)
> ancestors before proceeding with the mount. This is done after setting
> DCACHE_MOUNTED on the soon-to-be mountpoint and with the rename seqlock
> taken for write. This ensures that the only one of have_submounts() or
> has_unlinked_ancestor() can succeed.
>
> Signed-off-by: Miklos Szeredi <[email protected]>
> ---
> fs/dcache.c | 28 ++++++++++++++++++++++++++++
> fs/internal.h | 1 +
> fs/namespace.c | 9 +++++++++
> 3 files changed, 38 insertions(+)
>
> diff --git a/fs/dcache.c b/fs/dcache.c
> index ba429d9..eae7cc1 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -1103,6 +1103,34 @@ rename_retry:
> }
> EXPORT_SYMBOL(have_submounts);
>
> +static bool __has_unlinked_ancestor(struct dentry *dentry)
> +{
> + struct dentry *this;
> +
> + for (this = dentry; !IS_ROOT(this); this = this->d_parent) {
> + if (d_unhashed(this))
Actually we need to take ->d_lock around this d_unhashed() since the __d_drop()
is done outside the read read_seqbegin/read_seqretry() region but with d_lock
held.
Thanks,
Miklos
> + return true;
> + }
> + return false;
> +}
> +
> +/*
> + * Called by mount code to check if the mountpoint is reachable (e.g. NFS can
> + * unhash a directory dentry and then the complete subtree can become
> + * unreachable).
> + */
> +bool has_unlinked_ancestor(struct dentry *dentry)
> +{
> + bool found;
> +
> + /* Need exclusion wrt. check_submounts_and_drop() */
> + write_seqlock(&rename_lock);
> + found = __has_unlinked_ancestor(dentry);
> + write_sequnlock(&rename_lock);
> +
> + return found;
> +}
> +
> /*
> * Search the dentry child list of the specified parent,
> * and move any unused dentries to the end of the unused
> diff --git a/fs/internal.h b/fs/internal.h
> index 7c5f01c..d232355 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -126,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool);
> * dcache.c
> */
> extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
> +extern bool has_unlinked_ancestor(struct dentry *dentry);
>
> /*
> * read_write.c
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 7b1ca9b..bb92a9c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -634,6 +634,15 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
> }
> dentry->d_flags |= DCACHE_MOUNTED;
> spin_unlock(&dentry->d_lock);
> +
> + if (has_unlinked_ancestor(dentry)) {
> + spin_lock(&dentry->d_lock);
> + dentry->d_flags &= ~DCACHE_MOUNTED;
> + spin_unlock(&dentry->d_lock);
> + kfree(mp);
> + return ERR_PTR(-ENOENT);
> + }
> +
> mp->m_dentry = dentry;
> mp->m_count = 1;
> list_add(&mp->m_hash, chain);
> --
> 1.8.1.4
>
On 8/6/13 7:30 AM, Miklos Szeredi wrote:
> From: Anand Avati <[email protected]>
>
> Drop a subtree when we find that it has moved or been delated. This can be
> done as long as there are no submounts under this location.
>
> If the directory was moved and we come across the same directory in a
> future lookup it will be reconnected by d_materialise_unique().
>
> Signed-off-by: Anand Avati <[email protected]>
> Signed-off-by: Miklos Szeredi <[email protected]>
> ---
> fs/fuse/dir.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 131d14b..4ba5893 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -226,8 +226,13 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
> if (!err) {
> struct fuse_inode *fi = get_fuse_inode(inode);
> if (outarg.nodeid != get_node_id(inode)) {
> + int ret = 0;
> +
> + if (check_submounts_and_drop(entry) != 0)
> + ret = 1;
> +
> fuse_queue_forget(fc, forget, outarg.nodeid, 1);
> - return 0;
> + return ret;
If outarg.nodeid != get_node_id(inode), then we have to return 0 no
matter what (whether we successfully dropped the entry or not), no? Or
are you trying to forcefully keep the path to reach the submount alive?
If so, we still fail in inode_permission() .. -> getattr() of the dir
inode, no?
Avati
> }
> spin_lock(&fc->lock);
> fi->nlookup++;
>
On Tue, Aug 6, 2013 at 10:06 PM, Anand Avati <[email protected]> wrote:
> On 8/6/13 7:30 AM, Miklos Szeredi wrote:
>>
>> From: Anand Avati <[email protected]>
>>
>> Drop a subtree when we find that it has moved or been delated. This can
>> be
>> done as long as there are no submounts under this location.
>>
>> If the directory was moved and we come across the same directory in a
>> future lookup it will be reconnected by d_materialise_unique().
>>
>> Signed-off-by: Anand Avati <[email protected]>
>> Signed-off-by: Miklos Szeredi <[email protected]>
>> ---
>> fs/fuse/dir.c | 7 ++++++-
>> 1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
>> index 131d14b..4ba5893 100644
>> --- a/fs/fuse/dir.c
>> +++ b/fs/fuse/dir.c
>> @@ -226,8 +226,13 @@ static int fuse_dentry_revalidate(struct dentry
>> *entry, unsigned int flags)
>> if (!err) {
>> struct fuse_inode *fi = get_fuse_inode(inode);
>> if (outarg.nodeid != get_node_id(inode)) {
>> + int ret = 0;
>> +
>> + if (check_submounts_and_drop(entry) != 0)
>> + ret = 1;
>> +
>> fuse_queue_forget(fc, forget,
>> outarg.nodeid, 1);
>> - return 0;
>> + return ret;
>
>
> If outarg.nodeid != get_node_id(inode), then we have to return 0 no matter
> what (whether we successfully dropped the entry or not), no?
If we return 0 in that case (we failed to invalidate the dentry), then
the VFS will call d_invalidate() which will fail. The result is the
same...
> Or are you
> trying to forcefully keep the path to reach the submount alive? If so, we
> still fail in inode_permission() .. -> getattr() of the dir inode, no?
Yes. But the path to the mountpoint should still be reachable (for
the purpose of unmounting for example). I'm including an interesting
discussion between Al and Linus about this (mailing lists weren't
CC-d, but I don't think they'd mind).
BTW, the isue that non-directory mountpoints are dropped by NFS and
friends is not addressed by my previous patchset. Updated patches
coming up.
Thanks,
Miklos
Subject: [heads-up] breakage with revalidate on NFS and elsewhere
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
1) In NFS ->d_revalidate() we blindly evict non-directories from
dcache. So does d_invalidate(). Which will leave anything bound on
the file in question unreachable. It's not a complete leak (e.g. umount -l
or death of namespace will still evict those), but it's certainly a bug
and one with potential for rather unhappy admin.
Note that there's no reason whatsoever to do that d_drop() in case of
non-directories; the only possible caller (do_revalidate(); the other
call site is for directories only) will call d_invalidate(), which will
drop them itself.
d_invalidate() is more interesting; the minimal fix is to have it check
d_mounted and if it's non-zero - grab namespace_sem, find all vfsmounts
with this ->mnt_root, umount_tree() for all of those, drop namespace_sem,
then release all collected vfsmounts. What's more, we probably want to
extend that to directories; the same thing could be done for all children
with non-zero d_mounted, killing the "has submounts" logics in NFS revalidate.
It's not even hard to implement - all we need is a secondary hash chains
going through vfsmounts, keyed by ->mnt_mountpoint alone. That would be
enough (alternative would be to put them on a cyclic list anchored in dentry,
but that'd lead to much worse memory waste since for almost all dentries
the list would be empty).
_However_, there's a secondary issue with d_invalidate() callers. What
happens to the "case-insensitive" crap? Suppose we have something mounted
on /mnt/foo/bar, with /mnt/foo/bar being on VFAT. Somebody wants to open
/mnt/foo/BaR; what should that do to mountpoint?
Current behaviour is
a) if it's a directory, have lookup return /mnt/foo/bar, case be
damned.
b) if it's a non-directory, leak the vfsmount(s), return dentry
with new name.
IMO we should _NOT_ make any vfsmounts unreachable in that case; too obvious
abuse potential. The only question is whether to have invalidation simply
fail (i.e. case (a) for everything) or to try and flip ->mnt_mountpoint in
them to the "replacement" dentry. I think that the former is the right
answer. In any case, this means splitting d_invalidate() in two variants
(unmounting and non-unmounting).
We also need to review other __d_drop()/d_drop() users - potentially they
might need the same kind of treatment ;-/
2) NFS4 ->d_revalidate() is too bloody eager to bypass everything
bypassable; as the result, if you have a something bound on top of file
and attempt to open it, the damn thing will blindly try to open _underlying_
file. You either get that file opened (and nameidata_to_filp() will return it,
nevermind where had that binding lead pathname resolution) or fail (e.g. if
it's not readable for you) and get open() failure.
That one is nasty. We certainly can notice that there's a vfsmount
on top of the sucker - this happens only on the last step of pathname
resolution in do_filp_open(). However, what would we do once we find it?
We can't just skip ->d_revalidate(). I suspect that the cheapest way is
to indicate in nd->flags that it shouldn't attempt atomic open (e.g. by
removing LOOKUP_OPEN for the duration) and leave the rest alone. Longer
term solution, of course, is to split "atomic open" path away into a
separate method, with fallback to ->d_revalidate/->lookup/->create/->open
combination if it's not there *OR* if we have something bound on top of
the sucker (and just want to see if it's stale and binding should be
kicked out). In either case, we really have to split the last step of
lookup away from path_lookup_open(); O_CREAT path is already split
that way.
3) while we are at it (== sorting out the intents patch pile),
there's fun with passing flags through nameidata; on MIPS O_SYNC == FMODE_EXEC,
with everything that implies. Namely, open() a file on nfs4 with O_SYNC,
get it fail with -EACCES if it's not executable. Moral: don't mix f_flags
with f_mode. FWIW, as an intermediate step I'm
a) adding LOOKUP_EXCL in addition to LOOKUP_CREATE/LOOKUP_OPEN, with
callers that set O_EXCL in ->intent.flags setting it in ->flags.
b) setting ->f_mode and ->f_flags of intent.file from the very
beginning and switching the checks in filesystems to _that_; longer term
we'll simply pass that struct file * directly to methods. This particular
check (in nfs4_intent_set_file()) goes to ->f_mode, of course...
4) more in the same pile: open_exec() ignores leases. Solution:
switch the damn thing to do_filp_open(); makes life interesting wrt the
arguments that need to be passed to the latter, but that's survivable.
That's probably a less immediate mess, though...
5) v9fs_vfs_create() ignores implicit O_EXCL when called from
mknod(2) (i.e. without LOOKUP_OPEN). IMO that's a clear bug, fortunately
easy to fix.
Comments?
PS: ->d_revalidate()/d_invalidate() stuff is probably the oldest surviving
serious mess in VFS, with fs/locks.c being a distant second; I'd never
quite got the courage to sort it out until now and AFAICS neither did
Bill Hawes before my time. So it had festered for more than a decade...
-------------------------------------------------------------------------------
From: Linus Torvalds <[email protected]>
On Tue, 5 Aug 2008, Al Viro wrote:
>
> 1) In NFS ->d_revalidate() we blindly evict non-directories from
> dcache. So does d_invalidate(). Which will leave anything bound on
> the file in question unreachable.
Hmm. If something is bound on top of it, how would we ever even _reach_
->d_revalidate()? We never revalidate mount-points, afaicr ("can
remember", I'm too dang lazy to look it up).
So in _practice_, this can only happen with private mounts and namespaces
(ie we don't happen to have it bound in one namespace, so we revalidate,
and thus screw up the other namespace), right?
> d_invalidate() is more interesting; the minimal fix is to have it check
> d_mounted and if it's non-zero - grab namespace_sem, find all vfsmounts
> with this ->mnt_root, umount_tree() for all of those, drop namespace_sem,
> then release all collected vfsmounts. What's more, we probably want to
> extend that to directories; the same thing could be done for all children
> with non-zero d_mounted, killing the "has submounts" logics in NFS revalidate.
Wouldn't it be better to do it the other way around instead, and extend
the "don't even bother to revalidate" to anything that is mounted on in
any namespace?
In particular, "d_invalidate()" is *not* designed to be "this dentry is
bad, we have to throw it away". No, it's designed for "this dentry _may_
be stale, throw it away adn try to look it up again".
And that means that doing a unmount_tree() on them is WRONG. Because
you're throwing out mount-points not on knowledge that they have to be
thrown out, but on just the suspicion that maybe we should update some
cached data.
Linus
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
On Tue, Aug 05, 2008 at 10:31:06AM -0700, Linus Torvalds wrote:
>
>
> On Tue, 5 Aug 2008, Al Viro wrote:
> >
> > 1) In NFS ->d_revalidate() we blindly evict non-directories from
> > dcache. So does d_invalidate(). Which will leave anything bound on
> > the file in question unreachable.
>
> Hmm. If something is bound on top of it, how would we ever even _reach_
> ->d_revalidate()? We never revalidate mount-points, afaicr ("can
> remember", I'm too dang lazy to look it up).
>
> So in _practice_, this can only happen with private mounts and namespaces
> (ie we don't happen to have it bound in one namespace, so we revalidate,
> and thus screw up the other namespace), right?
No. We do lookup (d_lookup + revalidate + ->lookup, possibly) and only
then try to cross the mountpoint.
> Wouldn't it be better to do it the other way around instead, and extend
> the "don't even bother to revalidate" to anything that is mounted on in
> any namespace?
>
> In particular, "d_invalidate()" is *not* designed to be "this dentry is
> bad, we have to throw it away". No, it's designed for "this dentry _may_
> be stale, throw it away adn try to look it up again".
>
> And that means that doing a unmount_tree() on them is WRONG. Because
> you're throwing out mount-points not on knowledge that they have to be
> thrown out, but on just the suspicion that maybe we should update some
> cached data.
That sure as hell doesn't match what ->d_revalidate() instances are doing...
It's not "I suspect it might be bogus"; it's pretty hard "I checked if
it's still OK and it's buggered, dead and buried, not necessary in that
order".
-------------------------------------------------------------------------------
From: Linus Torvalds <[email protected]>
On Tue, 5 Aug 2008, Al Viro wrote:
> >
> > So in _practice_, this can only happen with private mounts and namespaces
> > (ie we don't happen to have it bound in one namespace, so we revalidate,
> > and thus screw up the other namespace), right?
>
> No. We do lookup (d_lookup + revalidate + ->lookup, possibly) and only
> then try to cross the mountpoint.
Ouch. So mounts are already broken on NFS, but nobody probably noticed
because it only triggers for regular files (ie single-file bind mounts) by
default.
> That sure as hell doesn't match what ->d_revalidate() instances are doing...
> It's not "I suspect it might be bogus"; it's pretty hard "I checked if
> it's still OK and it's buggered, dead and buried, not necessary in that
> order".
I agree that the ones that do "d_drop" are clearly broken, and they should
just return the right error value and depend on the caller to DTRT. I was
just arguing against your suggested fix. Yes, we should check whether it's
a mount-point, and just effectively ignore the d_revalidate() if so.
(And yes, we should only do this _after_ d_revalidate() has returned
"don't use it", so 99% of your suggested fix is correct - I just don't
think that the "unmount it" is the correct action, we should just ignore
the d_revalidate failure)
Linus
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
On Tue, Aug 05, 2008 at 11:32:37AM -0700, Linus Torvalds wrote:
> > That sure as hell doesn't match what ->d_revalidate() instances are doing...
> > It's not "I suspect it might be bogus"; it's pretty hard "I checked if
> > it's still OK and it's buggered, dead and buried, not necessary in that
> > order".
>
> I agree that the ones that do "d_drop" are clearly broken, and they should
> just return the right error value and depend on the caller to DTRT. I was
> just arguing against your suggested fix. Yes, we should check whether it's
> a mount-point, and just effectively ignore the d_revalidate() if so.
>
> (And yes, we should only do this _after_ d_revalidate() has returned
> "don't use it", so 99% of your suggested fix is correct - I just don't
> think that the "unmount it" is the correct action, we should just ignore
> the d_revalidate failure)
Either I'm misunderstanding you, or NFS folks will be very unhappy with
that. Think of the following scenario: something is mounted over NFS
in two places (/mnt/something and /jail/something_else). We have /dev/null
bound on top of /jail/something_else/sensitive. That file is removed on
server; we attempt to open /mnt/something/sensitive and instead of expected
-ENOENT we get 100% persistent -ESTALE, simply because there's nothing
to open. Worse, after the file had been recreated on server we _still_ get
the same error. Indefinitely.
And yes, we have the same issue for directories. If server's /home/foo
is mounted on /mnt/foo, some other fs is mounted on /mnt/foo/bar/baz and
server does rm -rf /home/foo/bar, we are stuck with /mnt/foo/bar giving
us -ESTALE. Moreover, if server does mkdir /home/foo/bar and we do
ls /mnt/foo, we'll see bar. But attempt to stat it will keep giving us
-ESTALE. _MOREOVER_, umount(8) on /mnt/foo/bar/baz will be screwed
*anyway*, without bindings, namespaces or anything of that kind. And
the only way out of that will be umount -l /mnt/foo or direct call of
umount(2), with no stat() or anything of that kind; nothing short of
that will make /mnt/foo go away and nothing short of that will make whatever
we had mounted on /mnt/foo/bar/baz not busy.
-------------------------------------------------------------------------------
From: Linus Torvalds <[email protected]>
On Tue, 5 Aug 2008, Al Viro wrote:
>
> Either I'm misunderstanding you, or NFS folks will be very unhappy with
> that. Think of the following scenario: something is mounted over NFS
> in two places (/mnt/something and /jail/something_else). We have /dev/null
> bound on top of /jail/something_else/sensitive. That file is removed on
> server; we attempt to open /mnt/something/sensitive and instead of expected
> -ENOENT we get 100% persistent -ESTALE, simply because there's nothing
> to open. Worse, after the file had been recreated on server we _still_ get
> the same error. Indefinitely.
Umm. If somebody uses it as a mount-point, that's still better than
dropping the mount.
Deal with it. If people think it's problematic, then they shouldn't have
mounted something on top of it.
We cannot just auto-unmount. That's _worse_ than the ESTALE issue.
Linus
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
On Tue, Aug 05, 2008 at 12:07:07PM -0700, Linus Torvalds wrote:
>
>
> On Tue, 5 Aug 2008, Al Viro wrote:
> >
> > Either I'm misunderstanding you, or NFS folks will be very unhappy with
> > that. Think of the following scenario: something is mounted over NFS
> > in two places (/mnt/something and /jail/something_else). We have /dev/null
> > bound on top of /jail/something_else/sensitive. That file is removed on
> > server; we attempt to open /mnt/something/sensitive and instead of expected
> > -ENOENT we get 100% persistent -ESTALE, simply because there's nothing
> > to open. Worse, after the file had been recreated on server we _still_ get
> > the same error. Indefinitely.
>
> Umm. If somebody uses it as a mount-point, that's still better than
> dropping the mount.
>
> Deal with it. If people think it's problematic, then they shouldn't have
> mounted something on top of it.
>
> We cannot just auto-unmount. That's _worse_ than the ESTALE issue.
Why? If anything, I can buy something along the lines of "postpone killing
until lookup from vfsmount that doesn't have anything mounted on it, then
do normal ->lookup() and relocate whatever had been mounted on it to new
dentry, killing the old one". That would give obviously good behaviour
(stuff is visible where bound and remains visible all along, lookup in
any other instance gives expected behaviour and as soon as that happens we
get the situation completely resolved).
However, it still leaves us lousy behaviour for case without any bindings.
What do you expect to happen in such case, anyway? A chain of zombie
directories leading to mountpoint? With any operation on those except
the lookup by exact path resulting in -ESTALE? Including ls on any of
those suckers, BTW.
So why is auto-unmount bad in case when directories in question are
really, honestly dead and gone?
FWIW, what _is_ the semantics you expect from ->d_revalidate()? Perhaps
we'd be better off if we simply get rid of cases when ->d_revalidate()
returns 0 just in case; I see only VFAT playing such games on positive
dentries.
Note that _negative_ ones are different; we often return "redo lookup just
in case", but that's fine - nothing is mounted in them or under them.
-------------------------------------------------------------------------------
From: Linus Torvalds <[email protected]>
On Tue, 5 Aug 2008, Al Viro wrote:
> On Tue, Aug 05, 2008 at 12:07:07PM -0700, Linus Torvalds wrote:
> >
> > We cannot just auto-unmount. That's _worse_ than the ESTALE issue.
>
> Why?
.. because the kernel shouldn't undo the kinds of decisions that the user
has explicitly asked it to do.
If they user mounted something, the kernel should not just unmount it
willy nilly. That's *FUNDAMENTALLY WRONG*. It's so wrong that I don't
even understand how you can _possibly_ ask "why?". It's obviously wrong.
For example, you did a totally made-up example of something that nobody
would actually ever do, and used it as an example of why we'd have to work
the way you suggest. I will counter-act with
- first off, even in your unrealistic example, if somebody mounted it,
you can't just remove it - let the other place get ESTALE, and let the
_user_ unmount it if required, but it's very possible that the thing
was mounted on top of a file because of security issues, for example.
Let's say that you have some http server, and the file that got
over-mounted was /etc/{password/shadow} for the servers namespace.
Let's say that the ESTALE happened because somebody changed the
passwords on the server (which does a "rename" on top of the old
files). Does that mean that suddenly the client that had been set up to
explicitly hide those files should suddenly see the real contents?
You're much better off getting a failure on all points than to expose
one of them!
See? I can top your unrealistic example with another one - and it all
boils down to the fact that you should _never_ _ever_ override user choice
in the kernel (and that 'mount' was very much a user choice).
But the more _realistic_ case is simply the case where the thing was just
mounted in one place to begin with, and the ESTALE never happens at all!
So the other case is:
- mount in just one place. That one place gets ESTALE, but who the f*ck
cares, because nobody will see the native file underneath _anyway_,
because it's overmounted by something else!
In this case, your "solution" breaks the overmount, with no actual
advantages. There are no ESTALE returns to even protect against. The
ESTALE is immaterial - what is (once more) material is that the user
mounted a file on top of another one, and we want to see the _mounted_
contents.
> So why is auto-unmount bad in case when directories in question are
> really, honestly dead and gone?
.. because they aren't gone locally. We can happily continue to look
through them because they _mounted_ stuff still works. The fact that the
mount-point is gone doesn't change the fact that we have a local mount.
Linus
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
On Tue, Aug 05, 2008 at 12:38:35PM -0700, Linus Torvalds wrote:
> > So why is auto-unmount bad in case when directories in question are
> > really, honestly dead and gone?
>
> .. because they aren't gone locally. We can happily continue to look
> through them because they _mounted_ stuff still works. The fact that the
> mount-point is gone doesn't change the fact that we have a local mount.
Um. Again, just one directory can be taken care of; I agree with that.
But what do you do when _ancestors_ are gone?
The mountpoint is covered; we don't care about its contents. Fine. What
about 5 levels of directories leading to it, all crapped out? We can
walk through them, but we can't do anything _else_ - not stat(2), not
readdir(2), nothing. Again, in this case you have no bindings, etc. -
just something mounted on directory deep in NFS tree, with all the
branch leading to mountpoint being dead and gone on server.
What kind of state do you want to get after that?
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
BTW, there's another fun side to it. What do you do when the
kernel itself removes the mountpoint?
I'm not even talking about the fun with somebody creating mounts
under /proc/<pid>; results seem to be amusing, to put it mildly. But under
/sys it's not that unthinkable and it leads to the same kind of mess -
forced d_drop(), this time regardless of whether it's a directory or not.
And then there are callers of d_delete() similar to those (configfs, etc.).
What do you suggest for those?
-------------------------------------------------------------------------------
From: Al Viro <[email protected]>
On Wed, Aug 06, 2008 at 02:16:55AM +0100, Al Viro wrote:
> BTW, there's another fun side to it. What do you do when the
> kernel itself removes the mountpoint?
>
> I'm not even talking about the fun with somebody creating mounts
> under /proc/<pid>; results seem to be amusing, to put it mildly. But under
> /sys it's not that unthinkable and it leads to the same kind of mess -
> forced d_drop(), this time regardless of whether it's a directory or not.
> And then there are callers of d_delete() similar to those (configfs, etc.).
>
> What do you suggest for those?
BTW, after looking at that stuff I'd say that there's a couple of common
helpers asking to be added - for adding and removing objects in such
trees, with callback for setting up an inode in case of addition. Would
cut down on the amount of weird callers of lookup_one_len(), while we
are at it...
Another thing: nfs4 handling of open_flags is _brittle_. I've done a bit
of digging today, dumped a couple of found bugs on steved, but more serious
code review will have to wait for a while... I'd really appreciate a
braindump on locking for states and delegations, BTW.
-------------------------------------------------------------------------------
From: Linus Torvalds <[email protected]>
On Tue, 5 Aug 2008, Al Viro wrote:
>
> Um. Again, just one directory can be taken care of; I agree with that.
> But what do you do when _ancestors_ are gone?
There's one pretty simple approach: we could just add a "pinning count" to
the dentry, and make any mount increment the count for the target and for
all ancestors. Of course, that would complicate reparenting a bit etc, but
it would certainly trivially make sure that the mount structure is always
around.
That said, no, I'm not convinced it's at all worth it, especially if the
end result ends up being that yes, you can walk that particular path, but
not do anything else with it. On the other hand, maybe that end result
isn't so horrible. After all, what _is_ the meaning of a mounted thing
when the directory structure has been changed without the kernel knowing
what happened?
Linus
On 8/7/13 8:44 AM, Miklos Szeredi wrote:
> On Tue, Aug 6, 2013 at 10:06 PM, Anand Avati <[email protected]> wrote:
>> On 8/6/13 7:30 AM, Miklos Szeredi wrote:
>>>
>>> From: Anand Avati <[email protected]>
>>>
>>> Drop a subtree when we find that it has moved or been delated. This can
>>> be
>>> done as long as there are no submounts under this location.
>>>
>>> If the directory was moved and we come across the same directory in a
>>> future lookup it will be reconnected by d_materialise_unique().
>>>
>>> Signed-off-by: Anand Avati <[email protected]>
>>> Signed-off-by: Miklos Szeredi <[email protected]>
>>> ---
>>> fs/fuse/dir.c | 7 ++++++-
>>> 1 file changed, 6 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
>>> index 131d14b..4ba5893 100644
>>> --- a/fs/fuse/dir.c
>>> +++ b/fs/fuse/dir.c
>>> @@ -226,8 +226,13 @@ static int fuse_dentry_revalidate(struct dentry
>>> *entry, unsigned int flags)
>>> if (!err) {
>>> struct fuse_inode *fi = get_fuse_inode(inode);
>>> if (outarg.nodeid != get_node_id(inode)) {
>>> + int ret = 0;
>>> +
>>> + if (check_submounts_and_drop(entry) != 0)
>>> + ret = 1;
>>> +
>>> fuse_queue_forget(fc, forget,
>>> outarg.nodeid, 1);
>>> - return 0;
>>> + return ret;
>>
>>
>> If outarg.nodeid != get_node_id(inode), then we have to return 0 no matter
>> what (whether we successfully dropped the entry or not), no?
>
> If we return 0 in that case (we failed to invalidate the dentry), then
> the VFS will call d_invalidate() which will fail. The result is the
> same...
>
>> Or are you
>> trying to forcefully keep the path to reach the submount alive? If so, we
>> still fail in inode_permission() .. -> getattr() of the dir inode, no?
>
> Yes. But the path to the mountpoint should still be reachable (for
> the purpose of unmounting for example). I'm including an interesting
> discussion between Al and Linus about this (mailing lists weren't
> CC-d, but I don't think they'd mind).
Thanks for attaching the thread. Was very educative! I still do not
quite understand - will umount() still work when
inode_permission()[->getattr()] on the ancestors fail (with ESTALE
etc.)? Wouldn't path resolution itself abort and fail and therefore
do_umount() never called? I understand that the path to the mountpoint
being reachable through the dentry chain is a necessity for umounting,
but is just that really sufficient?
Avati
On Wed, Aug 7, 2013 at 6:31 PM, Anand Avati <[email protected]> wrote:
> Thanks for attaching the thread. Was very educative! I still do not quite
> understand - will umount() still work when inode_permission()[->getattr()]
> on the ancestors fail (with ESTALE etc.)?
Several cases:
1) mountpoint was removed
2) mountpoint's ancestor removed
3) mountpoint or ancestor moved
1) and 3) would only cause revalidate to fail, not inode operations
(permission or getattr) since the inodes of ancestors still exist.
2) could error out if the ->permission got to the server. I'm not
sure it will and I don't have an NFS setup handy to test.
So it doesn't necessarily work but it can be made to work, and as
Linus pointed out, it might still be the least worst option.
BTW, attaching a little fuse filesystem that I've been testing this
with. It's read-only, but the point here is that the tree is modified
*externally*, so that's not an issue.
Thanks,
Miklos