2022-08-01 00:34:06

by NeilBrown

[permalink] [raw]
Subject: [PATCH] NFS: don't unhash dentry during unlink/rename


NFS unlink() (and rename over existing target) must determine if the
file is open, and must perform a "silly rename" instead of an unlink (or
before rename) if it is. Otherwise the client might hold a file open
which has been removed on the server.

Consequently if it determines that the file isn't open, it must block
any subsequent opens until the unlink/rename has been completed on the
server.

This is currently achieved by unhashing the dentry. This forces any
open attempt to the slow-path for lookup which will block on i_rwsem on
the directory until the unlink/rename completes. A future patch will
change the VFS to only get a shared lock on i_rwsem for unlink, so this
will no longer work.

Instead we introduce an explicit interlock. A special value is stored
in dentry->d_fsdata while the unlink/rename is running and
->d_revalidate blocks while that value is present. When ->d_revalidate
unblocks, the dentry will be invalid. This closes the race
without requiring exclusion on i_rwsem.

d_fsdata is already used in two different ways.
1/ an IS_ROOT directory dentry might have a "devname" stored in
d_fsdata. Such a dentry doesn't have a name and so cannot be the
target of unlink or rename. For safety we check if an old devname
is still stored, and remove it if it is.
2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct
nfs_unlinkdata' stored in d_fsdata. While this is set maydelete()
will fail, so an unlink or rename will never proceed on such
a dentry.

Neither of these can be in effect when a dentry is the target of unlink
or rename. So we can expect d_fsdata to be NULL, and store a special
value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate
that any lookup will be blocked.

The d_count() is incremented under d_lock() when a lookup finds the
dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under
the same lock to avoid any races.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++-----------
include/linux/nfs_fs.h | 9 ++++++
2 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0c4e8dd6aa96..a988dc9a5c92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1778,6 +1778,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
int ret;

if (flags & LOOKUP_RCU) {
+ if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+ return -ECHILD;
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
@@ -1786,6 +1788,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
+ /* Wait for unlink to complete */
+ wait_var_event(&dentry->d_fsdata,
+ dentry->d_fsdata != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -2454,7 +2459,6 @@ static int nfs_safe_remove(struct dentry *dentry)
int nfs_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
- int need_rehash = 0;

dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
dir->i_ino, dentry);
@@ -2469,15 +2473,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
error = nfs_sillyrename(dir, dentry);
goto out;
}
- if (!d_unhashed(dentry)) {
- __d_drop(dentry);
- need_rehash = 1;
- }
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
+ */
+ error = -ETXTBSY;
+ if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+ goto out;
+ if (dentry->d_fsdata)
+ /* old devname */
+ kfree(dentry->d_fsdata);
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- if (need_rehash)
- d_rehash(dentry);
+ dentry->d_fsdata = NULL;
+ wake_up_var(&dentry->d_fsdata);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2584,6 +2598,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(nfs_link);

+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *new_dentry = data->new_dentry;
+
+ new_dentry->d_fsdata = NULL;
+ wake_up_var(&new_dentry->d_fsdata);
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2614,8 +2637,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct dentry *dentry = NULL, *rehash = NULL;
+ struct dentry *dentry = NULL;
struct rpc_task *task;
+ bool must_unblock = false;
int error = -EBUSY;

if (flags)
@@ -2633,18 +2657,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
* the new target.
*/
if (new_inode && !S_ISDIR(new_inode->i_mode)) {
- /*
- * To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- rehash = new_dentry;
+ error = -ETXTBSY;
+ if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+ goto out;
+ if (new_dentry->d_fsdata) {
+ /* old devname */
+ kfree(new_dentry->d_fsdata);
+ new_dentry->d_fsdata = NULL;
}

+ spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
int err;

+ spin_unlock(&new_dentry->d_lock);
+
/* copy the target dentry's name */
dentry = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
@@ -2657,14 +2690,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
goto out;

new_dentry = dentry;
- rehash = NULL;
new_inode = NULL;
+ } else {
+ new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ must_unblock = true;
+ spin_unlock(&new_dentry->d_lock);
}
+
}

if (S_ISREG(old_inode->i_mode))
nfs_sync_inode(old_inode);
- task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+ must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
@@ -2688,8 +2726,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
spin_unlock(&old_inode->i_lock);
}
out:
- if (rehash)
- d_rehash(rehash);
trace_nfs_rename_exit(old_dir, old_dentry,
new_dir, new_dentry, error);
if (!error) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a17c337dbdf1..b32ed68e7dc4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid)

#define NFS_JUKEBOX_RETRY_TIME (5 * HZ)

+/* We need to block new opens while a file is being unlinked.
+ * If it is opened *before* we decide to unlink, we will silly-rename
+ * instead. If it is opened *after*, then we need to create or will fail.
+ * If we allow the two to race, we could end up with a file that is open
+ * but deleted on the server resulting in ESTALE.
+ * So use ->d_fsdata to record when the unlink is happening
+ * and block dentry revalidation while it is set.
+ */
+#define NFS_FSDATA_BLOCKED ((void*)1)

# undef ifdebug
# ifdef NFS_DEBUG
--
2.36.1



2022-08-12 03:21:46

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] NFS: don't unhash dentry during unlink/rename


Ping ... any thoughts?

Thanks,
NeilBrown


On Mon, 01 Aug 2022, NeilBrown wrote:
> NFS unlink() (and rename over existing target) must determine if the
> file is open, and must perform a "silly rename" instead of an unlink (or
> before rename) if it is. Otherwise the client might hold a file open
> which has been removed on the server.
>
> Consequently if it determines that the file isn't open, it must block
> any subsequent opens until the unlink/rename has been completed on the
> server.
>
> This is currently achieved by unhashing the dentry. This forces any
> open attempt to the slow-path for lookup which will block on i_rwsem on
> the directory until the unlink/rename completes. A future patch will
> change the VFS to only get a shared lock on i_rwsem for unlink, so this
> will no longer work.
>
> Instead we introduce an explicit interlock. A special value is stored
> in dentry->d_fsdata while the unlink/rename is running and
> ->d_revalidate blocks while that value is present. When ->d_revalidate
> unblocks, the dentry will be invalid. This closes the race
> without requiring exclusion on i_rwsem.
>
> d_fsdata is already used in two different ways.
> 1/ an IS_ROOT directory dentry might have a "devname" stored in
> d_fsdata. Such a dentry doesn't have a name and so cannot be the
> target of unlink or rename. For safety we check if an old devname
> is still stored, and remove it if it is.
> 2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct
> nfs_unlinkdata' stored in d_fsdata. While this is set maydelete()
> will fail, so an unlink or rename will never proceed on such
> a dentry.
>
> Neither of these can be in effect when a dentry is the target of unlink
> or rename. So we can expect d_fsdata to be NULL, and store a special
> value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate
> that any lookup will be blocked.
>
> The d_count() is incremented under d_lock() when a lookup finds the
> dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under
> the same lock to avoid any races.
>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++-----------
> include/linux/nfs_fs.h | 9 ++++++
> 2 files changed, 63 insertions(+), 18 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 0c4e8dd6aa96..a988dc9a5c92 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -1778,6 +1778,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
> int ret;
>
> if (flags & LOOKUP_RCU) {
> + if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
> + return -ECHILD;
> parent = READ_ONCE(dentry->d_parent);
> dir = d_inode_rcu(parent);
> if (!dir)
> @@ -1786,6 +1788,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
> if (parent != READ_ONCE(dentry->d_parent))
> return -ECHILD;
> } else {
> + /* Wait for unlink to complete */
> + wait_var_event(&dentry->d_fsdata,
> + dentry->d_fsdata != NFS_FSDATA_BLOCKED);
> parent = dget_parent(dentry);
> ret = reval(d_inode(parent), dentry, flags);
> dput(parent);
> @@ -2454,7 +2459,6 @@ static int nfs_safe_remove(struct dentry *dentry)
> int nfs_unlink(struct inode *dir, struct dentry *dentry)
> {
> int error;
> - int need_rehash = 0;
>
> dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
> dir->i_ino, dentry);
> @@ -2469,15 +2473,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
> error = nfs_sillyrename(dir, dentry);
> goto out;
> }
> - if (!d_unhashed(dentry)) {
> - __d_drop(dentry);
> - need_rehash = 1;
> - }
> + /* We must prevent any concurrent open until the unlink
> + * completes. ->d_revalidate will wait for ->d_fsdata
> + * to clear. We set it here to ensure no lookup succeeds until
> + * the unlink is complete on the server.
> + */
> + error = -ETXTBSY;
> + if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
> + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED))
> + goto out;
> + if (dentry->d_fsdata)
> + /* old devname */
> + kfree(dentry->d_fsdata);
> + dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> +
> spin_unlock(&dentry->d_lock);
> error = nfs_safe_remove(dentry);
> nfs_dentry_remove_handle_error(dir, dentry, error);
> - if (need_rehash)
> - d_rehash(dentry);
> + dentry->d_fsdata = NULL;
> + wake_up_var(&dentry->d_fsdata);
> out:
> trace_nfs_unlink_exit(dir, dentry, error);
> return error;
> @@ -2584,6 +2598,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
> }
> EXPORT_SYMBOL_GPL(nfs_link);
>
> +static void
> +nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
> +{
> + struct dentry *new_dentry = data->new_dentry;
> +
> + new_dentry->d_fsdata = NULL;
> + wake_up_var(&new_dentry->d_fsdata);
> +}
> +
> /*
> * RENAME
> * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
> @@ -2614,8 +2637,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
> {
> struct inode *old_inode = d_inode(old_dentry);
> struct inode *new_inode = d_inode(new_dentry);
> - struct dentry *dentry = NULL, *rehash = NULL;
> + struct dentry *dentry = NULL;
> struct rpc_task *task;
> + bool must_unblock = false;
> int error = -EBUSY;
>
> if (flags)
> @@ -2633,18 +2657,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
> * the new target.
> */
> if (new_inode && !S_ISDIR(new_inode->i_mode)) {
> - /*
> - * To prevent any new references to the target during the
> - * rename, we unhash the dentry in advance.
> + /* We must prevent any concurrent open until the unlink
> + * completes. ->d_revalidate will wait for ->d_fsdata
> + * to clear. We set it here to ensure no lookup succeeds until
> + * the unlink is complete on the server.
> */
> - if (!d_unhashed(new_dentry)) {
> - d_drop(new_dentry);
> - rehash = new_dentry;
> + error = -ETXTBSY;
> + if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
> + WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
> + goto out;
> + if (new_dentry->d_fsdata) {
> + /* old devname */
> + kfree(new_dentry->d_fsdata);
> + new_dentry->d_fsdata = NULL;
> }
>
> + spin_lock(&new_dentry->d_lock);
> if (d_count(new_dentry) > 2) {
> int err;
>
> + spin_unlock(&new_dentry->d_lock);
> +
> /* copy the target dentry's name */
> dentry = d_alloc(new_dentry->d_parent,
> &new_dentry->d_name);
> @@ -2657,14 +2690,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
> goto out;
>
> new_dentry = dentry;
> - rehash = NULL;
> new_inode = NULL;
> + } else {
> + new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> + must_unblock = true;
> + spin_unlock(&new_dentry->d_lock);
> }
> +
> }
>
> if (S_ISREG(old_inode->i_mode))
> nfs_sync_inode(old_inode);
> - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
> + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
> + must_unblock ? nfs_unblock_rename : NULL);
> if (IS_ERR(task)) {
> error = PTR_ERR(task);
> goto out;
> @@ -2688,8 +2726,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
> spin_unlock(&old_inode->i_lock);
> }
> out:
> - if (rehash)
> - d_rehash(rehash);
> trace_nfs_rename_exit(old_dir, old_dentry,
> new_dir, new_dentry, error);
> if (!error) {
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index a17c337dbdf1..b32ed68e7dc4 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid)
>
> #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
>
> +/* We need to block new opens while a file is being unlinked.
> + * If it is opened *before* we decide to unlink, we will silly-rename
> + * instead. If it is opened *after*, then we need to create or will fail.
> + * If we allow the two to race, we could end up with a file that is open
> + * but deleted on the server resulting in ESTALE.
> + * So use ->d_fsdata to record when the unlink is happening
> + * and block dentry revalidation while it is set.
> + */
> +#define NFS_FSDATA_BLOCKED ((void*)1)
>
> # undef ifdebug
> # ifdef NFS_DEBUG
> --
> 2.36.1
>
>