From: Anna Schumaker <Anna.Schumaker@netapp.com>
Subject: Re: [PATCH v2 5/8] nfs42: add NFS_IOC_CLONE ioctl
To: Peng Tao <tao.peng@primarydata.com>, <linux-nfs@vger.kernel.org>
References: <1436778368-65447-1-git-send-email-tao.peng@primarydata.com>
 <1436778368-65447-6-git-send-email-tao.peng@primarydata.com>
CC: Trond Myklebust <trond.myklebust@primarydata.com>,
        Anna Schumaker <anna.schumaker@netapp.com>
Message-ID: <55A3C2EA.1060901@Netapp.com>
Date: Mon, 13 Jul 2015 09:53:46 -0400
MIME-Version: 1.0
In-Reply-To: <1436778368-65447-6-git-send-email-tao.peng@primarydata.com>
Content-Type: text/plain; charset="utf-8"
Sender: linux-nfs-owner@vger.kernel.org

Hi Tao,

One question inline (below):

On 07/13/2015 05:06 AM, Peng Tao wrote:
> It can be called by user space to CLONE two files.
> Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
> Thus we don't mess up userspace with too many ioctls.
> 
> Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> ---
>  fs/nfs/nfs4file.c        | 102 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/nfs.h |   4 ++
>  2 files changed, 106 insertions(+)
> 
> diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
> index dcd39d4..dfa6620 100644
> --- a/fs/nfs/nfs4file.c
> +++ b/fs/nfs/nfs4file.c
> @@ -4,6 +4,7 @@
>   *  Copyright (C) 1992  Rick Sladkey
>   */
>  #include <linux/fs.h>
> +#include <linux/file.h>
>  #include <linux/falloc.h>
>  #include <linux/nfs_fs.h>
>  #include "internal.h"
> @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
>  		return nfs42_proc_deallocate(filep, offset, len);
>  	return nfs42_proc_allocate(filep, offset, len);
>  }
> +
> +static noinline long
> +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
> +		  u64 src_off, u64 dst_off, u64 count)
> +{
> +	struct inode *dst_inode = file_inode(dst_file);
> +	struct fd src_file;
> +	struct inode *src_inode;
> +	int ret;
> +
> +	/* dst file must be opened for writing */
> +	if (!(dst_file->f_mode & FMODE_WRITE))
> +		return -EINVAL;
> +
> +	ret = mnt_want_write_file(dst_file);
> +	if (ret)
> +		return ret;
> +
> +	src_file = fdget(srcfd);
> +	if (!src_file.file) {
> +		ret = -EBADF;
> +		goto out_drop_write;
> +	}
> +
> +	src_inode = file_inode(src_file.file);
> +
> +	/* src and dst must be different files */
> +	ret = -EINVAL;
> +	if (src_inode == dst_inode)
> +		goto out_fput;
> +
> +	/* src file must be opened for reading */
> +	if (!(src_file.file->f_mode & FMODE_READ))
> +		goto out_fput;
> +
> +	/* src and dst must be regular files */
> +	ret = -EISDIR;
> +	if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
> +		goto out_fput;
> +
> +	ret = -EXDEV;
> +	if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
> +	    src_inode->i_sb != dst_inode->i_sb)
> +		goto out_fput;
> +
> +	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
> +	if (dst_inode < src_inode) {

Why is the order of inode numbers important?

Thanks,
Anna

> +		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
> +		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
> +	} else {
> +		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
> +		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
> +	}
> +
> +	/* flush all pending writes on both src and dst so that server
> +	 * has the latest data */
> +	ret = nfs_sync_inode(src_inode);
> +	if (ret)
> +		goto out_unlock;
> +	ret = nfs_sync_inode(dst_inode);
> +	if (ret)
> +		goto out_unlock;
> +
> +	ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
> +
> +	/* truncate inode page cache of the dst range so that future reads can fetch
> +	 * new data from server */
> +	if (!ret)
> +		truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
> +
> +out_unlock:
> +	if (dst_inode < src_inode) {
> +		mutex_unlock(&src_inode->i_mutex);
> +		mutex_unlock(&dst_inode->i_mutex);
> +	} else {
> +		mutex_unlock(&dst_inode->i_mutex);
> +		mutex_unlock(&src_inode->i_mutex);
> +	}
> +out_fput:
> +	fdput(src_file);
> +out_drop_write:
> +	mnt_drop_write_file(dst_file);
> +	return ret;
> +}
>  #endif /* CONFIG_NFS_V4_2 */
>  
> +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	switch (cmd) {
> +#ifdef CONFIG_NFS_V4_2
> +	case NFS_IOC_CLONE:
> +		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
> +#endif
> +	}
> +
> +	return -ENOTTY;
> +}
> +
>  const struct file_operations nfs4_file_operations = {
>  #ifdef CONFIG_NFS_V4_2
>  	.llseek		= nfs4_file_llseek,
> @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = {
>  #endif /* CONFIG_NFS_V4_2 */
>  	.check_flags	= nfs_check_flags,
>  	.setlease	= simple_nosetlease,
> +#ifdef CONFIG_COMPAT
> +	.unlocked_ioctl = nfs4_ioctl,
> +#else
> +	.compat_ioctl	= nfs4_ioctl,
> +#endif /* CONFIG_COMPAT */
>  };
> diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
> index 5199a36..d85748d 100644
> --- a/include/uapi/linux/nfs.h
> +++ b/include/uapi/linux/nfs.h
> @@ -31,6 +31,10 @@
>  
>  #define NFS_PIPE_DIRNAME "nfs"
>  
> +/* NFS ioctls */
> +/* Let's follow btrfs lead on CLONE to avoid messing userspace */
> +#define NFS_IOC_CLONE	_IOW(0x94, 9, int)
> +
>  /*
>   * NFS stats. The good thing with these values is that NFSv3 errors are
>   * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
>