MIME-Version: 1.0
In-Reply-To: <20161012133326.GD31239@veci.piliscsaba.szeredi.hu>
References: <20161012133326.GD31239@veci.piliscsaba.szeredi.hu>
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 13 Oct 2016 21:45:51 +0300
Message-ID: <CAOQ4uxibnWDsSdfZg=d=Sp-Hz06ZdcGxhoVp19HTY0ksyzo1ug@mail.gmail.com>
Subject: Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
To: Miklos Szeredi <miklos@szeredi.hu>
Cc: linux-unionfs@vger.kernel.org,
        linux-fsdevel <linux-fsdevel@vger.kernel.org>,
        linux-kernel <linux-kernel@vger.kernel.org>,
        Jeremy Eder <jeder@redhat.com>, David Howells <dhowells@redhat.com>,
        Ratna Bolla <rbolla@portworx.com>, Gou Rao <grao@portworx.com>,
        Vinod Jayaraman <jv@portworx.com>, Al Viro <viro@zeniv.linux.org.uk>,
        Vivek Goyal <vgoyal@redhat.com>, Dave Chinner <david@fromorbit.com>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 14570
Lines: 391

On Wed, Oct 12, 2016 at 4:33 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> This is a proof of concept patch to fix the following.
>
> /ovl is in overlay mount and /ovl/foo exists on the lower layer only.
>
>  rofd = open("/ovl/foo", O_RDONLY);
>  rwfd = open("/ovl/foo", O_WRONLY); /* this causes copy up */
>  write(rwfd, "bar", 3);
>  read(rofd, buf, 3);
>  assert(memcmp(buf, "bar", 3) == 0);
>
> Similar problem exists with an MAP_SHARED mmap created from rofd.
>
> While this has only caused few problems (yum/dnf failure is the only one I know
> of) and easily worked around in userspace, many see it as a proof that overlayfs
> can never be a proper "POSIX" filesystem.
>
> To quell those worries, here's a simple patch that should address the above.
>
> The only VFS change is that f_op is initialized from f_path.dentry->d_inode
> instead of file_inode(filp) in open.  The effect of this is that overlayfs can
> intercept open and other file operations, while the file still effectively
> belongs to the underlying fs.
>
> The patch does not give up on the nice properties of overlayfs, like sharing the
> page cache with the underlying files.  It does cause copy up in one case where
> previously there wasn't one and that's the O_RDONLY/MAP_SHARED case.  I haven't
> done much research into this, but running some tests in chroot didn't trigger
> this.
>
> Comments, testing are welcome.

I ran the -g quick set of xfstest (overlay over ext4)
and there are no regressions - the same set of tests are failing.

I am trying to get to adding more xfstests for overlay and/or to
improve the way that the generic tests run on overlay.

>
> Thanks,
> Miklos
>
> ---
>  fs/internal.h            |   1 -
>  fs/open.c                |   2 +-
>  fs/overlayfs/dir.c       |   2 +-
>  fs/overlayfs/inode.c     | 175 +++++++++++++++++++++++++++++++++++++++++++----
>  fs/overlayfs/overlayfs.h |   2 +-
>  fs/overlayfs/super.c     |   7 +-
>  include/linux/fs.h       |   1 +
>  7 files changed, 169 insertions(+), 21 deletions(-)
>
> diff --git a/fs/internal.h b/fs/internal.h
> index f4da3341b4a3..361ba1c12698 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -112,7 +112,6 @@ extern long do_handle_open(int mountdirfd,
>                            struct file_handle __user *ufh, int open_flag);
>  extern int open_check_o_direct(struct file *f);
>  extern int vfs_open(const struct path *, struct file *, const struct cred *);
> -extern struct file *filp_clone_open(struct file *);
>
>  /*
>   * inode.c
> diff --git a/fs/open.c b/fs/open.c
> index a7719cfb7257..e21f1a6f77b7 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -728,7 +728,7 @@ static int do_dentry_open(struct file *f,
>         if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
>                 f->f_mode |= FMODE_ATOMIC_POS;
>
> -       f->f_op = fops_get(inode->i_fop);
> +       f->f_op = fops_get(f->f_path.dentry->d_inode->i_fop);

I said it before, I think we need to find a good macro name for this construct
maybe file_dentry() := f->f_path.dentry ?
and the few places that really need d_real should use a new macro
file_real_dentry()??


>         if (unlikely(WARN_ON(!f->f_op))) {
>                 error = -ENODEV;
>                 goto cleanup_all;
> diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
> index 5f90ddf778ba..1ea511be6e37 100644
> --- a/fs/overlayfs/dir.c
> +++ b/fs/overlayfs/dir.c
> @@ -534,7 +534,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
>                 goto out;
>
>         err = -ENOMEM;
> -       inode = ovl_new_inode(dentry->d_sb, mode);
> +       inode = ovl_new_inode(dentry->d_sb, mode, rdev);
>         if (!inode)
>                 goto out_drop_write;
>
> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
> index c18d6a4ff456..744c8eb7e947 100644
> --- a/fs/overlayfs/inode.c
> +++ b/fs/overlayfs/inode.c
> @@ -11,6 +11,9 @@
>  #include <linux/slab.h>
>  #include <linux/xattr.h>
>  #include <linux/posix_acl.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/file.h>
>  #include "overlayfs.h"
>
>  static int ovl_copy_up_truncate(struct dentry *dentry)
> @@ -381,7 +384,154 @@ static const struct inode_operations ovl_symlink_inode_operations = {
>         .update_time    = ovl_update_time,
>  };
>
> -static void ovl_fill_inode(struct inode *inode, umode_t mode)
> +static const struct file_operations ovl_file_operations;
> +
> +static const struct file_operations *ovl_real_fop(struct file *file)
> +{
> +       return file_inode(file)->i_fop;
> +}
> +
> +static int ovl_open(struct inode *realinode, struct file *file)
> +{
> +       int ret = 0;
> +       const struct file_operations *fop = ovl_real_fop(file);
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +
> +       /* Don't intercept upper file operations */
> +       if (isupper)
> +               replace_fops(file, fop);
> +
> +       if (fop->open)
> +               ret = fop->open(realinode, file);
> +
> +       if (!isupper && WARN_ON(file->f_op != &ovl_file_operations)) {
> +               if (fop->release)
> +                       fop->release(realinode, file);
> +               return -EINVAL;
> +       }
> +
> +       return ret;
> +}
> +
> +static int ovl_release(struct inode *realinode, struct file *file)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +
> +       if (fop->release)
> +               return fop->release(realinode, file);
> +
> +       return 0;
> +}
> +
> +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +       struct file *file = iocb->ki_filp;
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       ssize_t ret = -EINVAL;
> +
> +       if (likely(!isupper)) {
> +               const struct file_operations *fop = ovl_real_fop(file);
> +
> +               if (likely(fop->read_iter))
> +                       ret = fop->read_iter(iocb, to);
> +       } else {
> +               struct file *upperfile = filp_clone_open(file);
> +
> +               if (IS_ERR(upperfile)) {
> +                       ret = PTR_ERR(upperfile);
> +               } else {
> +                       ret = vfs_iter_read(upperfile, to, &iocb->ki_pos);
> +                       fput(upperfile);
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +
> +       if (fop->llseek)
> +               return fop->llseek(file, offset, whence);
> +
> +       return -EINVAL;
> +}
> +
> +static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       int err;
> +
> +       /*
> +        * Treat MAP_SHARED as hint about future writes to the file (through
> +        * another file descriptor).  Caller might not have had such an intent,
> +        * but we hope MAP_PRIVATE will be used in most such cases.
> +        *
> +        * If we don't copy up now and the file is modified, it becomes really
> +        * difficult to change the mapping to match that of the file's content
> +        * later.
> +        */
> +       if (unlikely(isupper || vma->vm_flags & VM_MAYSHARE)) {
> +               if (!isupper) {
> +                       err = ovl_copy_up(file->f_path.dentry);
> +                       if (err)
> +                               goto out;
> +               }
> +
> +               file = filp_clone_open(file);
> +               err = PTR_ERR(file);
> +               if (IS_ERR(file))
> +                       goto out;
> +
> +               fput(vma->vm_file);
> +               /* transfer ref: */
> +               vma->vm_file = file;
> +               fop = file->f_op;
> +       }
> +       err = -EINVAL;
> +       if (fop->mmap)
> +               err = fop->mmap(file, vma);
> +out:
> +       return err;
> +}
> +
> +static int ovl_fsync(struct file *file, loff_t start, loff_t end,
> +                    int datasync)

I'm confused. Can fsync be called on a RO file?
I do not see that it can't, but I wonder what is the rational.

> +{
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       int ret = -EINVAL;
> +
> +       if (likely(!isupper)) {
> +               const struct file_operations *fop = ovl_real_fop(file);
> +
> +               if (likely(fop->fsync))
> +                      ret = fop->fsync(file, start, end, datasync);
> +       } else {
> +               struct file *upperfile = filp_clone_open(file);
> +
> +               if (IS_ERR(upperfile)) {
> +                       ret = PTR_ERR(upperfile);
> +               } else {
> +                       ret = vfs_fsync_range(upperfile, start, end, datasync);
> +                       fput(upperfile);
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static const struct file_operations ovl_file_operations = {
> +       .open           = ovl_open,
> +       .release        = ovl_release,
> +       .read_iter      = ovl_read_iter,
> +       .llseek         = ovl_llseek,
> +       .mmap           = ovl_mmap,
> +       .fsync          = ovl_fsync,
> +};
> +
> +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
>  {
>         inode->i_ino = get_next_ino();
>         inode->i_mode = mode;
> @@ -390,8 +540,12 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
>         inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
>  #endif
>
> -       mode &= S_IFMT;
> -       switch (mode) {
> +       switch (mode & S_IFMT) {
> +       case S_IFREG:
> +               inode->i_op = &ovl_file_inode_operations;
> +               inode->i_fop = &ovl_file_operations;
> +               break;
> +
>         case S_IFDIR:
>                 inode->i_op = &ovl_dir_inode_operations;
>                 inode->i_fop = &ovl_dir_operations;
> @@ -402,26 +556,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
>                 break;
>
>         default:
> -               WARN(1, "illegal file type: %i\n", mode);
> -               /* Fall through */
> -
> -       case S_IFREG:
> -       case S_IFSOCK:
> -       case S_IFBLK:
> -       case S_IFCHR:
> -       case S_IFIFO:
>                 inode->i_op = &ovl_file_inode_operations;
> +               init_special_inode(inode, mode, rdev);
>                 break;
>         }
>  }
>
> -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
> +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
>  {
>         struct inode *inode;
>
>         inode = new_inode(sb);
>         if (inode)
> -               ovl_fill_inode(inode, mode);
> +               ovl_fill_inode(inode, mode, rdev);
>
>         return inode;
>  }
> @@ -445,7 +592,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
>         inode = iget5_locked(sb, (unsigned long) realinode,
>                              ovl_inode_test, ovl_inode_set, realinode);
>         if (inode && inode->i_state & I_NEW) {
> -               ovl_fill_inode(inode, realinode->i_mode);
> +               ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
>                 set_nlink(inode, realinode->i_nlink);
>                 unlock_new_inode(inode);
>         }
> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
> index e218e741cb99..95d0d86c2d54 100644
> --- a/fs/overlayfs/overlayfs.h
> +++ b/fs/overlayfs/overlayfs.h
> @@ -195,7 +195,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
>  int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
>  bool ovl_is_private_xattr(const char *name);
>
> -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
> +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
>  struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
>  static inline void ovl_copyattr(struct inode *from, struct inode *to)
>  {
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index 7e3f0127fc1a..daed68d13467 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -305,7 +305,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
>  {
>         struct dentry *real;
>
> -       if (d_is_dir(dentry)) {
> +       if (!d_is_reg(dentry)) {
>                 if (!inode || inode == d_inode(dentry))
>                         return dentry;
>                 goto bug;
> @@ -579,7 +579,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
>                 if (upperdentry && !d_is_dir(upperdentry)) {

Shouldn't this be && !d_is_reg(dentry) to align with the change in
ovl_d_real() above?


>                         inode = ovl_get_inode(dentry->d_sb, realinode);
>                 } else {
> -                       inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
> +                       inode = ovl_new_inode(dentry->d_sb, realinode->i_mode,
> +                                             realinode->i_rdev);
>                         if (inode)
>                                 ovl_inode_init(inode, realinode, !!upperdentry);
>                 }
> @@ -1292,7 +1293,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
>         if (!oe)
>                 goto out_put_cred;
>
> -       root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
> +       root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
>         if (!root_dentry)
>                 goto out_free_oe;
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index bc65d5918140..cc7d1203b846 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2342,6 +2342,7 @@ extern struct file *filp_open(const char *, int, umode_t);
>  extern struct file *file_open_root(struct dentry *, struct vfsmount *,
>                                    const char *, int, umode_t);
>  extern struct file * dentry_open(const struct path *, int, const struct cred *);
> +extern struct file *filp_clone_open(struct file *);
>  extern int filp_close(struct file *, fl_owner_t id);
>
>  extern struct filename *getname_flags(const char __user *, int, int *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html