The following is a series of patches related to Unionfs. The main change
here is that we dropped ->sync_page and rewritten ->writepage.
These patches were tested (where appropriate) on Linus's 2.6.24 latest code
(as of v2.6.24-rc1-521-g54866f0), as well as the backports to
2.6.{23,22,21,20,19,18,9} on ext2/3/4, xfs, reiserfs, nfs2/3/4, jffs2,
ramfs, tmpfs, cramfs, and squashfs (where available). See
http://unionfs.filesystems.org/ to download backported unionfs code.
Please pull from the 'master' branch of
git://git.kernel.org/pub/scm/linux/kernel/git/ezk/unionfs.git
to receive the following:
Erez Zadok (8):
Unionfs: delete whiteouts in sticky directories
Unionfs: cleanup permission checking code
Unionfs: update usage.txt documentation
Unionfs: mmap updates
Unionfs: avoid a deadlock during branch-management on a pivot_root'ed union
Unionfs: don't bother validating inode if it has no lower branches
Unionfs: don't printk an error if it's due to common copyup
Unionfs/VFS: no need to export 2 symbols in security/security.c
Documentation/filesystems/unionfs/usage.txt | 17 +++
fs/unionfs/commonfops.c | 4
fs/unionfs/debug.c | 6 +
fs/unionfs/inode.c | 96 ++++++-----------
fs/unionfs/mmap.c | 156 ++++++++++------------------
fs/unionfs/rename.c | 2
fs/unionfs/subr.c | 8 +
fs/unionfs/union.h | 39 +++++--
security/security.c | 2
9 files changed, 158 insertions(+), 172 deletions(-)
---
Erez Zadok
[email protected]
This is needed to maintain Unix semantics.
Signed-off-by: Erez Zadok <[email protected]>
---
fs/unionfs/inode.c | 17 +++++++++++++++++
1 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/fs/unionfs/inode.c b/fs/unionfs/inode.c
index 6ca52f4..f4facf4 100644
--- a/fs/unionfs/inode.c
+++ b/fs/unionfs/inode.c
@@ -90,6 +90,23 @@ static int unionfs_create(struct inode *parent, struct dentry *dentry,
err = vfs_unlink(lower_dir_dentry->d_inode, wh_dentry);
unlock_dir(lower_dir_dentry);
+ /*
+ * Whiteouts are special files and should be deleted
+ * no matter what (as if they never existed), in
+ * order to allow this create operation to succeed.
+ * This is especially important in sticky
+ * directories: a whiteout may have been created by
+ * one user, but the newly created file may be
+ * created by another user. Therefore, in order to
+ * maintain Unix semantics, if the vfs_unlink above
+ * ailed, then we have to try to directly unlink the
+ * whiteout. Note: in the ODF version of unionfs,
+ * whiteout are handled much more cleanly.
+ */
+ if (err == -EPERM) {
+ struct inode *inode = lower_dir_dentry->d_inode;
+ err = inode->i_op->unlink(inode, wh_dentry);
+ }
if (err) {
printk(KERN_ERR "unionfs: create: could not "
"unlink whiteout, err = %d\n", err);
--
1.5.2.2
Use vfs helpers and avoid redundant checks performed by the VFS already.
Signed-off-by: Erez Zadok <[email protected]>
---
fs/unionfs/commonfops.c | 4 ---
fs/unionfs/inode.c | 70 +++++++++--------------------------------------
2 files changed, 13 insertions(+), 61 deletions(-)
diff --git a/fs/unionfs/commonfops.c b/fs/unionfs/commonfops.c
index 7654bcb..50e5775 100644
--- a/fs/unionfs/commonfops.c
+++ b/fs/unionfs/commonfops.c
@@ -669,10 +669,6 @@ static long do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
lower_file = unionfs_lower_file(file);
- err = security_file_ioctl(lower_file, cmd, arg);
- if (err)
- goto out;
-
err = -ENOTTY;
if (!lower_file || !lower_file->f_op)
goto out;
diff --git a/fs/unionfs/inode.c b/fs/unionfs/inode.c
index f4facf4..169365c 100644
--- a/fs/unionfs/inode.c
+++ b/fs/unionfs/inode.c
@@ -914,59 +914,6 @@ static void unionfs_put_link(struct dentry *dentry, struct nameidata *nd,
}
/*
- * Basically copied from the kernel vfs permission(), but we've changed
- * the following:
- * (1) the IS_RDONLY check is skipped, and
- * (2) We return 0 (success) if the non-leftmost branch is mounted
- * readonly, to allow copyup to work.
- * (3) we do call security_inode_permission, and therefore security inside
- * SELinux, etc. are performed.
- *
- * @inode: the lower inode we're checking permission on
- */
-static int inode_permission(struct super_block *sb, struct inode *inode,
- int mask, struct nameidata *nd, int bindex)
-{
- int retval, submask;
-
- if (mask & MAY_WRITE) {
- umode_t mode = inode->i_mode;
- /* The first branch is allowed to be really readonly. */
- if (bindex == 0 &&
- IS_RDONLY(inode) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
- return -EROFS;
- /*
- * Nobody gets write access to an immutable file.
- */
- if (IS_IMMUTABLE(inode))
- return -EACCES;
- /*
- * For all other branches than the first one, we ignore
- * EROFS or if the branch is mounted as readonly, to let
- * copyup take place.
- */
- if (bindex > 0 &&
- is_robranch_super(sb, bindex) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
- return 0;
- }
-
- /* Ordinary permission routines do not understand MAY_APPEND. */
- submask = mask & ~MAY_APPEND;
- if (inode->i_op && inode->i_op->permission)
- retval = inode->i_op->permission(inode, submask, nd);
- else
- retval = generic_permission(inode, submask, NULL);
-
- if (retval && retval != -EROFS) /* ignore EROFS */
- return retval;
-
- retval = security_inode_permission(inode, mask, nd);
- return ((retval == -EROFS) ? 0 : retval); /* ignore EROFS */
-}
-
-/*
* Don't grab the superblock read-lock in unionfs_permission, which prevents
* a deadlock with the branch-management "add branch" code (which grabbed
* the write lock). It is safe to not grab the read lock here, because even
@@ -1011,11 +958,20 @@ static int unionfs_permission(struct inode *inode, int mask,
continue;
/*
- * We use our own special version of permission, such that
- * only the first branch returns -EROFS.
+ * We check basic permissions, but we ignore any conditions
+ * such as readonly file systems or branches marked as
+ * readonly, because those conditions should lead to a
+ * copyup taking place later on.
*/
- err = inode_permission(inode->i_sb, lower_inode, mask, nd,
- bindex);
+ err = permission(lower_inode, mask, nd);
+ if (err && bindex > 0) {
+ umode_t mode = lower_inode->i_mode;
+ if (is_robranch_super(inode->i_sb, bindex) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ err = 0;
+ if (IS_COPYUP_ERR(err))
+ err = 0;
+ }
/*
* The permissions are an intersection of the overall directory
--
1.5.2.2
Signed-off-by: Erez Zadok <[email protected]>
---
fs/unionfs/debug.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/fs/unionfs/debug.c b/fs/unionfs/debug.c
index 894bf7c..0066ccd 100644
--- a/fs/unionfs/debug.c
+++ b/fs/unionfs/debug.c
@@ -49,6 +49,9 @@ void __unionfs_check_inode(const struct inode *inode,
sb = inode->i_sb;
istart = ibstart(inode);
iend = ibend(inode);
+ /* don't check inode if no lower branches */
+ if (istart < 0 && iend < 0)
+ return;
if (unlikely(istart > iend)) {
PRINT_CALLER(fname, fxn, line);
pr_debug(" Ci0: inode=%p istart/end=%d:%d\n",
@@ -221,6 +224,9 @@ check_inode:
return;
istart = ibstart(inode);
iend = ibend(inode);
+ /* don't check inode if no lower branches */
+ if (istart < 0 && iend < 0)
+ return;
BUG_ON(istart > iend);
if (unlikely((istart == -1 && iend != -1) ||
(istart != -1 && iend == -1))) {
--
1.5.2.2
Signed-off-by: Erez Zadok <[email protected]>
Acked-by: Herton Ronaldo Krzesinski <[email protected]>
---
fs/unionfs/union.h | 39 +++++++++++++++++++++++++++++++++------
1 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/fs/unionfs/union.h b/fs/unionfs/union.h
index 6333488..0476f97 100644
--- a/fs/unionfs/union.h
+++ b/fs/unionfs/union.h
@@ -142,11 +142,18 @@ struct unionfs_sb_info {
* This rwsem is used to make sure that a branch management
* operation...
* 1) will not begin before all currently in-flight operations
- * complete
+ * complete.
* 2) any new operations do not execute until the currently
- * running branch management operation completes
+ * running branch management operation completes.
+ *
+ * The write_lock_owner records the PID of the task which grabbed
+ * the rw_sem for writing. If the same task also tries to grab the
+ * read lock, we allow it. This prevents a self-deadlock when
+ * branch-management is used on a pivot_root'ed union, because we
+ * have to ->lookup paths which belong to the same union.
*/
struct rw_semaphore rwsem;
+ pid_t write_lock_owner; /* PID of rw_sem owner (write lock) */
int high_branch_id; /* last unique branch ID given */
struct unionfs_data *data;
};
@@ -234,10 +241,30 @@ static inline off_t rdstate2offset(struct unionfs_dir_state *buf)
return tmp;
}
-#define unionfs_read_lock(sb) down_read(&UNIONFS_SB(sb)->rwsem)
-#define unionfs_read_unlock(sb) up_read(&UNIONFS_SB(sb)->rwsem)
-#define unionfs_write_lock(sb) down_write(&UNIONFS_SB(sb)->rwsem)
-#define unionfs_write_unlock(sb) up_write(&UNIONFS_SB(sb)->rwsem)
+static inline void unionfs_read_lock(struct super_block *sb)
+{
+ if (UNIONFS_SB(sb)->write_lock_owner &&
+ UNIONFS_SB(sb)->write_lock_owner == current->pid)
+ return;
+ down_read(&UNIONFS_SB(sb)->rwsem);
+}
+static inline void unionfs_read_unlock(struct super_block *sb)
+{
+ if (UNIONFS_SB(sb)->write_lock_owner &&
+ UNIONFS_SB(sb)->write_lock_owner == current->pid)
+ return;
+ up_read(&UNIONFS_SB(sb)->rwsem);
+}
+static inline void unionfs_write_lock(struct super_block *sb)
+{
+ down_write(&UNIONFS_SB(sb)->rwsem);
+ UNIONFS_SB(sb)->write_lock_owner = current->pid;
+}
+static inline void unionfs_write_unlock(struct super_block *sb)
+{
+ up_write(&UNIONFS_SB(sb)->rwsem);
+ UNIONFS_SB(sb)->write_lock_owner = 0;
+}
static inline void unionfs_double_lock_dentry(struct dentry *d1,
struct dentry *d2)
--
1.5.2.2
Signed-off-by: Erez Zadok <[email protected]>
---
Documentation/filesystems/unionfs/usage.txt | 17 +++++++++++++++++
1 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/Documentation/filesystems/unionfs/usage.txt b/Documentation/filesystems/unionfs/usage.txt
index d8c15de..a6b1aca 100644
--- a/Documentation/filesystems/unionfs/usage.txt
+++ b/Documentation/filesystems/unionfs/usage.txt
@@ -74,6 +74,23 @@ to read-write, and change /bar from read-write to read-only:
# mount -t unionfs -o remount,mode=/foo=rw,mode=/bar=ro none MOUNTPOINT
+Note: in Unionfs 2.x, you cannot set the leftmost branch to readonly because
+then Unionfs won't have any writable place for copyups to take place.
+Moreover, the VFS can get confused when it tries to modify something in a
+file system mounted read-write, but isn't permitted to write to it.
+Instead, you should set the whole union as readonly, as described above.
+If, however, you must set the leftmost branch as readonly, perhaps so you
+can get a snapshot of it at a point in time, then you should insert a new
+writable top-level branch, and mark the one you want as readonly. This can
+be accomplished as follows, assuming that /foo is your current leftmost
+branch:
+
+# mount -t tmpfs -o size=NNN /new
+# mount -t unionfs -o remount,add=/new,mode=/foo=ro none MOUNTPOINT
+<do what you want safely in /foo>
+# mount -t unionfs -o remount,del=/new,mode=/foo=rw none MOUNTPOINT
+<check if there's anything in /new you want to preserve>
+# umount /new
CACHE CONSISTENCY
=================
--
1.5.2.2
Signed-off-by: Erez Zadok <[email protected]>
---
fs/unionfs/inode.c | 9 +++++----
fs/unionfs/rename.c | 2 +-
fs/unionfs/subr.c | 8 ++++++--
3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/fs/unionfs/inode.c b/fs/unionfs/inode.c
index 169365c..bcefe45 100644
--- a/fs/unionfs/inode.c
+++ b/fs/unionfs/inode.c
@@ -465,10 +465,11 @@ static int unionfs_symlink(struct inode *dir, struct dentry *dentry,
if (!lower_dentry || IS_ERR(lower_dentry)) {
if (IS_ERR(lower_dentry))
err = PTR_ERR(lower_dentry);
-
- printk(KERN_ERR "unionfs: lower dentry "
- "NULL (or error) for bindex = %d\n",
- bindex);
+ if (!IS_COPYUP_ERR(err))
+ printk(KERN_ERR
+ "unionfs: create_parents for "
+ "symlink failed: bindex=%d "
+ "err=%d\n", bindex, err);
continue;
}
}
diff --git a/fs/unionfs/rename.c b/fs/unionfs/rename.c
index 1ab474f..4a35b5e 100644
--- a/fs/unionfs/rename.c
+++ b/fs/unionfs/rename.c
@@ -41,7 +41,7 @@ static int __unionfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bindex);
if (IS_ERR(lower_new_dentry)) {
err = PTR_ERR(lower_new_dentry);
- if (err == -EROFS)
+ if (IS_COPYUP_ERR(err))
goto out;
printk(KERN_ERR "unionfs: error creating directory "
"tree for rename, bindex=%d err=%d\n",
diff --git a/fs/unionfs/subr.c b/fs/unionfs/subr.c
index d97086a..968ee8c 100644
--- a/fs/unionfs/subr.c
+++ b/fs/unionfs/subr.c
@@ -61,8 +61,12 @@ int create_whiteout(struct dentry *dentry, int start)
dentry->d_name.name,
bindex);
if (!lower_dentry || IS_ERR(lower_dentry)) {
- printk(KERN_ERR "unionfs: create_parents "
- "failed for bindex = %d\n", bindex);
+ int ret = PTR_ERR(lower_dentry);
+ if (!IS_COPYUP_ERR(ret))
+ printk(KERN_ERR
+ "unionfs: create_parents for "
+ "whiteout failed: bindex=%d "
+ "err=%d\n", bindex, ret);
continue;
}
}
--
1.5.2.2
Rewrite unionfs_writepage to minimize dependence on AOP_WRITEPAGE_ACTIVEATE,
handle memory pressure better, and update documentation. Remove
unionfs_sync_page because it's not needed.
CC: Hugh Dickins <[email protected]>
CC: Pekka Enberg <[email protected]>
Signed-off-by: Erez Zadok <[email protected]>
---
fs/unionfs/mmap.c | 156 ++++++++++++++++++++--------------------------------
1 files changed, 60 insertions(+), 96 deletions(-)
diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c
index bed11c3..4b00b98 100644
--- a/fs/unionfs/mmap.c
+++ b/fs/unionfs/mmap.c
@@ -25,83 +25,91 @@ static int unionfs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode;
struct inode *lower_inode;
struct page *lower_page;
- char *kaddr, *lower_kaddr;
+ struct address_space *lower_mapping; /* lower inode mapping */
+ gfp_t mask;
inode = page->mapping->host;
lower_inode = unionfs_lower_inode(inode);
+ lower_mapping = lower_inode->i_mapping;
/*
* find lower page (returns a locked page)
*
- * NOTE: we used to call grab_cache_page(), but that was unnecessary
- * as it would have tried to create a new lower page if it didn't
- * exist, leading to deadlocks (esp. under memory-pressure
- * conditions, when it is really a bad idea to *consume* more
- * memory). Instead, we assume the lower page exists, and if we can
- * find it, then we ->writepage on it; if we can't find it, then it
- * couldn't have disappeared unless the kernel already flushed it,
- * in which case we're still OK. This is especially correct if
- * wbc->sync_mode is WB_SYNC_NONE (as per
- * Documentation/filesystems/vfs.txt). If we can't flush our page
- * because we can't find a lower page, then at least we re-mark our
- * page as dirty, and return AOP_WRITEPAGE_ACTIVATE as the VFS
- * expects us to. (Note, if in the future it'd turn out that we
- * have to find a lower page no matter what, then we'd have to
- * resort to RAIF's page pointer flipping trick.)
+ * We turn off __GFP_FS while we look for or create a new lower
+ * page. This prevents a recursion into the file system code, which
+ * under memory pressure conditions could lead to a deadlock. This
+ * is similar to how the loop driver behaves (see loop_set_fd in
+ * drivers/block/loop.c). If we can't find the lower page, we
+ * redirty our page and return "success" so that the VM will call us
+ * again in the (hopefully near) future.
*/
- lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+ mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS);
+ lower_page = find_or_create_page(lower_mapping, page->index, mask);
if (!lower_page) {
- err = AOP_WRITEPAGE_ACTIVATE;
+ err = 0;
set_page_dirty(page);
goto out;
}
- /* get page address, and encode it */
- kaddr = kmap(page);
- lower_kaddr = kmap(lower_page);
+ /* copy page data from our upper page to the lower page */
+ copy_highpage(lower_page, page);
- memcpy(lower_kaddr, kaddr, PAGE_CACHE_SIZE);
-
- kunmap(page);
- kunmap(lower_page);
-
- BUG_ON(!lower_inode->i_mapping->a_ops->writepage);
-
- /* call lower writepage (expects locked page) */
- clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
- err = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
-
- /* b/c find_lock_page locked it and ->writepage unlocks on success */
- if (err)
+ /*
+ * Call lower writepage (expects locked page). However, if we are
+ * called with wbc->for_reclaim, then the VFS/VM just wants to
+ * reclaim our page. Therefore, we don't need to call the lower
+ * ->writepage: just copy our data to the lower page (already done
+ * above), then mark the lower page dirty and unlock it, and return
+ * success.
+ */
+ if (wbc->for_reclaim) {
+ set_page_dirty(lower_page);
unlock_page(lower_page);
- /* b/c grab_cache_page increased refcnt */
- page_cache_release(lower_page);
-
+ goto out_release;
+ }
+ BUG_ON(!lower_mapping->a_ops->writepage);
+ clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
+ err = lower_mapping->a_ops->writepage(lower_page, wbc);
if (err < 0) {
ClearPageUptodate(page);
- goto out;
+ goto out_release;
}
+
+ /*
+ * Lower file systems such as ramfs and tmpfs, may return
+ * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly)
+ * write the page again for a while. But those lower file systems
+ * also set the page dirty bit back again. Since we successfully
+ * copied our page data to the lower page, then the VM will come
+ * back to the lower page (directly) and try to flush it. So we can
+ * save the VM the hassle of coming back to our page and trying to
+ * flush too. Therefore, we don't re-dirty our own page, and we
+ * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider
+ * this a success).
+ *
+ * We also unlock the lower page if the lower ->writepage returned
+ * AOP_WRITEPAGE_ACTIVATE. (This "anomalous" behaviour may be
+ * addressed in future shmem/VM code.)
+ */
if (err == AOP_WRITEPAGE_ACTIVATE) {
- /*
- * Lower file systems such as ramfs and tmpfs, may return
- * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to
- * (pointlessly) write the page again for a while. But
- * those lower file systems also set the page dirty bit back
- * again. So we mimic that behaviour here.
- */
- if (PageDirty(lower_page))
- set_page_dirty(page);
- goto out;
+ err = 0;
+ unlock_page(lower_page);
}
/* all is well */
SetPageUptodate(page);
- /* lower mtimes has changed: update ours */
+ /* lower mtimes have changed: update ours */
unionfs_copy_attr_times(inode);
- unlock_page(page);
-
+out_release:
+ /* b/c find_or_create_page increased refcnt */
+ page_cache_release(lower_page);
out:
+ /*
+ * We unlock our page unconditionally, because we never return
+ * AOP_WRITEPAGE_ACTIVATE.
+ */
+ unlock_page(page);
return err;
}
@@ -119,9 +127,8 @@ static int unionfs_writepages(struct address_space *mapping,
if (!mapping_cap_writeback_dirty(lower_inode->i_mapping))
goto out;
- /* Note: generic_writepages may return AOP_WRITEPAGE_ACTIVATE */
err = generic_writepages(mapping, wbc);
- if (err == 0)
+ if (!err)
unionfs_copy_attr_times(inode);
out:
return err;
@@ -313,53 +320,10 @@ out:
return err; /* assume all is ok */
}
-static void unionfs_sync_page(struct page *page)
-{
- struct inode *inode;
- struct inode *lower_inode;
- struct page *lower_page;
- struct address_space *mapping;
-
- inode = page->mapping->host;
- lower_inode = unionfs_lower_inode(inode);
-
- /*
- * Find lower page (returns a locked page).
- *
- * NOTE: we used to call grab_cache_page(), but that was unnecessary
- * as it would have tried to create a new lower page if it didn't
- * exist, leading to deadlocks. All our sync_page method needs to
- * do is ensure that pending I/O gets done.
- */
- lower_page = find_lock_page(lower_inode->i_mapping, page->index);
- if (!lower_page) {
- printk(KERN_ERR "unionfs: find_lock_page failed\n");
- goto out;
- }
-
- /* do the actual sync */
- mapping = lower_page->mapping;
- /*
- * XXX: can we optimize ala RAIF and set the lower page to be
- * discarded after a successful sync_page?
- */
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(lower_page);
-
- /* b/c find_lock_page locked it */
- unlock_page(lower_page);
- /* b/c find_lock_page increased refcnt */
- page_cache_release(lower_page);
-
-out:
- return;
-}
-
struct address_space_operations unionfs_aops = {
.writepage = unionfs_writepage,
.writepages = unionfs_writepages,
.readpage = unionfs_readpage,
.prepare_write = unionfs_prepare_write,
.commit_write = unionfs_commit_write,
- .sync_page = unionfs_sync_page,
};
--
1.5.2.2
Signed-off-by: Erez Zadok <[email protected]>
---
security/security.c | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/security/security.c b/security/security.c
index 95a6733..0e1f1f1 100644
--- a/security/security.c
+++ b/security/security.c
@@ -409,7 +409,6 @@ int security_inode_permission(struct inode *inode, int mask, struct nameidata *n
return 0;
return security_ops->inode_permission(inode, mask, nd);
}
-EXPORT_SYMBOL(security_inode_permission);
int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -519,7 +518,6 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
return security_ops->file_ioctl(file, cmd, arg);
}
-EXPORT_SYMBOL(security_file_ioctl);
int security_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
--
1.5.2.2