2004-03-20 08:34:38

by Jörn Engel

[permalink] [raw]
Subject: [PATCH] cowlinks v2

Hi!

Version 2 of my cowlink patch, tested and currently running on my
machine.

Al, I'd especially like your opinion on it. Would you accept
something like this?

Changes since v1:
o moved break_cow_link() check to get_write_access()
o added inode locking when changing flags
o switched to mark_inode_dirty_sync()

TODO:
o Disallow fcntl() for filesystems without support
o Proper support for ext[23]
o Switch to mark_inode_dirty() without sync?
o Library support for
o copyfile() (link and set cow-flag)
o cow_open() (break link if open() fails)

J?rn

--
Premature optimization is the root of all evil.
-- Donald Knuth


fs/ext2/inode.c | 3 +-
fs/ext3/inode.c | 4 ++
fs/fcntl.c | 21 +++++++++++++++
fs/namei.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fcntl.h | 3 ++
include/linux/fs.h | 3 ++
6 files changed, 102 insertions(+), 2 deletions(-)

--- linux-2.6.4/include/linux/fcntl.h~cowlink 2004-03-19 17:38:48.000000000 +0100
+++ linux-2.6.4/include/linux/fcntl.h 2004-03-19 17:52:49.000000000 +0100
@@ -23,6 +23,9 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */

+#define F_SETCOW (F_LINUX_SPECIFIC_BASE+3)
+#define F_GETCOW (F_LINUX_SPECIFIC_BASE+4)
+
#ifdef __KERNEL__

#if BITS_PER_LONG == 32
--- linux-2.6.4/include/linux/fs.h~cowlink 2004-03-19 17:47:29.000000000 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-19 17:52:49.000000000 +0100
@@ -137,6 +137,9 @@
#define S_DEAD 32 /* removed, but still open directory */
#define S_NOQUOTA 64 /* Inode is not counted to quota */
#define S_DIRSYNC 128 /* Directory modifications are synchronous */
+#define S_COWLINK 256 /* Hard links have copy on write semantics.
+ * This flag has no meaning for directories,
+ * but is inherited to directory children */

/*
* Note that nosuid etc flags are inode-specific: setting some file-system
--- linux-2.6.4/fs/fcntl.c~cowlink 2004-03-19 17:47:15.000000000 +0100
+++ linux-2.6.4/fs/fcntl.c 2004-03-19 17:59:20.000000000 +0100
@@ -282,6 +282,20 @@

EXPORT_SYMBOL(f_delown);

+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |= S_COWLINK;
+ else
+ inode->i_flags &= ~S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static long do_fcntl(unsigned int fd, unsigned int cmd,
unsigned long arg, struct file * filp)
{
@@ -346,6 +360,13 @@
case F_NOTIFY:
err = fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETCOW:
+ err = fcntl_setcow(filp, arg);
+ break;
+ case F_GETCOW:
+ err = (filp->f_dentry->d_inode->i_flags & S_COWLINK) /
+ S_COWLINK;
+ break;
default:
break;
}
--- linux-2.6.4/fs/namei.c~cowlink 2004-03-19 17:47:19.000000000 +0100
+++ linux-2.6.4/fs/namei.c 2004-03-19 18:10:00.000000000 +0100
@@ -224,6 +224,33 @@
}

/*
+ * Files with the S_COWLINK flag set cannot be written to, if more
+ * than one hard link to them exists. Ultimately, this function
+ * should copy the inode, assign the copy to the dentry and lower use
+ * count of the old inode - one day.
+ * For now, it is sufficient to return an error and let userspace
+ * deal with the messy part. Not exactly the meaning of
+ * copy-on-write, but much better than writing to fifty files at once
+ * and noticing month later.
+ *
+ * Yes, this breaks the kernel interface and is simply wrong. This
+ * is intended behaviour, so Linus will not merge the code before
+ * it is complete. Or will he?
+ */
+static int break_cow_link(struct inode *inode)
+{
+ if (!(inode->i_flags & S_COWLINK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (inode->i_nlink < 2)
+ return 0;
+ /* TODO: As soon as sendfile can do normal file copies, use that
+ * and always return 0 */
+ return -EMLINK;
+}
+
+/*
* get_write_access() gets write permission for a file.
* put_write_access() releases this write permission.
* This is used for regular files.
@@ -243,7 +270,14 @@

int get_write_access(struct inode * inode)
{
+ int error;
+
spin_lock(&inode->i_lock);
+ error = break_cow_link(inode);
+ if (error) {
+ spin_unlock(&inode->i_lock);
+ return error;
+ }
if (atomic_read(&inode->i_writecount) < 0) {
spin_unlock(&inode->i_lock);
return -ETXTBSY;
@@ -1148,6 +1182,10 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_create(dir, dentry, mode);
+ spin_lock(&inode->i_lock);
+ dentry->d_inode->i_flags |= dir->i_flags & S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
}
return error;
}
@@ -1522,6 +1560,9 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_mkdir(dir,dentry, mode);
+ spin_lock(&inode->i_lock);
+ dentry->d_inode->i_flags |= dir->i_flags & S_COWLINK;
+ spin_unlock(&inode->i_lock);
}
return error;
}
@@ -1820,6 +1861,13 @@
return -EXDEV;

/*
+ * Cowlink attribute is inherited from directory, but here,
+ * the inode already has one. If they don't match, bail out.
+ */
+ if ((dir->i_flags ^ old_dentry->d_inode->i_flags) & S_COWLINK)
+ return -EMLINK;
+
+ /*
* A link to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
@@ -1997,6 +2045,24 @@
return error;
}

+static int cow_allow_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir)
+{
+ /* source and target share directory: allow */
+ if (old_dir == new_dir)
+ return 0;
+ /* source and target directory have identical cowlink flag: allow */
+ if (! ((old_dentry->d_inode->i_flags ^ new_dir->i_flags) & S_COWLINK))
+ return 0;
+ /* We could always fail here, but cowlink flag is only defined for
+ * files and directories, so let's allow special files */
+ if (!S_ISREG(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ return 0;
+}
+
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -2020,6 +2086,10 @@
if (!old_dir->i_op || !old_dir->i_op->rename)
return -EPERM;

+ error = cow_allow_rename(old_dir, old_dentry, new_dir);
+ if (error)
+ return error;
+
DQUOT_INIT(old_dir);
DQUOT_INIT(new_dir);

--- linux-2.6.4/fs/ext2/inode.c~cowlink 2004-03-19 17:44:02.000000000 +0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags = EXT2_I(inode)->i_flags;

+ inode->i_flags = flags;
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -1191,7 +1192,7 @@

raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags = cpu_to_le32(inode->i_flags);
raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
raw_inode->i_frag = ei->i_frag_no;
raw_inode->i_fsize = ei->i_frag_size;
--- linux-2.6.4/fs/ext3/inode.c~cowlink 2004-03-19 17:44:02.000000000 +0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags = EXT3_I(inode)->i_flags;

+ inode->i_flags = flags;
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT3_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -2629,7 +2630,8 @@
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags = cpu_to_le32((ei->i_flags & ~S_COWLINK) |
+ (inode->i_flags & S_COWLINK));
#ifdef EXT3_FRAGMENTS
raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
raw_inode->i_frag = ei->i_frag_no;


2004-03-20 08:49:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> wrote:
>
> +static long fcntl_setcow(struct file *filp, unsigned long arg)
> +{
> + struct inode *inode = filp->f_dentry->d_inode;
> +
> + spin_lock(&inode->i_lock);
> + if (arg)
> + inode->i_flags |= S_COWLINK;
> + else
> + inode->i_flags &= ~S_COWLINK;
> + mark_inode_dirty_sync(inode);
> + spin_unlock(&inode->i_lock);
> + return 0;
> +}
> +

i_lock is an innermost lock. No locks should be taken inside i_lock.

Here, not only is inode_lock being taken inside i_lock but ->dirty_inode
may be called as well, and dirty_inode() may not be called under any
spinlock.

2004-03-20 11:27:55

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 20 March 2004 00:49:01 -0800, Andrew Morton wrote:
> J?rn Engel <[email protected]> wrote:
> >
> > +static long fcntl_setcow(struct file *filp, unsigned long arg)
> > +{
> > + struct inode *inode = filp->f_dentry->d_inode;
> > +
> > + spin_lock(&inode->i_lock);
> > + if (arg)
> > + inode->i_flags |= S_COWLINK;
> > + else
> > + inode->i_flags &= ~S_COWLINK;
> > + mark_inode_dirty_sync(inode);
> > + spin_unlock(&inode->i_lock);
> > + return 0;
> > +}
> > +
>
> i_lock is an innermost lock. No locks should be taken inside i_lock.
>
> Here, not only is inode_lock being taken inside i_lock but ->dirty_inode
> may be called as well, and dirty_inode() may not be called under any
> spinlock.

Is it enough to move the mark_inode_dirty outside of i_lock?

New patch below.

J?rn

--
The cheapest, fastest and most reliable components of a computer
system are those that aren't there.
-- Gordon Bell, DEC labratories

--- linux-2.6.4/include/linux/fcntl.h~cowlink 2004-03-19 17:38:48.000000000 +0100
+++ linux-2.6.4/include/linux/fcntl.h 2004-03-19 17:52:49.000000000 +0100
@@ -23,6 +23,9 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */

+#define F_SETCOW (F_LINUX_SPECIFIC_BASE+3)
+#define F_GETCOW (F_LINUX_SPECIFIC_BASE+4)
+
#ifdef __KERNEL__

#if BITS_PER_LONG == 32
--- linux-2.6.4/include/linux/fs.h~cowlink 2004-03-19 17:47:29.000000000 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-19 17:52:49.000000000 +0100
@@ -137,6 +137,9 @@
#define S_DEAD 32 /* removed, but still open directory */
#define S_NOQUOTA 64 /* Inode is not counted to quota */
#define S_DIRSYNC 128 /* Directory modifications are synchronous */
+#define S_COWLINK 256 /* Hard links have copy on write semantics.
+ * This flag has no meaning for directories,
+ * but is inherited to directory children */

/*
* Note that nosuid etc flags are inode-specific: setting some file-system
--- linux-2.6.4/fs/fcntl.c~cowlink 2004-03-19 17:47:15.000000000 +0100
+++ linux-2.6.4/fs/fcntl.c 2004-03-20 13:29:29.000000000 +0100
@@ -282,6 +282,20 @@

EXPORT_SYMBOL(f_delown);

+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |= S_COWLINK;
+ else
+ inode->i_flags &= ~S_COWLINK;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ return 0;
+}
+
static long do_fcntl(unsigned int fd, unsigned int cmd,
unsigned long arg, struct file * filp)
{
@@ -346,6 +360,13 @@
case F_NOTIFY:
err = fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETCOW:
+ err = fcntl_setcow(filp, arg);
+ break;
+ case F_GETCOW:
+ err = (filp->f_dentry->d_inode->i_flags & S_COWLINK) /
+ S_COWLINK;
+ break;
default:
break;
}
--- linux-2.6.4/fs/namei.c~cowlink 2004-03-19 17:47:19.000000000 +0100
+++ linux-2.6.4/fs/namei.c 2004-03-20 11:59:59.000000000 +0100
@@ -223,6 +223,40 @@
return security_inode_permission(inode, mask, nd);
}

+static inline void set_cowflag(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode->i_flags |= S_COWLINK;
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Files with the S_COWLINK flag set cannot be written to, if more
+ * than one hard link to them exists. Ultimately, this function
+ * should copy the inode, assign the copy to the dentry and lower use
+ * count of the old inode - one day.
+ * For now, it is sufficient to return an error and let userspace
+ * deal with the messy part. Not exactly the meaning of
+ * copy-on-write, but much better than writing to fifty files at once
+ * and noticing month later.
+ *
+ * Yes, this breaks the kernel interface and is simply wrong. This
+ * is intended behaviour, so Linus will not merge the code before
+ * it is complete. Or will he?
+ */
+static int break_cow_link(struct inode *inode)
+{
+ if (!(inode->i_flags & S_COWLINK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (inode->i_nlink < 2)
+ return 0;
+ /* TODO: As soon as sendfile can do normal file copies, use that
+ * and always return 0 */
+ return -EMLINK;
+}
+
/*
* get_write_access() gets write permission for a file.
* put_write_access() releases this write permission.
@@ -243,7 +277,14 @@

int get_write_access(struct inode * inode)
{
+ int error;
+
spin_lock(&inode->i_lock);
+ error = break_cow_link(inode);
+ if (error) {
+ spin_unlock(&inode->i_lock);
+ return error;
+ }
if (atomic_read(&inode->i_writecount) < 0) {
spin_unlock(&inode->i_lock);
return -ETXTBSY;
@@ -1148,6 +1189,10 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_create(dir, dentry, mode);
+ if (dir->i_flags & S_COWLINK) {
+ set_cowflag(dentry->d_inode);
+ mark_inode_dirty_sync(dentry->d_inode);
+ }
}
return error;
}
@@ -1522,6 +1567,9 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_mkdir(dir,dentry, mode);
+ if (dir->i_flags & S_COWLINK) {
+ set_cowflag(dentry->d_inode);
+ }
}
return error;
}
@@ -1820,6 +1868,13 @@
return -EXDEV;

/*
+ * Cowlink attribute is inherited from directory, but here,
+ * the inode already has one. If they don't match, bail out.
+ */
+ if ((dir->i_flags ^ old_dentry->d_inode->i_flags) & S_COWLINK)
+ return -EMLINK;
+
+ /*
* A link to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
@@ -1997,6 +2052,24 @@
return error;
}

+static int cow_allow_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir)
+{
+ /* source and target share directory: allow */
+ if (old_dir == new_dir)
+ return 0;
+ /* source and target directory have identical cowlink flag: allow */
+ if (! ((old_dentry->d_inode->i_flags ^ new_dir->i_flags) & S_COWLINK))
+ return 0;
+ /* We could always fail here, but cowlink flag is only defined for
+ * files and directories, so let's allow special files */
+ if (!S_ISREG(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ return 0;
+}
+
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -2020,6 +2093,10 @@
if (!old_dir->i_op || !old_dir->i_op->rename)
return -EPERM;

+ error = cow_allow_rename(old_dir, old_dentry, new_dir);
+ if (error)
+ return error;
+
DQUOT_INIT(old_dir);
DQUOT_INIT(new_dir);

--- linux-2.6.4/fs/ext2/inode.c~cowlink 2004-03-19 17:44:02.000000000 +0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags = EXT2_I(inode)->i_flags;

+ inode->i_flags = flags;
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -1191,7 +1192,7 @@

raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags = cpu_to_le32(inode->i_flags);
raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
raw_inode->i_frag = ei->i_frag_no;
raw_inode->i_fsize = ei->i_frag_size;
--- linux-2.6.4/fs/ext3/inode.c~cowlink 2004-03-19 17:44:02.000000000 +0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags = EXT3_I(inode)->i_flags;

+ inode->i_flags = flags;
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT3_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -2629,7 +2630,8 @@
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags = cpu_to_le32((ei->i_flags & ~S_COWLINK) |
+ (inode->i_flags & S_COWLINK));
#ifdef EXT3_FRAGMENTS
raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
raw_inode->i_frag = ei->i_frag_no;

2004-03-20 15:03:10

by Patrick J. LoPresti

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Neat stuff! But...

J?rn Engel <[email protected]> writes:

> + * Files with the S_COWLINK flag set cannot be written to, if more
> + * than one hard link to them exists. Ultimately, this function
> + * should copy the inode, assign the copy to the dentry and lower use
> + * count of the old inode - one day.

What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?

I do not think you can implement this without changing the interface
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as-is
and let userspace worry about creating the copy (and dealing with the
resulting errors).

- Pat

2004-03-20 15:23:44

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 20 March 2004 10:03:05 -0500, Patrick J. LoPresti wrote:
>
> What happens if the disk fills while you are making the copy? Will
> open(2) on an *existing file* then return ENOSPC?

Correct. It would be possible to always succeed and return -ENOSPC
on every write(). But then mmap() has the same problem again, so
serious headache would be the only gain from this little excercise.

> I do not think you can implement this without changing the interface
> to open(2). Which means applications have to be made aware of it
> anyway. Which means you might as well leave your implementation as-is
> and let userspace worry about creating the copy (and dealing with the
> resulting errors).

Good point. Vote is now 2:0 for the simple approach.

J?rn

--
"Translations are and will always be problematic. They inflict violence
upon two languages." (translation from German)

2004-03-20 16:48:48

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On 20 Mar 2004, Patrick J. LoPresti wrote:

> What happens if the disk fills while you are making the copy? Will
> open(2) on an *existing file* then return ENOSPC?
>
> I do not think you can implement this without changing the interface
> to open(2). Which means applications have to be made aware of it
> anyway. Which means you might as well leave your implementation as-is
> and let userspace worry about creating the copy (and dealing with the
> resulting errors).

FWIW I did this quite some time ago to speed up copy+diff linux kernel
trees:

http://www.xmailserver.org/flcow.html

It is entirely userspace and uses LD_PRELOAD on my dev shell.



- Davide


2004-03-20 19:28:32

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> wrote:
>
> > i_lock is an innermost lock. No locks should be taken inside i_lock.
> >
> > Here, not only is inode_lock being taken inside i_lock but ->dirty_inode
> > may be called as well, and dirty_inode() may not be called under any
> > spinlock.
>
> Is it enough to move the mark_inode_dirty outside of i_lock?

yup. Now you're using i_lock to protect i_flags (which seems reasonable)
you'll need to hnt down all the other i_flags users and make them use
i_lock too. Currently they appear to be using i_sem.

2004-03-21 12:48:30

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 20 March 2004 11:28:28 -0800, Andrew Morton wrote:
>
> yup. Now you're using i_lock to protect i_flags (which seems reasonable)
> you'll need to hnt down all the other i_flags users and make them use
> i_lock too. Currently they appear to be using i_sem.

Interesting task. A few of those users are filesystems that fill
their own flags into inode->i_flags. Here is an example from
fs/ext3/ialloc.c:

static int find_group_orlov(struct super_block *sb, struct inode *parent)
{
...
if ((parent == sb->s_root->d_inode) ||
(parent->i_flags & EXT3_TOPDIR_FL)) {

Shouldn't this rather be:

if ((parent == sb->s_root->d_inode) ||
(EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {

J?rn

--
And spam is a useful source of entropy for /dev/random too!
-- Jasmine Strong

2004-03-21 12:57:46

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 20 March 2004 08:48:43 -0800, Davide Libenzi wrote:
> On 20 Mar 2004, Patrick J. LoPresti wrote:
>
> > What happens if the disk fills while you are making the copy? Will
> > open(2) on an *existing file* then return ENOSPC?
> >
> > I do not think you can implement this without changing the interface
> > to open(2). Which means applications have to be made aware of it
> > anyway. Which means you might as well leave your implementation as-is
> > and let userspace worry about creating the copy (and dealing with the
> > resulting errors).
>
> FWIW I did this quite some time ago to speed up copy+diff linux kernel
> trees:
>
> http://www.xmailserver.org/flcow.html
>
> It is entirely userspace and uses LD_PRELOAD on my dev shell.

Nice work. I was thinking about something like that as an
intermediate solution (my goal is libc inclusion), just with slightly
different checks:

int ret = open(...);
if (ret == -EMLINK)
ret = cow_open(...);
return ret;

J?rn

--
He that composes himself is wiser than he that composes a book.
-- B. Franklin

2004-03-21 17:59:41

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sun, 21 Mar 2004, J?rn Engel wrote:

> On Sat, 20 March 2004 08:48:43 -0800, Davide Libenzi wrote:
> >
> > FWIW I did this quite some time ago to speed up copy+diff linux kernel
> > trees:
> >
> > http://www.xmailserver.org/flcow.html
> >
> > It is entirely userspace and uses LD_PRELOAD on my dev shell.
>
> Nice work. I was thinking about something like that as an
> intermediate solution (my goal is libc inclusion), just with slightly
> different checks:
>
> int ret = open(...);
> if (ret == -EMLINK)
> ret = cow_open(...);
> return ret;

When I did that, fumes of an in-kernel implementation invaded my head for
a little while. Then you start thinking that you have to teach apps of new
open(2) semantics, you have to bloat kernel code a little bit and you have
to deal with a new set of errors cases that open(2) is not expected to
deal with. A fully userspace implementation did fit my needs at that time,
even if the LD_PRELOAD trick might break if weak aliases setup for open
functions change inside glibc.



- Davide


2004-03-21 18:14:51

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sun, 21 March 2004 09:59:39 -0800, Davide Libenzi wrote:
>
> When I did that, fumes of an in-kernel implementation invaded my head for
> a little while. Then you start thinking that you have to teach apps of new
> open(2) semantics, you have to bloat kernel code a little bit and you have
> to deal with a new set of errors cases that open(2) is not expected to
> deal with. A fully userspace implementation did fit my needs at that time,
> even if the LD_PRELOAD trick might break if weak aliases setup for open
> functions change inside glibc.

209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little. ;)

No matter how you try, there is no way around an additional return
code for open(), so we have to break compatibility anyway. The good
news is that a) people not using this feature won't notice and b) all
programs I tried so far can deal with the problem. Vim even has a
decent error message - as if my patch was anticipated already.

J?rn

--
Everything should be made as simple as possible, but not simpler.
-- Albert Einstein

2004-03-21 18:54:07

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 20 March 2004 11:28:28 -0800, Andrew Morton wrote:
>
> yup. Now you're using i_lock to protect i_flags (which seems reasonable)
> you'll need to hnt down all the other i_flags users and make them use
> i_lock too. Currently they appear to be using i_sem.

Or in some cases nothing at all. Here is the patch against 2.6.4,
sans five non-trivial cases. I'll have to take a closer look at those
first.

J?rn

--
To recognize individual spam features you have to try to get into the
mind of the spammer, and frankly I want to spend as little time inside
the minds of spammers as possible.
-- Paul Graham

Omissions:
o fs/befs/linuxvfs.c abused i_flags, needs closer inspection
o fs/ext2/ialloc.c dito
o fs/ext3/ialloc.c dito
o fs/smbfs/file.c debug output - harmless
o fs/dquot.c vfs_quota_on() is racy, needs a better fix

drivers/usb/core/inode.c | 2 ++
fs/dquot.c | 9 ++++++++-
fs/ext2/ialloc.c | 2 ++
fs/ext2/inode.c | 2 ++
fs/ext3/ialloc.c | 2 ++
fs/ext3/inode.c | 2 ++
fs/fat/inode.c | 5 ++++-
fs/hfs/inode.c | 2 +-
fs/hfsplus/catalog.c | 4 ++--
fs/hfsplus/dir.c | 10 ++++++++--
fs/hfsplus/inode.c | 6 +++++-
fs/hfsplus/ioctl.c | 2 ++
fs/intermezzo/vfs.c | 7 ++++++-
fs/namei.c | 10 ++++++++--
fs/nfs/inode.c | 2 ++
fs/proc/base.c | 4 ++++
fs/reiserfs/inode.c | 9 ++++++++-
fs/udf/ialloc.c | 2 ++
fs/ufs/ialloc.c | 2 ++
fs/xfs/linux/xfs_ioctl.c | 2 +-
fs/xfs/linux/xfs_super.c | 2 ++
fs/xfs/linux/xfs_vnode.c | 2 ++
fs/xfs/xfs_acl.c | 2 +-
fs/xfs/xfs_cap.c | 2 +-
include/linux/fs.h | 30 ++++++++++++++++++++----------
25 files changed, 99 insertions(+), 25 deletions(-)

--- linux-2.6.4/include/linux/fs.h~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-21 17:50:38.000000000 +0100
@@ -153,23 +153,24 @@
*/
#define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))

-#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
+#define IS_RDONLY(inode) (__IS_FLG(inode, MS_RDONLY))
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
- ((inode)->i_flags & S_SYNC))
+ inode_flags((inode), S_SYNC))
#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
- ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
+ inode_flags((inode), (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)

-#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
-#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
-#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
-#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
-#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+#define IS_QUOTAINIT(inode) (inode_flags((inode), S_QUOTA))
+#define IS_NOQUOTA(inode) (inode_flags((inode), S_NOQUOTA))
+#define IS_APPEND(inode) (inode_flags((inode), S_APPEND))
+#define IS_IMMUTABLE(inode) (inode_flags((inode), S_IMMUTABLE))
+#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || \
+ inode_flags((inode), S_NOATIME))
#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME)
#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND)

-#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
+#define IS_DEADDIR(inode) (inode_flags((inode), S_DEAD))

/* the read-only stuff doesn't really belong here, but any other place is
probably as bad and I don't want to create yet another include file. */
@@ -414,7 +415,7 @@

unsigned long i_state;

- unsigned int i_flags;
+ unsigned int i_flags; /* protected by i_lock */
unsigned char i_sock;

atomic_t i_writecount;
@@ -428,6 +429,15 @@
#endif
};

+static inline unsigned int inode_flags(struct inode *inode, unsigned int mask)
+{
+ int ret;
+ spin_lock(inode->i_lock);
+ ret = inode->i_flags & mask;
+ spin_unlock(inode->i_lock);
+ return ret;
+}
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
--- linux-2.6.4/drivers/usb/core/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/drivers/usb/core/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -277,7 +277,9 @@
if (usbfs_empty(dentry)) {
dentry->d_inode->i_nlink -= 2;
dput(dentry);
+ spin_lock(inode->i_lock);
inode->i_flags |= S_DEAD;
+ spin_unlock(inode->i_lock);
dir->i_nlink--;
error = 0;
}
--- linux-2.6.4/fs/xfs/linux/xfs_vnode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_vnode.c 2004-03-21 17:44:03.000000000 +0100
@@ -213,6 +213,7 @@
inode->i_mtime = va.va_mtime;
inode->i_ctime = va.va_ctime;
inode->i_atime = va.va_atime;
+ spin_lock(inode->i_lock);
if (va.va_xflags & XFS_XFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
@@ -229,6 +230,7 @@
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+ spin_unlock(inode->i_lock);
VUNMODIFY(vp);
}
return -error;
--- linux-2.6.4/fs/xfs/linux/xfs_ioctl.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_ioctl.c 2004-03-21 17:44:03.000000000 +0100
@@ -879,7 +879,7 @@
int attr_flags = 0;
int error;

- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return -XFS_ERROR(EPERM);

if (filp->f_flags & O_RDONLY)
--- linux-2.6.4/fs/xfs/linux/xfs_super.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_super.c 2004-03-21 17:44:03.000000000 +0100
@@ -187,6 +187,7 @@
inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ spin_lock(inode->i_lock);
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
@@ -203,6 +204,7 @@
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+ spin_unlock(inode->i_lock);
vp->v_flag &= ~VMODIFIED;
}

--- linux-2.6.4/fs/xfs/xfs_acl.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/xfs/xfs_acl.c 2004-03-21 17:44:03.000000000 +0100
@@ -388,7 +388,7 @@
vattr_t va;
int error;

- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(&vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return EPERM;
if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
return ENOTDIR;
--- linux-2.6.4/fs/xfs/xfs_cap.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/xfs/xfs_cap.c 2004-03-21 17:44:03.000000000 +0100
@@ -192,7 +192,7 @@

if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
return EROFS;
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(&vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return EPERM;
if ((error = _MAC_VACCESS(vp, NULL, VWRITE)))
return error;
--- linux-2.6.4/fs/nfs/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/nfs/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -729,7 +729,9 @@
inode->i_ino = hash;

/* We can't support update_atime(), since the server will reset it */
+ spin_lock(inode->i_lock);
inode->i_flags |= S_NOATIME;
+ spin_unlock(inode->i_lock);
inode->i_mode = fattr->mode;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
--- linux-2.6.4/fs/ufs/ialloc.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/ufs/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -283,7 +283,9 @@

if (DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
+ spin_lock(inode->i_lock);
inode->i_flags |= S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink = 0;
iput(inode);
return ERR_PTR(-EDQUOT);
--- linux-2.6.4/fs/hfs/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/hfs/inode.c 2004-03-21 18:50:48.000000000 +0100
@@ -540,7 +540,7 @@
if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
down(&inode->i_sem);
hfs_file_truncate(inode);
- //if (inode->i_flags & S_DEAD) {
+ //if (IS_DEADDIR(inode) {
// hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
// hfs_delete_inode(inode);
//}
--- linux-2.6.4/fs/intermezzo/vfs.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/intermezzo/vfs.c 2004-03-21 18:45:10.000000000 +0100
@@ -1445,7 +1445,9 @@
error = iops->rmdir(dir->d_inode, dentry);
unlock_kernel();
if (!error) {
+ spin_lock(dentry->d_inode->i_lock);
dentry->d_inode->i_flags |= S_DEAD;
+ spin_unlock(dentry->d_inode->i_lock);
error = presto_settime(fset, NULL, NULL, dir, info,
ATTR_CTIME | ATTR_MTIME);
}
@@ -1842,8 +1844,11 @@
error = do_rename(fset, old_parent, old_dentry,
new_parent, new_dentry, info);
if (target) {
- if (!error)
+ if (!error) {
+ spin_lock(target);
target->i_flags |= S_DEAD;
+ spin_unlock(target);
+ }
// triple_up(&old_dir->i_zombie,
// &new_dir->i_zombie,
// &target->i_zombie);
--- linux-2.6.4/fs/udf/ialloc.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/udf/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -165,7 +165,9 @@
if (DQUOT_ALLOC_INODE(inode))
{
DQUOT_DROP(inode);
+ spin_lock(inode->i_lock);
inode->i_flags |= S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink = 0;
iput(inode);
*err = -EDQUOT;
--- linux-2.6.4/fs/reiserfs/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/reiserfs/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1612,8 +1612,11 @@
/* uid and gid must already be set by the caller for quota init */

/* symlink cannot be immutable or append only, right? */
- if( S_ISLNK( inode -> i_mode ) )
+ if( S_ISLNK( inode -> i_mode ) ) {
+ spin_lock(inode->i_lock);
inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
+ spin_unlock(inode->i_lock);
+ }

inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_size = i_size;
@@ -2287,6 +2290,7 @@
void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
{
if( reiserfs_attrs( inode -> i_sb ) ) {
+ spin_lock(inode->i_lock);
if( sd_attrs & REISERFS_SYNC_FL )
inode -> i_flags |= S_SYNC;
else
@@ -2307,12 +2311,14 @@
REISERFS_I(inode)->i_flags |= i_nopack_mask;
else
REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+ spin_unlock(inode->i_lock);
}
}

void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
{
if( reiserfs_attrs( inode -> i_sb ) ) {
+ spin_lock(inode->i_lock);
if( inode -> i_flags & S_IMMUTABLE )
*sd_attrs |= REISERFS_IMMUTABLE_FL;
else
@@ -2329,6 +2335,7 @@
*sd_attrs |= REISERFS_NOTAIL_FL;
else
*sd_attrs &= ~REISERFS_NOTAIL_FL;
+ spin_unlock(inode->i_lock);
}
}

--- linux-2.6.4/fs/fat/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/fat/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1192,8 +1192,11 @@
MSDOS_I(inode)->mmu_private = inode->i_size;
}
if(de->attr & ATTR_SYS)
- if (sbi->options.sys_immutable)
+ if (sbi->options.sys_immutable) {
+ spin_lock(inode->i_lock);
inode->i_flags |= S_IMMUTABLE;
+ spin_unlock(inode->i_lock);
+ }
MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
/* this is as close to the truth as we can get ... */
inode->i_blksize = sbi->cluster_size;
--- linux-2.6.4/fs/ext3/ialloc.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/ext3/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -627,7 +627,9 @@
return ret;

fail2:
+ spin_lock(inode->i_lock);
inode->i_flags |= S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink = 0;
iput(inode);
brelse(bitmap_bh);
--- linux-2.6.4/fs/ext3/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags = EXT3_I(inode)->i_flags;

+ spin_lock(inode->i_lock);
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT3_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -2458,6 +2459,7 @@
inode->i_flags |= S_NOATIME;
if (flags & EXT3_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
+ spin_unlock(inode->i_lock);
}

void ext3_read_inode(struct inode * inode)
--- linux-2.6.4/fs/proc/base.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/proc/base.c 2004-03-21 17:44:03.000000000 +0100
@@ -1594,7 +1594,9 @@
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_nlink = 3;
+ spin_lock(inode->i_lock);
inode->i_flags|=S_IMMUTABLE;
+ spin_unlock(inode->i_lock);

dentry->d_op = &pid_base_dentry_operations;

@@ -1649,7 +1651,9 @@
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
inode->i_nlink = 3;
+ spin_lock(inode->i_lock);
inode->i_flags|=S_IMMUTABLE;
+ spin_unlock(inode->i_lock);

dentry->d_op = &pid_base_dentry_operations;

--- linux-2.6.4/fs/ext2/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags = EXT2_I(inode)->i_flags;

+ spin_lock(inode->i_lock);
inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
@@ -1031,6 +1032,7 @@
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
+ spin_unlock(inode->i_lock);
}

void ext2_read_inode (struct inode * inode)
--- linux-2.6.4/fs/ext2/ialloc.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/ext2/ialloc.c 2004-03-21 18:43:39.000000000 +0100
@@ -620,7 +620,9 @@
return inode;

fail2:
+ spin_lock(inode->i_lock);
inode->i_flags |= S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink = 0;
iput(inode);
return ERR_PTR(err);
--- linux-2.6.4/fs/namei.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/namei.c 2004-03-21 18:33:10.000000000 +0100
@@ -1609,8 +1609,11 @@
error = security_inode_rmdir(dir, dentry);
if (!error) {
error = dir->i_op->rmdir(dir, dentry);
- if (!error)
+ if (!error) {
+ spin_lock(dentry->d_inode->i_lock);
dentry->d_inode->i_flags |= S_DEAD;
+ spin_unlock(dentry->d_inode->i_lock);
+ }
}
}
up(&dentry->d_inode->i_sem);
@@ -1952,8 +1955,11 @@
else
error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
if (target) {
- if (!error)
+ if (!error) {
+ spin_lock(target->i_lock);
target->i_flags |= S_DEAD;
+ spin_unlock(target->i_lock);
+ }
up(&target->i_sem);
if (d_unhashed(new_dentry))
d_rehash(new_dentry);
--- linux-2.6.4/fs/dquot.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/dquot.c 2004-03-21 17:44:03.000000000 +0100
@@ -573,7 +573,9 @@
if (inode->i_dquot[cnt] != NODQUOT)
goto put_it;
}
+ spin_lock(inode->i_lock);
inode->i_flags &= ~S_QUOTA;
+ spin_unlock(inode->i_lock);
put_it:
if (dquot != NODQUOT) {
if (dqput_blocks(dquot)) {
@@ -824,8 +826,11 @@
break;
}
inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
- if (inode->i_dquot[cnt])
+ if (inode->i_dquot[cnt]) {
+ spin_lock(inode->i_lock);
inode->i_flags |= S_QUOTA;
+ spin_unlock(inode->i_lock);
+ }
}
}
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -839,7 +844,9 @@
{
int cnt;

+ spin_lock(inode->i_lock);
inode->i_flags &= ~S_QUOTA;
+ spin_unlock(inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
to_drop[cnt] = inode->i_dquot[cnt];
inode->i_dquot[cnt] = NODQUOT;
--- linux-2.6.4/fs/hfsplus/catalog.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/hfsplus/catalog.c 2004-03-21 18:49:52.000000000 +0100
@@ -54,11 +54,11 @@

static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
{
- if (inode->i_flags & S_IMMUTABLE)
+ if (IS_IMMUTABLE(inode))
perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
else
perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
- if (inode->i_flags & S_APPEND)
+ if (IS_APPEND(inode))
perms->rootflags |= HFSPLUS_FLG_APPEND;
else
perms->rootflags &= ~HFSPLUS_FLG_APPEND;
--- linux-2.6.4/fs/hfsplus/dir.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/hfsplus/dir.c 2004-03-21 17:44:03.000000000 +0100
@@ -315,8 +315,11 @@
res = hfsplus_rename_cat(inode->i_ino,
dir, &dentry->d_name,
HFSPLUS_SB(sb).hidden_dir, &str);
- if (!res)
+ if (!res) {
+ spin_lock(inode->i_lock);
inode->i_flags |= S_DEAD;
+ spin_unlock(inode->i_lock);
+ }
return res;
}
res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
@@ -330,8 +333,11 @@
res = hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
if (!res)
hfsplus_delete_inode(inode);
- } else
+ } else {
+ spin_lock(inode->i_lock);
inode->i_flags |= S_DEAD;
+ spin_unlock(inode->i_lock);
+ }
}
inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
--- linux-2.6.4/fs/hfsplus/inode.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/hfsplus/inode.c 2004-03-21 18:48:11.000000000 +0100
@@ -225,6 +225,7 @@

HFSPLUS_I(inode).rootflags = perms->rootflags;
HFSPLUS_I(inode).userflags = perms->userflags;
+ spin_lock(inode->i_lock);
if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
@@ -233,10 +234,12 @@
inode->i_flags |= S_APPEND;
else
inode->i_flags &= ~S_APPEND;
+ spin_unlock(inode->i_lock);
}

static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
{
+ spin_lock(inode->i_lock);
if (inode->i_flags & S_IMMUTABLE)
perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
else
@@ -245,6 +248,7 @@
perms->rootflags |= HFSPLUS_FLG_APPEND;
else
perms->rootflags &= ~HFSPLUS_FLG_APPEND;
+ spin_unlock(inode->i_lock);
perms->userflags = HFSPLUS_I(inode).userflags;
perms->mode = cpu_to_be16(inode->i_mode);
perms->owner = cpu_to_be32(inode->i_uid);
@@ -285,7 +289,7 @@
if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
down(&inode->i_sem);
hfsplus_file_truncate(inode);
- if (inode->i_flags & S_DEAD) {
+ if (IS_DEADDIR(inode)) {
hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
hfsplus_delete_inode(inode);
}
--- linux-2.6.4/fs/hfsplus/ioctl.c~lock_flags 2004-03-21 17:43:58.000000000 +0100
+++ linux-2.6.4/fs/hfsplus/ioctl.c 2004-03-21 17:44:03.000000000 +0100
@@ -53,6 +53,7 @@
EXT2_FLAG_NODUMP))
return -EOPNOTSUPP;

+ spin_lock(inode->i_lock);
if (flags & EXT2_FLAG_IMMUTABLE) { /* EXT2_IMMUTABLE_FL */
inode->i_flags |= S_IMMUTABLE;
HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -67,6 +68,7 @@
inode->i_flags &= ~S_APPEND;
HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
}
+ spin_unlock(inode->i_lock);
if (flags & EXT2_FLAG_NODUMP) /* EXT2_NODUMP_FL */
HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
else

2004-03-21 20:27:08

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sun, 21 Mar 2004, J?rn Engel wrote:

> On Sun, 21 March 2004 09:59:39 -0800, Davide Libenzi wrote:
> >
> > When I did that, fumes of an in-kernel implementation invaded my head for
> > a little while. Then you start thinking that you have to teach apps of new
> > open(2) semantics, you have to bloat kernel code a little bit and you have
> > to deal with a new set of errors cases that open(2) is not expected to
> > deal with. A fully userspace implementation did fit my needs at that time,
> > even if the LD_PRELOAD trick might break if weak aliases setup for open
> > functions change inside glibc.
>
> 209 fairly simple lines definitely have more appear than a full
> in-kernel implementation with many new corner-cases, yes. But it
> looks as if you ignore the -ENOSPC case, so you cheated a little. ;)

Yes, at that time I preferred to fall back to the caller open(2) if any
error happened during the COW (a more picky implementation would simply
bounce an error to the application). Look BTW that there is a difference
between the error handling when you have an in-kernel solution or a
completely userspace solution. If you push this inside the kernel you have
at least to define a new open(2) flag and a new set of errors that might
arise when doing the COW. You are basically changing the open(2) API. You
have also to "teach" apps to use the new flag, since obviously you cannot
make COW a default behavior. The fl-cow approach let you define a set of
paths where you want COW to happen (in my case typically /usr/src/lk), and
only apps writing to hard-linked files inside such path gets COWed. The
open(2) API does not change. OTOH there is the LD_PRELOAD trick that is
weak alias dependent.



- Davide



2004-03-21 20:36:24

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sun, 21 March 2004 12:26:57 -0800, Davide Libenzi wrote:
>
> Yes, at that time I preferred to fall back to the caller open(2) if any
> error happened during the COW (a more picky implementation would simply
> bounce an error to the application). Look BTW that there is a difference
> between the error handling when you have an in-kernel solution or a
> completely userspace solution. If you push this inside the kernel you have
> at least to define a new open(2) flag and a new set of errors that might
> arise when doing the COW. You are basically changing the open(2) API. You
> have also to "teach" apps to use the new flag, since obviously you cannot
> make COW a default behavior. The fl-cow approach let you define a set of
> paths where you want COW to happen (in my case typically /usr/src/lk), and
> only apps writing to hard-linked files inside such path gets COWed. The
> open(2) API does not change. OTOH there is the LD_PRELOAD trick that is
> weak alias dependent.

My "solution" to this paradox (leaving the interface unchanged, even
though cow is impossible without changes) is a new flag to files. The
user has to manually set the flag and is now in charge of anything
that might break. Scared? Don't set the flag.

It is a compromise, just like yours. Imo it is a little nicer, but
there just isn't any good solution anymore, 1967 would have been the
right time for that.

J?rn

--
To announce that there must be no criticism of the President, or that we
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918

2004-03-22 00:18:57

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> writes:

> On Sun, 21 March 2004 09:59:39 -0800, Davide Libenzi wrote:
> >
> > When I did that, fumes of an in-kernel implementation invaded my head for
> > a little while. Then you start thinking that you have to teach apps of new
> > open(2) semantics, you have to bloat kernel code a little bit and you have
> > to deal with a new set of errors cases that open(2) is not expected to
> > deal with. A fully userspace implementation did fit my needs at that time,
> > even if the LD_PRELOAD trick might break if weak aliases setup for open
> > functions change inside glibc.
>
> 209 fairly simple lines definitely have more appear than a full
> in-kernel implementation with many new corner-cases, yes. But it
> looks as if you ignore the -ENOSPC case, so you cheated a little. ;)
>
> No matter how you try, there is no way around an additional return
> code for open(), so we have to break compatibility anyway. The good
> news is that a) people not using this feature won't notice and b) all
> programs I tried so far can deal with the problem. Vim even has a
> decent error message - as if my patch was anticipated already.

Actually there is... You don't do the copy until an actual write occurs.
Some files are opened read/write when there is simply the chance they might
be written to so delaying the copy is generally a win.

A coworker of mine implemented a version of this idea as a filesystem.
It did the copy in the kernel, it handled directories, and could be
used to atomically snapshot your filesystem. The only case that was
still a little sketchy was how do you handle cow to a file with hard
links.

The interesting case for us is when you have multiple machines sharing
the same root filesystem.

Eric

2004-03-22 00:25:12

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On 21 Mar 2004, Eric W. Biederman wrote:

> J?rn Engel <[email protected]> writes:
>
> > On Sun, 21 March 2004 09:59:39 -0800, Davide Libenzi wrote:
> > >
> > > When I did that, fumes of an in-kernel implementation invaded my head for
> > > a little while. Then you start thinking that you have to teach apps of new
> > > open(2) semantics, you have to bloat kernel code a little bit and you have
> > > to deal with a new set of errors cases that open(2) is not expected to
> > > deal with. A fully userspace implementation did fit my needs at that time,
> > > even if the LD_PRELOAD trick might break if weak aliases setup for open
> > > functions change inside glibc.
> >
> > 209 fairly simple lines definitely have more appear than a full
> > in-kernel implementation with many new corner-cases, yes. But it
> > looks as if you ignore the -ENOSPC case, so you cheated a little. ;)
> >
> > No matter how you try, there is no way around an additional return
> > code for open(), so we have to break compatibility anyway. The good
> > news is that a) people not using this feature won't notice and b) all
> > programs I tried so far can deal with the problem. Vim even has a
> > decent error message - as if my patch was anticipated already.
>
> Actually there is... You don't do the copy until an actual write occurs.
> Some files are opened read/write when there is simply the chance they might
> be written to so delaying the copy is generally a win.

What about open+mmap?



- Davide


2004-03-22 05:07:56

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Davide Libenzi <[email protected]> writes:

> On 21 Mar 2004, Eric W. Biederman wrote:
>
> > J?rn Engel <[email protected]> writes:
> >
> > > On Sun, 21 March 2004 09:59:39 -0800, Davide Libenzi wrote:
> > > >
> > > > When I did that, fumes of an in-kernel implementation invaded my head for
>
> > > > a little while. Then you start thinking that you have to teach apps of new
>
> > > > open(2) semantics, you have to bloat kernel code a little bit and you have
>
> > > > to deal with a new set of errors cases that open(2) is not expected to
> > > > deal with. A fully userspace implementation did fit my needs at that time,
>
> > > > even if the LD_PRELOAD trick might break if weak aliases setup for open
> > > > functions change inside glibc.
> > >
> > > 209 fairly simple lines definitely have more appear than a full
> > > in-kernel implementation with many new corner-cases, yes. But it
> > > looks as if you ignore the -ENOSPC case, so you cheated a little. ;)
> > >
> > > No matter how you try, there is no way around an additional return
> > > code for open(), so we have to break compatibility anyway. The good
> > > news is that a) people not using this feature won't notice and b) all
> > > programs I tried so far can deal with the problem. Vim even has a
> > > decent error message - as if my patch was anticipated already.
> >
> > Actually there is... You don't do the copy until an actual write occurs.
> > Some files are opened read/write when there is simply the chance they might
> > be written to so delaying the copy is generally a win.
>
> What about open+mmap?

The case is nothing really different from having a hole in your file.

There are two pieces to implementing this. First you create separate
page cache entries for the cow file and it's original, so the
laziness of mmapped file writes will not bite you.. Second you make
aops -> writepage trigger the actual copy of the file, and have it
return -ENOSPC if you can't do the copy.

If cow links became sufficiently common you might want to dig into
the VFS and make it possible to do the copy when a write-fault occurs.
At which point you could share the page cache until you do a copy.

Eric

2004-03-22 05:11:51

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On 21 Mar 2004, Eric W. Biederman wrote:

> Davide Libenzi <[email protected]> writes:
>
> > > Actually there is... You don't do the copy until an actual write occurs.
> > > Some files are opened read/write when there is simply the chance they might
> > > be written to so delaying the copy is generally a win.
> >
> > What about open+mmap?
>
> The case is nothing really different from having a hole in your file.
>
> There are two pieces to implementing this. First you create separate
> page cache entries for the cow file and it's original, so the
> laziness of mmapped file writes will not bite you.. Second you make
> aops -> writepage trigger the actual copy of the file, and have it
> return -ENOSPC if you can't do the copy.

There has been a misunderstanding. I thought you were talking about a
userspace solution ala fl-cow. Of course if you are inside the kernel you
can catch both explicit writes and page syncs.



- Davide


2004-03-22 11:20:36

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Davide Libenzi <[email protected]> writes:

> On 21 Mar 2004, Eric W. Biederman wrote:
>
> > Davide Libenzi <[email protected]> writes:
> >
> > > > Actually there is... You don't do the copy until an actual write occurs.
> > > > Some files are opened read/write when there is simply the chance they
> might
>
> > > > be written to so delaying the copy is generally a win.
> > >
> > > What about open+mmap?
> >
> > The case is nothing really different from having a hole in your file.
> >
> > There are two pieces to implementing this. First you create separate
> > page cache entries for the cow file and it's original, so the
> > laziness of mmapped file writes will not bite you.. Second you make
> > aops -> writepage trigger the actual copy of the file, and have it
> > return -ENOSPC if you can't do the copy.
>
> There has been a misunderstanding. I thought you were talking about a
> userspace solution ala fl-cow. Of course if you are inside the kernel you
> can catch both explicit writes and page syncs.

Right. Although there is nothing prevent the copy to be in user space
even with the trigger hooks down in the write path.

The nice features of having the hook at that point in the kernel are:
1) No new failure modes for user space to worry about.
2) With cow directory support instant fs level check pointing is achieved.

Eric

2004-03-22 16:02:32

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On 22 Mar 2004, Eric W. Biederman wrote:

> Davide Libenzi <[email protected]> writes:
>
> > There has been a misunderstanding. I thought you were talking about a
> > userspace solution ala fl-cow. Of course if you are inside the kernel you
> > can catch both explicit writes and page syncs.
>
> Right. Although there is nothing prevent the copy to be in user space
> even with the trigger hooks down in the write path.

How do you insert yourself before the first page fault to do the COW, from
userspace (open+mmap)? You can obviously hook mmap(2) and if a PROT_WRITE
is requested, you COW from there. But then you have an whole bunch of new
problems (again, when done from userspace) because, just to begin with,
you need a stateful interception layer (while fl-cow for example is stateless).
In my modest usage scenario for my fl-cow shell (emacs+patch+diff+gcc)
I've found that when something opens in RDWR, it really writes the file at
some point during the opened session. So moving the COW down in the write
path helps little or nothing. There may be as well other use cases where
applications do frequently open in RDWR even w/out ever touching the file.



- Davide


2004-03-25 17:49:54

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> Actually there is... You don't do the copy until an actual write occurs.
> Some files are opened read/write when there is simply the chance they might
> be written to so delaying the copy is generally a win.

Programs depend on the inode number returned by fstat() not changing,
and maybe in some other circumstances, even if they subsequently write
to the file.

(It's ok for open() to change the inode number, because that's
equivalent to another program changing the filesystem in parallel).

How do you handle that if COW occurs later than open()?
You could also force COW when fstat() is called, I suppose.

-- Jamie

2004-03-25 18:07:32

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > Actually there is... You don't do the copy until an actual write occurs.
> > Some files are opened read/write when there is simply the chance they might
> > be written to so delaying the copy is generally a win.
>
> Programs depend on the inode number returned by fstat() not changing,
> and maybe in some other circumstances, even if they subsequently write
> to the file.
>
> (It's ok for open() to change the inode number, because that's
> equivalent to another program changing the filesystem in parallel).
>
> How do you handle that if COW occurs later than open()?
> You could also force COW when fstat() is called, I suppose.

One of the rougher patches, in that we don't have persistent inode
numbers. Basically the two files never have the same inode number.
To the user they are always presented as two separate files.
Currently I believe the strategy is to assign an inode when the file
is read into the icache/dcache. I think it is just a sequential
counter.

This was all implemented as a stackable filesystem. Something
that gets down to the real filesystem could likely just reuse
the inode number of the cow link.

Eric

2004-03-25 19:43:14

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> One of the rougher patches, in that we don't have persistent inode
> numbers. Basically the two files never have the same inode number.
> To the user they are always presented as two separate files.

That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.

"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.

-- Jamie

2004-03-25 20:39:08

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2



On Thu, 25 Mar 2004, Jamie Lokier wrote:
>
> That is not useful for me or the other people who want to use this to
> duplicate large source trees and run "diff" between trees.
>
> "diff" depends on being able to check if files in the two trees are
> identical -- by checking whether the inode number and device (and
> maybe other stat data) are identical. This allows "diff -ur" between
> two cloned trees the size of linux to be quite fast. Without that
> optimisation, it's very slow indeed.

I think the correct thing to do is to just admit that cowlinks aren't
POSIX, and instead see the inode number as a way to see whether the link
has been broken or not. Ie just accept the inode number potentially
changing.

That would make "diff" (adn most other uses) ok with this, and anythign
that isn't, just couldn't be used with cowlinked files.

Linus

2004-03-25 21:46:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > One of the rougher patches, in that we don't have persistent inode
> > numbers. Basically the two files never have the same inode number.
> > To the user they are always presented as two separate files.
>
> That is not useful for me or the other people who want to use this to
> duplicate large source trees and run "diff" between trees.
>
> "diff" depends on being able to check if files in the two trees are
> identical -- by checking whether the inode number and device (and
> maybe other stat data) are identical. This allows "diff -ur" between
> two cloned trees the size of linux to be quite fast. Without that
> optimisation, it's very slow indeed.


In the case where cow is implemented as a stackable filesystem it is
easy to discover the changes by looking at the underlying fs instead
of at the cow view. If the file was not changed the file was not
copied.

The reason for the late copy is some programs open files O_RDRW and
only read the file. If those triggered a copy on write when you
opened the file, diff would still need to go to the work of manually
comparing the files.

The underlying idea of copy on write is that you have separate files
that happen to be storing the same data, in the same space. Anytime
you deviate from that is when you are going to get surprised.

The case I care about is sharing the same root filesystem on different
machines, and that must look like 2 separate filesystems.

It is easy to add something like a cowstat or a readcowlink and teach
the few programs that care (i.e. diff, tar,...) how to use it. So I
would rather concentrate on making cow links look like a separate copy
than early optimizations.

Eric

2004-03-25 22:19:32

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Linus Torvalds <[email protected]> writes:

> On Thu, 25 Mar 2004, Jamie Lokier wrote:
> >
> > That is not useful for me or the other people who want to use this to
> > duplicate large source trees and run "diff" between trees.
> >
> > "diff" depends on being able to check if files in the two trees are
> > identical -- by checking whether the inode number and device (and
> > maybe other stat data) are identical. This allows "diff -ur" between
> > two cloned trees the size of linux to be quite fast. Without that
> > optimisation, it's very slow indeed.
>
> I think the correct thing to do is to just admit that cowlinks aren't
> POSIX, and instead see the inode number as a way to see whether the link
> has been broken or not. Ie just accept the inode number potentially
> changing.
>
> That would make "diff" (adn most other uses) ok with this, and anythign
> that isn't, just couldn't be used with cowlinked files.

Really?

tar and cp still need to be taught about them. And if they are not taught
they will do the wrong thing and hard link the files removing the
copy on write semantics. Which would do ugly thing when you restore
from your backup.

I don't have a problem with the inode number changing when you write to
a file, because I can't think of much that would care either way.
But having the inode number of an open file change sounds like a very
difficult problem to track.

Maybe aiming cow links at things like a live cd filesystem is too
ambitious, but it sounds like a minimal clean way to handle all of
the dependencies on writeable files that show up.

Eric

2004-03-27 10:28:42

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> It is easy to add something like a cowstat or a readcowlink and teach
> the few programs that care (i.e. diff, tar,...) how to use it. So I
> would rather concentrate on making cow links look like a separate copy
> than early optimizations.

I agree, having each cowlink look like a separate copy, with separate
inode numbers, is best. That _is_ POSIX compatible -- the sharing is
just a storage optimisation, and programs which only use the POSIX API
won't see the difference.

I have no problem with adding cowstat() to diff, and I'm sure other
people will eventually extend rsync and tar to use it, if it becomes
widely used.

It's not the simplest solution, though. The filesystem changes are
non-trivial. (The simplest solution is just an ext2 attribute which
says you can't write to the file if it has >1 links).

-- Jamie

2004-03-27 21:01:16

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > It is easy to add something like a cowstat or a readcowlink and teach
> > the few programs that care (i.e. diff, tar,...) how to use it. So I
> > would rather concentrate on making cow links look like a separate copy
> > than early optimizations.
>
> I agree, having each cowlink look like a separate copy, with separate
> inode numbers, is best. That _is_ POSIX compatible -- the sharing is
> just a storage optimisation, and programs which only use the POSIX API
> won't see the difference.
>
> I have no problem with adding cowstat() to diff, and I'm sure other
> people will eventually extend rsync and tar to use it, if it becomes
> widely used.
>
> It's not the simplest solution, though. The filesystem changes are
> non-trivial. (The simplest solution is just an ext2 attribute which
> says you can't write to the file if it has >1 links).

There are two possible implementations strategies for implementing
cow files. You can either start as J?rn did with hardlinks, or you
can start with symlinks.

The set of tradeoffs is interesting. When using hardlinks the only
sane thing to do is to change the inode number when you do a copy.
You are limited to normal files, no directories, no symlinks, and the
original must resided on the same filesystem as the copy. In addition
a copy will always have a link count of one. So on that score I
see a hard time getting POSIX semantics out of the a hard link based
cow.

When you start with symlinks the tradeoffs are different. You only
trigger a copy on write when you go through the copy not when you
write through the original. You get distinct inodes for free. They
can be straight forwardly extended to work on symlinks and other node
types. You can have multiple links at the end of the copy, because
symlinks can be hard linked. The original can be stored on another
filesystem. If you don't change the original you get POSIX semantics.

As for simplicity I think the two approaches are roughly equal.

As my memory has it the proto implementation I saw using a stackable
fs and cow symlinks was about a 1000 lines. And it was that large
because it was complete (i.e. it did the copies including copying
directories.)

Eric

2004-03-27 21:42:56

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> There are two possible implementations strategies for implementing
> cow files. You can either start as J?rn did with hardlinks, or you
> can start with symlinks.

Symlinks have a _big_ problem: move one, or move it's target, and it
no longer links to the same file. That's nothing like the
transparency we need cowlinks to have.

There's a third implementation strategy. Since we're talking in all
cases about adding a new feature to the underlying filesystem, why not
implement separate inodes pointing to an underlying shared inode which
holds the data. (I think it was mentioned earlier in this thread).

The implementation is very similar to symbolic links, but instead of
having symlink inodes, you have cowlink inodes which point directly to
another inode and count as references to it.

That provides POSIX semantics and has none of the caveats you and I
have mentioned for hard links and symbolic links.

Implementation: Creating a cowlink to a non-cowlinked file creates a
new shared inode by cloning the file's inode, converts the original
inode to a cowlink-pointer inode, and creates a new cowlink-pointer
inode.

This provides 100% semantic equivalence to copying: all operations
including hard and symbolic links on the resulting cowlink files act
as if the cowlink operation was a copy, but faster and using less
space.

> As my memory has it the proto implementation I saw using a stackable
> fs and cow symlinks was about a 1000 lines. And it was that large
> because it was complete (i.e. it did the copies including copying
> directories.)

I can see a stackable fs being useful for live CD distributions, where
tmpfs or disk hold the modifications stacked over the original
filesystem.

But mounting an fs for each version of a source tree and keeping track
of all those mounts: that would be incredibly clumsy to use.

You could implement a stackable fs which is mounted once and provides
cowlink operations within the fs. That still be a bit clumsy, but not
so much as to make it unusable for source tree management.

-- Jamie

2004-03-27 23:46:34

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > There are two possible implementations strategies for implementing
> > cow files. You can either start as J?rn did with hardlinks, or you
> > can start with symlinks.
>
> Symlinks have a _big_ problem: move one, or move it's target, and it
> no longer links to the same file. That's nothing like the
> transparency we need cowlinks to have.

With absolute paths you can move the symlink. But the other side
is still problematic.

> There's a third implementation strategy. Since we're talking in all
> cases about adding a new feature to the underlying filesystem, why not
> implement separate inodes pointing to an underlying shared inode which
> holds the data. (I think it was mentioned earlier in this thread).
>
> The implementation is very similar to symbolic links, but instead of
> having symlink inodes, you have cowlink inodes which point directly to
> another inode and count as references to it.
>
> That provides POSIX semantics and has none of the caveats you and I
> have mentioned for hard links and symbolic links.
>
> Implementation: Creating a cowlink to a non-cowlinked file creates a
> new shared inode by cloning the file's inode, converts the original
> inode to a cowlink-pointer inode, and creates a new cowlink-pointer
> inode.

And to state the obvious, you must make certain your fs has a lot of
inodes because this dramatically changes the inode to fs block ratio.

> This provides 100% semantic equivalence to copying: all operations
> including hard and symbolic links on the resulting cowlink files act
> as if the cowlink operation was a copy, but faster and using less
> space.

I like it. It's not too hard and it yields a very nice result.
syscall wise we would need:
int cowlink(const char *oldpath, const char *newpath);
int cstat(const char *filename, struct stat *buf);

Which of course are non POSIX :)

> > As my memory has it the proto implementation I saw using a stackable
> > fs and cow symlinks was about a 1000 lines. And it was that large
> > because it was complete (i.e. it did the copies including copying
> > directories.)
>
> I can see a stackable fs being useful for live CD distributions, where
> tmpfs or disk hold the modifications stacked over the original
> filesystem.

The practical use was getting root on a network filesystem to scale,
and be manageable. But the problem is essentially the same. Although
to guard against network congestion interacting badly with paging we
were actually thinking about implementing copy on read.

> But mounting an fs for each version of a source tree and keeping track
> of all those mounts: that would be incredibly clumsy to use.
>
> You could implement a stackable fs which is mounted once and provides
> cowlink operations within the fs. That still be a bit clumsy, but not
> so much as to make it unusable for source tree management.

And that is what the prototype we were playing with was doing. You
could not see the original files at all unless you mounted the base
filesystem somewhere outside the cowfs mount. It was actually quite
close to your special cow inode idea except that it stored data in
symlinks instead of inodes. To the extent of freeing files if all of
the cow links to them were removed, and the files were store on the
same filesystem as the cow links.

The addictive thing about the prototype implementation was that you
could do ``ln --cow / /some/other/directory'' and you would have an
atomic snapshot of your filesystem. Definitely not a feature for the
first implementation but certainly something to dream about.

Eric

2004-03-28 00:43:34

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

[email protected] (Eric W. Biederman) writes:

> The addictive thing about the prototype implementation was that you
> could do ``ln --cow / /some/other/directory'' and you would have an
> atomic snapshot of your filesystem. Definitely not a feature for the
> first implementation but certainly something to dream about.

Addictive but broken by design. If any of the files inside your
directory tree have hard links outside of the tree there is no way
short of recursing through all of the subdirectories directories to
tell if a given inode has is in use. Except in the special case
where you are taking a cow copy of the entire filesystem. At which
point a magic mount option is likely a better interface.

Ok that simplifies the long term design a little more. cow
directories cannot work correctly in all cases even when implemented
in the kernel. So the directory walking must still be done in user
space.


Eric

2004-03-28 12:22:53

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> > The addictive thing about the prototype implementation was that you
> > could do ``ln --cow / /some/other/directory'' and you would have an
> > atomic snapshot of your filesystem. Definitely not a feature for the
> > first implementation but certainly something to dream about.
>
> Addictive but broken by design. If any of the files inside your
> directory tree have hard links outside of the tree there is no way
> short of recursing through all of the subdirectories directories to
> tell if a given inode has is in use. Except in the special case
> where you are taking a cow copy of the entire filesystem. At which
> point a magic mount option is likely a better interface.

I don't understand this explanation. Can you explain again? What is
the problem with inodes being in use?

-- Jamie

2004-03-28 20:07:48

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > > The addictive thing about the prototype implementation was that you
> > > could do ``ln --cow / /some/other/directory'' and you would have an
> > > atomic snapshot of your filesystem. Definitely not a feature for the
> > > first implementation but certainly something to dream about.
> >
> > Addictive but broken by design. If any of the files inside your
> > directory tree have hard links outside of the tree there is no way
> > short of recursing through all of the subdirectories directories to
> > tell if a given inode has is in use. Except in the special case
> > where you are taking a cow copy of the entire filesystem. At which
> > point a magic mount option is likely a better interface.
>
> I don't understand this explanation. Can you explain again? What is
> the problem with inodes being in use?

A COW on a directory implies a COW on everything in it. So the implication
is an atomic snapshot of that directory and everything below it.

Assuming no files are open and in use. The implementation would
create the cow link on the directory just like it would on a file.
Then when you open/modify a directory or file the cow copies would be
taken up to the point of the original cow directory so you have a
separate directory structure.

All of which works great until you have a file that has one hard link
in your cow directory structure and another hard link outside of
any cow. An application can come in and modify the file through that
second cow link causing problems.

So the problem is not really with open files at the time of the cow
although there is a variation of it there as well. At the time of the
cow it is possible to look through the dcache and find all of the
files that are in the cow directory or a subdirectory of the it. Then
you can make those cow files. But again you run into the problem of
an application using a file through a link that is not a subdirectory
of the cow directory.

So in the presence of hard links doing cow on a directory other than
the root directory can not be implemented correctly short of doing a
complete recursive directory copy at the time you create the cow. At
which point you might as well just copy the directories in user space.
At least the race conditions are easily apparent.

Eric

2004-03-28 23:55:41

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> A COW on a directory implies a COW on everything in it. So the implication
> is an atomic snapshot of that directory and everything below it.
>
> Assuming no files are open and in use. The implementation would
> create the cow link on the directory just like it would on a file.
> Then when you open/modify a directory or file the cow copies would be
> taken up to the point of the original cow directory so you have a
> separate directory structure.
>
> All of which works great until you have a file that has one hard link
> in your cow directory structure and another hard link outside of
> any cow. An application can come in and modify the file through that
> second cow link causing problems.

I don't see that problem (although I see another, see below). The
application will modify only one instance of the file, and it's the
correct instance. If the application writes through the link outside
both trees, or the link inside the original tree, it will only affect
the tree that was cowlinked _from_, which is correct. If the
application writes to the name inside the snapshot tree, it will only
affect that tree, which is also correct.

You cowlinked a directory. That converts the original directory inode
to a cowlink, creates another cowlink, and creates a shared inode
which now contains the directory.

Then you modify the directory or anything below it. That duplicates
the directory, breaking the directory cowlinks and duplicating the
shared directory inode -- so that the two directory cowlink inodes
become normal directory inodes. The directory duplication results in
two directory which are full of cowlinks -- every object in the
original directory is cowlinked by this operation.

A file which was originally hard-linked inside the tree and also
outside both trees retains the correct hard-link identity: the hard
link is simply two directory entries referring to the same inode,
which at all times is the inode visible inside the original tree and
not visible inside the snapshot, cowlinked tree. That inode changes
its underlying representation from file-inode to cowlink-inode
(pointing to a shared file-inode) and back again during these
operations. However, the hard link identity remains correct at all
times. Writing to a file won't ever modify the wrong file.

There is a different problem, though: cowlinking whole trees like that
doesn't preserve hard-linkage _within_ the tree being copied. Each
time a directory is lazily duplicated, every entry in it is cowlinked,
and if two or more entries are hard linked to the same inode, the
cowlinked copies won't share an inode. It's as if you did "cp -p"
instead of "cp -pd". This can be solved easily within a single
directory, but when there are hard linked within the tree from
different directories, it's too hard to solve with lazy directory
duplication, without keeping a lot of extra metadata. (That's the
same metadata that "cp --preserve=links" or "rsync -H" has to keep
track of when copying a whole tree: on my home system, there's so much
of it due to hard links that rsync can't copy my home directory).

So I don't see the problem you describe, where an application could
accidentally modify the wrong file. (Perhaps my imagined mechanism
for cowlinking directories is different from yours). In particular, I
see no problem at all with hard links outside the cowlinked trees:
they would be fine.

But I see a different problem: the equivalent of something
semantically equivalent to "cp -pr" is fine and fast, but "cp -dpr"
(aka. "cp -a") must, unless it's quite complicated with filesystem
metadata, duplicate the whole directories immediately rather than
lazily, or at least scan them looking for hard links.

-- Jamie

2004-03-29 01:32:13

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > All of which works great until you have a file that has one hard link
> > in your cow directory structure and another hard link outside of
> > any cow. An application can come in and modify the file through that
> > second cow link causing problems.
>
> I don't see that problem (although I see another, see below). The
> application will modify only one instance of the file, and it's the
> correct instance. If the application writes through the link outside
> both trees, or the link inside the original tree, it will only affect
> the tree that was cowlinked _from_, which is correct. If the
> application writes to the name inside the snapshot tree, it will only
> affect that tree, which is also correct.

What I see is a race. An application may write through the link outside
both trees before any of the links is marked cow. With the result
that you don't have a snapshot of your data.

> You cowlinked a directory. That converts the original directory inode
> to a cowlink, creates another cowlink, and creates a shared inode
> which now contains the directory.
>
> Then you modify the directory or anything below it. That duplicates
> the directory, breaking the directory cowlinks and duplicating the
> shared directory inode -- so that the two directory cowlink inodes
> become normal directory inodes. The directory duplication results in
> two directory which are full of cowlinks -- every object in the
> original directory is cowlinked by this operation.
>
> A file which was originally hard-linked inside the tree and also
> outside both trees retains the correct hard-link identity: the hard
> link is simply two directory entries referring to the same inode,
> which at all times is the inode visible inside the original tree and
> not visible inside the snapshot, cowlinked tree. That inode changes
> its underlying representation from file-inode to cowlink-inode
> (pointing to a shared file-inode) and back again during these
> operations. However, the hard link identity remains correct at all
> times. Writing to a file won't ever modify the wrong file.

Correct to a point. And we seem to be imagining the same operations.
However while you will always modify the correct file, as the metadata
is correct. There is no guarantee that the data will be correct. The
file will become a cow file only after it is modified or it's
containing directory is modified. Thus you can have data in the file
that was written after the snapshot operation finished, but before the
individual file itself is marked cow.

> There is a different problem, though: cowlinking whole trees like that
> doesn't preserve hard-linkage _within_ the tree being copied.

> I see a different problem: the equivalent of something
> semantically equivalent to "cp -pr" is fine and fast, but "cp -dpr"
> (aka. "cp -a") must, unless it's quite complicated with filesystem
> metadata, duplicate the whole directories immediately rather than
> lazily, or at least scan them looking for hard links.

Now that you bring it out I see that problem as well. I have seen it
in other proposed implementations as well. Keeping hard links linked
requires for some amount of context to be maintained for the entire
copy operation. If necessary you could keep that context where it is
available to the lazy copy but it is far from trivial.

In short lazy copying for creating snapshots is dangerous. The
data you are copying may be modified before you are done. It is
difficult to maintain state across the entire copy.

All of which sounds like a job for user space to me.

Eric

2004-03-29 08:00:52

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Monday 29 March 2004 01:55, Jamie Lokier wrote:
> Eric W. Biederman wrote:
> > A COW on a directory implies a COW on everything in it. So the
> > implication is an atomic snapshot of that directory and everything below
> > it.
> >
> > Assuming no files are open and in use. The implementation would
> > create the cow link on the directory just like it would on a file.
> > Then when you open/modify a directory or file the cow copies would be
> > taken up to the point of the original cow directory so you have a
> > separate directory structure.
> >
> > All of which works great until you have a file that has one hard link
> > in your cow directory structure and another hard link outside of
> > any cow. An application can come in and modify the file through that
> > second cow link causing problems.
>
> I don't see that problem (although I see another, see below). The
> application will modify only one instance of the file, and it's the
> correct instance. If the application writes through the link outside
> both trees, or the link inside the original tree, it will only affect
> the tree that was cowlinked _from_, which is correct. If the
> application writes to the name inside the snapshot tree, it will only
> affect that tree, which is also correct.
>
> You cowlinked a directory. That converts the original directory inode
> to a cowlink, creates another cowlink, and creates a shared inode
> which now contains the directory.
>
> Then you modify the directory or anything below it. That duplicates
> the directory, breaking the directory cowlinks and duplicating the
> shared directory inode -- so that the two directory cowlink inodes
> become normal directory inodes. The directory duplication results in
> two directory which are full of cowlinks -- every object in the
> original directory is cowlinked by this operation.
>
> A file which was originally hard-linked inside the tree and also
> outside both trees retains the correct hard-link identity: the hard
> link is simply two directory entries referring to the same inode,
> which at all times is the inode visible inside the original tree and
> not visible inside the snapshot, cowlinked tree. That inode changes
> its underlying representation from file-inode to cowlink-inode
> (pointing to a shared file-inode) and back again during these
> operations. However, the hard link identity remains correct at all
> times. Writing to a file won't ever modify the wrong file.
>
> There is a different problem, though: cowlinking whole trees like that
> doesn't preserve hard-linkage _within_ the tree being copied. Each
> time a directory is lazily duplicated, every entry in it is cowlinked,
> and if two or more entries are hard linked to the same inode, the
> cowlinked copies won't share an inode. It's as if you did "cp -p"
> instead of "cp -pd". This can be solved easily within a single
> directory, but when there are hard linked within the tree from
> different directories, it's too hard to solve with lazy directory
> duplication, without keeping a lot of extra metadata. (That's the
> same metadata that "cp --preserve=links" or "rsync -H" has to keep
> track of when copying a whole tree: on my home system, there's so much
> of it due to hard links that rsync can't copy my home directory).

Personally I'll dump hardlinks functionality entirely if I'll get
working cowlinks instead. Most of the time, hardlinks used to save space
and/or provide alternate names to programs/whatever,
but then you have to be damn careful to remember what is linked where.

Softlinks can be used insteal, and actually they are better for
alternate naming purpose, with them I easily see that it is
a link and where it points.

But cowlinks are *ultimate* space saving stuff because they never
make you fail and update wrong file, as it happens with hardlinks.
It just all works the Right Way.
--
vda

2004-03-29 12:39:30

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> The file will become a cow file only after it is modified or it's
> containing directory is modified.

Eh? The file (or directory) must be labelled as a cowlinked file the
moment you make the cowlink, not when the data is modified. It's
_breaking_ the cowlink that happens when the data (or directory
contents) are modified.

> Thus you can have data in the
> file that was written after the snapshot operation finished, but
> before the individual file itself is marked cow.

The creation of a cowlink should be atomic w.r.t. writing.

Specifically, the operation which moves the contents of a non-cowlink
inode to a newly created shared inode, and converts the original
non-cowlink inode to a cowlink inode, should be atomic.

Is there an unavoidable race condition? I don't see one.

-- Jamie

2004-03-29 12:43:03

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> Actually, there's 4th strategy, too. You could implement sharing at block level.
> Block free bitmap would become bigger, but you could do some tricks to keep it
> at ~8 bits per block...

For sharing source trees and such, and even for COW overlays of
/usr/lib and /usr/bin, that would have no benefit: you never write to
just part of a source or object file.

For databases (including things like the RPM database), and snapshots
of filesystems, it would be more useful.

-- Jamie

2004-03-29 13:00:42

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > There are two possible implementations strategies for implementing
> > cow files. You can either start as Jrn did with hardlinks, or you
> > can start with symlinks.
...
> There's a third implementation strategy. Since we're talking in all
> cases about adding a new feature to the underlying filesystem, why not
> implement separate inodes pointing to an underlying shared inode which
> holds the data. (I think it was mentioned earlier in this thread).

Actually, there's 4th strategy, too. You could implement sharing at block level.
Block free bitmap would become bigger, but you could do some tricks to keep it
at ~8 bits per block...
--
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms

2004-03-29 17:16:29

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > What happens if the disk fills while you are making the copy? Will
> > open(2) on an *existing file* then return ENOSPC?
>
> Correct. It would be possible to always succeed and return -ENOSPC
> on every write(). But then mmap() has the same problem again, so
> serious headache would be the only gain from this little excercise.

> > I do not think you can implement this without changing the interface
> > to open(2). Which means applications have to be made aware of it
> > anyway. Which means you might as well leave your implementation as-is
> > and let userspace worry about creating the copy (and dealing with the
> > resulting errors).
>
> Good point. Vote is now 2:0 for the simple approach.

Well, 99% need no special handling on ENOSPC during open just
now. However, implementing file copying to each one would be serious
headache.

Applications can not be sure that it is existing file. If you
do stat followed by open, someone may have removed the file in
between. So it is not so new case.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-03-29 19:37:22

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > The file will become a cow file only after it is modified or it's
> > containing directory is modified.
>
> Eh? The file (or directory) must be labelled as a cowlinked file the
> moment you make the cowlink, not when the data is modified. It's
> _breaking_ the cowlink that happens when the data (or directory
> contents) are modified.

We have the mechanism of:
> You cowlinked a directory. That converts the original directory inode
> to a cowlink, creates another cowlink, and creates a shared inode
> which now contains the directory.
>
> Then you modify the directory or anything below it. That duplicates
> the directory, breaking the directory cowlinks and duplicating the
> shared directory inode -- so that the two directory cowlink inodes
> become normal directory inodes. The directory duplication results in
> two directory which are full of cowlinks -- every object in the
> original directory is cowlinked by this operation.

This mechanism does not label all files below a directory as cowlinked
the moment the cowlink.

Therefore cowlink on a directory as described may be an interesting
operation but it is not an atomic snapshot of the directory and
it's contents.

> > Thus you can have data in the
> > file that was written after the snapshot operation finished, but
> > before the individual file itself is marked cow.
>
> The creation of a cowlink should be atomic w.r.t. writing.
>
> Specifically, the operation which moves the contents of a non-cowlink
> inode to a newly created shared inode, and converts the original
> non-cowlink inode to a cowlink inode, should be atomic.
>
> Is there an unavoidable race condition? I don't see one.

Only if the following apply.

- A correct cow on a directory is considered to be an atomic snapshot
of both the directory and everything below it.
- You break the cow on the directories on demand, as above.

Scenario. The directory tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10)

Then a cow link is performed:

ln --cow dir2 cow1

Resulting in a directory tree that looks like:
/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 4) -> (inode 3)/link2 (inode 10)


Then we have several things that could happen.
Scenario A: fd = open(/cow1/link2); write(fd, ....);
Scenario B: fd = open(/dir2/link2); write(fd, ....);
Scenario C: fd = open(/dir1/link1); write(fd, ....);

**
In Scenario A the open breaks the cow on the directory, so that
the cow copy can have it's own inode as:

/dir1 (inode 1)/link1 (inode 10) -> (inode 11)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)

Then the write occurs the cow is broken and the tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12)

**
In Scenario B the open breaks the cow on the directory, so that
the cow copy can potentially have it's own inode as:

/dir1 (inode 1)/link1 (inode 10) -> (inode 11)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)

Then the write occurs the cow is broken and the tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)


**
In Scenario C their is no open for the cow to break and
things proceed normally the directory tree remains looking like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 3) -> (inode 3)/link2 (inode 10)

Then when the write occurs there is not cow to break, and the
tree remains looking like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 3) -> (inode 3)/link2 (inode 10)

So I see a problem with Scenario C. Perhaps you can refute it.

Eric

2004-03-29 21:05:35

by Patrick J. LoPresti

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek <[email protected]> writes:

> > > What happens if the disk fills while you are making the copy? Will
> > > open(2) on an *existing file* then return ENOSPC?
>
> Applications can not be sure that it is existing file. If you
> do stat followed by open, someone may have removed the file in
> between. So it is not so new case.

I should have said, "Will open(2) without O_CREAT then return ENOSPC?"

This is definitely a new case.

For what it's worth, I agree with whoever (Jamie?) said that COW
should be primarily a space optimization, and that semantically the
two files should mostly behave like separate copies.

In fact, I think it is unfortunate, in some ways, that things like
permissions and timestamps are kept in the inode. This means that two
files may only be COW-linked if they also share ownership,
permissions, and timestamps, which makes COW links less useful for
some applications (e.g., sharing source trees among multiple
developers).

But sharing data blocks without sharing inodes is too horrible even to
contemplate, I suppose.

- Pat

2004-03-29 23:05:58

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> So I see a problem with Scenario C. Perhaps you can refute it.

Ick. You're right. I cannot refute it.

Fwiw, I would have broken the directory cows on the first write, not
the open.

Thus, snapshots using lazy directory copies requires either that there
are no hard links of the type you described (e.g. when snapshotting
the root of the filesystem), or rather complex metadata to track the
hard links, not dissimilar to the metadata needed to preserve hard
links _within_ the snapshot. They both seem far too complex to be worth it.

Hard links just don't play well with lazy cowlinked directories.

They are fine with non-lazy directory cowlinking, where the whole
directory tree is duplicated and only files are cow'd. Note that this
doesn't apply to the original implementation which used hard links
with a special flag: maintaining hard links in conjunction with
cowlinks requires the inode duplication we've been talking about.

Btw, if we have cowlinks implemented using inode duplication, then it
isn't necessary for both inodes to have the same metadata such as
mtime, ctime, mode etc. Only the data is shared. That means the
sendfile() system call could conceivably be overloaded, meaning to
copy the data, and let "cp --cow" decide whether it wants to copy the
metadata (same as as "cp -rp" or "cp -rpd"), or not (same as "cp -r").

-- Jamie

2004-03-29 23:16:51

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > > > What happens if the disk fills while you are making the copy? Will
> > > > open(2) on an *existing file* then return ENOSPC?
> >
> > Applications can not be sure that it is existing file. If you
> > do stat followed by open, someone may have removed the file in
> > between. So it is not so new case.
>
> I should have said, "Will open(2) without O_CREAT then return ENOSPC?"
>
> This is definitely a new case.
>
> For what it's worth, I agree with whoever (Jamie?) said that COW
> should be primarily a space optimization, and that semantically the
> two files should mostly behave like separate copies.
>
> In fact, I think it is unfortunate, in some ways, that things like
> permissions and timestamps are kept in the inode. This means that two
> files may only be COW-linked if they also share ownership,
> permissions, and timestamps, which makes COW links less useful for
> some applications (e.g., sharing source trees among multiple
> developers).

I think they *should* have separate permissions.

Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normally
are should be enough (thinking ext2 here).

> But sharing data blocks without sharing inodes is too horrible even to
> contemplate, I suppose.

Why, btw?

Lets say we allocate 4 bits instead of one for block bitmap. Count
"15" is special, now it means "15 or higher". That means we have to
"garbage-collect" to free space that used to have more than 15 links,
but that should not happen too often...
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]


2004-03-29 23:59:06

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > So I see a problem with Scenario C. Perhaps you can refute it.
>
> Ick. You're right. I cannot refute it.

The upside is since we can't it makes the implementation much easier :)

> Fwiw, I would have broken the directory cows on the first write, not
> the open.

I did it only so that programs do not see inode numbers change under
them. For the copy that gets the original inode numbers delaying
until write is fine, for the copy with the new inode numbers to avoid
surprises you need to break the cow on the directory and move it to
the files on readdir, stat, and open.

> Thus, snapshots using lazy directory copies requires either that there
> are no hard links of the type you described (e.g. when snapshotting
> the root of the filesystem), or rather complex metadata to track the
> hard links, not dissimilar to the metadata needed to preserve hard
> links _within_ the snapshot. They both seem far too complex to be worth it.

Agreed.

> Hard links just don't play well with lazy cowlinked directories.
>
> They are fine with non-lazy directory cowlinking, where the whole
> directory tree is duplicated and only files are cow'd. Note that this
> doesn't apply to the original implementation which used hard links
> with a special flag: maintaining hard links in conjunction with
> cowlinks requires the inode duplication we've been talking about.

? The problem is lazy propagation of the cow flag. The implementation
for ordinary files does not matter. Only the implementation
for directories matters.

> Btw, if we have cowlinks implemented using inode duplication, then it
> isn't necessary for both inodes to have the same metadata such as
> mtime, ctime, mode etc. Only the data is shared. That means the
> sendfile() system call could conceivably be overloaded, meaning to
> copy the data, and let "cp --cow" decide whether it wants to copy the
> metadata (same as as "cp -rp" or "cp -rpd"), or not (same as "cp -r").

Sendfile feels about right but it has a few issues that complication
something like this. It works on file descriptors, and it takes
a length parameter.

There is a lot of room for things to go wrong when implementing
cowlink(const char *oldname, const char *newname) in user space.

Since the semantics are a delayed sendfile in other ways sendfile
feels like a good fit.

Eric

2004-03-31 14:34:31

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> Also it should be possible to have file with 2 hardlinks cowlinked
> somewhere, and possibly make more hardlinks of that one... Having
> pointer to another inode in place where direct block pointers normally
> are should be enough (thinking ext2 here).

Yes.

> > But sharing data blocks without sharing inodes is too horrible even to
> > contemplate, I suppose.
>
> Why, btw?
>
> Lets say we allocate 4 bits instead of one for block bitmap. Count
> "15" is special, now it means "15 or higher". That means we have to
> "garbage-collect" to free space that used to have more than 15 links,
> but that should not happen too often...

The garbage collection is what's horrible about it :)
Btw, 15 would be exceeded easily in my home directory.

IMHO, an inode whose block pointers points to another, so that whole
files only can be shared, would be fine.

Only one level of indirection would be allowed, so there'd be no
loops, just reference counted shared inodes.

-- Jamie

ps. Sharing individual data blocks is rather complicated. If it were
a very desirable feature, I'd be inclined to go for reference counted
shared extents or reference counted shared indirection blocks
(i.e. sharing at the level or 4Mb or whatever the first indirection
block size is).

2004-03-31 14:45:52

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > Also it should be possible to have file with 2 hardlinks cowlinked
> > somewhere, and possibly make more hardlinks of that one... Having
> > pointer to another inode in place where direct block pointers normally
> > are should be enough (thinking ext2 here).
>
> Yes.
>
> > > But sharing data blocks without sharing inodes is too horrible even to
> > > contemplate, I suppose.
> >
> > Why, btw?
> >
> > Lets say we allocate 4 bits instead of one for block bitmap. Count
> > "15" is special, now it means "15 or higher". That means we have to
> > "garbage-collect" to free space that used to have more than 15 links,
> > but that should not happen too often...
>
> The garbage collection is what's horrible about it :)
> Btw, 15 would be exceeded easily in my home directory.

Well, but chances are that you'll never unlink such files... Leaving
garbage collection to fsck would make it rather easy.

> IMHO, an inode whose block pointers points to another, so that whole
> files only can be shared, would be fine.

Yes, this is probably way better way to do that.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-03-31 15:20:19

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> > The garbage collection is what's horrible about it :)
> > Btw, 15 would be exceeded easily in my home directory.
>
> Well, but chances are that you'll never unlink such files... Leaving
> garbage collection to fsck would make it rather easy.

No, I'd unlink them often. Every time I clone a source tree, make
some changes, compile, maybe test :), then delete the tree.

I like to fsck maybe once every 6 months.

-- Jamie

2004-04-01 14:54:26

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Thu, 25 March 2004 15:16:48 -0700, Eric W. Biederman wrote:
> Linus Torvalds <[email protected]> writes:
>
> > On Thu, 25 Mar 2004, Jamie Lokier wrote:
> > >
> > > That is not useful for me or the other people who want to use this to
> > > duplicate large source trees and run "diff" between trees.
> > >
> > > "diff" depends on being able to check if files in the two trees are
> > > identical -- by checking whether the inode number and device (and
> > > maybe other stat data) are identical. This allows "diff -ur" between
> > > two cloned trees the size of linux to be quite fast. Without that
> > > optimisation, it's very slow indeed.
> >
> > I think the correct thing to do is to just admit that cowlinks aren't
> > POSIX, and instead see the inode number as a way to see whether the link
> > has been broken or not. Ie just accept the inode number potentially
> > changing.
> >
> > That would make "diff" (adn most other uses) ok with this, and anythign
> > that isn't, just couldn't be used with cowlinked files.
>
> Really?
>
> tar and cp still need to be taught about them. And if they are not taught
> they will do the wrong thing and hard link the files removing the
> copy on write semantics. Which would do ugly thing when you restore
> from your backup.
>
> I don't have a problem with the inode number changing when you write to
> a file, because I can't think of much that would care either way.
> But having the inode number of an open file change sounds like a very
> difficult problem to track.
>
> Maybe aiming cow links at things like a live cd filesystem is too
> ambitious, but it sounds like a minimal clean way to handle all of
> the dependencies on writeable files that show up.

Either method could work with some restrictions. Either way, some
userspace tools need to be taught about the changes. Personally I
don't care much either way, but the simple trick is already working on
my notebook and the problems are not too bad (not too good either).

BTW: sendfile() for ext[23] is finished, so the actual copy can be
done inside the kernel now. Patch will follow soon. (-EBUSY)

J?rn

--
To announce that there must be no criticism of the President, or that we
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918

2004-04-02 11:44:33

by Tim Connors

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek <[email protected]> said on Wed, 31 Mar 2004 16:45:37 +0200:
> Hi!
> > The garbage collection is what's horrible about it :)
> > Btw, 15 would be exceeded easily in my home directory.
>
> Well, but chances are that you'll never unlink such files... Leaving
> garbage collection to fsck would make it rather easy.

How often do you fsck? Sure, I fscked the other day, but that's
because some idiot unplugged my laptop.

I sure as hell delete copied trees more often than once every 6 months
though :)

--
TimC -- http://astronomy.swin.edu.au/staff/tconnors/
I'm sorry. The number you have reached is imaginary. Please rotate your
phone 90 degrees and try again.

2004-04-02 11:54:14

by Tim Connors

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

[email protected] (Eric W. Biederman) said on 25 Mar 2004 15:16:48 -0700:
> Linus Torvalds <[email protected]> writes:
>
> > On Thu, 25 Mar 2004, Jamie Lokier wrote:
> > >
> > > That is not useful for me or the other people who want to use this to
> > > duplicate large source trees and run "diff" between trees.
> > >
> > > "diff" depends on being able to check if files in the two trees are
> > > identical -- by checking whether the inode number and device (and
> > > maybe other stat data) are identical. This allows "diff -ur" between
> > > two cloned trees the size of linux to be quite fast. Without that
> > > optimisation, it's very slow indeed.
> >
> > I think the correct thing to do is to just admit that cowlinks aren't
> > POSIX, and instead see the inode number as a way to see whether the link
> > has been broken or not. Ie just accept the inode number potentially
> > changing.
> >
> > That would make "diff" (adn most other uses) ok with this, and anythign
> > that isn't, just couldn't be used with cowlinked files.
>
> Really?
>
> tar and cp still need to be taught about them. And if they are not taught
> they will do the wrong thing and hard link the files removing the
> copy on write semantics. Which would do ugly thing when you restore
> from your backup.
>
> I don't have a problem with the inode number changing when you write to
> a file, because I can't think of much that would care either way.
> But having the inode number of an open file change sounds like a very
> difficult problem to track.

If the inode doesn't change upon a write, the system is still POSIX,
so nothing breaks (the problem is, cowlinks would be very useful, and
POSIX is very useful - making them incompatible would be a major
bummer)

If you use sendfile() or cowcp() or similar to implement cowlinks,
then only rsync/tar/cp need be taught about its usage. Someone
mentioned that diff will not be optimal with different inodes, because
diff compared inodes. So add some diffstat() that keeps track of some
magical inode like thingy that stays the same for cowed files. Then
teach diff to use diffstat().

Now, you have a system where if your program doesn't explicitly use
cowcp() and diffstat(), everything still works perfectly (we want to
still be able to use old coreutils on a machine that has a new kernel
applied, right? And vice-versa?). All that happens is that your
program may or may not be sub-optimal - you might have to compare
entire files instead of just inodes. Or you may have to copy entire
files instead of just cowing them. But it all still Just Works.

My AU$0.02.

--
TimC -- http://astronomy.swin.edu.au/staff/tconnors/
I bet the human brain is a kludge.
-- Marvin Minsky

2004-04-02 16:54:57

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Tue, 30 March 2004 01:16:35 +0200, Pavel Machek wrote:
>
> I think they *should* have separate permissions.

That makes the count 2:2. I'll continue to follow the simple solution
for some time, but wouldn't like to have it included for now (or ever?)

> Also it should be possible to have file with 2 hardlinks cowlinked
> somewhere, and possibly make more hardlinks of that one... Having
> pointer to another inode in place where direct block pointers normally
> are should be enough (thinking ext2 here).

All right, you are proposing hell. I've tried to think through all
possibilities and was too scared to continue. So limitation is that
cowlinks and hardlinks are mutually exclusive, which eliminated all
problems.

If you really want cowlinks and hardlinks to be intermixed freely, I'd
happily agree with you as soon as you can define the behaviour for all
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.

J?rn

--
A quarrel is quickly settled when deserted by one party; there is
no battle unless there be two.
-- Seneca

2004-04-02 18:01:51

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > Also it should be possible to have file with 2 hardlinks cowlinked
> > somewhere, and possibly make more hardlinks of that one... Having
> > pointer to another inode in place where direct block pointers normally
> > are should be enough (thinking ext2 here).
>
> All right, you are proposing hell. I've tried to think through all
> possibilities and was too scared to continue. So limitation is that
> cowlinks and hardlinks are mutually exclusive, which eliminated all
> problems.

I do not think I'm proposing hell.

> If you really want cowlinks and hardlinks to be intermixed freely, I'd
> happily agree with you as soon as you can define the behaviour for all
> possible cases in a simple document and none of them make me scared
> again. Show me that it is possible and makes sense.

Okay:

User/kernel API modifications for cowlinks

open(..., O_RDWR) may return ENOSPACE

new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
must be empty, or it returns -EINVAL. copyfile() copies content of
file from one file to another. It may return success even through
there's not enough space on filesystem to actually do the copy. It is
also pretty fast.

another syscall is introduced for diff and friends, long long
get_data_id(int fd). It may only be used on non-zero-length regular
files. if get_data_id(fd1) == get_data_id(fd2), it means that files
fd1 and fd2 contain same data and you do not need to read them to
check it.

df might be overly optimistic

Implications

In my proposlal, diff will not automagically sense files contain same
stuff using (dev, ino); if you want speedups, you'll have to teach
diff to call get_data_id.

Users do not really know cowlinks exist. Disk space behaves funny, and
copyfile is somehow fast, but otherwise its normal UNIX system.

Trivial implementation does copyfile by real copying (=>slow, takes
lots of space), and returns error on get_data_id. (Or it might return
inode#, but returning error is probably better).

[And yes, I believe this can actually be implemented in usefull
way. Are you scared or should I continue?]

Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-02 18:17:18

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Fri, 2 April 2004 20:01:28 +0200, Pavel Machek wrote:
>
> I do not think I'm proposing hell.

Yet. :)

> > If you really want cowlinks and hardlinks to be intermixed freely, I'd
> > happily agree with you as soon as you can define the behaviour for all
> > possible cases in a simple document and none of them make me scared
> > again. Show me that it is possible and makes sense.
>
> Okay:
>
> User/kernel API modifications for cowlinks
>
> open(..., O_RDWR) may return ENOSPACE
>
> new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
> must be empty, or it returns -EINVAL. copyfile() copies content of
> file from one file to another. It may return success even through
> there's not enough space on filesystem to actually do the copy. It is
> also pretty fast.
>
> another syscall is introduced for diff and friends, long long
> get_data_id(int fd). It may only be used on non-zero-length regular
> files. if get_data_id(fd1) == get_data_id(fd2), it means that files
> fd1 and fd2 contain same data and you do not need to read them to
> check it.

Sounds good, but you missed the hell part.

What happens, if you copyfile() a file that has two links?
Then you link() the result.
Then you copyfile() one of those two links.
Then you link()...

Now you write to either one of the six files. What happens?

Give me a clean proposal how this is simple, defined and not too
dangerous for the unaware. Then I agree, there is no hell involved.

J?rn

--
When in doubt, use brute force.
-- Ken Thompson

2004-04-02 19:07:36

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > > If you really want cowlinks and hardlinks to be intermixed freely, I'd
> > > happily agree with you as soon as you can define the behaviour for all
> > > possible cases in a simple document and none of them make me scared
> > > again. Show me that it is possible and makes sense.
> >
> > Okay:
> >
> > User/kernel API modifications for cowlinks
> >
> > open(..., O_RDWR) may return ENOSPACE
> >
> > new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
> > must be empty, or it returns -EINVAL. copyfile() copies content of
> > file from one file to another. It may return success even through
> > there's not enough space on filesystem to actually do the copy. It is
> > also pretty fast.
> >
> > another syscall is introduced for diff and friends, long long
> > get_data_id(int fd). It may only be used on non-zero-length regular
> > files. if get_data_id(fd1) == get_data_id(fd2), it means that files
> > fd1 and fd2 contain same data and you do not need to read them to
> > check it.
>
> Sounds good, but you missed the hell part.

:-).

> Now you write to either one of the six files. What happens?
>
> Give me a clean proposal how this is simple, defined and not too
> dangerous for the unaware. Then I agree, there is no hell involved.

Okay, now I have to start talking about implementation. Assume ext2 as
a base. Theres new object "cowid" which contains, well, id for
get_data_id() and usage count. Each inode either has pointer to
"cowid" object, or it is plain old regular file.

INODE123 Usage count = 2.

> What happens, if you copyfile() a file that has two links?

copyfile results in:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 2
INODE124 Usage count = 1, pointer to cowid 567

> Then you link() the result.

No, problem just increase usage count on inode124:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 2
INODE124 Usage count = 2, pointer to cowid 567

> Then you copyfile() one of those two links.

Creates another inode pointing at same old cowid:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 3
INODE124 Usage count = 2, pointer to cowid 567
INODE125 Usage count = 1, pointer to cowid 567

> Then you link()...

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 3
INODE124 Usage count = 2, pointer to cowid 567
INODE125 Usage count = 2, pointer to cowid 567

Now, if I write to any inode with has cowid, data have to be copied,
and pointer to cowid deleted from that inode .

Pavel

--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-02 19:35:44

by Ross Biro

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Fri, 2 Apr 2004 20:23:58 +0200, Pavel Machek <[email protected]> wrote:
> > > > If you really want cowlinks and hardlinks to be intermixed freely, I'd
> > > > happily agree with you as soon as you can define the behaviour for all
> > > > possible cases in a simple document and none of them make me scared
> > > > again. Show me that it is possible and makes sense.

Maybe it's easiest to view the proposed copyfile() as being
semantically equivalent to cp from the point of view of anything above
the actual file system (modulo running out of space at weird times)

Then all the questions are easy to answer, and it would also be
possible to implement copyfile at the VFS layer as cp for file systems
that don't support it.

Of course, it gets more interesting if you try to do it at the block
level instead of at the file level. For ext2, you could just reserve
a block #, say -1, to mean take the data from the master cow file, and
anything else is treated normally. You would need a deamon to make
sure you were still saving space though.

2004-04-02 20:09:38

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> Okay, now I have to start talking about implementation. Assume ext2 as
> a base. Theres new object "cowid" which contains, well, id for
> get_data_id() and usage count. Each inode either has pointer to
> "cowid" object, or it is plain old regular file.

Pavel has it exactly right.

A simple way to store COWID objects in the filesystem itself is as
another ordinary inode. The attributes of that inode (mtime, mode
etc.) aren't important (except to fsck), only the size and data
pointers are important. The files which point to a COWID need a flag
to indicate that, too.

Someone said that a problem with using sendfile() to create cowlinks
is that sendfile() takes a length parameter. It's a size_t, which
isn't large enough to copy Large files.

Actually that isn't a problem. The COWID inodes contain the real
length of the shared data. It would be possible for the individual
cowlink inodes to have a smaller length, indicating that they don't
have all the data. Then a cp implementation which calls sendfile()
repeatedly could copy large files and still share the data. Provided
the offset and length are compatible with sharing, the first
sendfile() would create the COWID (if it doesn't exist already), and
subsequent ones would simply enlarge the individual cowlink's length
attribute. It's not the cleanest interface; I only mention it because
sendfile() exists already and could be used.

get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
get_data_id() has the advantage that it can report immediately whether
a file has _any_ cowlink peers, which is important for programs that
scan trees. Perhaps getxattr() would be reasonable interface, using a
named attribute "data-id".

-- Jamie

2004-04-02 21:37:34

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

On P? 02-04-04 11:28:55, Ross Biro wrote:
> On Fri, 2 Apr 2004 20:23:58 +0200, Pavel Machek <[email protected]> wrote:
> > > > > If you really want cowlinks and hardlinks to be intermixed freely, I'd
> > > > > happily agree with you as soon as you can define the behaviour for all
> > > > > possible cases in a simple document and none of them make me scared
> > > > > again. Show me that it is possible and makes sense.
>
> Maybe it's easiest to view the proposed copyfile() as being
> semantically equivalent to cp from the point of view of anything above
> the actual file system (modulo running out of space at weird times)

Yes, this is what I was trying to propose.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-02 21:40:53

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > Okay, now I have to start talking about implementation. Assume ext2 as
> > a base. Theres new object "cowid" which contains, well, id for
> > get_data_id() and usage count. Each inode either has pointer to
> > "cowid" object, or it is plain old regular file.
>
> Pavel has it exactly right.
>
> A simple way to store COWID objects in the filesystem itself is as
> another ordinary inode. The attributes of that inode (mtime, mode
> etc.) aren't important (except to fsck), only the size and data
> pointers are important. The files which point to a COWID need a flag
> to indicate that, too.

Actually, my solution has one weirdness...

> a
copyfile a b
rm a

...now b has pointer to cowid with usage count of 1. Which is slightly
ugly (and wastes one cowid entry), but should be harmless.

> get_data_id() is one way to detect equivalent files. Another would be
> a function files_equal(fd1, fd2) which returns a boolean.

files_equal(...) would lead to quadratic number of calls, no?

> get_data_id() has the advantage that it can report immediately whether
> a file has _any_ cowlink peers, which is important for programs that
> scan trees. Perhaps getxattr() would be reasonable interface, using a
> named attribute "data-id".

Yes, get_data_id() is extremely ugly name.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-02 22:01:00

by Chris Friesen

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:

> Actually, my solution has one weirdness...
>
>
>>a
>
> copyfile a b
> rm a
>
> ...now b has pointer to cowid with usage count of 1. Which is slightly
> ugly (and wastes one cowid entry), but should be harmless.

Could you not change it back to a normal inode when refcount becomes 1?
Or if you didn't want to do that always (say if you knew there would
be more references being created soon) you could at least have some kind
of cleanup tool that you could manually run on a filesystem to clean it up?

Chris

2004-04-03 00:46:47

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> Actually, my solution has one weirdness...
>
> > a
> copyfile a b
> rm a
>
> ...now b has pointer to cowid with usage count of 1. Which is slightly
> ugly (and wastes one cowid entry), but should be harmless.

That's necessary, unless the cowid object has a linked list of all the
inodes which point to it, a bit like inodes having a linke list of all
parent directories which point to them. That's not impossible, but
leaving the unnecessary cowid object is much simpler and will result
in less I/O (no doubly-linked list to update). It can be garbage
collected when the last reference is followed to it.

> > get_data_id() is one way to detect equivalent files. Another would be
> > a function files_equal(fd1, fd2) which returns a boolean.
>
> files_equal(...) would lead to quadratic number of calls, no?

Yes. </blush>

-- Jamie

2004-04-03 00:49:57

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Chris Friesen wrote:
> Could you not change it back to a normal inode when refcount becomes 1?

You can only do that if the cowid object has a pointer to the last
remaining reference to it. That's possible, but more complicated and
would incur a little more I/O per cow operation.

> Or if you didn't want to do that always (say if you knew there would
> be more references being created soon) you could at least have some kind
> of cleanup tool that you could manually run on a filesystem to clean it up?

fsck could do it. It's not a big deal though: simply looking up the
inode through the last remaining path can also clean it up. Until
them, it's very little space used: the same as a short symlink.

-- Jamie

2004-04-03 01:04:30

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Here's a tricky situation:

1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.

2. At this point both mappings share the same pages in RAM.

3. Then one of the cowlinks is written to...

-- Jamie

2004-04-03 01:21:39

by Erik Andersen

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat Apr 03, 2004 at 02:04:25AM +0100, Jamie Lokier wrote:
> Here's a tricky situation:
>
> 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
>
> 2. At this point both mappings share the same pages in RAM.
>
> 3. Then one of the cowlinks is written to...

Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.

-Erik

--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--

2004-04-03 01:59:29

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Erik Andersen wrote:
> > Here's a tricky situation:
> >
> > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> >
> > 2. At this point both mappings share the same pages in RAM.
> >
> > 3. Then one of the cowlinks is written to...
>
> Using mmap with PROT_WRITE on a cowlink must preemptively
> break the link.

I forget to mention, they are PROT_READ shared mappings.

-- Jamie

2004-04-03 03:55:17

by Ross Biro

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 3 Apr 2004 02:59:20 +0100, Jamie Lokier <[email protected]> wrote:
>
> Erik Andersen wrote:
> > > Here's a tricky situation:
> > >
> > > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> > >
> > > 2. At this point both mappings share the same pages in RAM.
> > >
> > > 3. Then one of the cowlinks is written to...
> >
> > Using mmap with PROT_WRITE on a cowlink must preemptively
> > break the link.
>
> I forget to mention, they are PROT_READ shared mappings.

Or just also make the page cow and not break the link until someone
actually writes to it. Really depends on how you view the performance
impact, whether you do cow at the block of inode level, and how much
you want to touch (or copy parts of) the mm.

2004-04-03 08:23:18

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > Could you not change it back to a normal inode when refcount becomes 1?
>
> You can only do that if the cowid object has a pointer to the last
> remaining reference to it. That's possible, but more complicated and
> would incur a little more I/O per cow operation.

You'd have to have pointers to all references to it... because you
can't tell in advance which one will be the last to go away.

But I agree its not a big problem.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-03 09:09:37

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> Erik Andersen wrote:
> > > Here's a tricky situation:
> > >
> > > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> > >
> > > 2. At this point both mappings share the same pages in RAM.
> > >
> > > 3. Then one of the cowlinks is written to...
> >
> > Using mmap with PROT_WRITE on a cowlink must preemptively
> > break the link.
>
> I forget to mention, they are PROT_READ shared mappings.

I'm not mm guru, but... with rmap, we should be able to find all the
users of that shared memory, and unmap their pages, right?

Until copy is done, we don't do anything, because write is not allowed
to progress until copy is done. When copy is done we should unmap all
the pages that still point to "old" copy, let write progress, and make
users fault in.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-03 13:15:52

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> > > Could you not change it back to a normal inode when refcount becomes 1?
> >
> > You can only do that if the cowid object has a pointer to the last
> > remaining reference to it. That's possible, but more complicated and
> > would incur a little more I/O per cow operation.
>
> You'd have to have pointers to all references to it... because you
> can't tell in advance which one will be the last to go away.

Exactly. Each of the cow pointers would need to be linked in a doubly
linked list containing them all.

-- Jamie

2004-04-03 13:27:38

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> > > > Here's a tricky situation:
> > > >
> > > > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> > > >
> > > > 2. At this point both mappings share the same pages in RAM.
> > > >
> > > > 3. Then one of the cowlinks is written to...
> > >
> > > Using mmap with PROT_WRITE on a cowlink must preemptively
> > > break the link.
> >
> > I forget to mention, they are PROT_READ shared mappings.
>
> I'm not mm guru, but... with rmap, we should be able to find all the
> users of that shared memory, and unmap their pages, right?

Yes. I bring it up only because it's tricky, and the simple cowlink
implementations so far don't deal with it.

A page can only exist in one address_space. So if pages are shared
before the cow is broken, the address_space must be of the shared
cowid object, not an individual address_space per cowlink.
Afterwards, the copied pages are in the non-shared cowlink object's
address_space.

> Until copy is done, we don't do anything, because write is not allowed
> to progress until copy is done. When copy is done we should unmap all
> the pages that still point to "old" copy, let write progress, and make
> users fault in.

I agree.

(Ross suggested using COW pages. While technically possible, that
would be pretty complicated to implemented as it implies pages shared
among more than one address_space, and the facility for write() to
break COW sharing in the page cache, and update page tables when that
happens.)

-- Jamie

2004-04-03 18:40:32

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Here's a tricky situation:
>
> 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
>
> 2. At this point both mappings share the same pages in RAM.

Why they have different inodes?

> 3. Then one of the cowlinks is written to...

I would not worry about sharing page cache entries unless this becomes
a common case. If you want to avoid the hit of rereading the file when
you have a cow copy it should be simple enough to walk through the list
of cow copies and see if anyone else has it open.

Eric

2004-04-03 19:43:51

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> > Here's a tricky situation:
> >
> > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> >
> > 2. At this point both mappings share the same pages in RAM.
>
> Why they have different inodes?

Did you miss the last 20 or so messages in this thread?

We'd like cowlinks that are an invisible filesystem optimisation.
That means you "copy" a file and it behaves the same as if you copy a file.

> > 3. Then one of the cowlinks is written to...
>
> I would not worry about sharing page cache entries unless this becomes
> a common case. If you want to avoid the hit of rereading the file when
> you have a cow copy it should be simple enough to walk through the list
> of cow copies and see if anyone else has it open.

It is not a question of performance, it's correctness. Either you
have cowlinks that behave like copied files, or you accept that when
cowlinked files are mmapped and written to, they don't behave like
regular files (not even the original file prior to cowlinking does).

Btw, I'm not suggesting sharing page cache entries.

-- Jamie

2004-04-03 19:48:35

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> writes:

> On Tue, 30 March 2004 01:16:35 +0200, Pavel Machek wrote:
> >
> > I think they *should* have separate permissions.
>
> That makes the count 2:2. I'll continue to follow the simple solution
> for some time, but wouldn't like to have it included for now (or ever?)
>
> > Also it should be possible to have file with 2 hardlinks cowlinked
> > somewhere, and possibly make more hardlinks of that one... Having
> > pointer to another inode in place where direct block pointers normally
> > are should be enough (thinking ext2 here).
>
> All right, you are proposing hell. I've tried to think through all
> possibilities and was too scared to continue. So limitation is that
> cowlinks and hardlinks are mutually exclusive, which eliminated all
> problems.

Sounds like a simple place to start.

> If you really want cowlinks and hardlinks to be intermixed freely, I'd
> happily agree with you as soon as you can define the behaviour for all
> possible cases in a simple document and none of them make me scared
> again. Show me that it is possible and makes sense.

Concise summary:
- (indirection inodes) solve the hard link problem.
- Don't share the page cache between cow copies.

For intermixing hard links and cow copy operations a single extra layer
of indirection solves the problem. The cow files point to an (indirection)
inode that holds the hard link count, the real inode number and possibly
a few extra things like permissions. Except for never being pointed
at by a directory entry the real inode works like normal. The link
count of the real inode is the number of indirection inodes pointing
at it.

Caching issues are more subtle. The pathological case is a read only
mapping of the cow file, that expects to see the mapping changed by
some other process writing to the file. Not sharing page cache
entries between cow copies solves this problem.

Once page cache entries are distinct it is possible to move the
trigger down from an open with intent to write, to the first
actual write operation. In aops->prepare_write from sys_write,
and aops->writepage from the mmap version.

And a few scenarios to hopefully make things clear.

So your fs starts out as:

/file1 -> ino1 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> data block #1

file1 is only 4K long, so I only need to describe one data block.

Actions:

cowcopy(file2, file3):

/file1 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 2) -> data block #1


copyfile(file3, file4):

/file1 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1

unlink(file2):

/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1

link(file4, file5):

/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1

write(file3):

/file1 -> ino1 (link count 1) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1

write(file5):

/file1 -> ino1 (link count 1) -> ino2 (link count 1) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3

write(file1):

/file1 -> ino1 (link count 1) -> data block #1 (with modified contents)
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3


Does this make things clear?

Eric

2004-04-03 20:31:01

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > > Here's a tricky situation:
> > >
> > > 1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
> > >
> > > 2. At this point both mappings share the same pages in RAM.
> >
> > Why they have different inodes?
>
> Did you miss the last 20 or so messages in this thread?
>
> We'd like cowlinks that are an invisible filesystem optimisation.
> That means you "copy" a file and it behaves the same as if you copy a file.

Exactly so they would not share the same pages in RAM.

> > > 3. Then one of the cowlinks is written to...
> >
> > I would not worry about sharing page cache entries unless this becomes
> > a common case. If you want to avoid the hit of rereading the file when
> > you have a cow copy it should be simple enough to walk through the list
> > of cow copies and see if anyone else has it open.
>
> It is not a question of performance, it's correctness. Either you
> have cowlinks that behave like copied files, or you accept that when
> cowlinked files are mmapped and written to, they don't behave like
> regular files (not even the original file prior to cowlinking does).
>
> Btw, I'm not suggesting sharing page cache entries.

It sounded like you assumed sharing of page cache entries above.
How do you get to step 2 if the cow copies don't share the same page
cache entries?

Eric

2004-04-03 22:00:24

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> > We'd like cowlinks that are an invisible filesystem optimisation.
> > That means you "copy" a file and it behaves the same as if you copy a file.
>
> Exactly so they would not share the same pages in RAM.

That is one way to implement it.

> > Btw, I'm not suggesting sharing page cache entries.
>
> It sounded like you assumed sharing of page cache entries above.
> How do you get to step 2 if the cow copies don't share the same page
> cache entries?

Ah. A misunderstanding on my part.

I mean not sharing page cache entries between different
address_spaces, but sharing between different cowlinks which use the
same underlying address_space.

I had in mind that since each cowlink is a separate inode, but both
inodes point to a shared data structure in the filesystem, they would
map pages out of a shared address_space representing that data
structure. You've pointed out that it isn't necessary to do that, and
it's probably simpler not to.

Now I see your point. Page sharing could be avoided completely, if
when mapping a cowlink the page was _copied_ from the shared
address_space to the cowlink's own address_space. Copying also solves
the mlock() problem. (A shared address_space is still required, because
you may cowlink a file which has dirty pages in RAM).

Copying raises a different problem: what to do when a non-cowlink file
is mapped (PROT_READ), and then it's cowlinked while the mapping is in
place. The non-cowlink inode gets converted to a cowlink inode. The
pages are hashed in the original address_space, and you now have a
mapping of a cowlink file where the mapped pages are _not copies_ of
pages in the shared address_space.

-- Jamie

2004-04-04 08:16:31

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Jamie Lokier <[email protected]> writes:

> Eric W. Biederman wrote:
> > > We'd like cowlinks that are an invisible filesystem optimisation.
> > > That means you "copy" a file and it behaves the same as if you copy a file.
> >
> > Exactly so they would not share the same pages in RAM.
>
> That is one way to implement it.
>
> > > Btw, I'm not suggesting sharing page cache entries.
> >
> > It sounded like you assumed sharing of page cache entries above.
> > How do you get to step 2 if the cow copies don't share the same page
> > cache entries?
>
> Ah. A misunderstanding on my part.
>
> I mean not sharing page cache entries between different
> address_spaces, but sharing between different cowlinks which use the
> same underlying address_space.
>
> I had in mind that since each cowlink is a separate inode, but both
> inodes point to a shared data structure in the filesystem, they would
> map pages out of a shared address_space representing that data
> structure. You've pointed out that it isn't necessary to do that, and
> it's probably simpler not to.

Especially since the VM layer has no concept of COW except for private
anonymous pages. Which does not map to a POSIX cow on files.

> Now I see your point. Page sharing could be avoided completely, if
> when mapping a cowlink the page was _copied_ from the shared
> address_space to the cowlink's own address_space. Copying also solves
> the mlock() problem. (A shared address_space is still required, because
> you may cowlink a file which has dirty pages in RAM).

Either that your you call fsync(file) as part of generating the cow
copy.

> Copying raises a different problem: what to do when a non-cowlink file
> is mapped (PROT_READ), and then it's cowlinked while the mapping is in
> place. The non-cowlink inode gets converted to a cowlink inode. The
> pages are hashed in the original address_space, and you now have a
> mapping of a cowlink file where the mapped pages are _not copies_ of
> pages in the shared address_space.

If you don't flush things to disk first there are certainly some sharing
issues. Flushing the data to the address_space of the shared inode
should work just as well though.

What you must have is a clear state change from caching a normal file
to cache a cow file. Where certain parts of the behavior changes.

What this needs to do depends on the VFS at the time.

If sharing is introduced past that state change things get tricky to
manage. I just don't want to think about those issues just now...

Eric




2004-04-05 08:13:09

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Fri, 2 April 2004 11:28:55 -0800, Ross Biro wrote:
>
> Of course, it gets more interesting if you try to do it at the block
> level instead of at the file level. For ext2, you could just reserve
> a block #, say -1, to mean take the data from the master cow file, and
> anything else is treated normally. You would need a deamon to make
> sure you were still saving space though.

More interesting is correct. I see the advantages and proposed this
myself some time ago, but there are downsides. Basically, for each
block you need additional data, at least a counter telling you the
number of users it currently has. Eats up memory.

If it really has to make sense, you also have to detect duplicated
blocks at runtime. So you need a checksum for each block and a
balanced tree containing those checksums or some other means of quick
access. Eats up 40 bytes (16 checksum, 3*8 tree pointers). With 4k
blocks, that's 1% memory overhead.

Runtime is even worse. Unless the tree fits into memory, you have 1-3
disk reads for each write. Most filesystem developers don't like to
hear such news.

Frankly, the disadvantages still outweigh the advantages today. With
64k blocks, more memory and more diskspace and in comparison less
unique blocks, it might make sense. Later. :)

J?rn

--
Ninety percent of everything is crap.
-- Sturgeon's Law

2004-04-05 08:19:46

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 3 April 2004 14:15:39 +0100, Jamie Lokier wrote:
> Pavel Machek wrote:
> > > > Could you not change it back to a normal inode when refcount becomes 1?
> > >
> > > You can only do that if the cowid object has a pointer to the last
> > > remaining reference to it. That's possible, but more complicated and
> > > would incur a little more I/O per cow operation.
> >
> > You'd have to have pointers to all references to it... because you
> > can't tell in advance which one will be the last to go away.
>
> Exactly. Each of the cow pointers would need to be linked in a doubly
> linked list containing them all.

I don't like the list idea. Having the extra cowid (I prefer inode)
indirection costs a few bytes and one lookup, not much. The list is
way too much overhead to get rid of so little in a few cases.

If you really want to, create a new syscall foldfile() that will
remove the indirection for one file, if possible. Then userspace can
do the ugly work of scanning for single-linked cowids (or just leave
it).

J?rn

--
The competent programmer is fully aware of the strictly limited size of
his own skull; therefore he approaches the programming task in full
humility, and among other things he avoids clever tricks like the plague.
-- Edsger W. Dijkstra

2004-04-05 08:19:27

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > Of course, it gets more interesting if you try to do it at the block
> > level instead of at the file level. For ext2, you could just reserve
> > a block #, say -1, to mean take the data from the master cow file, and
> > anything else is treated normally. You would need a deamon to make
> > sure you were still saving space though.
>
> More interesting is correct. I see the advantages and proposed this
> myself some time ago, but there are downsides. Basically, for each
> block you need additional data, at least a counter telling you the
> number of users it currently has. Eats up memory.
>
> If it really has to make sense, you also have to detect duplicated
> blocks at runtime. So you need a checksum for each block and a
> balanced tree containing those checksums or some other means of quick
> access. Eats up 40 bytes (16 checksum, 3*8 tree pointers). With 4k
> blocks, that's 1% memory overhead.

Well, you could do this in userspace, do it only if system is idle,
and use "scan" type approach.

But I agree that's far away.

BTW what about the "mix hardlinks with cowlinks" proposal? You said it
leads to hell and then I did not hear from you. Did it scare you that
much? ;-)
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-05 08:23:19

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > > > > Could you not change it back to a normal inode when refcount becomes 1?
> > > >
> > > > You can only do that if the cowid object has a pointer to the last
> > > > remaining reference to it. That's possible, but more complicated and
> > > > would incur a little more I/O per cow operation.
> > >
> > > You'd have to have pointers to all references to it... because you
> > > can't tell in advance which one will be the last to go away.
> >
> > Exactly. Each of the cow pointers would need to be linked in a doubly
> > linked list containing them all.
>
> I don't like the list idea. Having the extra cowid (I prefer inode)
> indirection costs a few bytes and one lookup, not much. The list is
> way too much overhead to get rid of so little in a few cases.

Nobody likes the "list idea".

> If you really want to, create a new syscall foldfile() that will
> remove the indirection for one file, if possible. Then userspace can
> do the ugly work of scanning for single-linked cowids (or just leave
> it).

We could automaticaly remove them when we see them, or on first open(
O_RDWR) or something, or just leave them alone. Definitely leave them
alone for first version.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

2004-04-05 08:35:59

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 3 April 2004 20:43:44 +0100, Jamie Lokier wrote:
>
> Btw, I'm not suggesting sharing page cache entries.

But I am!!!

Sharing the page cache is more important to me than sharing disk
space. Disk space is cheap, but increasing memory beyond 1GiB in my
notebook is not and 1GiB is too little, so memory is the real
constraint.

And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock

This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have a
regular (non-cow) file.

Did I miss something?

J?rn

--
Premature optimization is the root of all evil.
-- Donald Knuth

2004-04-05 08:43:37

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Fri, 2 April 2004 20:23:58 +0200, Pavel Machek wrote:
>
> > Then you link()...
>
> INODE123 Usage count = 2, pointer to cowid 567
> COWID 567: Usage count = 3
> INODE124 Usage count = 2, pointer to cowid 567
> INODE125 Usage count = 2, pointer to cowid 567
>
> Now, if I write to any inode with has cowid, data have to be copied,
> and pointer to cowid deleted from that inode .

Ok, you win. Next time I get scare, I should ask you first. :)

In a single picture, links currently look like this:

Symlink can point to inodes or cowlinks or hardlinks
Hardlink can point to inodes or cowlinks
Cowlink can point to inodes

I like it.

Not sure about the current count, but it looks like most people favor
the indirect approach now.

J?rn

--
"Security vulnerabilities are here to stay."
-- Scott Culp, Manager of the Microsoft Security Response Center, 2001

2004-04-05 08:46:46

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Mon, 5 April 2004 10:19:08 +0200, Pavel Machek wrote:
>
> BTW what about the "mix hardlinks with cowlinks" proposal? You said it
> leads to hell and then I did not hear from you. Did it scare you that
> much? ;-)

It did in the beginning. Over the weekend (without mail access) I
pretty much found the same solution you did, but without your
insistence, I hadn't even though about it. All glory to you! ;)

J?rn

--
A surrounded army must be given a way out.
-- Sun Tzu

2004-04-05 08:54:33

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Sat, 3 April 2004 12:47:58 -0700, Eric W. Biederman wrote:
>
> And a few scenarios to hopefully make things clear.
>
> So your fs starts out as:
>
> /file1 -> ino1 (link count 2) -> data block #1
> /file2 -> ino1 (link count 2) -> data block #1
>
> file1 is only 4K long, so I only need to describe one data block.
>
> Actions:
>
> cowcopy(file2, file3):
>
> /file1 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
> /file2 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
> /file3 -> ino3 (link count 1) -> ino2 (link count 2) -> data block #1
>
>
> copyfile(file3, file4):
>
> /file1 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
> /file2 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
> /file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
> /file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1
>
> unlink(file2):
>
> /file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
> /file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
> /file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1
>
> link(file4, file5):
>
> /file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
> /file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
> /file4 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
> /file5 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
>
> write(file3):
>
> /file1 -> ino1 (link count 1) -> ino2 (link count 2) -> data block #1
> /file3 -> ino3 (link count 1) -> data block #2
> /file4 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
> /file5 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
>
> write(file5):
>
> /file1 -> ino1 (link count 1) -> ino2 (link count 1) -> data block #1
> /file3 -> ino3 (link count 1) -> data block #2
> /file4 -> ino4 (link count 2) -> data block #3
> /file5 -> ino4 (link count 2) -> data block #3
>
> write(file1):
>
> /file1 -> ino1 (link count 1) -> data block #1 (with modified contents)
> /file3 -> ino3 (link count 1) -> data block #2
> /file4 -> ino4 (link count 2) -> data block #3
> /file5 -> ino4 (link count 2) -> data block #3
>
> Does this make things clear?

Almost. What exactly do cowcopy() and copyfile() do? They look the
same, so why the difference?

J?rn

--
The story so far:
In the beginning the Universe was created. This has made a lot
of people very angry and been widely regarded as a bad move.
-- Douglas Adams?

2004-04-05 09:08:18

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> writes:

> Almost. What exactly do cowcopy() and copyfile() do? They look the
> same, so why the difference?

A typo on my part from the looks of it. I mean the same operation.

Eric

2004-04-05 09:15:39

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel <[email protected]> writes:

> On Sat, 3 April 2004 20:43:44 +0100, Jamie Lokier wrote:
> >
> > Btw, I'm not suggesting sharing page cache entries.
>
> But I am!!!
>
> Sharing the page cache is more important to me than sharing disk
> space. Disk space is cheap, but increasing memory beyond 1GiB in my
> notebook is not and 1GiB is too little, so memory is the real
> constraint.
>
> And it looks like Pavel already found the solution. Whenever doing
> something fishy that would confuse the page cache, we
> 1. lock
> 2. invalidate page cache for all files belonging to that cow entity
> 3. copyfile(), write(), or whatever
> 4. unlock
>
> This is always possibly, because page cache for cow-files is never
> read-write. If it was, we would have done 1-4 before and now have a
> regular (non-cow) file.
>
> Did I miss something?

I know a writable mmap needs to trigger a copy in that case.
And then are fun cases with MAP_FIXED which may mean invalidation
is not allowed.

As scheme that does not isolate the invalidate to the new copy worries me.

Eric


2004-04-05 09:18:59

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Mon, 5 April 2004 03:15:01 -0600, Eric W. Biederman wrote:
>
> I know a writable mmap needs to trigger a copy in that case.
> And then are fun cases with MAP_FIXED which may mean invalidation
> is not allowed.

Sounds fishy, so it will trigger cow. Done. :)

> As scheme that does not isolate the invalidate to the new copy worries me.

If you can come up with a better way, please do. Right now I cannot
think of anything better, but Pavel already showed how little that
means.

J?rn

--
The wise man seeks everything in himself; the ignorant man tries to get
everything from somebody else.
-- unknown

2004-04-05 11:10:54

by jlnance

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Fri, Apr 02, 2004 at 11:39:34PM +0200, Pavel Machek wrote:

> > get_data_id() is one way to detect equivalent files. Another would be
> > a function files_equal(fd1, fd2) which returns a boolean.
>
> files_equal(...) would lead to quadratic number of calls, no?
>
> > get_data_id() has the advantage that it can report immediately whether
> > a file has _any_ cowlink peers, which is important for programs that
> > scan trees. Perhaps getxattr() would be reasonable interface, using a
> > named attribute "data-id".
>
> Yes, get_data_id() is extremely ugly name.

I think it is worth asking if we really want to give userspace a way of
doing this or not. It exposes fairly low level FS details to userspace,
and this will limit our ability to change the implementation of the FS
in the future (partially shared files?). Certainly there has been some
pain caused over the years because userspace can ask for the inode number,
and people have written file systems which do not use inodes. Then they
have to kluge around this and make something up. I would hate to see
us implement an interface that causes long term pain.

I also cant really think of anyone who would need this information. I have
seen diff and tar used as examples. Perhaps diff would run faster but that
seems like a very special case thing, and diff will certainly work w/o it.
Tar might also be faster creating archives if it had this information
available. However to make tar useful wrt cowlinks, it will need to be
able to create these links at extract time from tarfiles which were created
on non-cowlink filesystems, so I don't think there is a pressing need.

Of course, I am willing to believe that I am wrong. Hope you all have a
great day.

Thanks,

Jim

--
http://www.jeweltran.com

2004-04-05 11:43:21

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > And it looks like Pavel already found the solution. Whenever doing
> > something fishy that would confuse the page cache, we
> > 1. lock
> > 2. invalidate page cache for all files belonging to that cow entity
> > 3. copyfile(), write(), or whatever
> > 4. unlock
> >
> > This is always possibly, because page cache for cow-files is never
> > read-write. If it was, we would have done 1-4 before and now have a
> > regular (non-cow) file.
> >
> > Did I miss something?
>
> I know a writable mmap needs to trigger a copy in that case.
> And then are fun cases with MAP_FIXED which may mean invalidation
> is not allowed.

How is "invalidation not allowed" for MAP_FIXED? Application will
never see the fault...

Pavel
--
Horseback riding is like software...
...vgf orggre jura vgf serr.

2004-04-05 11:46:41

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Hi!

> > > get_data_id() is one way to detect equivalent files. Another would be
> > > a function files_equal(fd1, fd2) which returns a boolean.
> >
> > files_equal(...) would lead to quadratic number of calls, no?
> >
> > > get_data_id() has the advantage that it can report immediately whether
> > > a file has _any_ cowlink peers, which is important for programs that
> > > scan trees. Perhaps getxattr() would be reasonable interface, using a
> > > named attribute "data-id".
> >
> > Yes, get_data_id() is extremely ugly name.
>
> I think it is worth asking if we really want to give userspace a way of
> doing this or not. It exposes fairly low level FS details to userspace,
> and this will limit our ability to change the implementation of the FS
> in the future (partially shared files?). Certainly there has been some
> pain caused over the years because userspace can ask for the inode number,
> and people have written file systems which do not use inodes. Then they
> have to kluge around this and make something up. I would hate to see
> us implement an interface that causes long term pain.

It should not be painfull, get_data_id() can allways return
-Esomething, meaning "I do not know".

> I also cant really think of anyone who would need this information. I have
> seen diff and tar used as examples. Perhaps diff would run faster but that
> seems like a very special case thing, and diff will certainly work w/o it.

Speeding up diff was one of main cowlinks motivations.
Pavel
--
Horseback riding is like software...
...vgf orggre jura vgf serr.

2004-04-05 12:18:24

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Pavel Machek wrote:
> > I know a writable mmap needs to trigger a copy in that case.
> > And then are fun cases with MAP_FIXED which may mean invalidation
> > is not allowed.
>
> How is "invalidation not allowed" for MAP_FIXED? Application will
> never see the fault...

I think Eric secretly meant MAP_LOCKED and/or mlock().

Even if the file isn't copied, when you have two different mappings of
different cowlinks both mapped with MAP_LOCKED, the pages need to be
different in RAM or we break locked page expectations.

-- Jamie

2004-04-05 12:36:06

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

[email protected] wrote:
> Perhaps diff would run faster but that
> seems like a very special case thing, and diff will certainly work w/o it.

We are talking about a difference between 20 minutes and 1 second.
It's quite significant, when it's a regular part of your diffing &
patching day.

I agree with your general sentiment that we shouldn't expose
filesystem details, e.g. a 32-bit integer. See below for an
alternative interface.

> Tar might also be faster creating archives if it had this information
> available. However to make tar useful wrt cowlinks, it will need to be
> able to create these links at extract time from tarfiles which were created
> on non-cowlink filesystems, so I don't think there is a pressing need.

I agree. The purpose of cowlinks is to be semantically invisible. If tar
or some other archiver/transferer wanted to use this information, it
should really be checking for equivalent files in general (like cmp)
and use this call as an optimisation only.

Btw, when we treat cowlinks as a semantically invisible, there is no
problem searching an entire filesystem for files with identical
content and linking them together to save space, in a cron job. It's
invisible to applications, except that space is saved and sometimes
the first write takes longer.

That still permits the get_data_id() optimisation, but that now
strictly means "kernel knows and returns a unique id of the data
(unique in this filesystem)".

Instead of get_data_id(), we'd use a POSIX attribute called "data-id"
returned by getxattr(). An absence of the attribute indicates that no
data-id is known. Otherwise, it's a unique id for that data in the
current filesystem.

It's a short byte string (another reason for making it a POSIX
attribute). On ext2/ext3, it's just the bytes of the shared inode
number plus a filesystem-wide generation number. On a hypothetical
httpfs, it could be the host name and ETag (a strong validator). On
any filesystem, it could be the SHA1 digest if that is known. It
would have the nice property of working over NFSv4, too.

-- Jamie

2004-04-05 12:39:39

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

Eric W. Biederman wrote:
> As scheme that does not isolate the invalidate to the new copy worries me.

It is possible to isolate the invalidate to mappings of the newly
broken cowlink only.

There is still the problem of MAP_LOCKED, or more realistically
mlockall() used with mappings of glibc etc. on a filesystem snapshot
made using cowlinks. The easiest thing is obviously to break the
cowlink when a mapping is locked, but it's not nice.

-- Jamie

2004-04-05 12:45:50

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

J?rn Engel wrote:
> Sharing the page cache is more important to me than sharing disk
> space. Disk space is cheap, but increasing memory beyond 1GiB in my
> notebook is not and 1GiB is too little, so memory is the real
> constraint.

Lucky you! I have 192MB, that's as much as it can take.

> And it looks like Pavel already found the solution. Whenever doing
> something fishy that would confuse the page cache, we
> 1. lock
> 2. invalidate page cache for all files belonging to that cow entity
> 3. copyfile(), write(), or whatever
> 4. unlock
>
> This is always possibly, because page cache for cow-files is never
> read-write. If it was, we would have done 1-4 before and now have a
> regular (non-cow) file.
>
> Did I miss something?

Just some interesting indirection or substitution of address_space
objects needed in the vmas, to map the right pages.

-- Jamie

2004-04-05 18:03:25

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] cowlinks v2

On Mon, 5 April 2004 13:41:58 +0100, Jamie Lokier wrote:
> J?rn Engel wrote:
> > Sharing the page cache is more important to me than sharing disk
> > space. Disk space is cheap, but increasing memory beyond 1GiB in my
> > notebook is not and 1GiB is too little, so memory is the real
> > constraint.
>
> Lucky you! I have 192MB, that's as much as it can take.

I know your pain, last month my machine had just 160MiB.

That month I also didn't care much about page cache awareness of cow.
A grep over one kernel tree would thrash the cache just as a grep over
ten. Now I can tell the difference again.

> > And it looks like Pavel already found the solution. Whenever doing
> > something fishy that would confuse the page cache, we
> > 1. lock
> > 2. invalidate page cache for all files belonging to that cow entity
> > 3. copyfile(), write(), or whatever
> > 4. unlock
> >
> > This is always possibly, because page cache for cow-files is never
> > read-write. If it was, we would have done 1-4 before and now have a
> > regular (non-cow) file.
> >
> > Did I miss something?
>
> Just some interesting indirection or substitution of address_space
> objects needed in the vmas, to map the right pages.

That is already covered by the nonexistent invalidate_inode() function
in point 2. :)

You are right, implementation is more interesting, but it *should*
work.

J?rn

--
To announce that there must be no criticism of the President, or that we
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918