LinuxLists.cc - [PATCH 14/38] fallthru: ext2 fallthru support

2010-08-06 22:35:00

Subject: [PATCH 14/38] fallthru: ext2 fallthru support

Add support for fallthru directory entries to ext2.

XXX What to do for d_ino for fallthrus? If we return the inode from
the the underlying file system, it comes from a different inode
"namespace" and that will produce spurious matches. This argues for
implementation of fallthrus as symlinks because they have to allocate
an inode (and inode number) anyway, and we can later reuse it if we
copy the file up.

Cc: Theodore Tso <[email protected]>
Cc: [email protected]
Signed-off-by: Valerie Aurora <[email protected]>
Signed-off-by: Jan Blunck <[email protected]>
---
fs/ext2/dir.c | 92 ++++++++++++++++++++++++++++++++++++++++++++--
fs/ext2/ext2.h | 1 +
fs/ext2/namei.c | 22 +++++++++++
fs/ext2/super.c | 2 +
include/linux/ext2_fs.h | 4 ++
5 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 030bd46..f3b4aff 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -219,7 +219,8 @@ static inline int ext2_match (int len, const char * const name,
{
if (len != de->name_len)
return 0;
- if (!de->inode && (de->file_type != EXT2_FT_WHT))
+ if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
+ (de->file_type != EXT2_FT_FALLTHRU)))
return 0;
return !memcmp(name, de->name, len);
}
@@ -256,6 +257,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
[EXT2_FT_SOCK] = DT_SOCK,
[EXT2_FT_SYMLINK] = DT_LNK,
[EXT2_FT_WHT] = DT_WHT,
+ [EXT2_FT_FALLTHRU] = DT_UNKNOWN,
};

#define S_SHIFT 12
@@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
ext2_put_page(page);
return 0;
}
+ } else if (de->file_type == EXT2_FT_FALLTHRU) {
+ int over;
+ unsigned char d_type = DT_UNKNOWN;
+
+ offset = (char *)de - kaddr;
+ /* XXX We don't know the inode number
+ * of the directory entry in the
+ * underlying file system. Should
+ * look it up, either on fallthru
+ * creation at first readdir or now at
+ * filldir time. */
+ over = filldir(dirent, de->name, de->name_len,
+ (n<<PAGE_CACHE_SHIFT) | offset,
+ 123 /* Made up ino */, d_type);
+ if (over) {
+ ext2_put_page(page);
+ return 0;
+ }
}
filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
}
@@ -463,6 +483,10 @@ ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_WHITEOUT;
spin_unlock(&dentry->d_lock);
+ } else if(!res && de->file_type == EXT2_FT_FALLTHRU) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_FALLTHRU;
+ spin_unlock(&dentry->d_lock);
}
ext2_put_page(page);
}
@@ -532,6 +556,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
de->name_len = 0;
de->rec_len = ext2_rec_len_to_disk(chunk_size);
de->inode = 0;
+ de->file_type = 0;
goto got_it;
}
if (de->rec_len == 0) {
@@ -545,6 +570,7 @@ static ext2_dirent * ext2_append_entry(struct dentry * dentry,
name_len = EXT2_DIR_REC_LEN(de->name_len);
rec_len = ext2_rec_len_from_disk(de->rec_len);
if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
+ (de->file_type != EXT2_FT_FALLTHRU) &&
(rec_len >= reclen))
goto got_it;
if (rec_len >= name_len + reclen)
@@ -587,7 +613,8 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)

err = -EEXIST;
if (ext2_match (namelen, name, de)) {
- if (de->file_type == EXT2_FT_WHT)
+ if ((de->file_type == EXT2_FT_WHT) ||
+ (de->file_type == EXT2_FT_FALLTHRU))
goto got_it;
goto out_unlock;
}
@@ -602,7 +629,8 @@ got_it:
&page, NULL);
if (err)
goto out_unlock;
- if (de->inode || ((de->file_type == EXT2_FT_WHT) &&
+ if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
+ (de->file_type == EXT2_FT_FALLTHRU)) &&
!ext2_match (namelen, name, de))) {
ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
@@ -627,6 +655,60 @@ out_unlock:
}

/*
+ * Create a fallthru entry.
+ */
+int ext2_fallthru_entry (struct inode *dir, struct dentry *dentry)
+{
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned short rec_len, name_len;
+ ext2_dirent * de;
+ struct page *page;
+ loff_t pos;
+ int err;
+
+ de = ext2_append_entry(dentry, &page);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
+
+ err = -EEXIST;
+ if (ext2_match (namelen, name, de))
+ goto out_unlock;
+
+ name_len = EXT2_DIR_REC_LEN(de->name_len);
+ rec_len = ext2_rec_len_from_disk(de->rec_len);
+
+ pos = page_offset(page) +
+ (char*)de - (char*)page_address(page);
+ err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
+ &page, NULL);
+ if (err)
+ goto out_unlock;
+ if (de->inode || (de->file_type == EXT2_FT_WHT) ||
+ (de->file_type == EXT2_FT_FALLTHRU)) {
+ ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
+ de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
+ de->rec_len = ext2_rec_len_to_disk(name_len);
+ de = de1;
+ }
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+ de->inode = 0;
+ de->file_type = EXT2_FT_FALLTHRU;
+ err = ext2_commit_chunk(page, pos, rec_len);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+ mark_inode_dirty(dir);
+ /* OFFSET_CACHE */
+out_put:
+ ext2_put_page(page);
+ return err;
+out_unlock:
+ unlock_page(page);
+ goto out_put;
+}
+
+/*
* ext2_delete_entry deletes a directory entry by merging it with the
* previous entry. Page is up-to-date. Releases the page.
*/
@@ -711,7 +793,9 @@ int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
*/
if (ext2_match (namelen, name, de))
de->inode = 0;
- if (de->inode || (de->file_type == EXT2_FT_WHT)) {
+ if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
+ (de->file_type == EXT2_FT_FALLTHRU)) &&
+ !ext2_match (namelen, name, de))) {
ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
de->rec_len = ext2_rec_len_to_disk(name_len);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 89ab2f7..1504814 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -108,6 +108,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
extern int ext2_whiteout_entry (struct inode *, struct dentry *,
struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_fallthru_entry (struct inode *, struct dentry *);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 8f92dd0..af4052f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -345,6 +345,7 @@ static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
goto out;

spin_lock(&new_dentry->d_lock);
+ new_dentry->d_flags &= ~DCACHE_FALLTHRU;
new_dentry->d_flags |= DCACHE_WHITEOUT;
spin_unlock(&new_dentry->d_lock);
d_add(new_dentry, NULL);
@@ -363,6 +364,26 @@ out:
return err;
}

+/*
+ * Create a fallthru entry.
+ */
+static int ext2_fallthru (struct inode *dir, struct dentry *dentry)
+{
+ int err;
+
+ dquot_initialize(dir);
+
+ err = ext2_fallthru_entry(dir, dentry);
+ if (err)
+ return err;
+
+ d_instantiate(dentry, NULL);
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_FALLTHRU;
+ spin_unlock(&dentry->d_lock);
+ return 0;
+}
+
static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct inode * new_dir, struct dentry * new_dentry )
{
@@ -466,6 +487,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.rmdir = ext2_rmdir,
.mknod = ext2_mknod,
.whiteout = ext2_whiteout,
+ .fallthru = ext2_fallthru,
.rename = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 704521b..76eba1e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1095,6 +1095,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)

if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_WHITEOUT))
sb->s_flags |= MS_WHITEOUT;
+ if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FALLTHRU))
+ sb->s_flags |= MS_FALLTHRU;

if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
sb->s_flags |= MS_RDONLY;
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index b0fb356..1a6f929 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -505,11 +505,14 @@ struct ext2_super_block {
#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
#define EXT2_FEATURE_INCOMPAT_META_BG 0x0010
#define EXT2_FEATURE_INCOMPAT_WHITEOUT 0x0020
+/* ext3/4 incompat flags take up the intervening constants */
+#define EXT2_FEATURE_INCOMPAT_FALLTHRU 0x2000
#define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff

#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \
EXT2_FEATURE_INCOMPAT_WHITEOUT| \
+ EXT2_FEATURE_INCOMPAT_FALLTHRU| \
EXT2_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
@@ -577,6 +580,7 @@ enum {
EXT2_FT_SOCK = 6,
EXT2_FT_SYMLINK = 7,
EXT2_FT_WHT = 8,
+ EXT2_FT_FALLTHRU = 9,
EXT2_FT_MAX
};

--
1.6.3.3

2010-08-07 00:28:31

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

On 2010-08-06, at 16:35, Valerie Aurora wrote:
> XXX What to do for d_ino for fallthrus? If we return the inode from
> the the underlying file system, it comes from a different inode
> "namespace" and that will produce spurious matches. This argues for
> implementation of fallthrus as symlinks because they have to allocate
> an inode (and inode number) anyway, and we can later reuse it if we
> copy the file up.
>
> @@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, + /* XXX We don't know the inode number
> + * of the directory entry in the
> + * underlying file system. Should
> + * look it up, either on fallthru
> + * creation at first readdir or now at
> + * filldir time. */
> + over = filldir(dirent, de->name, de->name_len,
> + (n<<PAGE_CACHE_SHIFT) | offset,
> + 123 /* Made up ino */, d_type);

I don't think it makes sense to use "123" for the inode number. This is a valid inode number, and almost certainly one that will be in use in most filesystems. One option for extN is to use EXT2_BAD_INO (1).

Cheers, Andreas

2010-08-08 16:41:07

by Valerie Aurora

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

On Fri, Aug 06, 2010 at 06:28:29PM -0600, Andreas Dilger wrote:
> On 2010-08-06, at 16:35, Valerie Aurora wrote:
> > XXX What to do for d_ino for fallthrus? If we return the inode from
> > the the underlying file system, it comes from a different inode
> > "namespace" and that will produce spurious matches. This argues for
> > implementation of fallthrus as symlinks because they have to allocate
> > an inode (and inode number) anyway, and we can later reuse it if we
> > copy the file up.
> >
> > @@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, + /* XXX We don't know the inode number
> > + * of the directory entry in the
> > + * underlying file system. Should
> > + * look it up, either on fallthru
> > + * creation at first readdir or now at
> > + * filldir time. */
> > + over = filldir(dirent, de->name, de->name_len,
> > + (n<<PAGE_CACHE_SHIFT) | offset,
> > + 123 /* Made up ino */, d_type);
>
> I don't think it makes sense to use "123" for the inode number. This is a valid inode number, and almost certainly one that will be in use in most filesystems. One option for extN is to use EXT2_BAD_INO (1).

The next version (Subject: Union mounts - return d_ino from lower fs)
fixed this. Take a look and tell me what you think?

-VAL

2010-08-18 23:24:17

by Bodo Eggert

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

Miklos Szeredi <[email protected]> wrote:
> On Tue, 17 Aug 2010, Valerie Aurora wrote:

>> > - hard links to make sure a separate inode is not necessary for each
>> > whiteout/fallthrough entry
>>
>> The problem with hard links is that you run into hard link limits. I
>> don't think we can do hard links for whiteouts and fallthrus. Each
>> whiteout or fallthru will cost an inode if we implement them as
>> extended attributes. This cost has to be balanced against the cost of
>> implementing them as dentries, which is mainly code complexity in
>> individual file systems.

Not knowing the details, I'd suggest to implement a generic function to
create an attributed inode and let the fs override it to create an
unlinked-file-dentry instead.

Benefit: All fs supporting extended attributes will be able to support
whiteout. If the fs has other means of supporting whiteout, they may fake
the attribute.

Possible problems:
- Having two ways of reporting a whiteout? Or can it be reported using a
(static) fake inode?
- How do you un-whiteout while (not) having an overlaying fs?

> get_unlinked_inode() is a great idea. But I feel that individual
> inodes for each fallthrough is excessive. It'll make the first
> readdir() really really expensive and wastes a lot of disk and memory
> for no good reason.
>
> Not sure how to fix the hard link limits problem though...

Do a hardlink if you can create a hard link, otherwise use a fresh inode
and use that for the next hardlink(s).

2010-08-19 02:03:20

by J. R. Okajima

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

Bodo Eggert:
> Do a hardlink if you can create a hard link, otherwise use a fresh inode
> and use that for the next hardlink(s).

Exactly.
That's the approach aufs takes for whiteout and its brothers.

J. R. Okajima

2010-08-24 17:21:43

by Valerie Aurora

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

On Thu, Aug 19, 2010 at 01:24:07AM +0200, Bodo Eggert wrote:
> Miklos Szeredi <[email protected]> wrote:
> > On Tue, 17 Aug 2010, Valerie Aurora wrote:
>
> >> > - hard links to make sure a separate inode is not necessary for each
> >> > whiteout/fallthrough entry
> >>
> >> The problem with hard links is that you run into hard link limits. I
> >> don't think we can do hard links for whiteouts and fallthrus. Each
> >> whiteout or fallthru will cost an inode if we implement them as
> >> extended attributes. This cost has to be balanced against the cost of
> >> implementing them as dentries, which is mainly code complexity in
> >> individual file systems.
>
> Not knowing the details, I'd suggest to implement a generic function to
> create an attributed inode and let the fs override it to create an
> unlinked-file-dentry instead.
>
> Benefit: All fs supporting extended attributes will be able to support
> whiteout. If the fs has other means of supporting whiteout, they may fake
> the attribute.

Yeah, I think that's the way to go.

> Possible problems:
> - Having two ways of reporting a whiteout? Or can it be reported using a
> (static) fake inode?

They are going to look the same at the VFS level and higher.

> - How do you un-whiteout while (not) having an overlaying fs?

The current version of whiteout support always hides DT_WHT dentries
from userspace. Perhaps a start is to only hide DT_WHT entries when
the file system is union mounted. Applications usually ignore all
dentries with d_ino == 0 so it might not cause problems.

Right now, you have to remove whiteouts offline using fsck.

> > get_unlinked_inode() is a great idea. But I feel that individual
> > inodes for each fallthrough is excessive. It'll make the first
> > readdir() really really expensive and wastes a lot of disk and memory
> > for no good reason.
> >
> > Not sure how to fix the hard link limits problem though...
>
> Do a hardlink if you can create a hard link, otherwise use a fresh inode
> and use that for the next hardlink(s).

Bleah! Then you have a code path that is only tested when you hit
LINK_MAX. Sounds like a recipe for bugs for me.

-VAL

2010-08-26 09:53:59

by Bodo Eggert

[permalink] [raw]

Subject: Re: [PATCH 14/38] fallthru: ext2 fallthru support

On Tue, 24 Aug 2010, Valerie Aurora wrote:
> On Thu, Aug 19, 2010 at 01:24:07AM +0200, Bodo Eggert wrote:
>> Miklos Szeredi <[email protected]> wrote:
>>> On Tue, 17 Aug 2010, Valerie Aurora wrote:

>>> get_unlinked_inode() is a great idea. But I feel that individual
>>> inodes for each fallthrough is excessive. It'll make the first
>>> readdir() really really expensive and wastes a lot of disk and memory
>>> for no good reason.
>>>
>>> Not sure how to fix the hard link limits problem though...
>>
>> Do a hardlink if you can create a hard link, otherwise use a fresh inode
>> and use that for the next hardlink(s).
>
> Bleah! Then you have a code path that is only tested when you hit
> LINK_MAX. Sounds like a recipe for bugs for me.

You'll also hit it while creating the first whiteout, maybe on creating
the first whiteout since mounting, and on filesystems not supporting
hardlinks (are there some that support attributes but not hardlinks?).
Maybe it will be possible to create immutable whiteout inodes, too.