This adds inode operations for directory, symlink, and special inodes.
Signed-off-by: Changman Lee <[email protected]>
Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/namei.c | 549 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 549 insertions(+)
create mode 100644 fs/f2fs/namei.c
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 0000000..eaf5b7b
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,549 @@
+/**
+ * fs/f2fs/namei.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define F2FS_LINK_MAX 32000
+
+const char *media_ext_lists[] = {
+ "jpg",
+ "gif",
+ "png",
+ "avi",
+ "divx",
+ "mp4",
+ "mp3",
+ "3gp",
+ "wmv",
+ "wma",
+ "mpeg",
+ "mkv",
+ "mov",
+ "asx",
+ "asf",
+ "wmx",
+ "svi",
+ "wvx",
+ "wm",
+ "mpg",
+ "mpe",
+ "rm",
+ "ogg",
+ NULL
+};
+
+static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ int err;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock_op(sbi, NODE_NEW);
+ if (!alloc_nid(sbi, &ino)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = -ENOSPC;
+ goto fail;
+ }
+ mutex_unlock_op(sbi, NODE_NEW);
+
+ inode->i_uid = current_fsuid();
+
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
+ }
+
+ inode->i_ino = ino;
+ inode->i_mode = mode;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ nid_free = true;
+ goto out;
+ }
+
+ mark_inode_dirty(inode);
+ return inode;
+
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+fail:
+ iput(inode);
+ if (nid_free)
+ alloc_nid_failed(sbi, ino);
+ return ERR_PTR(err);
+}
+
+static int is_multimedia_file(const unsigned char *s, const char *sub)
+{
+ int slen = strlen(s);
+ int sublen = strlen(sub);
+ int ret;
+
+ if (sublen > slen)
+ return 1;
+
+ ret = memcmp(s + slen - sublen, sub, sublen);
+ if (ret) { /* compare upper case */
+ int i;
+ char upper_sub[6];
+ for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
+ upper_sub[i] = toupper(sub[i]);
+ return memcmp(s + slen - sublen, upper_sub, sublen);
+ }
+
+ return ret;
+}
+
+/**
+ * Set multimedia files as cold files for hot/cold data separation
+ */
+static inline void set_cold_file(struct inode *inode, const unsigned char *name)
+{
+ const char **extlist = media_ext_lists;
+
+ while (*extlist) {
+ if (!is_multimedia_file(name, *extlist)) {
+ F2FS_I(inode)->is_cold = 1;
+ break;
+ }
+ extlist++;
+ }
+}
+
+static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ nid_t ino = 0;
+ int err;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ set_cold_file(inode, dentry->d_name.name);
+
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ ino = inode->i_ino;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, ino);
+
+ if (!sbi->por_doing)
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, ino);
+ return err;
+}
+
+static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ if (inode->i_nlink >= F2FS_LINK_MAX)
+ return -EMLINK;
+
+ inode->i_ctime = CURRENT_TIME;
+ atomic_inc(&inode->i_count);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ d_instantiate(dentry, inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+out:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ iput(inode);
+ return err;
+}
+
+static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode = NULL;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (de) {
+ nid_t ino = le32_to_cpu(de->ino);
+ kunmap(page);
+ f2fs_put_page(page, 0);
+
+ inode = f2fs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ int err = -ENOENT;
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (!de)
+ goto fail;
+
+ err = check_orphan_space(sbi);
+ if (err) {
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ goto fail;
+ }
+
+ f2fs_delete_entry(de, page, inode);
+
+ /* In order to evict this inode, we set it dirty */
+ mark_inode_dirty(inode);
+ f2fs_balance_fs(sbi);
+fail:
+ return err;
+}
+
+static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ unsigned symlen = strlen(symname) + 1;
+ int err;
+
+ if (symlen > sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+ inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ err = page_symlink(inode, symname, symlen);
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+
+ return err;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct inode *inode;
+ int err = -EMLINK;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ if (dir->i_nlink >= F2FS_LINK_MAX)
+ return err;
+
+ inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ return err;
+
+ inode->i_op = &f2fs_dir_inode_operations;
+ inode->i_fop = &f2fs_dir_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_ZERO);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out_fail;
+
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+
+out_fail:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ if (f2fs_empty_dir(inode))
+ return f2fs_unlink(dir, dentry);
+ return -ENOTEMPTY;
+}
+
+static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ int err = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ init_special_inode(inode, inode->i_mode, rdev);
+ inode->i_op = &f2fs_special_inode_operations;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, inode->i_ino);
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct super_block *sb = old_dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct page *old_dir_page;
+ struct page *old_page;
+ struct f2fs_dir_entry *old_dir_entry = NULL;
+ struct f2fs_dir_entry *old_entry;
+ struct f2fs_dir_entry *new_entry;
+ int err = -ENOENT;
+
+ if (new_dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ if (!old_entry)
+ goto out;
+
+ if (S_ISDIR(old_inode->i_mode)) {
+ err = -EIO;
+ old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+ if (!old_dir_entry)
+ goto out_old;
+ }
+
+ mutex_lock_op(sbi, RENAME);
+
+ if (new_inode) {
+ struct page *new_page;
+
+ err = -ENOTEMPTY;
+ if (old_dir_entry && !f2fs_empty_dir(new_inode))
+ goto out_dir;
+
+ err = -ENOENT;
+ new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
+ &new_page);
+ if (!new_entry)
+ goto out_dir;
+
+ f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+
+ new_inode->i_ctime = CURRENT_TIME_SEC;
+ if (old_dir_entry)
+ drop_nlink(new_inode);
+ drop_nlink(new_inode);
+ if (!new_inode->i_nlink)
+ add_orphan_inode(sbi, new_inode->i_ino);
+ f2fs_write_inode(new_inode, NULL);
+ } else {
+ if (old_dir_entry) {
+ err = -EMLINK;
+ if (new_dir->i_nlink >= F2FS_LINK_MAX)
+ goto out_dir;
+ }
+ err = f2fs_add_link(new_dentry, old_inode);
+ if (err)
+ goto out_dir;
+
+ if (old_dir_entry) {
+ inc_nlink(new_dir);
+ f2fs_write_inode(new_dir, NULL);
+ }
+ }
+
+ old_inode->i_ctime = CURRENT_TIME_SEC;
+ set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
+ mark_inode_dirty(old_inode);
+
+ f2fs_delete_entry(old_entry, old_page, NULL);
+
+ if (old_dir_entry) {
+ if (old_dir != new_dir) {
+ f2fs_set_link(old_inode, old_dir_entry,
+ old_dir_page, new_dir);
+ } else {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ drop_nlink(old_dir);
+ f2fs_write_inode(old_dir, NULL);
+ }
+
+ mutex_unlock_op(sbi, RENAME);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+
+out_dir:
+ if (old_dir_entry) {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ mutex_unlock_op(sbi, RENAME);
+out_old:
+ kunmap(old_page);
+ f2fs_put_page(old_page, 0);
+out:
+ return err;
+}
+
+const struct inode_operations f2fs_dir_inode_operations = {
+ .create = f2fs_create,
+ .lookup = f2fs_lookup,
+ .link = f2fs_link,
+ .unlink = f2fs_unlink,
+ .symlink = f2fs_symlink,
+ .mkdir = f2fs_mkdir,
+ .rmdir = f2fs_rmdir,
+ .mknod = f2fs_mknod,
+ .rename = f2fs_rename,
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_special_inode_operations = {
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
--
1.7.9.5
---
Jaegeuk Kim
Samsung
On Fri, Oct 05, 2012 at 09:03:09PM +0900, ????????? wrote:
> +static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
> + bool excl)
> +{
> + struct super_block *sb = dir->i_sb;
> + struct f2fs_sb_info *sbi = F2FS_SB(sb);
> + struct inode *inode;
> + nid_t ino = 0;
> + int err;
> +
> + if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
> + return -ENAMETOOLONG;
Pointless - failing those on ->lookup() with ENAMETOOLONG is enough.
The same goes for all entry creation methods.
> + if (inode->i_nlink >= F2FS_LINK_MAX)
> + return -EMLINK;
Just set ->s_max_links and be done with that.
> + if (dir->i_nlink >= F2FS_LINK_MAX)
> + return err;
Ditto.
> + if (old_dir_entry) {
> + err = -EMLINK;
> + if (new_dir->i_nlink >= F2FS_LINK_MAX)
> + goto out_dir;
... and here as well.
On Friday 05 October 2012, 김재극 wrote:
> +const char *media_ext_lists[] = {
> + "jpg",
> + "gif",
> + "png",
> + "avi",
> + "divx",
> + "mp4",
> + "mp3",
> ...
> + * Set multimedia files as cold files for hot/cold data separation
> + */
> +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
> +{
> + const char **extlist = media_ext_lists;
> +
> + while (*extlist) {
> + if (!is_multimedia_file(name, *extlist)) {
> + F2FS_I(inode)->is_cold = 1;
> + break;
> + }
> + extlist++;
> + }
> +}
This is a very clever way of categorizing files by their name, but I wonder if hardcoding
the list of file name extensions at in the kernel source is the best strategy. Generally
I would consider this to be a policy that should be configurable by the user.
Unfortunately I can't think of a good interface to configure this, but maybe someone
else has a useful idea. Maybe the list can be stored in the superblock and get written
at mkfs time from the same defaults, but with the option of overriding it using
a debugfs tool.
Arnd
On Oct 14, 2012, at 12:52 AM, Arnd Bergmann wrote:
> On Friday 05 October 2012, 김재극 wrote:
>> +const char *media_ext_lists[] = {
>> + "jpg",
>> + "gif",
>> + "png",
>> + "avi",
>> + "divx",
>> + "mp4",
>> + "mp3",
>> ...
>
>> + * Set multimedia files as cold files for hot/cold data separation
>> + */
>> +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
>> +{
>> + const char **extlist = media_ext_lists;
>> +
>> + while (*extlist) {
>> + if (!is_multimedia_file(name, *extlist)) {
>> + F2FS_I(inode)->is_cold = 1;
>> + break;
>> + }
>> + extlist++;
>> + }
>> +}
>
> This is a very clever way of categorizing files by their name, but I wonder if hardcoding
> the list of file name extensions at in the kernel source is the best strategy. Generally
> I would consider this to be a policy that should be configurable by the user.
>
I think that file extensions can't be a steady basis for categorization. It is possible that user can use any extension as you want during file naming (for example, save text file with png extension). Or it is possible to use file without any extension. Only magics in a file structure can be a steady basis. But analyzing of file structure on the file system driver level is a breaking of some fundamentals, from my point of view.
With the best regards,
Vyacheslav Dubeyko.
> Unfortunately I can't think of a good interface to configure this, but maybe someone
> else has a useful idea. Maybe the list can be stored in the superblock and get written
> at mkfs time from the same defaults, but with the option of overriding it using
> a debugfs tool.
>
> Arnd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Oct 14, 2012, at 12:52 AM, Arnd Bergmann wrote:
> On Friday 05 October 2012, 김재극 wrote:
>> +const char *media_ext_lists[] = {
>> + "jpg",
>> + "gif",
>> + "png",
>> + "avi",
>> + "divx",
>> + "mp4",
>> + "mp3",
>> ...
>
>> + * Set multimedia files as cold files for hot/cold data separation
>> + */
>> +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
>> +{
>> + const char **extlist = media_ext_lists;
>> +
>> + while (*extlist) {
>> + if (!is_multimedia_file(name, *extlist)) {
>> + F2FS_I(inode)->is_cold = 1;
>> + break;
>> + }
>> + extlist++;
>> + }
>> +}
>
> This is a very clever way of categorizing files by their name, but I wonder if hardcoding
> the list of file name extensions at in the kernel source is the best strategy. Generally
> I would consider this to be a policy that should be configurable by the user.
>
> Unfortunately I can't think of a good interface to configure this, but maybe someone
> else has a useful idea. Maybe the list can be stored in the superblock and get written
> at mkfs time from the same defaults, but with the option of overriding it using
> a debugfs tool.
>
By the way, how about extended attributes? It is possible to save in extended attribute a hint about file's content nature during creation operation or later. Moreover, extended attribute gives possibility to change hint after renaming operation, for example.
With the best regards,
Vyacheslav Dubeyko.
> Arnd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
2012-10-13 (토), 20:52 +0000, Arnd Bergmann:
> On Friday 05 October 2012, 김재극 wrote:
> > +const char *media_ext_lists[] = {
> > + "jpg",
> > + "gif",
> > + "png",
> > + "avi",
> > + "divx",
> > + "mp4",
> > + "mp3",
> > ...
>
> > + * Set multimedia files as cold files for hot/cold data separation
> > + */
> > +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
> > +{
> > + const char **extlist = media_ext_lists;
> > +
> > + while (*extlist) {
> > + if (!is_multimedia_file(name, *extlist)) {
> > + F2FS_I(inode)->is_cold = 1;
> > + break;
> > + }
> > + extlist++;
> > + }
> > +}
>
> This is a very clever way of categorizing files by their name, but I wonder if hardcoding
> the list of file name extensions at in the kernel source is the best strategy. Generally
> I would consider this to be a policy that should be configurable by the user.
>
> Unfortunately I can't think of a good interface to configure this, but maybe someone
> else has a useful idea. Maybe the list can be stored in the superblock and get written
> at mkfs time from the same defaults, but with the option of overriding it using
> a debugfs tool.
>
Good point!
I'll think about a user-made list.
Thanks,
> Arnd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Jaegeuk Kim
Samsung
2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> On Oct 14, 2012, at 12:52 AM, Arnd Bergmann wrote:
>
> > On Friday 05 October 2012, 김재극 wrote:
> >> +const char *media_ext_lists[] = {
> >> + "jpg",
> >> + "gif",
> >> + "png",
> >> + "avi",
> >> + "divx",
> >> + "mp4",
> >> + "mp3",
> >> ...
> >
> >> + * Set multimedia files as cold files for hot/cold data separation
> >> + */
> >> +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
> >> +{
> >> + const char **extlist = media_ext_lists;
> >> +
> >> + while (*extlist) {
> >> + if (!is_multimedia_file(name, *extlist)) {
> >> + F2FS_I(inode)->is_cold = 1;
> >> + break;
> >> + }
> >> + extlist++;
> >> + }
> >> +}
> >
> > This is a very clever way of categorizing files by their name, but I wonder if hardcoding
> > the list of file name extensions at in the kernel source is the best strategy. Generally
> > I would consider this to be a policy that should be configurable by the user.
> >
> > Unfortunately I can't think of a good interface to configure this, but maybe someone
> > else has a useful idea. Maybe the list can be stored in the superblock and get written
> > at mkfs time from the same defaults, but with the option of overriding it using
> > a debugfs tool.
> >
>
> By the way, how about extended attributes? It is possible to save in extended attribute a hint about file's content nature during creation operation or later. Moreover, extended attribute gives possibility to change hint after renaming operation, for example.
>
I think xattr is not a proper way to communicate between file system and
users.
How about fadvise()?
> With the best regards,
> Vyacheslav Dubeyko.
>
> > Arnd
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Jaegeuk Kim
Samsung
On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
>> On Oct 14, 2012, at 12:52 AM, Arnd Bergmann wrote:
>>
>>> On Friday 05 October 2012, 김재극 wrote:
>>>> +const char *media_ext_lists[] = {
>>>> + "jpg",
>>>> + "gif",
>>>> + "png",
>>>> + "avi",
>>>> + "divx",
>>>> + "mp4",
>>>> + "mp3",
>>>> ...
>>>
>>>> + * Set multimedia files as cold files for hot/cold data separation
>>>> + */
>>>> +static inline void set_cold_file(struct inode *inode, const unsigned char *name)
>>>> +{
>>>> + const char **extlist = media_ext_lists;
>>>> +
>>>> + while (*extlist) {
>>>> + if (!is_multimedia_file(name, *extlist)) {
>>>> + F2FS_I(inode)->is_cold = 1;
>>>> + break;
>>>> + }
>>>> + extlist++;
>>>> + }
>>>> +}
>>>
>>> This is a very clever way of categorizing files by their name, but I wonder if hardcoding
>>> the list of file name extensions at in the kernel source is the best strategy. Generally
>>> I would consider this to be a policy that should be configurable by the user.
>>>
>>> Unfortunately I can't think of a good interface to configure this, but maybe someone
>>> else has a useful idea. Maybe the list can be stored in the superblock and get written
>>> at mkfs time from the same defaults, but with the option of overriding it using
>>> a debugfs tool.
>>>
>>
>> By the way, how about extended attributes? It is possible to save in extended attribute a hint about file's content nature during creation operation or later. Moreover, extended attribute gives possibility to change hint after renaming operation, for example.
>>
>
> I think xattr is not a proper way to communicate between file system and
> users.
I don't understand why you think that xattr is not proper way. Extended attributes are the way of adding some additional properties to filesystem object, from my point of view.
> How about fadvise()?
>
The fadvise() is a good suggestion. But, as I can understand, such solution requires using fadvise() during application implementation. So, from one point of view, it exists many applications that doesn't use fadvise() and, from another point of view, developers change style of coding not so easy.
Extended attributes are more flexible way, from my point of view. The xattr gives possibility to make hint to filesystem at any time and without any dependencies with application's functional opportunities. Documented way of using such extended attributes gives to user flexible way of manipulation of filesystem behavior (but I remember that you don't believe in an user :-)).
So, I think that fadvise() and extended attributes can be complementary solutions.
Anyway, hardcoding or saving in filesystem list of file extensions is a nasty way. It can be not safe or hardly understandable by users the way of reconfiguration filesystem by means of tunefs or debugfs with the purpose of file extensions addition in such "black-box" as TV or smartphones, from my point of view.
With the best regards,
Vyacheslav Dubeyko.
>> With the best regards,
>> Vyacheslav Dubeyko.
>>
>>> Arnd
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
> --
> Jaegeuk Kim
> Samsung
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> >>
> >> By the way, how about extended attributes? It is possible to save in extended attribute
> >> a hint about file's content nature during creation operation or later. Moreover, extended
'> >> attribute gives possibility to change hint after renaming operation, for example.
> >>
> >
> > I think xattr is not a proper way to communicate between file system and
> > users.
>
> I don't understand why you think that xattr is not proper way. Extended attributes are the
> way of adding some additional properties to filesystem object, from my point of view.
There are different kinds of extended attributes, as described by
http://linux.die.net/man/5/attr
I would think that the system namespace can hold an attribute that can be used for this.
> > How about fadvise()?
> >
>
> The fadvise() is a good suggestion. But, as I can understand, such solution requires
> using fadvise() during application implementation. So, from one point of view, it exists
> many applications that doesn't use fadvise() and, from another point of view, developers
> change style of coding not so easy.
Most importantly, fadvise is about accessing an open file, and I would expect anything
passed in there to be forgotten after the file is closed, while an attribute is associated
with the inode and should persist across open/close as well as mount/umount cycles.
> Extended attributes are more flexible way, from my point of view. The xattr gives
> possibility to make hint to filesystem at any time and without any dependencies with
> application's functional opportunities. Documented way of using such extended attributes
> gives to user flexible way of manipulation of filesystem behavior (but I remember that
> you don't believe in an user :-)).
>
> So, I think that fadvise() and extended attributes can be complementary solutions.
Right. Another option is to have ext4 style attributes, see
http://linux.die.net/man/1/chattr
Unlike extended attributes, there is a limited number of those, and they can
only be boolean flags, but that might be enough for this particular use case.
The main reason I can see against extended attributes is that they are not stored
very efficiently in f2fs, unless a lot of work is put into coming up with a good
implementation. A single flags bit can trivially be added to the inode in
comparison (if it's not there already).
> Anyway, hardcoding or saving in filesystem list of file extensions is a nasty way. It
> can be not safe or hardly understandable by users the way of reconfiguration filesystem
> by means of tunefs or debugfs with the purpose of file extensions addition in such
> "black-box" as TV or smartphones, from my point of view.
It is only a performance hint though, so it is not a correctness issue the
file system gets it wrong. In order to do efficient garbage collection, a log
structured file system should take all the information it can get about the
expected life of data it writes. I agree that the list, even in the form of
mkfs time settings, is not a clean abstraction, but in the place of an Android
phone manufacturer I would still enable it if it promises a significant
performance advantage over not using it. I guess it would be nice if this
could be overridden in some form, e.g. using an ioctl on the file as ext4 does.
We should also take the kinds of access we have seen on a file into account.
E.g. if someone opens a file O_RDWR and performs seek or pwrite on it, we can
assume that it's not in the category of typical media files, and a file that
gets written to disk linearly in multiple megabytes might belong into the
category even if it is named otherwise.
Arnd
On Monday 15 October 2012, Changman Lee wrote:
> 2012년 10월 15일 월요일에 Arnd Bergmann<[email protected]>님이 작성:
> > It is only a performance hint though, so it is not a correctness issue the
> > file system gets it wrong. In order to do efficient garbage collection, a log
> > structured file system should take all the information it can get about the
> > expected life of data it writes. I agree that the list, even in the form of
> > mkfs time settings, is not a clean abstraction, but in the place of an Android
> > phone manufacturer I would still enable it if it promises a significant
> > performance advantage over not using it. I guess it would be nice if this
> > could be overridden in some form, e.g. using an ioctl on the file as ext4 does.
> >
> Right. This is related with HOT/COLD separation policy of f2fs. If we know
> that data is COLD, we can manage gc effectively.
> I think that ext lists are placed in sb is better like your advice because
> it's difficult to fix user app. Although it's nasty way.
Ok. I think you should adapt the terminology though. Right now, the optimization
is to mark the data as COLD because we expect it to be written less often than
other kinds of data. However, the hot/cold terms are usually only applied to
data that we assume is going to be written soon or not based on how often
the same data has been accessed in the past.
Anything you detect from the file name is not really a hint on hot/cold
files, but rather on the expected access pattern: These files are going
to be written once, and will be read-only after that, they are probably
multiple megabytes in size, and if you have a lot of them, they are likely
to live for the same time.
It may well be possible that we later decide to use the hint in a different
way, e.g. to put these files into yet another separate log, aside from
other hot or cold files.
> > We should also take the kinds of access we have seen on a file into account.
> > E.g. if someone opens a file O_RDWR and performs seek or pwrite on it, we can
> > assume that it's not in the category of typical media files, and a file that
> > gets written to disk linearly in multiple megabytes might belong into the
> > category even if it is named otherwise.
> >
> This is more general but it's hard to adapt now.
I think it's important to leave the option open for a future optimization.
Right now, what we have to get agreement on is the on-disk format, because
we absolutely don't want to make incompatible changes to that once f2fs
has been merged into the kernel and is getting used on real systems.
This is independent of how the code is implemented at the moment, and
any tuning regarding how to group different kinds of data into the six
logs is completely up to how things work out in practice. But you should
definitely ensure that those changes don't require changing the format
if we decide to use a different number of logs in the future, or to
use the logs differently.
The split between logs for nodes on the one hand and data on the other
is something that can well be hardcoded, and it's ok to have a hard
upper bound on the number of logs in the file system, possibly higher
than 6.
Arnd
On Sun, Oct 14, 2012 at 03:19:37PM +0000, Arnd Bergmann wrote:
> On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> > On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> > Extended attributes are more flexible way, from my point of view. The xattr gives
> > possibility to make hint to filesystem at any time and without any dependencies with
> > application's functional opportunities. Documented way of using such extended attributes
> > gives to user flexible way of manipulation of filesystem behavior (but I remember that
> > you don't believe in an user :-)).
> >
> > So, I think that fadvise() and extended attributes can be complementary solutions.
>
> Right. Another option is to have ext4 style attributes, see
> http://linux.die.net/man/1/chattr
Xattrs are much prefered to more "ext4 style" flags because xattrs
are filesystem independent. Indeed, some filesystems can't store any
new "ext4 style" flags without a change of disk format or
internally mapping them to an xattr. So really, xattrs are the best
way forward for such hints.
> Unlike extended attributes, there is a limited number of those,
> and they can only be boolean flags, but that might be enough for
> this particular use case.
A boolean is not sufficient for access policy hints. An extensible
xattr format is probably the best approach to take here, so that we
can easily introduce new access policy hints as functionality is
required. Indeed, an extensible xattr could start with just a
hot/cold boolean, and grow from there....
> The main reason I can see against extended attributes is that they are not stored
> very efficiently in f2fs, unless a lot of work is put into coming up with a good
> implementation. A single flags bit can trivially be added to the inode in
> comparison (if it's not there already).
That's a deficiency that should be corrected, then, because xattrs
are very common these days.
And given that stuff like access frequency tracking is being
implemented at the VFS level, access policy hints should also be VFS
functionality. A bad filesystem implementation should not dictate
the interface for generically useful functionality....
> > Anyway, hardcoding or saving in filesystem list of file extensions is a nasty way. It
> > can be not safe or hardly understandable by users the way of reconfiguration filesystem
> > by means of tunefs or debugfs with the purpose of file extensions addition in such
> > "black-box" as TV or smartphones, from my point of view.
>
> It is only a performance hint though, so it is not a correctness issue the
> file system gets it wrong. In order to do efficient garbage collection, a log
> structured file system should take all the information it can get about the
> expected life of data it writes. I agree that the list, even in the form of
> mkfs time settings, is not a clean abstraction, but in the place of an Android
> phone manufacturer I would still enable it if it promises a significant
> performance advantage over not using it. I guess it would be nice if this
> could be overridden in some form, e.g. using an ioctl on the file as ext4 does.
An xattr on the root inode that holds a list like this is something
that could be set at mkfs time, but then also updated easily by new
software packages that are installed...
> We should also take the kinds of access we have seen on a file into account.
Yes, but it should be done at the VFS level, not in the filesystem
itself. Integrated into the current hot inode/range tracking that is
being worked on right now, I'd suggest.
IOWs, these access policy issues are not unique to F2FS or it's use
case. Anything to do with access hints, policy, tracking, file
classification, etc that can influence data locality, reclaim,
migration, etc need to be dealt with at the VFS, independently of a
specific filesystem. Filesystems can make use of that information
how they please (whether in the kernel or via userspace tools), but
having filesystem specific interfaces and implementations of the
same functionality is extremely wasteful. Let's do it once, and do
it right the first time. ;)
Cheers,
Dave.
--
Dave Chinner
[email protected]
> On Sun, Oct 14, 2012 at 03:19:37PM +0000, Arnd Bergmann wrote:
> > On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> > > On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > > > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> > > Extended attributes are more flexible way, from my point of view. The xattr gives
> > > possibility to make hint to filesystem at any time and without any dependencies with
> > > application's functional opportunities. Documented way of using such extended attributes
> > > gives to user flexible way of manipulation of filesystem behavior (but I remember that
> > > you don't believe in an user :-)).
> > >
> > > So, I think that fadvise() and extended attributes can be complementary solutions.
> >
> > Right. Another option is to have ext4 style attributes, see
> > http://linux.die.net/man/1/chattr
>
> Xattrs are much prefered to more "ext4 style" flags because xattrs
> are filesystem independent. Indeed, some filesystems can't store any
> new "ext4 style" flags without a change of disk format or
> internally mapping them to an xattr. So really, xattrs are the best
> way forward for such hints.
>
> > Unlike extended attributes, there is a limited number of those,
> > and they can only be boolean flags, but that might be enough for
> > this particular use case.
>
> A boolean is not sufficient for access policy hints. An extensible
> xattr format is probably the best approach to take here, so that we
> can easily introduce new access policy hints as functionality is
> required. Indeed, an extensible xattr could start with just a
> hot/cold boolean, and grow from there....
>
> > The main reason I can see against extended attributes is that they are not stored
> > very efficiently in f2fs, unless a lot of work is put into coming up with a good
> > implementation. A single flags bit can trivially be added to the inode in
> > comparison (if it's not there already).
>
> That's a deficiency that should be corrected, then, because xattrs
> are very common these days.
IMO, most file systems including f2fs have some inefficiency to store
and retrieve xattrs, since they have to allocate an additional block.
The only distinct problem in f2fs is that there is a cleaning overhead.
So, that's the why xattr is not an efficient way in f2fs.
OTOH, I think xattr itself is for users, not for communicating
between file system and users.
Moreover, I'm not sure in the current android, but I saw ICS android
did not call any xattr operations, even if mount option was enabled.
>
> And given that stuff like access frequency tracking is being
> implemented at the VFS level, access policy hints should also be VFS
> functionality. A bad filesystem implementation should not dictate
> the interface for generically useful functionality....
>
> > > Anyway, hardcoding or saving in filesystem list of file extensions is a nasty way. It
> > > can be not safe or hardly understandable by users the way of reconfiguration filesystem
> > > by means of tunefs or debugfs with the purpose of file extensions addition in such
> > > "black-box" as TV or smartphones, from my point of view.
> >
> > It is only a performance hint though, so it is not a correctness issue the
> > file system gets it wrong. In order to do efficient garbage collection, a log
> > structured file system should take all the information it can get about the
> > expected life of data it writes. I agree that the list, even in the form of
> > mkfs time settings, is not a clean abstraction, but in the place of an Android
> > phone manufacturer I would still enable it if it promises a significant
> > performance advantage over not using it. I guess it would be nice if this
> > could be overridden in some form, e.g. using an ioctl on the file as ext4 does.
>
> An xattr on the root inode that holds a list like this is something
> that could be set at mkfs time, but then also updated easily by new
> software packages that are installed...
>
> > We should also take the kinds of access we have seen on a file into account.
>
> Yes, but it should be done at the VFS level, not in the filesystem
> itself. Integrated into the current hot inode/range tracking that is
> being worked on right now, I'd suggest.
>
> IOWs, these access policy issues are not unique to F2FS or it's use
> case. Anything to do with access hints, policy, tracking, file
> classification, etc that can influence data locality, reclaim,
> migration, etc need to be dealt with at the VFS, independently of a
> specific filesystem. Filesystems can make use of that information
> how they please (whether in the kernel or via userspace tools), but
> having filesystem specific interfaces and implementations of the
> same functionality is extremely wasteful. Let's do it once, and do
> it right the first time. ;)
I agree that VFS should support something, but before then, it needs
to do something by the file system first.
Because, we have to figure out what kind of information are really useful.
Thanks,
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
> On Monday 15 October 2012, Changman Lee wrote:
> > 2012년 10월 15일 월요일에 Arnd Bergmann<[email protected]>님이 작성:
> > > It is only a performance hint though, so it is not a correctness issue the
> > > file system gets it wrong. In order to do efficient garbage collection, a log
> > > structured file system should take all the information it can get about the
> > > expected life of data it writes. I agree that the list, even in the form of
> > > mkfs time settings, is not a clean abstraction, but in the place of an Android
> > > phone manufacturer I would still enable it if it promises a significant
> > > performance advantage over not using it. I guess it would be nice if this
> > > could be overridden in some form, e.g. using an ioctl on the file as ext4 does.
> > >
> > Right. This is related with HOT/COLD separation policy of f2fs. If we know
> > that data is COLD, we can manage gc effectively.
> > I think that ext lists are placed in sb is better like your advice because
> > it's difficult to fix user app. Although it's nasty way.
>
> Ok. I think you should adapt the terminology though. Right now, the optimization
> is to mark the data as COLD because we expect it to be written less often than
> other kinds of data. However, the hot/cold terms are usually only applied to
> data that we assume is going to be written soon or not based on how often
> the same data has been accessed in the past.
>
> Anything you detect from the file name is not really a hint on hot/cold
> files, but rather on the expected access pattern: These files are going
> to be written once, and will be read-only after that, they are probably
> multiple megabytes in size, and if you have a lot of them, they are likely
> to live for the same time.
>
> It may well be possible that we later decide to use the hint in a different
> way, e.g. to put these files into yet another separate log, aside from
> other hot or cold files.
>
> > > We should also take the kinds of access we have seen on a file into account.
> > > E.g. if someone opens a file O_RDWR and performs seek or pwrite on it, we can
> > > assume that it's not in the category of typical media files, and a file that
> > > gets written to disk linearly in multiple megabytes might belong into the
> > > category even if it is named otherwise.
> > >
> > This is more general but it's hard to adapt now.
>
> I think it's important to leave the option open for a future optimization.
> Right now, what we have to get agreement on is the on-disk format, because
> we absolutely don't want to make incompatible changes to that once f2fs
> has been merged into the kernel and is getting used on real systems.
>
> This is independent of how the code is implemented at the moment, and
> any tuning regarding how to group different kinds of data into the six
> logs is completely up to how things work out in practice. But you should
> definitely ensure that those changes don't require changing the format
> if we decide to use a different number of logs in the future, or to
> use the logs differently.
>
> The split between logs for nodes on the one hand and data on the other
> is something that can well be hardcoded, and it's ok to have a hard
> upper bound on the number of logs in the file system, possibly higher
> than 6.
>
Thank you for a lot of points to be addressed. :)
Maybe it's time to summarize them.
Please let me know what I misunderstood.
[In v2]
- Extension list
: Mkfs supports configuring extensions by user, and that information
will be stored in the superblock. In order to reduce the cleaning overhead,
f2fs supports an additional interface, ioctl, likewise ext4.
- The number of active logs
: No change will be done in on-disk layout (i.e., max 6 logs).
Instead, f2fs supports changing the number with a mount option.
Currently, I think 4, 5, and 6 would be enough.
- Section size
: Mkfs supports multiples of segments for a section, not power-of-two.
[Future optimization]
- Data separation
: file access pattern, and else?
> Arnd
On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> On Monday 15 October 2012, Dave Chinner wrote:
> > On Sun, Oct 14, 2012 at 03:19:37PM +0000, Arnd Bergmann wrote:
> > > On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> > > > On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > > > > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> >
> > > The main reason I can see against extended attributes is that they are not stored
> > > very efficiently in f2fs, unless a lot of work is put into coming up with a good
> > > implementation. A single flags bit can trivially be added to the inode in
> > > comparison (if it's not there already).
> >
> > That's a deficiency that should be corrected, then, because xattrs
> > are very common these days.
>
> IMO, most file systems including f2fs have some inefficiency to store
> and retrieve xattrs, since they have to allocate an additional block.
> The only distinct problem in f2fs is that there is a cleaning overhead.
> So, that's the why xattr is not an efficient way in f2fs.
I would hope that there is a better way to encode extented attributes
if the current method is not efficient enough. Maybe Dave or someone
else who is experienced with this can make suggestions.
What is the "expected" size of the attributes normally? Does it
make sense to put attributes for multiple files into a single block?
> OTOH, I think xattr itself is for users, not for communicating
> between file system and users.
No, you are mistaken in that point, as Dave explained.
> Moreover, I'm not sure in the current android, but I saw ICS android
> did not call any xattr operations, even if mount option was enabled.
I realize that Android is currently your main target, but to get
the file system merged into Linux, it needs to fit in with the
overall strategy for file systems, which includes more than just
Android.
> > And given that stuff like access frequency tracking is being
> > implemented at the VFS level, access policy hints should also be VFS
> > functionality. A bad filesystem implementation should not dictate
> > the interface for generically useful functionality....
Agreed.
> > An xattr on the root inode that holds a list like this is something
> > that could be set at mkfs time, but then also updated easily by new
> > software packages that are installed...
Yes, good idea.
> > > We should also take the kinds of access we have seen on a file into account.
> >
> > Yes, but it should be done at the VFS level, not in the filesystem
> > itself. Integrated into the current hot inode/range tracking that is
> > being worked on right now, I'd suggest.
> >
> > IOWs, these access policy issues are not unique to F2FS or it's use
> > case. Anything to do with access hints, policy, tracking, file
> > classification, etc that can influence data locality, reclaim,
> > migration, etc need to be dealt with at the VFS, independently of a
> > specific filesystem. Filesystems can make use of that information
> > how they please (whether in the kernel or via userspace tools), but
> > having filesystem specific interfaces and implementations of the
> > same functionality is extremely wasteful. Let's do it once, and do
> > it right the first time. ;)
>
> I agree that VFS should support something, but before then, it needs
> to do something by the file system first.
> Because, we have to figure out what kind of information are really useful.
As mentioned before, such tuning can still be done after the file system
is merged, as long as the on-disk structure is flexible enough.
As you said yourself, imlpementing the hints by detecting access patterns
from the file system itself as I suggested would be a lot of work, and
if the VFS can give us that information for free in the future, we can
wait for that to happen (or help out on the implementation if necessary)
and then implement it based on those APIs.
Arnd
On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> Thank you for a lot of points to be addressed. :)
> Maybe it's time to summarize them.
> Please let me know what I misunderstood.
>
> [In v2]
> - Extension list
> : Mkfs supports configuring extensions by user, and that information
> will be stored in the superblock. In order to reduce the cleaning overhead,
> f2fs supports an additional interface, ioctl, likewise ext4.
That is what I suggested but actually Dave Chinner is the person that you
need to listen to rather than me in this regard. Using an extended attribute
in the root node would be more appropriate to configure this than an ioctl.
> - The number of active logs
> : No change will be done in on-disk layout (i.e., max 6 logs).
> Instead, f2fs supports changing the number with a mount option.
> Currently, I think 4, 5, and 6 would be enough.
Right, that would be the minimum that I would ask for. If it is relatively
easy to support more than six logs in the file format without actually
implementing them in the code, you might want to support up to 16, just
to be future-proof.
For the lower bound, being able to support as little as 2 logs for
cheap hardware would be nice, but 4 logs is the important one.
5 logs is probably not all that important, as long as you have the
choice between 4 and 6. If you implement three different ways, I
would prefer have the choice of 2/4/6 over 4/5/6 logs.
> - Section size
> : Mkfs supports multiples of segments for a section, not power-of-two.
Right.
> [Future optimization]
> - Data separation
> : file access pattern, and else?
: Investigate the option to make large files erase block indirect rather than
part of the normal logs
There is one more more point that I have not mentioned before, which is the
alignment of write requests. As far as I can tell, you try to group writes
as much as possible, but the alignment and the minimum size is still just
4 KB. I fear that this might not be good enough for a lot of cases when
the page sizes grow and there is no sufficient amount of nonvolatile
write cache in the device. I wonder whether there is something that can
be done to ensure we always write with a minimum alignment, and pad
out the data with zeroes if necessary in order to avoid getting into
garbage collection on devices that can't handle sub-page writes.
Arnd
2012-10-16 (화), 11:38 +0000, Arnd Bergmann:
> On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > On Monday 15 October 2012, Dave Chinner wrote:
> > > On Sun, Oct 14, 2012 at 03:19:37PM +0000, Arnd Bergmann wrote:
> > > > On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> > > > > On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > > > > > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> > >
> > > > The main reason I can see against extended attributes is that they are not stored
> > > > very efficiently in f2fs, unless a lot of work is put into coming up with a good
> > > > implementation. A single flags bit can trivially be added to the inode in
> > > > comparison (if it's not there already).
> > >
> > > That's a deficiency that should be corrected, then, because xattrs
> > > are very common these days.
> >
> > IMO, most file systems including f2fs have some inefficiency to store
> > and retrieve xattrs, since they have to allocate an additional block.
> > The only distinct problem in f2fs is that there is a cleaning overhead.
> > So, that's the why xattr is not an efficient way in f2fs.
>
> I would hope that there is a better way to encode extented attributes
> if the current method is not efficient enough. Maybe Dave or someone
> else who is experienced with this can make suggestions.
>
> What is the "expected" size of the attributes normally? Does it
> make sense to put attributes for multiple files into a single block?
>
IMHO, this is an issue on global vs. local management approaches.
Yes, it is possible to manage xattrs globally, but my concern is how
to efficiently store, retrieve, and search xattrs without lock
contention, when users create and remove so many xattrs dynamically.
It may need to adopt an additional index structure like b+tree.
Well, the important thing that I focus is that xattrs need additional
blocks whatever they are compacted or not, resulting in cleaning
overhead in f2fs.
> > OTOH, I think xattr itself is for users, not for communicating
> > between file system and users.
>
> No, you are mistaken in that point, as Dave explained.
I meant the original use-cases of xattr like below.
"Typical uses can be storing the author of a document, the character
encoding of a plain-text document, a checksum, cryptographic hash or
digital signature."
from http://en.wikipedia.org/wiki/Extended_file_attributes
Would you please give me some existing use-cases for the
communicating purpose?
> > Moreover, I'm not sure in the current android, but I saw ICS android
> > did not call any xattr operations, even if mount option was enabled.
>
> I realize that Android is currently your main target, but to get
> the file system merged into Linux, it needs to fit in with the
> overall strategy for file systems, which includes more than just
> Android.
Definitely, yes.
I just wanted to say that xattr is not common on all the platforms yet.
>
> > > And given that stuff like access frequency tracking is being
> > > implemented at the VFS level, access policy hints should also be VFS
> > > functionality. A bad filesystem implementation should not dictate
> > > the interface for generically useful functionality....
>
> Agreed.
>
> > > An xattr on the root inode that holds a list like this is something
> > > that could be set at mkfs time, but then also updated easily by new
> > > software packages that are installed...
>
> Yes, good idea.
Likewise many file systems, f2fs also supports xattr as a configurable
Kconfig option.
If user disables the xattr feature, how can we do this?
>
> > > > We should also take the kinds of access we have seen on a file into account.
> > >
> > > Yes, but it should be done at the VFS level, not in the filesystem
> > > itself. Integrated into the current hot inode/range tracking that is
> > > being worked on right now, I'd suggest.
> > >
> > > IOWs, these access policy issues are not unique to F2FS or it's use
> > > case. Anything to do with access hints, policy, tracking, file
> > > classification, etc that can influence data locality, reclaim,
> > > migration, etc need to be dealt with at the VFS, independently of a
> > > specific filesystem. Filesystems can make use of that information
> > > how they please (whether in the kernel or via userspace tools), but
> > > having filesystem specific interfaces and implementations of the
> > > same functionality is extremely wasteful. Let's do it once, and do
> > > it right the first time. ;)
> >
> > I agree that VFS should support something, but before then, it needs
> > to do something by the file system first.
> > Because, we have to figure out what kind of information are really useful.
>
> As mentioned before, such tuning can still be done after the file system
> is merged, as long as the on-disk structure is flexible enough.
>
> As you said yourself, imlpementing the hints by detecting access patterns
> from the file system itself as I suggested would be a lot of work, and
> if the VFS can give us that information for free in the future, we can
> wait for that to happen (or help out on the implementation if necessary)
> and then implement it based on those APIs.
>
Ok, agreed.
> Arnd
--
Jaegeuk Kim
Samsung
On Tue, Oct 16, 2012 at 11:38:35AM +0000, Arnd Bergmann wrote:
> On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > On Monday 15 October 2012, Dave Chinner wrote:
> > > On Sun, Oct 14, 2012 at 03:19:37PM +0000, Arnd Bergmann wrote:
> > > > On Sunday 14 October 2012, Vyacheslav Dubeyko wrote:
> > > > > On Oct 14, 2012, at 11:09 AM, Jaegeuk Kim wrote:
> > > > > > 2012-10-14 (일), 02:21 +0400, Vyacheslav Dubeyko:
> > >
> > > > The main reason I can see against extended attributes is that they are not stored
> > > > very efficiently in f2fs, unless a lot of work is put into coming up with a good
> > > > implementation. A single flags bit can trivially be added to the inode in
> > > > comparison (if it's not there already).
> > >
> > > That's a deficiency that should be corrected, then, because xattrs
> > > are very common these days.
> >
> > IMO, most file systems including f2fs have some inefficiency to store
> > and retrieve xattrs, since they have to allocate an additional block.
> > The only distinct problem in f2fs is that there is a cleaning overhead.
> > So, that's the why xattr is not an efficient way in f2fs.
>
> I would hope that there is a better way to encode extented attributes
> if the current method is not efficient enough. Maybe Dave or someone
> else who is experienced with this can make suggestions.
>
> What is the "expected" size of the attributes normally?
Most attributes are small. Even "large" user attributes don't
generally get to be more than a couple of hundred bytes, though the
maximum size for a single xattr is 64K.
> Does it
> make sense to put attributes for multiple files into a single block?
There are two main ways of dealing with attributes. The first is a
tree-like structure to index and store unique xattrs, and have the
inode siimply keep pointers to the main xattr tree. This is great
for keeping space down when there are lots of identical xattrs, but
is a serialisation point for modification an modification can be
complex (i.e. shared entries in the tree need COW semantics.) This
is the approach ZFS takes, IIRC, and is the most space efficient way
of dealing with xattrs. It's not the most performance efficient way,
however, and the reference counting means frequent tree rewrites.
The second is the XFS/ext4 approach, where xattrs are stored in a
per-inode tree, with no sharing. The attribute tree holds the
attributes in it's leaves, and the tree grows and shrinks as you add
or remove xattrs. There are optimisations on top of this - e.g. for
XFS if the xattrs fit in the spare space in the inode, they are
packed into the inode ("shortform") and don't require an external
block. IIUC, there are patches to implement this optimisation for
ext4 floating around at the moment. This is a worthwhile
optimisation, because with a 512 byte inode size on XFS there is
enough spare space (roughly 380 bytes) for most systems to store all
their xattrs in the inode itself. XFS also has "remote attr storage"
for large xattrs (i.e. larger than a leaf block), where the tree
just keeps a pointer to an external extent that holds the xattr.
IIRC, fs2fs uses 4k inodes, so IMO per-inode xattr tress with
internal storage before spilling to an external block is probably
the best approach to take...
> > OTOH, I think xattr itself is for users, not for communicating
> > between file system and users.
>
> No, you are mistaken in that point, as Dave explained.
e.g. selinux, IMA, ACLs, capabilities, etc all communicate
information that the kernel uses for access control. That's why
xattrs have different namespaces like "system", "security" and
"user". Only user attributes are truly for user data
- the rest are for communicating information to the kernel....
A file usage policy xattr would definitely exist under the "system"
namespace - it's not a user xattr at all.
Cheers,
Dave.
--
Dave Chinner
[email protected]
2012-10-16 (화), 16:14 +0000, Arnd Bergmann:
> On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > Thank you for a lot of points to be addressed. :)
> > Maybe it's time to summarize them.
> > Please let me know what I misunderstood.
> >
> > [In v2]
> > - Extension list
> > : Mkfs supports configuring extensions by user, and that information
> > will be stored in the superblock. In order to reduce the cleaning overhead,
> > f2fs supports an additional interface, ioctl, likewise ext4.
>
> That is what I suggested but actually Dave Chinner is the person that you
> need to listen to rather than me in this regard. Using an extended attribute
> in the root node would be more appropriate to configure this than an ioctl.
>
> > - The number of active logs
> > : No change will be done in on-disk layout (i.e., max 6 logs).
> > Instead, f2fs supports changing the number with a mount option.
> > Currently, I think 4, 5, and 6 would be enough.
>
> Right, that would be the minimum that I would ask for. If it is relatively
> easy to support more than six logs in the file format without actually
> implementing them in the code, you might want to support up to 16, just
> to be future-proof.
Ok, got it.
>
> For the lower bound, being able to support as little as 2 logs for
> cheap hardware would be nice, but 4 logs is the important one.
>
> 5 logs is probably not all that important, as long as you have the
> choice between 4 and 6. If you implement three different ways, I
> would prefer have the choice of 2/4/6 over 4/5/6 logs.
Ok, I'll try, but in the case of 2 logs, it may need to change recovery
routines.
>
> > - Section size
> > : Mkfs supports multiples of segments for a section, not power-of-two.
>
> Right.
>
> > [Future optimization]
> > - Data separation
> > : file access pattern, and else?
>
> : Investigate the option to make large files erase block indirect rather than
> part of the normal logs
>
> There is one more more point that I have not mentioned before, which is the
> alignment of write requests. As far as I can tell, you try to group writes
> as much as possible, but the alignment and the minimum size is still just
> 4 KB.
Yes.
> I fear that this might not be good enough for a lot of cases when
> the page sizes grow and there is no sufficient amount of nonvolatile
> write cache in the device. I wonder whether there is something that can
> be done to ensure we always write with a minimum alignment, and pad
> out the data with zeroes if necessary in order to avoid getting into
> garbage collection on devices that can't handle sub-page writes.
You're very familiar with flash. :)
Yes, as the page size grows, the sub-page write issue is one of the
most critical problems.
I also thought this before, but I have not made a conclusion until now.
Because, I don't know how to deal with this in other companies, but,
I've seen that so many firmware developers in samsung have tried to
reduce this overhead by adapting many schemes.
I guess very cautiously that other companies also handle this well.
Therefore, I keep a question whether file system should care about
this perfectly or not.
Thanks,
>
> Arnd
--
Jaegeuk Kim
Samsung
> > > > > The main reason I can see against extended attributes is that they are not stored
> > > > > very efficiently in f2fs, unless a lot of work is put into coming up with a good
> > > > > implementation. A single flags bit can trivially be added to the inode in
> > > > > comparison (if it's not there already).
> > > >
> > > > That's a deficiency that should be corrected, then, because xattrs
> > > > are very common these days.
> > >
> > > IMO, most file systems including f2fs have some inefficiency to store
> > > and retrieve xattrs, since they have to allocate an additional block.
> > > The only distinct problem in f2fs is that there is a cleaning overhead.
> > > So, that's the why xattr is not an efficient way in f2fs.
> >
> > I would hope that there is a better way to encode extented attributes
> > if the current method is not efficient enough. Maybe Dave or someone
> > else who is experienced with this can make suggestions.
> >
> > What is the "expected" size of the attributes normally?
>
> Most attributes are small. Even "large" user attributes don't
> generally get to be more than a couple of hundred bytes, though the
> maximum size for a single xattr is 64K.
>
> > Does it
> > make sense to put attributes for multiple files into a single block?
>
> There are two main ways of dealing with attributes. The first is a
> tree-like structure to index and store unique xattrs, and have the
> inode siimply keep pointers to the main xattr tree. This is great
> for keeping space down when there are lots of identical xattrs, but
> is a serialisation point for modification an modification can be
> complex (i.e. shared entries in the tree need COW semantics.) This
> is the approach ZFS takes, IIRC, and is the most space efficient way
> of dealing with xattrs. It's not the most performance efficient way,
> however, and the reference counting means frequent tree rewrites.
>
> The second is the XFS/ext4 approach, where xattrs are stored in a
> per-inode tree, with no sharing. The attribute tree holds the
> attributes in it's leaves, and the tree grows and shrinks as you add
> or remove xattrs. There are optimisations on top of this - e.g. for
> XFS if the xattrs fit in the spare space in the inode, they are
> packed into the inode ("shortform") and don't require an external
> block. IIUC, there are patches to implement this optimisation for
> ext4 floating around at the moment. This is a worthwhile
> optimisation, because with a 512 byte inode size on XFS there is
> enough spare space (roughly 380 bytes) for most systems to store all
> their xattrs in the inode itself. XFS also has "remote attr storage"
> for large xattrs (i.e. larger than a leaf block), where the tree
> just keeps a pointer to an external extent that holds the xattr.
Thank you for great explanation. :)
>
> IIRC, fs2fs uses 4k inodes, so IMO per-inode xattr tress with
> internal storage before spilling to an external block is probably
> the best approach to take...
Yes, indeed this is the best approach to f2fs's xattr.
Apart from giving fs hints, it is worth enough to optimize later.
>
> > > OTOH, I think xattr itself is for users, not for communicating
> > > between file system and users.
> >
> > No, you are mistaken in that point, as Dave explained.
>
> e.g. selinux, IMA, ACLs, capabilities, etc all communicate
> information that the kernel uses for access control. That's why
> xattrs have different namespaces like "system", "security" and
> "user". Only user attributes are truly for user data
> - the rest are for communicating information to the kernel....
>
I agree that "system" is used by kernel.
How about the file system view?
Would you explain what file systems retrieve xattrs and use
them with their own purpose?
Sorry, I'm not familiar with xattrs in depth.
Unfortunately, "system" is not implemented in f2fs yet. :(
> A file usage policy xattr would definitely exist under the "system"
> namespace - it's not a user xattr at all.
>
> Cheers,
>
> Dave.
--
Jaegeuk Kim
Samsung
On Wed, Oct 17, 2012 at 07:30:21AM +0900, Jaegeuk Kim wrote:
> > > > OTOH, I think xattr itself is for users, not for communicating
> > > > between file system and users.
> > >
> > > No, you are mistaken in that point, as Dave explained.
> >
> > e.g. selinux, IMA, ACLs, capabilities, etc all communicate
> > information that the kernel uses for access control. That's why
> > xattrs have different namespaces like "system", "security" and
> > "user". Only user attributes are truly for user data
> > - the rest are for communicating information to the kernel....
> >
>
> I agree that "system" is used by kernel.
> How about the file system view?
Not sure what you mean - the filesystem woul dsimply read the xattrs
in the system namespace as it needs, just like the other subsystems
like selinux or IMA do.
> Would you explain what file systems retrieve xattrs and use
> them with their own purpose?
I think cachefs users a "CacheFiles.cache" namespace for storing
information it needs in xattrs. ecryptfs stores crypto metadata in
xattrs in the lower filesytem. NFSv4 servers store junction mount
information in xattrs.
So there are examples where filesystems use xattrs for special
information. However, in most cases filesystems don't need xattrs
for their own metadata primarily because that gets added to their
own on-disk formats. IThe above are all "overlay" style filesystems
that don't have their own on-disk formats, so need to use xattrs to
store their per-inode metadata.
The case of access hints and allocation policies are not somethign
that are native to any filesystem on-disk format. They are abstract
concepts that really only the software generating/using that
information knows about. Given we want the software that uses this
information to be in VFS, it is separate from every filesystem and
this is exactly the use case that system xattrs were intended for.
:)
> Sorry, I'm not familiar with xattrs in depth.
>
> Unfortunately, "system" is not implemented in f2fs yet. :(
If you've already implemented the user.* namespace, then it's
trivial to support the other namespaces - it's just prefixing the
xattrs with the appropriate string instead of "user"....
Cheers,
Dave.
--
Dave Chinner
[email protected]
> On Wed, Oct 17, 2012 at 07:30:21AM +0900, Jaegeuk Kim wrote:
> > > > > OTOH, I think xattr itself is for users, not for communicating
> > > > > between file system and users.
> > > >
> > > > No, you are mistaken in that point, as Dave explained.
> > >
> > > e.g. selinux, IMA, ACLs, capabilities, etc all communicate
> > > information that the kernel uses for access control. That's why
> > > xattrs have different namespaces like "system", "security" and
> > > "user". Only user attributes are truly for user data
> > > - the rest are for communicating information to the kernel....
> > >
> >
> > I agree that "system" is used by kernel.
> > How about the file system view?
>
> Not sure what you mean - the filesystem woul dsimply read the xattrs
> in the system namespace as it needs, just like the other subsystems
> like selinux or IMA do.
>
> > Would you explain what file systems retrieve xattrs and use
> > them with their own purpose?
>
> I think cachefs users a "CacheFiles.cache" namespace for storing
> information it needs in xattrs. ecryptfs stores crypto metadata in
> xattrs in the lower filesytem. NFSv4 servers store junction mount
> information in xattrs.
>
> So there are examples where filesystems use xattrs for special
> information. However, in most cases filesystems don't need xattrs
> for their own metadata primarily because that gets added to their
> own on-disk formats. IThe above are all "overlay" style filesystems
> that don't have their own on-disk formats, so need to use xattrs to
> store their per-inode metadata.
>
> The case of access hints and allocation policies are not somethign
> that are native to any filesystem on-disk format. They are abstract
> concepts that really only the software generating/using that
> information knows about. Given we want the software that uses this
> information to be in VFS, it is separate from every filesystem and
> this is exactly the use case that system xattrs were intended for.
> :)
I understand. Thank you very much. :)
>
> > Sorry, I'm not familiar with xattrs in depth.
> >
> > Unfortunately, "system" is not implemented in f2fs yet. :(
>
> If you've already implemented the user.* namespace, then it's
> trivial to support the other namespaces - it's just prefixing the
> xattrs with the appropriate string instead of "user"....
>
Ok, I'll do right now.
Thanks, again.
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
> 2012-10-16 (화), 16:14 +0000, Arnd Bergmann:
> > On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > > Thank you for a lot of points to be addressed. :)
> > > Maybe it's time to summarize them.
> > > Please let me know what I misunderstood.
> > >
> > > [In v2]
> > > - Extension list
> > > : Mkfs supports configuring extensions by user, and that information
> > > will be stored in the superblock. In order to reduce the cleaning overhead,
> > > f2fs supports an additional interface, ioctl, likewise ext4.
> >
> > That is what I suggested but actually Dave Chinner is the person that you
> > need to listen to rather than me in this regard. Using an extended attribute
> > in the root node would be more appropriate to configure this than an ioctl.
> >
> > > - The number of active logs
> > > : No change will be done in on-disk layout (i.e., max 6 logs).
> > > Instead, f2fs supports changing the number with a mount option.
> > > Currently, I think 4, 5, and 6 would be enough.
> >
> > Right, that would be the minimum that I would ask for. If it is relatively
> > easy to support more than six logs in the file format without actually
> > implementing them in the code, you might want to support up to 16, just
> > to be future-proof.
>
> Ok, got it.
>
> >
> > For the lower bound, being able to support as little as 2 logs for
> > cheap hardware would be nice, but 4 logs is the important one.
> >
> > 5 logs is probably not all that important, as long as you have the
> > choice between 4 and 6. If you implement three different ways, I
> > would prefer have the choice of 2/4/6 over 4/5/6 logs.
>
> Ok, I'll try, but in the case of 2 logs, it may need to change recovery
> routines.
>
> >
> > > - Section size
> > > : Mkfs supports multiples of segments for a section, not power-of-two.
> >
> > Right.
> >
> > > [Future optimization]
> > > - Data separation
> > > : file access pattern, and else?
> >
> > : Investigate the option to make large files erase block indirect rather than
> > part of the normal logs
> >
> > There is one more more point that I have not mentioned before, which is the
> > alignment of write requests. As far as I can tell, you try to group writes
> > as much as possible, but the alignment and the minimum size is still just
> > 4 KB.
>
> Yes.
>
> > I fear that this might not be good enough for a lot of cases when
> > the page sizes grow and there is no sufficient amount of nonvolatile
> > write cache in the device. I wonder whether there is something that can
> > be done to ensure we always write with a minimum alignment, and pad
> > out the data with zeroes if necessary in order to avoid getting into
> > garbage collection on devices that can't handle sub-page writes.
>
> You're very familiar with flash. :)
> Yes, as the page size grows, the sub-page write issue is one of the
> most critical problems.
> I also thought this before, but I have not made a conclusion until now.
> Because, I don't know how to deal with this in other companies, but,
> I've seen that so many firmware developers in samsung have tried to
> reduce this overhead by adapting many schemes.
> I guess very cautiously that other companies also handle this well.
> Therefore, I keep a question whether file system should care about
> this perfectly or not.
>
> Thanks,
>
> >
> > Arnd
>
As discussed with Dave, I propose the following items.
[In v2]
- Extension list
: Mkfs supports configuring extensions by user, and that information
will be stored in the superblock.
I'll add a mount option to enable/disable using the extension list.
Instead, f2fs supports xattr to give a hint to any files.
After supporting this by VFS, it'll be removed.
- The number of active logs
: For compatibility, on-disk layout supports max 16 logs.
Instead, f2fs supports configuring the number of active logs that
will be used by a mount option.
The option supports 2, 4, and 6 logs.
- Section size
: Mkfs supports multiples of segments for a section, not power-of-two.
[Future optimization]
- Data separation
: file access pattern
: Investigate the option to make large files erase block indirect rather than
part of the normal logs
: sub-page write avoidance
Thanks,
> --
> Jaegeuk Kim
> Samsung
On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> 2012-10-16 (화), 16:14 +0000, Arnd Bergmann:
> > On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > For the lower bound, being able to support as little as 2 logs for
> > cheap hardware would be nice, but 4 logs is the important one.
> >
> > 5 logs is probably not all that important, as long as you have the
> > choice between 4 and 6. If you implement three different ways, I
> > would prefer have the choice of 2/4/6 over 4/5/6 logs.
>
> Ok, I'll try, but in the case of 2 logs, it may need to change recovery
> routines.
Ok, I see. If it needs any changes that require a lot of extra code or
if it would make the common (six logs) case less efficient, then
you should probably not do it.
> > I fear that this might not be good enough for a lot of cases when
> > the page sizes grow and there is no sufficient amount of nonvolatile
> > write cache in the device. I wonder whether there is something that can
> > be done to ensure we always write with a minimum alignment, and pad
> > out the data with zeroes if necessary in order to avoid getting into
> > garbage collection on devices that can't handle sub-page writes.
>
> You're very familiar with flash. :)
> Yes, as the page size grows, the sub-page write issue is one of the
> most critical problems.
> I also thought this before, but I have not made a conclusion until now.
> Because, I don't know how to deal with this in other companies, but,
> I've seen that so many firmware developers in samsung have tried to
> reduce this overhead by adapting many schemes.
> I guess very cautiously that other companies also handle this well.
> Therefore, I keep a question whether file system should care about
> this perfectly or not.
My guess is that most devices would be able to handle this well enough
as long as the writes are only in the log areas, but some would fail
when there are cached sub-page writes by the time you update the metadata
in the beginning of the drive.
Besides the extreme case of getting into garbage collect when the device
runs out of nonvolatile cache to keep sub-pages, there is also the other
problem that it is always more efficient not to need the NV cache than
having to use it to do sub-page writes. This is especially true if the
NV cache is implemented as a log on a regular flash block. In those cases,
it would be better to pad the current write with zeroes to the next
page boundary and rely on garbage collection to do the compaction later.
As I mentioned before, my design avoided the problem by using larger
clusters to start with and then mitigating the space overhead from this
by allowing to put multiple inodes into a single cluster. The tradeoffs
from this are very different than what you have with a fixed 4KB block
size, and it's probably not worth redesigning f2fs to handle this on
such a global scale.
One thing that you can do though is pad each flash page with data from
garbage collection: There should basically always be data that needs
to be GC'd, and as soon as you have decided that you want to write
a block to a new location and the hardware requires that it writes
a block of data to pad the page, you might just as well send down that
block. In the opposite case where you have a full page worth of actual
data that needs to be written (e.g. for a sync()) and half a page
worth of data from garbage collection, you can decide not send the GC
data in order to stay inside on a page boundary.
Doing this systematically would allow using the eMMC-4.5 "large-unit"
context for all of the logs, which can be a significant performance
improvement, depending on the underlying implementation.
Arnd
On Wednesday 17 October 2012, Jaegeuk Kim wrote:
> As discussed with Dave, I propose the following items.
>
> [In v2]
> - Extension list
> : Mkfs supports configuring extensions by user, and that information
> will be stored in the superblock.
> I'll add a mount option to enable/disable using the extension list.
> Instead, f2fs supports xattr to give a hint to any files.
> After supporting this by VFS, it'll be removed.
> - The number of active logs
> : For compatibility, on-disk layout supports max 16 logs.
> Instead, f2fs supports configuring the number of active logs that
> will be used by a mount option.
> The option supports 2, 4, and 6 logs.
> - Section size
> : Mkfs supports multiples of segments for a section, not power-of-two.
>
> [Future optimization]
> - Data separation
> : file access pattern
> : Investigate the option to make large files erase block indirect rather than
> part of the normal logs
> : sub-page write avoidance
Ok, sounds good! I'll comment separately on the xattr optimization,
since I had some ideas on how to combine this with other optimizations.
Arnd
On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> >
> > > > An xattr on the root inode that holds a list like this is something
> > > > that could be set at mkfs time, but then also updated easily by new
> > > > software packages that are installed...
> >
> > Yes, good idea.
>
> Likewise many file systems, f2fs also supports xattr as a configurable
> Kconfig option.
> If user disables the xattr feature, how can we do this?
I can see three options here:
* make the extension list feature dependent on xattr, and treat all files
the same if it's disabled.
* put the list into the superblock instead.
* fall back on a hardcoded list of extensions when the extended attribute
is not present or the feature is disabled.
Arnd
On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > IIRC, fs2fs uses 4k inodes, so IMO per-inode xattr tress with
> > internal storage before spilling to an external block is probably
> > the best approach to take...
>
> Yes, indeed this is the best approach to f2fs's xattr.
> Apart from giving fs hints, it is worth enough to optimize later.
I've thought a bit more about how this could be represented efficiently
in 4KB nodes. This would require a significant change of the way you
represent inodes, but can improve a number of things at the same time.
The idea is to replace the fixed area in the inode that contains block
pointers with an extensible TLV (type/length/value) list that can contain
multiple variable-length fields, like this. All TLVs together with the
fixed-length inode data can fill a 4KB block.
The obvious types would be:
* Direct file contents if the file is less than a block
* List of block pointers, as before, minimum 1, maximum until the end
of the block
* List of indirect pointers, now also a variable length, similar to the
list of block pointers
* List of double-indirect block pointers
* direct xattr: zero-terminated attribute name followed by contents
* indirect xattr: zero-terminated attribute name followed by up to
16 block pointers to store a maximum of 64KB sized xattrs
This could be extended later to cover additional types, e.g. a list
of erase block pointers, triple-indirect blocks or extents.
As a variation of this, it would also be nice to turn around the order
in which the pointers are walked, to optimize for space and for growing
files, rather than for reading the beginning of a file. With this, you
can represent a 9 KB file using a list of two block pointers, and 1KB
of direct data, all in the inode. When the user adds another byte, you
only need to rewrite the inode. Similarly, a 5 MB file would have a
single indirect node (covering block pointers for 4 MB), plus 256
separate block pointers (covering the last megabyte), and a 5 GB file
can be represented using 1 double-indirect node and 256 indirect nodes,
and each of them can still be followed by direct "tail" data and
extended attributes.
Arnd
> On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > >
> > > > > An xattr on the root inode that holds a list like this is something
> > > > > that could be set at mkfs time, but then also updated easily by new
> > > > > software packages that are installed...
> > >
> > > Yes, good idea.
> >
> > Likewise many file systems, f2fs also supports xattr as a configurable
> > Kconfig option.
> > If user disables the xattr feature, how can we do this?
>
> I can see three options here:
>
> * make the extension list feature dependent on xattr, and treat all files
> the same if it's disabled.
>
> * put the list into the superblock instead.
>
> * fall back on a hardcoded list of extensions when the extended attribute
> is not present or the feature is disabled.
>
IMHO, we don't need to disable the extension list among the cases.
So, as I described before, I propose the following options.
* By default, mkfs stores an extension list in superblock, and f2fs simply
uses it.
* If users try to handle cold files by themselves, they can give a hint
via the xattr interface.
* Whenever they want not to use the default extension list, they can
easily disable it by a mount option.
> Arnd
On Wed, Oct 17, 2012 at 12:50:11PM +0000, Arnd Bergmann wrote:
> On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > > IIRC, fs2fs uses 4k inodes, so IMO per-inode xattr tress with
> > > internal storage before spilling to an external block is probably
> > > the best approach to take...
> >
> > Yes, indeed this is the best approach to f2fs's xattr.
> > Apart from giving fs hints, it is worth enough to optimize later.
>
> I've thought a bit more about how this could be represented efficiently
> in 4KB nodes. This would require a significant change of the way you
> represent inodes, but can improve a number of things at the same time.
>
> The idea is to replace the fixed area in the inode that contains block
> pointers with an extensible TLV (type/length/value) list that can contain
> multiple variable-length fields, like this.
You've just re-invented inode forks... ;)
> All TLVs together with the
> fixed-length inode data can fill a 4KB block.
>
> The obvious types would be:
>
> * Direct file contents if the file is less than a block
> * List of block pointers, as before, minimum 1, maximum until the end
> of the block
> * List of indirect pointers, now also a variable length, similar to the
> list of block pointers
> * List of double-indirect block pointers
> * direct xattr: zero-terminated attribute name followed by contents
> * indirect xattr: zero-terminated attribute name followed by up to
> 16 block pointers to store a maximum of 64KB sized xattrs
>
> This could be extended later to cover additional types, e.g. a list
> of erase block pointers, triple-indirect blocks or extents.
An inode fork doesn't care about the data in it - it's just an
independent block mapping index. i.e. inline, direct,
indirect, double indirect. The data in the fork is managed
externally to the format of the fork. e.g. XFS has two forks - one
for storing data (file data, directory contents, etc) and the other
for storing attributes.
The main issue with supporting an arbitrary number of forks is space
management of the inode literal area. e.g. one fork is in inline
format (e.g. direct file contents) and then we add an attribute.
The attribute won't fit inline, nor will an extent form fork header,
so the inline data fork has to be converted to extent format before
the xattr can be added. Now scale that problem up to an arbitrary
number of forks....
> As a variation of this, it would also be nice to turn around the order
> in which the pointers are walked, to optimize for space and for growing
> files, rather than for reading the beginning of a file. With this, you
> can represent a 9 KB file using a list of two block pointers, and 1KB
> of direct data, all in the inode. When the user adds another byte, you
> only need to rewrite the inode. Similarly, a 5 MB file would have a
> single indirect node (covering block pointers for 4 MB), plus 256
> separate block pointers (covering the last megabyte), and a 5 GB file
> can be represented using 1 double-indirect node and 256 indirect nodes,
> and each of them can still be followed by direct "tail" data and
> extended attributes.
I'm not sure that the resultant code complexity is worth saving an
extra block here and there.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wednesday 24 October 2012, Dave Chinner wrote:
> On Wed, Oct 17, 2012 at 12:50:11PM +0000, Arnd Bergmann wrote:
> > On Tuesday 16 October 2012, Jaegeuk Kim wrote:
> > > > IIRC, fs2fs uses 4k inodes, so IMO per-inode xattr tress with
> > > > internal storage before spilling to an external block is probably
> > > > the best approach to take...
> > >
> > > Yes, indeed this is the best approach to f2fs's xattr.
> > > Apart from giving fs hints, it is worth enough to optimize later.
> >
> > I've thought a bit more about how this could be represented efficiently
> > in 4KB nodes. This would require a significant change of the way you
> > represent inodes, but can improve a number of things at the same time.
> >
> > The idea is to replace the fixed area in the inode that contains block
> > pointers with an extensible TLV (type/length/value) list that can contain
> > multiple variable-length fields, like this.
>
> You've just re-invented inode forks... ;)
Ah, good to know the name for it. I didn't really expect that it was a
new idea.
> The main issue with supporting an arbitrary number of forks is space
> management of the inode literal area. e.g. one fork is in inline
> format (e.g. direct file contents) and then we add an attribute.
> The attribute won't fit inline, nor will an extent form fork header,
> so the inline data fork has to be converted to extent format before
> the xattr can be added. Now scale that problem up to an arbitrary
> number of forks....
Right. Obviously this is a solveable problem, but I agree that solving
it is nontrivial and requires some code complexity that would be nice
to avoid.
> > As a variation of this, it would also be nice to turn around the order
> > in which the pointers are walked, to optimize for space and for growing
> > files, rather than for reading the beginning of a file. With this, you
> > can represent a 9 KB file using a list of two block pointers, and 1KB
> > of direct data, all in the inode. When the user adds another byte, you
> > only need to rewrite the inode. Similarly, a 5 MB file would have a
> > single indirect node (covering block pointers for 4 MB), plus 256
> > separate block pointers (covering the last megabyte), and a 5 GB file
> > can be represented using 1 double-indirect node and 256 indirect nodes,
> > and each of them can still be followed by direct "tail" data and
> > extended attributes.
>
> I'm not sure that the resultant code complexity is worth saving an
> extra block here and there.
The space overhead may be noticeable for lots of small files but the
part that worries me more is the overhead for writing (and cleaning up)
data in multiple locations. Any write to file data or extended attributes
requires an update of the inode (mtime, ctime, size, ...) and one or
more other blocks (data, pointers, xattr). In order for the garbage
collection to work best, we want to split those writes into separate logs,
which later have to be cleaned up again. In particular for the inode
but also for the block pointers, we create a lot of garbage from
copy-on-write. Storing as much as possible in the inode itself therefore
saves us from writing the data multiple times rather than just the
actual update.
Arnd