From: Artem Blagodarenko <[email protected]>
This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.
Signed-off-by: Yang Sheng <[email protected]>
Signed-off-by: Artem Blagodarenko <[email protected]>
---
fs/ext4/ext4.h | 23 ++++++++---
fs/ext4/inode.c | 4 +-
fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
3 files changed, 102 insertions(+), 43 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01d52b9..0bbbd9b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
- EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2125,6 +2126,16 @@ struct dir_private_info {
*/
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
+/* htree levels for ext4 */
+#define EXT4_HTREE_LEVEL_COMPAT 2
+#define EXT4_HTREE_LEVEL 3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+ return ext4_has_feature_largedir(sb) ?
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}
-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+ struct ext4_inode *raw_inode)
{
- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ if (ext4_has_feature_largedir(sb) ||
+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo);
- else
- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}
static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f622d4a..5787f3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- inode->i_size = ext4_isize(raw_inode);
+ inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(raw_inode)) {
+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6ad612c..3298fe3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
- return le32_to_cpu(entry->block) & 0x00ffffff;
+ return le32_to_cpu(entry->block) & 0x0fffffff;
}
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
}
indirect = root->info.indirect_levels;
- if (indirect > 1) {
- ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
- root->info.indirect_levels);
+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
+ ext4_warning(dir->i_sb,
+ "Directory (ino: %lu) htree depth %#06x exceed"
+ "supported value", dir->i_ino,
+ ext4_dir_htree_level(dir->i_sb));
+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(dir->i_sb, "Enable large directory "
+ "feature to access it");
+ }
goto fail;
}
@@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
static void dx_release(struct dx_frame *frames)
{
+ struct dx_root_info *info;
+ int i;
+
if (frames[0].bh == NULL)
return;
- if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
+ info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ for (i = 0; i <= info->indirect_levels; i++) {
+ if (frames[i].bh == NULL)
+ break;
+ brelse(frames[i].bh);
+ frames[i].bh = NULL;
+ }
}
/*
@@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
{
struct dx_hash_info hinfo;
struct ext4_dir_entry_2 *de;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct inode *dir;
ext4_lblk_t block;
int count = 0;
@@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
const struct qstr *d_name = fname->usr_fname;
struct buffer_head *bh;
ext4_lblk_t block;
@@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
*/
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
{
struct buffer_head *bh2;
struct dx_root *root;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
struct ext4_dir_entry_tail *t;
@@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
+ int restart;
int err;
+again:
+ restart = 0;
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
@@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (err != -ENOSPC)
goto cleanup;
+ err = 0;
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
ext4_lblk_t newblock;
- unsigned icount = dx_get_count(entries);
- int levels = frame - frames;
+ int levels = frame - frames + 1;
+ unsigned int icount;
+ int add_level = 1;
struct dx_entry *entries2;
struct dx_node *node2;
struct buffer_head *bh2;
- if (levels && (dx_get_count(frames->entries) ==
- dx_get_limit(frames->entries))) {
- ext4_warning_inode(dir, "Directory index full!");
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+ add_level = 0;
+ break;
+ }
+ frame--; /* split higher index block */
+ at = frame->at;
+ entries = frame->entries;
+ restart = 1;
+ }
+ if (add_level && levels == ext4_dir_htree_level(sb)) {
+ ext4_warning(sb, "Directory (ino: %lu) index full, "
+ "reach max htree level :%d",
+ dir->i_ino, levels);
+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(sb, "Large directory feature is "
+ "not enabled on this "
+ "filesystem");
+ }
err = -ENOSPC;
goto cleanup;
}
+ icount = dx_get_count(entries);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
err = PTR_ERR(bh2);
@@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
goto journal_error;
- if (levels) {
+ if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
- frames[0].bh);
+ (frame - 1)->bh);
if (err)
goto journal_error;
@@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block(frames + 0, hash2, newblock);
- dxtrace(dx_show_index("node", frames[1].entries));
+ dx_insert_block((frame - 1), hash2, newblock);
+ dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
+ ext4_handle_dirty_metadata(handle, dir,
+ (frame - 1)->bh);
+ if (restart) {
+ ext4_handle_dirty_metadata(handle, dir,
+ frame->bh);
+ goto cleanup;
+ }
} else {
- dxtrace(printk(KERN_DEBUG
- "Creating second level index...\n"));
+ struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
- frame->at = at = at - entries + entries2;
- frame->entries = entries = entries2;
- frame->bh = bh2;
- err = ext4_journal_get_write_access(handle,
- frame->bh);
- if (err)
- goto journal_error;
+ dxroot = (struct dx_root *)frames[0].bh->b_data;
+ dxroot->info.indirect_levels += 1;
+ dxtrace(printk(KERN_DEBUG
+ "Creating %d level index...\n",
+ info->indirect_levels));
+ ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ ext4_handle_dirty_metadata(handle, dir, bh2);
+ brelse(bh2);
+ restart = 1;
+ goto cleanup;
}
- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
if (err) {
ext4_std_error(inode->i_sb, err);
goto cleanup;
@@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
cleanup:
brelse(bh);
dx_release(frames);
+ /* @restart is true means htree-path has been changed, we need to
+ * repeat dx_probe() to find out valid htree-path
+ */
+ if (restart && err == 0)
+ goto again;
return err;
}
@@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
--
1.8.3.1
On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <[email protected]> wrote:
>
> From: Artem Blagodarenko <[email protected]>
>
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.
Artem,
many thanks for updating and submitting this, and creating the e2fsck patches.
Ted,
can you please add the original author for the largedir ext4 patch when
landing this patch: Liang Zhen <[email protected]>
and these tags which contain more background information on this feature:
Lustre-change: https://review.hpdd.intel.com/375
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
> Signed-off-by: Yang Sheng <[email protected]>
> Signed-off-by: Artem Blagodarenko <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
> ---
> fs/ext4/ext4.h | 23 ++++++++---
> fs/ext4/inode.c | 4 +-
> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
> 3 files changed, 102 insertions(+), 43 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01d52b9..0bbbd9b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> EXT4_FEATURE_INCOMPAT_MMP | \
> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
> @@ -2125,6 +2126,16 @@ struct dir_private_info {
> */
> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>
> +/* htree levels for ext4 */
> +#define EXT4_HTREE_LEVEL_COMPAT 2
> +#define EXT4_HTREE_LEVEL 3
> +
> +static inline int ext4_dir_htree_level(struct super_block *sb)
> +{
> + return ext4_has_feature_largedir(sb) ?
> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
> +}
> +
> /*
> * Timeout and state flag for lazy initialization inode thread.
> */
> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
> }
>
> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
> +static inline loff_t ext4_isize(struct super_block *sb,
> + struct ext4_inode *raw_inode)
> {
> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> + if (ext4_has_feature_largedir(sb) ||
> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
> le32_to_cpu(raw_inode->i_size_lo);
> - else
> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> +
> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> }
>
> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f622d4a..5787f3d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> if (ext4_has_feature_64bit(sb))
> ei->i_file_acl |=
> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
> - inode->i_size = ext4_isize(raw_inode);
> + inode->i_size = ext4_isize(sb, raw_inode);
> if ((size = i_size_read(inode)) < 0) {
> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
> ret = -EFSCORRUPTED;
> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
> raw_inode->i_file_acl_high =
> cpu_to_le16(ei->i_file_acl >> 32);
> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
> - if (ei->i_disksize != ext4_isize(raw_inode)) {
> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
> ext4_isize_set(raw_inode, ei->i_disksize);
> need_datasync = 1;
> }
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6ad612c..3298fe3 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>
> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
> {
> - return le32_to_cpu(entry->block) & 0x00ffffff;
> + return le32_to_cpu(entry->block) & 0x0fffffff;
> }
It wouldn't be terrible to add a comment to this function explaining why
the block numbers are masked off, which is to allow the high bits of the
logical block number to hold a "fullness" fraction so that the kernel
could start shrinking directories if many files are deleted. That said,
this isn't a shortcoming of this patch, but a suggestion if the patch is
being resubmitted for some other reason.
> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
> u32 hash;
>
> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
> if (IS_ERR(frame->bh))
> return (struct dx_frame *) frame->bh;
> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> }
>
> indirect = root->info.indirect_levels;
> - if (indirect > 1) {
> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
> - root->info.indirect_levels);
> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
> + ext4_warning(dir->i_sb,
> + "Directory (ino: %lu) htree depth %#06x exceed"
> + "supported value", dir->i_ino,
> + ext4_dir_htree_level(dir->i_sb));
> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(dir->i_sb, "Enable large directory "
> + "feature to access it");
> + }
> goto fail;
> }
>
> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>
> static void dx_release(struct dx_frame *frames)
> {
> + struct dx_root_info *info;
> + int i;
> +
> if (frames[0].bh == NULL)
> return;
>
> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
> - brelse(frames[1].bh);
> - brelse(frames[0].bh);
> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
> + for (i = 0; i <= info->indirect_levels; i++) {
> + if (frames[i].bh == NULL)
> + break;
> + brelse(frames[i].bh);
> + frames[i].bh = NULL;
> + }
> }
>
> /*
> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
> {
> struct dx_hash_info hinfo;
> struct ext4_dir_entry_2 *de;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct inode *dir;
> ext4_lblk_t block;
> int count = 0;
> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
> struct ext4_dir_entry_2 **res_dir)
> {
> struct super_block * sb = dir->i_sb;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> const struct qstr *d_name = fname->usr_fname;
> struct buffer_head *bh;
> ext4_lblk_t block;
> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
> */
> dir->i_mtime = dir->i_ctime = current_time(dir);
> ext4_update_dx_flag(dir);
> - dir->i_version++;
> + inode_inc_iversion(dir);
> ext4_mark_inode_dirty(handle, dir);
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
> {
> struct buffer_head *bh2;
> struct dx_root *root;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries;
> struct ext4_dir_entry_2 *de, *de2;
> struct ext4_dir_entry_tail *t;
> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> struct inode *dir, struct inode *inode)
> {
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries, *at;
> struct buffer_head *bh;
> struct super_block *sb = dir->i_sb;
> struct ext4_dir_entry_2 *de;
> + int restart;
> int err;
>
> +again:
> + restart = 0;
> frame = dx_probe(fname, dir, NULL, frames);
> if (IS_ERR(frame))
> return PTR_ERR(frame);
> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> if (err != -ENOSPC)
> goto cleanup;
>
> + err = 0;
> /* Block full, should compress but for now just split */
> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
> dx_get_count(entries), dx_get_limit(entries)));
> /* Need to split index? */
> if (dx_get_count(entries) == dx_get_limit(entries)) {
> ext4_lblk_t newblock;
> - unsigned icount = dx_get_count(entries);
> - int levels = frame - frames;
> + int levels = frame - frames + 1;
> + unsigned int icount;
> + int add_level = 1;
> struct dx_entry *entries2;
> struct dx_node *node2;
> struct buffer_head *bh2;
>
> - if (levels && (dx_get_count(frames->entries) ==
> - dx_get_limit(frames->entries))) {
> - ext4_warning_inode(dir, "Directory index full!");
> + while (frame > frames) {
> + if (dx_get_count((frame - 1)->entries) <
> + dx_get_limit((frame - 1)->entries)) {
> + add_level = 0;
> + break;
> + }
> + frame--; /* split higher index block */
> + at = frame->at;
> + entries = frame->entries;
> + restart = 1;
> + }
> + if (add_level && levels == ext4_dir_htree_level(sb)) {
> + ext4_warning(sb, "Directory (ino: %lu) index full, "
> + "reach max htree level :%d",
> + dir->i_ino, levels);
> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(sb, "Large directory feature is "
> + "not enabled on this "
> + "filesystem");
> + }
> err = -ENOSPC;
> goto cleanup;
> }
> + icount = dx_get_count(entries);
> bh2 = ext4_append(handle, dir, &newblock);
> if (IS_ERR(bh2)) {
> err = PTR_ERR(bh2);
> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> err = ext4_journal_get_write_access(handle, frame->bh);
> if (err)
> goto journal_error;
> - if (levels) {
> + if (!add_level) {
> unsigned icount1 = icount/2, icount2 = icount - icount1;
> unsigned hash2 = dx_get_hash(entries + icount1);
> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>
> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
> err = ext4_journal_get_write_access(handle,
> - frames[0].bh);
> + (frame - 1)->bh);
> if (err)
> goto journal_error;
>
> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> frame->entries = entries = entries2;
> swap(frame->bh, bh2);
> }
> - dx_insert_block(frames + 0, hash2, newblock);
> - dxtrace(dx_show_index("node", frames[1].entries));
> + dx_insert_block((frame - 1), hash2, newblock);
> + dxtrace(dx_show_index("node", frame->entries));
> dxtrace(dx_show_index("node",
> ((struct dx_node *) bh2->b_data)->entries));
> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
> if (err)
> goto journal_error;
> brelse (bh2);
> + ext4_handle_dirty_metadata(handle, dir,
> + (frame - 1)->bh);
> + if (restart) {
> + ext4_handle_dirty_metadata(handle, dir,
> + frame->bh);
> + goto cleanup;
> + }
> } else {
> - dxtrace(printk(KERN_DEBUG
> - "Creating second level index...\n"));
> + struct dx_root *dxroot;
> memcpy((char *) entries2, (char *) entries,
> icount * sizeof(struct dx_entry));
> dx_set_limit(entries2, dx_node_limit(dir));
> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> /* Set up root */
> dx_set_count(entries, 1);
> dx_set_block(entries + 0, newblock);
> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
> -
> - /* Add new access path frame */
> - frame = frames + 1;
> - frame->at = at = at - entries + entries2;
> - frame->entries = entries = entries2;
> - frame->bh = bh2;
> - err = ext4_journal_get_write_access(handle,
> - frame->bh);
> - if (err)
> - goto journal_error;
> + dxroot = (struct dx_root *)frames[0].bh->b_data;
> + dxroot->info.indirect_levels += 1;
> + dxtrace(printk(KERN_DEBUG
> + "Creating %d level index...\n",
> + info->indirect_levels));
> + ext4_handle_dirty_metadata(handle, dir, frame->bh);
> + ext4_handle_dirty_metadata(handle, dir, bh2);
> + brelse(bh2);
> + restart = 1;
> + goto cleanup;
> }
> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
> if (err) {
> ext4_std_error(inode->i_sb, err);
> goto cleanup;
> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> cleanup:
> brelse(bh);
> dx_release(frames);
> + /* @restart is true means htree-path has been changed, we need to
> + * repeat dx_probe() to find out valid htree-path
> + */
> + if (restart && err == 0)
> + goto again;
> return err;
> }
>
> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
> blocksize);
> else
> de->inode = 0;
> - dir->i_version++;
> + inode_inc_iversion(dir);
> return 0;
> }
> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> --
> 1.8.3.1
>
Cheers, Andreas
Andreas,
I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?
> 17 марта 2017 г., в 0:44, Andreas Dilger <[email protected]> написал(а):
>
> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <[email protected]> wrote:
>>
>> From: Artem Blagodarenko <[email protected]>
>>
>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>> depth of 3 instead of the current limit of 2. These features are needed
>> in order to exceed the current limit of approximately 10M entries in a
>> single directory.
>
> Artem,
> many thanks for updating and submitting this, and creating the e2fsck patches.
>
> Ted,
> can you please add the original author for the largedir ext4 patch when
> landing this patch: Liang Zhen <[email protected]>
>
> and these tags which contain more background information on this feature:
>
> Lustre-change: https://review.hpdd.intel.com/375
> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
>
>> Signed-off-by: Yang Sheng <[email protected]>
>> Signed-off-by: Artem Blagodarenko <[email protected]>
>
> Reviewed-by: Andreas Dilger <[email protected]>
>
>> ---
>> fs/ext4/ext4.h | 23 ++++++++---
>> fs/ext4/inode.c | 4 +-
>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>> 3 files changed, 102 insertions(+), 43 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 01d52b9..0bbbd9b 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>> EXT4_FEATURE_INCOMPAT_MMP | \
>> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
>> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>> */
>> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>>
>> +/* htree levels for ext4 */
>> +#define EXT4_HTREE_LEVEL_COMPAT 2
>> +#define EXT4_HTREE_LEVEL 3
>> +
>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>> +{
>> + return ext4_has_feature_largedir(sb) ?
>> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>> +}
>> +
>> /*
>> * Timeout and state flag for lazy initialization inode thread.
>> */
>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>> }
>>
>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>> +static inline loff_t ext4_isize(struct super_block *sb,
>> + struct ext4_inode *raw_inode)
>> {
>> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>> + if (ext4_has_feature_largedir(sb) ||
>> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>> le32_to_cpu(raw_inode->i_size_lo);
>> - else
>> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>> +
>> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>> }
>>
>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index f622d4a..5787f3d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> if (ext4_has_feature_64bit(sb))
>> ei->i_file_acl |=
>> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>> - inode->i_size = ext4_isize(raw_inode);
>> + inode->i_size = ext4_isize(sb, raw_inode);
>> if ((size = i_size_read(inode)) < 0) {
>> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>> ret = -EFSCORRUPTED;
>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>> raw_inode->i_file_acl_high =
>> cpu_to_le16(ei->i_file_acl >> 32);
>> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>> - if (ei->i_disksize != ext4_isize(raw_inode)) {
>> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>> ext4_isize_set(raw_inode, ei->i_disksize);
>> need_datasync = 1;
>> }
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index 6ad612c..3298fe3 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>>
>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>> {
>> - return le32_to_cpu(entry->block) & 0x00ffffff;
>> + return le32_to_cpu(entry->block) & 0x0fffffff;
>> }
>
> It wouldn't be terrible to add a comment to this function explaining why
> the block numbers are masked off, which is to allow the high bits of the
> logical block number to hold a "fullness" fraction so that the kernel
> could start shrinking directories if many files are deleted. That said,
> this isn't a shortcoming of this patch, but a suggestion if the patch is
> being resubmitted for some other reason.
>
>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>> u32 hash;
>>
>> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>> if (IS_ERR(frame->bh))
>> return (struct dx_frame *) frame->bh;
>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> }
>>
>> indirect = root->info.indirect_levels;
>> - if (indirect > 1) {
>> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>> - root->info.indirect_levels);
>> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>> + ext4_warning(dir->i_sb,
>> + "Directory (ino: %lu) htree depth %#06x exceed"
>> + "supported value", dir->i_ino,
>> + ext4_dir_htree_level(dir->i_sb));
>> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>> + ext4_warning(dir->i_sb, "Enable large directory "
>> + "feature to access it");
>> + }
>> goto fail;
>> }
>>
>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>
>> static void dx_release(struct dx_frame *frames)
>> {
>> + struct dx_root_info *info;
>> + int i;
>> +
>> if (frames[0].bh == NULL)
>> return;
>>
>> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>> - brelse(frames[1].bh);
>> - brelse(frames[0].bh);
>> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
>> + for (i = 0; i <= info->indirect_levels; i++) {
>> + if (frames[i].bh == NULL)
>> + break;
>> + brelse(frames[i].bh);
>> + frames[i].bh = NULL;
>> + }
>> }
>>
>> /*
>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>> {
>> struct dx_hash_info hinfo;
>> struct ext4_dir_entry_2 *de;
>> - struct dx_frame frames[2], *frame;
>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> struct inode *dir;
>> ext4_lblk_t block;
>> int count = 0;
>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>> struct ext4_dir_entry_2 **res_dir)
>> {
>> struct super_block * sb = dir->i_sb;
>> - struct dx_frame frames[2], *frame;
>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> const struct qstr *d_name = fname->usr_fname;
>> struct buffer_head *bh;
>> ext4_lblk_t block;
>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>> */
>> dir->i_mtime = dir->i_ctime = current_time(dir);
>> ext4_update_dx_flag(dir);
>> - dir->i_version++;
>> + inode_inc_iversion(dir);
>> ext4_mark_inode_dirty(handle, dir);
>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>> {
>> struct buffer_head *bh2;
>> struct dx_root *root;
>> - struct dx_frame frames[2], *frame;
>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> struct dx_entry *entries;
>> struct ext4_dir_entry_2 *de, *de2;
>> struct ext4_dir_entry_tail *t;
>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> struct inode *dir, struct inode *inode)
>> {
>> - struct dx_frame frames[2], *frame;
>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> struct dx_entry *entries, *at;
>> struct buffer_head *bh;
>> struct super_block *sb = dir->i_sb;
>> struct ext4_dir_entry_2 *de;
>> + int restart;
>> int err;
>>
>> +again:
>> + restart = 0;
>> frame = dx_probe(fname, dir, NULL, frames);
>> if (IS_ERR(frame))
>> return PTR_ERR(frame);
>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> if (err != -ENOSPC)
>> goto cleanup;
>>
>> + err = 0;
>> /* Block full, should compress but for now just split */
>> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>> dx_get_count(entries), dx_get_limit(entries)));
>> /* Need to split index? */
>> if (dx_get_count(entries) == dx_get_limit(entries)) {
>> ext4_lblk_t newblock;
>> - unsigned icount = dx_get_count(entries);
>> - int levels = frame - frames;
>> + int levels = frame - frames + 1;
>> + unsigned int icount;
>> + int add_level = 1;
>> struct dx_entry *entries2;
>> struct dx_node *node2;
>> struct buffer_head *bh2;
>>
>> - if (levels && (dx_get_count(frames->entries) ==
>> - dx_get_limit(frames->entries))) {
>> - ext4_warning_inode(dir, "Directory index full!");
>> + while (frame > frames) {
>> + if (dx_get_count((frame - 1)->entries) <
>> + dx_get_limit((frame - 1)->entries)) {
>> + add_level = 0;
>> + break;
>> + }
>> + frame--; /* split higher index block */
>> + at = frame->at;
>> + entries = frame->entries;
>> + restart = 1;
>> + }
>> + if (add_level && levels == ext4_dir_htree_level(sb)) {
>> + ext4_warning(sb, "Directory (ino: %lu) index full, "
>> + "reach max htree level :%d",
>> + dir->i_ino, levels);
>> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>> + ext4_warning(sb, "Large directory feature is "
>> + "not enabled on this "
>> + "filesystem");
>> + }
>> err = -ENOSPC;
>> goto cleanup;
>> }
>> + icount = dx_get_count(entries);
>> bh2 = ext4_append(handle, dir, &newblock);
>> if (IS_ERR(bh2)) {
>> err = PTR_ERR(bh2);
>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> err = ext4_journal_get_write_access(handle, frame->bh);
>> if (err)
>> goto journal_error;
>> - if (levels) {
>> + if (!add_level) {
>> unsigned icount1 = icount/2, icount2 = icount - icount1;
>> unsigned hash2 = dx_get_hash(entries + icount1);
>> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>
>> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>> err = ext4_journal_get_write_access(handle,
>> - frames[0].bh);
>> + (frame - 1)->bh);
>> if (err)
>> goto journal_error;
>>
>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> frame->entries = entries = entries2;
>> swap(frame->bh, bh2);
>> }
>> - dx_insert_block(frames + 0, hash2, newblock);
>> - dxtrace(dx_show_index("node", frames[1].entries));
>> + dx_insert_block((frame - 1), hash2, newblock);
>> + dxtrace(dx_show_index("node", frame->entries));
>> dxtrace(dx_show_index("node",
>> ((struct dx_node *) bh2->b_data)->entries));
>> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>> if (err)
>> goto journal_error;
>> brelse (bh2);
>> + ext4_handle_dirty_metadata(handle, dir,
>> + (frame - 1)->bh);
>> + if (restart) {
>> + ext4_handle_dirty_metadata(handle, dir,
>> + frame->bh);
>> + goto cleanup;
>> + }
>> } else {
>> - dxtrace(printk(KERN_DEBUG
>> - "Creating second level index...\n"));
>> + struct dx_root *dxroot;
>> memcpy((char *) entries2, (char *) entries,
>> icount * sizeof(struct dx_entry));
>> dx_set_limit(entries2, dx_node_limit(dir));
>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> /* Set up root */
>> dx_set_count(entries, 1);
>> dx_set_block(entries + 0, newblock);
>> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>> -
>> - /* Add new access path frame */
>> - frame = frames + 1;
>> - frame->at = at = at - entries + entries2;
>> - frame->entries = entries = entries2;
>> - frame->bh = bh2;
>> - err = ext4_journal_get_write_access(handle,
>> - frame->bh);
>> - if (err)
>> - goto journal_error;
>> + dxroot = (struct dx_root *)frames[0].bh->b_data;
>> + dxroot->info.indirect_levels += 1;
>> + dxtrace(printk(KERN_DEBUG
>> + "Creating %d level index...\n",
>> + info->indirect_levels));
>> + ext4_handle_dirty_metadata(handle, dir, frame->bh);
>> + ext4_handle_dirty_metadata(handle, dir, bh2);
>> + brelse(bh2);
>> + restart = 1;
>> + goto cleanup;
>> }
>> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>> if (err) {
>> ext4_std_error(inode->i_sb, err);
>> goto cleanup;
>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> cleanup:
>> brelse(bh);
>> dx_release(frames);
>> + /* @restart is true means htree-path has been changed, we need to
>> + * repeat dx_probe() to find out valid htree-path
>> + */
>> + if (restart && err == 0)
>> + goto again;
>> return err;
>> }
>>
>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>> blocksize);
>> else
>> de->inode = 0;
>> - dir->i_version++;
>> + inode_inc_iversion(dir);
>> return 0;
>> }
>> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>> --
>> 1.8.3.1
>>
>
>
> Cheers, Andreas
>
>
>
>
>
On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <[email protected]> wrote:
>
> Andreas,
>
> I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
> a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?
Hi Alexey,
I'm not against further improvements to large directory handling (e.g.
directory shrinking, implement pdirops through VFS, etc.) for ext4.
That said, anything that changes the disk format should use a different
feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been
using this flag in the Lustre code for several years already.
I think for very large directories that directory shrinking can be quite
useful, and could actually be implemented without a format change since
htree always masks off the top byte ("fullness ratio") of the logical block
number for this reason, and removing leaf blocks does not change the other
parts of the htree on-disk format. This could still be done (in a slightly
less efficient manner) even without "fullness ratio" (i.e. if fullness == 0,
unset from older versions of ext4/e2fsck). This would also help performance
noticeably if a directory has many files deleted, as the htree index will
shrink, and operations like readdir() will not have to skip empty blocks.
IMHO, I think the 3-level htree maximum limits are as far as we need to
go for ext4. With 4KB blocks this gives us a maximum directory size
of about 5B entries, which is more files than can fit in the filesystem,
Even for 1KB blocks the maximum directory size is about 8M entries, which
is fine since 1KB blocksize is mostly only used for small filesystems, yet
is still as good as what a 4KB filesystem can do today.
Cheers, Andreas
>> 17 марта 2017 г., в 0:44, Andreas Dilger <[email protected]> написал(а):
>>
>> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <[email protected]> wrote:
>>>
>>> From: Artem Blagodarenko <[email protected]>
>>>
>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>>> depth of 3 instead of the current limit of 2. These features are needed
>>> in order to exceed the current limit of approximately 10M entries in a
>>> single directory.
>>
>> Artem,
>> many thanks for updating and submitting this, and creating the e2fsck patches.
>>
>> Ted,
>> can you please add the original author for the largedir ext4 patch when
>> landing this patch: Liang Zhen <[email protected]>
>>
>> and these tags which contain more background information on this feature:
>>
>> Lustre-change: https://review.hpdd.intel.com/375
>> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
>>
>>> Signed-off-by: Yang Sheng <[email protected]>
>>> Signed-off-by: Artem Blagodarenko <[email protected]>
>>
>> Reviewed-by: Andreas Dilger <[email protected]>
>>
>>> ---
>>> fs/ext4/ext4.h | 23 ++++++++---
>>> fs/ext4/inode.c | 4 +-
>>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>>> 3 files changed, 102 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 01d52b9..0bbbd9b 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>> EXT4_FEATURE_INCOMPAT_MMP | \
>>> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>>> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>>> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>>> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>>> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
>>> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>>> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>>> */
>>> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>>>
>>> +/* htree levels for ext4 */
>>> +#define EXT4_HTREE_LEVEL_COMPAT 2
>>> +#define EXT4_HTREE_LEVEL 3
>>> +
>>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>>> +{
>>> + return ext4_has_feature_largedir(sb) ?
>>> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>>> +}
>>> +
>>> /*
>>> * Timeout and state flag for lazy initialization inode thread.
>>> */
>>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>>> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>>> }
>>>
>>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>>> +static inline loff_t ext4_isize(struct super_block *sb,
>>> + struct ext4_inode *raw_inode)
>>> {
>>> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>> + if (ext4_has_feature_largedir(sb) ||
>>> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>>> le32_to_cpu(raw_inode->i_size_lo);
>>> - else
>>> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>> +
>>> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>> }
>>>
>>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index f622d4a..5787f3d 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>> if (ext4_has_feature_64bit(sb))
>>> ei->i_file_acl |=
>>> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>>> - inode->i_size = ext4_isize(raw_inode);
>>> + inode->i_size = ext4_isize(sb, raw_inode);
>>> if ((size = i_size_read(inode)) < 0) {
>>> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>>> ret = -EFSCORRUPTED;
>>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>> raw_inode->i_file_acl_high =
>>> cpu_to_le16(ei->i_file_acl >> 32);
>>> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>>> - if (ei->i_disksize != ext4_isize(raw_inode)) {
>>> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>>> ext4_isize_set(raw_inode, ei->i_disksize);
>>> need_datasync = 1;
>>> }
>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>>> index 6ad612c..3298fe3 100644
>>> --- a/fs/ext4/namei.c
>>> +++ b/fs/ext4/namei.c
>>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>>>
>>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>>> {
>>> - return le32_to_cpu(entry->block) & 0x00ffffff;
>>> + return le32_to_cpu(entry->block) & 0x0fffffff;
>>> }
>>
>> It wouldn't be terrible to add a comment to this function explaining why
>> the block numbers are masked off, which is to allow the high bits of the
>> logical block number to hold a "fullness" fraction so that the kernel
>> could start shrinking directories if many files are deleted. That said,
>> this isn't a shortcoming of this patch, but a suggestion if the patch is
>> being resubmitted for some other reason.
>>
>>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>>> u32 hash;
>>>
>>> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>>> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>>> if (IS_ERR(frame->bh))
>>> return (struct dx_frame *) frame->bh;
>>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> }
>>>
>>> indirect = root->info.indirect_levels;
>>> - if (indirect > 1) {
>>> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>>> - root->info.indirect_levels);
>>> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>>> + ext4_warning(dir->i_sb,
>>> + "Directory (ino: %lu) htree depth %#06x exceed"
>>> + "supported value", dir->i_ino,
>>> + ext4_dir_htree_level(dir->i_sb));
>>> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>>> + ext4_warning(dir->i_sb, "Enable large directory "
>>> + "feature to access it");
>>> + }
>>> goto fail;
>>> }
>>>
>>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>
>>> static void dx_release(struct dx_frame *frames)
>>> {
>>> + struct dx_root_info *info;
>>> + int i;
>>> +
>>> if (frames[0].bh == NULL)
>>> return;
>>>
>>> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>>> - brelse(frames[1].bh);
>>> - brelse(frames[0].bh);
>>> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
>>> + for (i = 0; i <= info->indirect_levels; i++) {
>>> + if (frames[i].bh == NULL)
>>> + break;
>>> + brelse(frames[i].bh);
>>> + frames[i].bh = NULL;
>>> + }
>>> }
>>>
>>> /*
>>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>>> {
>>> struct dx_hash_info hinfo;
>>> struct ext4_dir_entry_2 *de;
>>> - struct dx_frame frames[2], *frame;
>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> struct inode *dir;
>>> ext4_lblk_t block;
>>> int count = 0;
>>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>>> struct ext4_dir_entry_2 **res_dir)
>>> {
>>> struct super_block * sb = dir->i_sb;
>>> - struct dx_frame frames[2], *frame;
>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> const struct qstr *d_name = fname->usr_fname;
>>> struct buffer_head *bh;
>>> ext4_lblk_t block;
>>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>>> */
>>> dir->i_mtime = dir->i_ctime = current_time(dir);
>>> ext4_update_dx_flag(dir);
>>> - dir->i_version++;
>>> + inode_inc_iversion(dir);
>>> ext4_mark_inode_dirty(handle, dir);
>>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>>> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>>> {
>>> struct buffer_head *bh2;
>>> struct dx_root *root;
>>> - struct dx_frame frames[2], *frame;
>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> struct dx_entry *entries;
>>> struct ext4_dir_entry_2 *de, *de2;
>>> struct ext4_dir_entry_tail *t;
>>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> struct inode *dir, struct inode *inode)
>>> {
>>> - struct dx_frame frames[2], *frame;
>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> struct dx_entry *entries, *at;
>>> struct buffer_head *bh;
>>> struct super_block *sb = dir->i_sb;
>>> struct ext4_dir_entry_2 *de;
>>> + int restart;
>>> int err;
>>>
>>> +again:
>>> + restart = 0;
>>> frame = dx_probe(fname, dir, NULL, frames);
>>> if (IS_ERR(frame))
>>> return PTR_ERR(frame);
>>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> if (err != -ENOSPC)
>>> goto cleanup;
>>>
>>> + err = 0;
>>> /* Block full, should compress but for now just split */
>>> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>>> dx_get_count(entries), dx_get_limit(entries)));
>>> /* Need to split index? */
>>> if (dx_get_count(entries) == dx_get_limit(entries)) {
>>> ext4_lblk_t newblock;
>>> - unsigned icount = dx_get_count(entries);
>>> - int levels = frame - frames;
>>> + int levels = frame - frames + 1;
>>> + unsigned int icount;
>>> + int add_level = 1;
>>> struct dx_entry *entries2;
>>> struct dx_node *node2;
>>> struct buffer_head *bh2;
>>>
>>> - if (levels && (dx_get_count(frames->entries) ==
>>> - dx_get_limit(frames->entries))) {
>>> - ext4_warning_inode(dir, "Directory index full!");
>>> + while (frame > frames) {
>>> + if (dx_get_count((frame - 1)->entries) <
>>> + dx_get_limit((frame - 1)->entries)) {
>>> + add_level = 0;
>>> + break;
>>> + }
>>> + frame--; /* split higher index block */
>>> + at = frame->at;
>>> + entries = frame->entries;
>>> + restart = 1;
>>> + }
>>> + if (add_level && levels == ext4_dir_htree_level(sb)) {
>>> + ext4_warning(sb, "Directory (ino: %lu) index full, "
>>> + "reach max htree level :%d",
>>> + dir->i_ino, levels);
>>> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>>> + ext4_warning(sb, "Large directory feature is "
>>> + "not enabled on this "
>>> + "filesystem");
>>> + }
>>> err = -ENOSPC;
>>> goto cleanup;
>>> }
>>> + icount = dx_get_count(entries);
>>> bh2 = ext4_append(handle, dir, &newblock);
>>> if (IS_ERR(bh2)) {
>>> err = PTR_ERR(bh2);
>>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> err = ext4_journal_get_write_access(handle, frame->bh);
>>> if (err)
>>> goto journal_error;
>>> - if (levels) {
>>> + if (!add_level) {
>>> unsigned icount1 = icount/2, icount2 = icount - icount1;
>>> unsigned hash2 = dx_get_hash(entries + icount1);
>>> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>
>>> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>>> err = ext4_journal_get_write_access(handle,
>>> - frames[0].bh);
>>> + (frame - 1)->bh);
>>> if (err)
>>> goto journal_error;
>>>
>>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> frame->entries = entries = entries2;
>>> swap(frame->bh, bh2);
>>> }
>>> - dx_insert_block(frames + 0, hash2, newblock);
>>> - dxtrace(dx_show_index("node", frames[1].entries));
>>> + dx_insert_block((frame - 1), hash2, newblock);
>>> + dxtrace(dx_show_index("node", frame->entries));
>>> dxtrace(dx_show_index("node",
>>> ((struct dx_node *) bh2->b_data)->entries));
>>> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>>> if (err)
>>> goto journal_error;
>>> brelse (bh2);
>>> + ext4_handle_dirty_metadata(handle, dir,
>>> + (frame - 1)->bh);
>>> + if (restart) {
>>> + ext4_handle_dirty_metadata(handle, dir,
>>> + frame->bh);
>>> + goto cleanup;
>>> + }
>>> } else {
>>> - dxtrace(printk(KERN_DEBUG
>>> - "Creating second level index...\n"));
>>> + struct dx_root *dxroot;
>>> memcpy((char *) entries2, (char *) entries,
>>> icount * sizeof(struct dx_entry));
>>> dx_set_limit(entries2, dx_node_limit(dir));
>>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> /* Set up root */
>>> dx_set_count(entries, 1);
>>> dx_set_block(entries + 0, newblock);
>>> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>>> -
>>> - /* Add new access path frame */
>>> - frame = frames + 1;
>>> - frame->at = at = at - entries + entries2;
>>> - frame->entries = entries = entries2;
>>> - frame->bh = bh2;
>>> - err = ext4_journal_get_write_access(handle,
>>> - frame->bh);
>>> - if (err)
>>> - goto journal_error;
>>> + dxroot = (struct dx_root *)frames[0].bh->b_data;
>>> + dxroot->info.indirect_levels += 1;
>>> + dxtrace(printk(KERN_DEBUG
>>> + "Creating %d level index...\n",
>>> + info->indirect_levels));
>>> + ext4_handle_dirty_metadata(handle, dir, frame->bh);
>>> + ext4_handle_dirty_metadata(handle, dir, bh2);
>>> + brelse(bh2);
>>> + restart = 1;
>>> + goto cleanup;
>>> }
>>> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>>> if (err) {
>>> ext4_std_error(inode->i_sb, err);
>>> goto cleanup;
>>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> cleanup:
>>> brelse(bh);
>>> dx_release(frames);
>>> + /* @restart is true means htree-path has been changed, we need to
>>> + * repeat dx_probe() to find out valid htree-path
>>> + */
>>> + if (restart && err == 0)
>>> + goto again;
>>> return err;
>>> }
>>>
>>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>>> blocksize);
>>> else
>>> de->inode = 0;
>>> - dir->i_version++;
>>> + inode_inc_iversion(dir);
>>> return 0;
>>> }
>>> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>>> --
>>> 1.8.3.1
>>>
>>
>>
>> Cheers, Andreas
>>
>>
>>
>>
>>
>
Cheers, Andreas
Andreas,
it not about a feature flag. It’s about a situation in whole.
Yes, we may increase a directory size, but it open a number a large problems.
1) readdir. It tries to read all entries in memory before send to the user. currently it may eats 20*10^6 * 256 so several gigs, so increasing it size may produce a problems for a system.
2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
Other improvements you point already.
> 17 марта 2017 г., в 23:51, Andreas Dilger <[email protected]> написал(а):
>
> On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <[email protected]> wrote:
>>
>> Andreas,
>>
>> I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
>> a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?
>
> Hi Alexey,
> I'm not against further improvements to large directory handling (e.g.
> directory shrinking, implement pdirops through VFS, etc.) for ext4.
> That said, anything that changes the disk format should use a different
> feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been
> using this flag in the Lustre code for several years already.
>
> I think for very large directories that directory shrinking can be quite
> useful, and could actually be implemented without a format change since
> htree always masks off the top byte ("fullness ratio") of the logical block
> number for this reason, and removing leaf blocks does not change the other
> parts of the htree on-disk format. This could still be done (in a slightly
> less efficient manner) even without "fullness ratio" (i.e. if fullness == 0,
> unset from older versions of ext4/e2fsck). This would also help performance
> noticeably if a directory has many files deleted, as the htree index will
> shrink, and operations like readdir() will not have to skip empty blocks.
>
> IMHO, I think the 3-level htree maximum limits are as far as we need to
> go for ext4. With 4KB blocks this gives us a maximum directory size
> of about 5B entries, which is more files than can fit in the filesystem,
> Even for 1KB blocks the maximum directory size is about 8M entries, which
> is fine since 1KB blocksize is mostly only used for small filesystems, yet
> is still as good as what a 4KB filesystem can do today.
>
> Cheers, Andreas
>
>>> 17 марта 2017 г., в 0:44, Andreas Dilger <[email protected]> написал(а):
>>>
>>> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <[email protected]> wrote:
>>>>
>>>> From: Artem Blagodarenko <[email protected]>
>>>>
>>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>>>> depth of 3 instead of the current limit of 2. These features are needed
>>>> in order to exceed the current limit of approximately 10M entries in a
>>>> single directory.
>>>
>>> Artem,
>>> many thanks for updating and submitting this, and creating the e2fsck patches.
>>>
>>> Ted,
>>> can you please add the original author for the largedir ext4 patch when
>>> landing this patch: Liang Zhen <[email protected]>
>>>
>>> and these tags which contain more background information on this feature:
>>>
>>> Lustre-change: https://review.hpdd.intel.com/375
>>> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
>>>
>>>> Signed-off-by: Yang Sheng <[email protected]>
>>>> Signed-off-by: Artem Blagodarenko <[email protected]>
>>>
>>> Reviewed-by: Andreas Dilger <[email protected]>
>>>
>>>> ---
>>>> fs/ext4/ext4.h | 23 ++++++++---
>>>> fs/ext4/inode.c | 4 +-
>>>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>>>> 3 files changed, 102 insertions(+), 43 deletions(-)
>>>>
>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>> index 01d52b9..0bbbd9b 100644
>>>> --- a/fs/ext4/ext4.h
>>>> +++ b/fs/ext4/ext4.h
>>>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>>> EXT4_FEATURE_INCOMPAT_MMP | \
>>>> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>>>> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>>>> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>>>> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>>>> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
>>>> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>>>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>>>> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>>>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>>>> */
>>>> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>>>>
>>>> +/* htree levels for ext4 */
>>>> +#define EXT4_HTREE_LEVEL_COMPAT 2
>>>> +#define EXT4_HTREE_LEVEL 3
>>>> +
>>>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>>>> +{
>>>> + return ext4_has_feature_largedir(sb) ?
>>>> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>>>> +}
>>>> +
>>>> /*
>>>> * Timeout and state flag for lazy initialization inode thread.
>>>> */
>>>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>>>> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>>>> }
>>>>
>>>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>>>> +static inline loff_t ext4_isize(struct super_block *sb,
>>>> + struct ext4_inode *raw_inode)
>>>> {
>>>> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>>> + if (ext4_has_feature_largedir(sb) ||
>>>> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>>> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>>>> le32_to_cpu(raw_inode->i_size_lo);
>>>> - else
>>>> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>>> +
>>>> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>>> }
>>>>
>>>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index f622d4a..5787f3d 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>>> if (ext4_has_feature_64bit(sb))
>>>> ei->i_file_acl |=
>>>> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>>>> - inode->i_size = ext4_isize(raw_inode);
>>>> + inode->i_size = ext4_isize(sb, raw_inode);
>>>> if ((size = i_size_read(inode)) < 0) {
>>>> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>>>> ret = -EFSCORRUPTED;
>>>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>>> raw_inode->i_file_acl_high =
>>>> cpu_to_le16(ei->i_file_acl >> 32);
>>>> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>>>> - if (ei->i_disksize != ext4_isize(raw_inode)) {
>>>> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>>>> ext4_isize_set(raw_inode, ei->i_disksize);
>>>> need_datasync = 1;
>>>> }
>>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>>>> index 6ad612c..3298fe3 100644
>>>> --- a/fs/ext4/namei.c
>>>> +++ b/fs/ext4/namei.c
>>>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>>>>
>>>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>>>> {
>>>> - return le32_to_cpu(entry->block) & 0x00ffffff;
>>>> + return le32_to_cpu(entry->block) & 0x0fffffff;
>>>> }
>>>
>>> It wouldn't be terrible to add a comment to this function explaining why
>>> the block numbers are masked off, which is to allow the high bits of the
>>> logical block number to hold a "fullness" fraction so that the kernel
>>> could start shrinking directories if many files are deleted. That said,
>>> this isn't a shortcoming of this patch, but a suggestion if the patch is
>>> being resubmitted for some other reason.
>>>
>>>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>>>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>>>> u32 hash;
>>>>
>>>> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>>>> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>>>> if (IS_ERR(frame->bh))
>>>> return (struct dx_frame *) frame->bh;
>>>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>> }
>>>>
>>>> indirect = root->info.indirect_levels;
>>>> - if (indirect > 1) {
>>>> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>>>> - root->info.indirect_levels);
>>>> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>>>> + ext4_warning(dir->i_sb,
>>>> + "Directory (ino: %lu) htree depth %#06x exceed"
>>>> + "supported value", dir->i_ino,
>>>> + ext4_dir_htree_level(dir->i_sb));
>>>> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>>>> + ext4_warning(dir->i_sb, "Enable large directory "
>>>> + "feature to access it");
>>>> + }
>>>> goto fail;
>>>> }
>>>>
>>>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>>
>>>> static void dx_release(struct dx_frame *frames)
>>>> {
>>>> + struct dx_root_info *info;
>>>> + int i;
>>>> +
>>>> if (frames[0].bh == NULL)
>>>> return;
>>>>
>>>> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>>>> - brelse(frames[1].bh);
>>>> - brelse(frames[0].bh);
>>>> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
>>>> + for (i = 0; i <= info->indirect_levels; i++) {
>>>> + if (frames[i].bh == NULL)
>>>> + break;
>>>> + brelse(frames[i].bh);
>>>> + frames[i].bh = NULL;
>>>> + }
>>>> }
>>>>
>>>> /*
>>>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>>>> {
>>>> struct dx_hash_info hinfo;
>>>> struct ext4_dir_entry_2 *de;
>>>> - struct dx_frame frames[2], *frame;
>>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> struct inode *dir;
>>>> ext4_lblk_t block;
>>>> int count = 0;
>>>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>>>> struct ext4_dir_entry_2 **res_dir)
>>>> {
>>>> struct super_block * sb = dir->i_sb;
>>>> - struct dx_frame frames[2], *frame;
>>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> const struct qstr *d_name = fname->usr_fname;
>>>> struct buffer_head *bh;
>>>> ext4_lblk_t block;
>>>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>>>> */
>>>> dir->i_mtime = dir->i_ctime = current_time(dir);
>>>> ext4_update_dx_flag(dir);
>>>> - dir->i_version++;
>>>> + inode_inc_iversion(dir);
>>>> ext4_mark_inode_dirty(handle, dir);
>>>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>>>> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>>>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>>>> {
>>>> struct buffer_head *bh2;
>>>> struct dx_root *root;
>>>> - struct dx_frame frames[2], *frame;
>>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> struct dx_entry *entries;
>>>> struct ext4_dir_entry_2 *de, *de2;
>>>> struct ext4_dir_entry_tail *t;
>>>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>>>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> struct inode *dir, struct inode *inode)
>>>> {
>>>> - struct dx_frame frames[2], *frame;
>>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> struct dx_entry *entries, *at;
>>>> struct buffer_head *bh;
>>>> struct super_block *sb = dir->i_sb;
>>>> struct ext4_dir_entry_2 *de;
>>>> + int restart;
>>>> int err;
>>>>
>>>> +again:
>>>> + restart = 0;
>>>> frame = dx_probe(fname, dir, NULL, frames);
>>>> if (IS_ERR(frame))
>>>> return PTR_ERR(frame);
>>>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> if (err != -ENOSPC)
>>>> goto cleanup;
>>>>
>>>> + err = 0;
>>>> /* Block full, should compress but for now just split */
>>>> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>>>> dx_get_count(entries), dx_get_limit(entries)));
>>>> /* Need to split index? */
>>>> if (dx_get_count(entries) == dx_get_limit(entries)) {
>>>> ext4_lblk_t newblock;
>>>> - unsigned icount = dx_get_count(entries);
>>>> - int levels = frame - frames;
>>>> + int levels = frame - frames + 1;
>>>> + unsigned int icount;
>>>> + int add_level = 1;
>>>> struct dx_entry *entries2;
>>>> struct dx_node *node2;
>>>> struct buffer_head *bh2;
>>>>
>>>> - if (levels && (dx_get_count(frames->entries) ==
>>>> - dx_get_limit(frames->entries))) {
>>>> - ext4_warning_inode(dir, "Directory index full!");
>>>> + while (frame > frames) {
>>>> + if (dx_get_count((frame - 1)->entries) <
>>>> + dx_get_limit((frame - 1)->entries)) {
>>>> + add_level = 0;
>>>> + break;
>>>> + }
>>>> + frame--; /* split higher index block */
>>>> + at = frame->at;
>>>> + entries = frame->entries;
>>>> + restart = 1;
>>>> + }
>>>> + if (add_level && levels == ext4_dir_htree_level(sb)) {
>>>> + ext4_warning(sb, "Directory (ino: %lu) index full, "
>>>> + "reach max htree level :%d",
>>>> + dir->i_ino, levels);
>>>> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>>>> + ext4_warning(sb, "Large directory feature is "
>>>> + "not enabled on this "
>>>> + "filesystem");
>>>> + }
>>>> err = -ENOSPC;
>>>> goto cleanup;
>>>> }
>>>> + icount = dx_get_count(entries);
>>>> bh2 = ext4_append(handle, dir, &newblock);
>>>> if (IS_ERR(bh2)) {
>>>> err = PTR_ERR(bh2);
>>>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> err = ext4_journal_get_write_access(handle, frame->bh);
>>>> if (err)
>>>> goto journal_error;
>>>> - if (levels) {
>>>> + if (!add_level) {
>>>> unsigned icount1 = icount/2, icount2 = icount - icount1;
>>>> unsigned hash2 = dx_get_hash(entries + icount1);
>>>> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>>>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>>
>>>> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>>>> err = ext4_journal_get_write_access(handle,
>>>> - frames[0].bh);
>>>> + (frame - 1)->bh);
>>>> if (err)
>>>> goto journal_error;
>>>>
>>>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> frame->entries = entries = entries2;
>>>> swap(frame->bh, bh2);
>>>> }
>>>> - dx_insert_block(frames + 0, hash2, newblock);
>>>> - dxtrace(dx_show_index("node", frames[1].entries));
>>>> + dx_insert_block((frame - 1), hash2, newblock);
>>>> + dxtrace(dx_show_index("node", frame->entries));
>>>> dxtrace(dx_show_index("node",
>>>> ((struct dx_node *) bh2->b_data)->entries));
>>>> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>>>> if (err)
>>>> goto journal_error;
>>>> brelse (bh2);
>>>> + ext4_handle_dirty_metadata(handle, dir,
>>>> + (frame - 1)->bh);
>>>> + if (restart) {
>>>> + ext4_handle_dirty_metadata(handle, dir,
>>>> + frame->bh);
>>>> + goto cleanup;
>>>> + }
>>>> } else {
>>>> - dxtrace(printk(KERN_DEBUG
>>>> - "Creating second level index...\n"));
>>>> + struct dx_root *dxroot;
>>>> memcpy((char *) entries2, (char *) entries,
>>>> icount * sizeof(struct dx_entry));
>>>> dx_set_limit(entries2, dx_node_limit(dir));
>>>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> /* Set up root */
>>>> dx_set_count(entries, 1);
>>>> dx_set_block(entries + 0, newblock);
>>>> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>>>> -
>>>> - /* Add new access path frame */
>>>> - frame = frames + 1;
>>>> - frame->at = at = at - entries + entries2;
>>>> - frame->entries = entries = entries2;
>>>> - frame->bh = bh2;
>>>> - err = ext4_journal_get_write_access(handle,
>>>> - frame->bh);
>>>> - if (err)
>>>> - goto journal_error;
>>>> + dxroot = (struct dx_root *)frames[0].bh->b_data;
>>>> + dxroot->info.indirect_levels += 1;
>>>> + dxtrace(printk(KERN_DEBUG
>>>> + "Creating %d level index...\n",
>>>> + info->indirect_levels));
>>>> + ext4_handle_dirty_metadata(handle, dir, frame->bh);
>>>> + ext4_handle_dirty_metadata(handle, dir, bh2);
>>>> + brelse(bh2);
>>>> + restart = 1;
>>>> + goto cleanup;
>>>> }
>>>> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>>>> if (err) {
>>>> ext4_std_error(inode->i_sb, err);
>>>> goto cleanup;
>>>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> cleanup:
>>>> brelse(bh);
>>>> dx_release(frames);
>>>> + /* @restart is true means htree-path has been changed, we need to
>>>> + * repeat dx_probe() to find out valid htree-path
>>>> + */
>>>> + if (restart && err == 0)
>>>> + goto again;
>>>> return err;
>>>> }
>>>>
>>>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>>>> blocksize);
>>>> else
>>>> de->inode = 0;
>>>> - dir->i_version++;
>>>> + inode_inc_iversion(dir);
>>>> return 0;
>>>> }
>>>> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>>>> --
>>>> 1.8.3.1
>>>>
>>>
>>>
>>> Cheers, Andreas
>>>
>>>
>>>
>>>
>>>
>>
>
>
> Cheers, Andreas
>
>
>
>
>
On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
> Andreas,
>
> it not about a feature flag. It’s about a situation in whole.
> Yes, we may increase a directory size, but it open a number a large problems.
> 1) readdir. It tries to read all entries in memory before send to
> the user. currently it may eats 20*10^6 * 256 so several gigs, so
> increasing it size may produce a problems for a system.
That's not true. We normally only read in one block a time. If there
is a hash collision, then we may need to insert into the rbtree in a
subsequent block's worth of dentries to make sure we have all of the
directory entries corresponding to a particular hash value. I think
you misunderstood the code.
> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
But there are also only 32k blocks in a group descriptor, and we try
to keep the blocks allocated close to the inode. So if you are using
a huge directory, and you are using a storage device with a
significant seek penalty, then yes, no matter what as the directory
grows, the time to iterate over all of the files does grow. But there
is more to life than microbenchmarks which creates huge numbers of zero
length files! If we assume that the files are going to need to
contain _data_, and the data blocks should be close to the inodes,
then there are going to be some performance impacts no matter what.
> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
I can imagine a new feature flag which defines the use a 64-bit inode
number, but that's more for people who are creating a file system that
takes advantage of 64-bit block numbers, and they are intending on
using all of that space to store small (< 4k or < 8k) files.
And it's also true that there are huge advantges to using a
multi-level directory hierarchy --- e.g.:
.git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8
or even
.git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8
instead of:
.git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8
but that's an application level question. If for some reason some
silly application programmer wants to have a single gargantuan
directory, if the patches to support it are fairly simple, even if
someone is going to give us patches to do something more general,
burning an extra feature flag doesn't seem like the most terrible
thing in the world.
As for the other optimizations --- things like allowing parallel
directory modifications, or being able to shrink empty directory
blocks or shorten the tree are all improvements we can make without
impacting the on-disk format. So they aren't an argument for halting
the submission of the new on-disk format, no?
- Ted
> 18 марта 2017 г., в 19:29, Theodore Ts'o <[email protected]> написал(а):
>
> On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
>> Andreas,
>>
>> it not about a feature flag. It’s about a situation in whole.
>> Yes, we may increase a directory size, but it open a number a large problems.
>
>> 1) readdir. It tries to read all entries in memory before send to
>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>> increasing it size may produce a problems for a system.
>
> That's not true. We normally only read in one block a time. If there
> is a hash collision, then we may need to insert into the rbtree in a
> subsequent block's worth of dentries to make sure we have all of the
> directory entries corresponding to a particular hash value. I think
> you misunderstood the code.
As i see it not about hash collisions, but about merging a several blocks into same hash range on up level hash entry.
so if we have a large hash range originally assigned to the single block, all that range will read at memory at single step.
With «aged» directory when hash blocks used already - it’s easy to hit.
>
>> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
>
> But there are also only 32k blocks in a group descriptor, and we try
> to keep the blocks allocated close to the inode.
with bigalloc feature it’s not a 32k blocks. but 32k clusters with 1M cluster size(as example), it very large space.
> So if you are using
> a huge directory, and you are using a storage device with a
> significant seek penalty, then yes, no matter what as the directory
> grows, the time to iterate over all of the files does grow. But there
> is more to life than microbenchmarks which creates huge numbers of zero
> length files! If we assume that the files are going to need to
> contain _data_, and the data blocks should be close to the inodes,
> then there are going to be some performance impacts no matter what.
>
Yes, i expect to have some seek penalty. But may testing say it too huge now.
directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
Same testing with hard links same create rate dropped slightly.
>> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
>
> I can imagine a new feature flag which defines the use a 64-bit inode
> number, but that's more for people who are creating a file system that
> takes advantage of 64-bit block numbers, and they are intending on
> using all of that space to store small (< 4k or < 8k) files.
>
> And it's also true that there are huge advantges to using a
> multi-level directory hierarchy --- e.g.:
>
> .git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8
>
> or even
>
> .git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8
>
> instead of:
>
> .git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8
>
> but that's an application level question. If for some reason some
> silly application programmer wants to have a single gargantuan
> directory, if the patches to support it are fairly simple, even if
> someone is going to give us patches to do something more general,
> burning an extra feature flag doesn't seem like the most terrible
> thing in the world.
From other side - application don’t expect to have very slow directory and have access with some constant or near or it speed.
>
> As for the other optimizations --- things like allowing parallel
> directory modifications, or being able to shrink empty directory
> blocks or shorten the tree are all improvements we can make without
> impacting the on-disk format. So they aren't an argument for halting
> the submission of the new on-disk format, no?
>
It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
Alexey
On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
> >
> > That's not true. We normally only read in one block a time. If there
> > is a hash collision, then we may need to insert into the rbtree in a
> > subsequent block's worth of dentries to make sure we have all of the
> > directory entries corresponding to a particular hash value. I think
> > you misunderstood the code.
>
> As i see it not about hash collisions, but about merging a several
> blocks into same hash range on up level hash entry. so if we have a
> large hash range originally assigned to the single block, all that
> range will read at memory at single step. With «aged» directory
> when hash blocks used already - it’s easy to hit.
If you look at ext4_htree_fill_tree(), we are only iterating over the
leaf blocks. We are using a 31-bit hash, where the low-order bit is
one if there has been a collision. In that case, we need to read the
next block to make sure all of the directory entries which have the
same 31-bit hash are in the rbtree.
So *even* if the directory is so badly fragmeneted that there is only
one directory entry per block, we will eventually find a block where
the last (only) directory entry has a different directory entry from
the current hash value, at which point we can stop.
And I'll note that this is a problem that can be solved without
changing the on-disk format, but simply adding code so we can merge
adjacent directory entry blocks. The main reason why it hasn't been
done is (a) no one has sent me patches, and (b) I haven't seen
workloads where this has been anything other than a theoretical /
academic concern.
You seem very passionate about this. Is this a problem you've
personally seen? If so, can you give me more details about your use
case, and how you've been running into this issue? Instead of just
arguing about it from a largely theoretical perspective?
> Yes, i expect to have some seek penalty. But may testing say it too huge now.
> directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
> Same testing with hard links same create rate dropped slightly.
So this sounds like it's all about the seek penalty of the _data_
blocks. If you use hard links the creation rate only dropped a
little, am I understanding you corretly? (Sorry, your English is a
little fracturered so I'm having trouble parsing the meaning out of
your sentences.)
So what do you think the creation rate _should_ be? And where do you
think the time is going to, if it's not the fact that we have to place
the data blocks further and further from the directory? And more
importantly, what's your proposal for how to "fix" this?
> > As for the other optimizations --- things like allowing parallel
> > directory modifications, or being able to shrink empty directory
> > blocks or shorten the tree are all improvements we can make without
> > impacting the on-disk format. So they aren't an argument for halting
> > the submission of the new on-disk format, no?
> >
> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
But there are cases where today, the workload would simply fail with
ENOSPC when the directory couldn't grow any farther. So in those
cases _maybe_ there is something we could do differently that might
make things faster, but you have yet to convince me that the
fundamental fault is one that can only be cured by an on-disk format
change. (And if you believe this is to be true, please enlighten us
on how we can make the on-disk format better!)
Cheers,
- Ted
sorry for english..
> 19 марта 2017 г., в 3:39, Theodore Ts'o <[email protected]> написал(а):
>
> On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
>>>
>>> That's not true. We normally only read in one block a time. If there
>>> is a hash collision, then we may need to insert into the rbtree in a
>>> subsequent block's worth of dentries to make sure we have all of the
>>> directory entries corresponding to a particular hash value. I think
>>> you misunderstood the code.
>>
>> As i see it not about hash collisions, but about merging a several
>> blocks into same hash range on up level hash entry. so if we have a
>> large hash range originally assigned to the single block, all that
>> range will read at memory at single step. With «aged» directory
>> when hash blocks used already - it’s easy to hit.
>
> If you look at ext4_htree_fill_tree(), we are only iterating over the
> leaf blocks. We are using a 31-bit hash, where the low-order bit is
> one if there has been a collision. In that case, we need to read the
> next block to make sure all of the directory entries which have the
> same 31-bit hash are in the rbtree.
looks we say about same but with different words.
based on dx code, up level hash block have a records - {hash1, block1} {hash2, block1}…
so any records with hash range {hash1, hash2} will live on block1.
right ?
so question how much {hash1, hash2} distance may be, looks you name it as "hash collision".
> You seem very passionate about this. Is this a problem you've
> personally seen? If so, can you give me more details about your use
> case, and how you've been running into this issue? Instead of just
> arguing about it from a largely theoretical perspective?
>
It problem was seen with Lustre MDT code after large number create/unlinks.
but it seen only few times.
Other hits isn’t conformed.
>> Yes, i expect to have some seek penalty. But may testing say it too huge now.
>> directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
>> Same testing with hard links same create rate dropped slightly.
>
> So this sounds like it's all about the seek penalty of the _data_
> blocks.
no.
> If you use hard links the creation rate only dropped a
> little, am I understanding you corretly?
yes and no. hard link create rate dropped a little, but open()+close tests dropped a large.
No writes, no data blocks, just inode allocation.
> (Sorry, your English is a
> little fracturered so I'm having trouble parsing the meaning out of
> your sentences.)
it’s my bad :(
>
> So what do you think the creation rate _should_ be? And where do you
> think the time is going to, if it's not the fact that we have to place
> the data blocks further and further from the directory? And more
> importantly, what's your proposal for how to "fix" this?
>
>>> As for the other optimizations --- things like allowing parallel
>>> directory modifications, or being able to shrink empty directory
>>> blocks or shorten the tree are all improvements we can make without
>>> impacting the on-disk format. So they aren't an argument for halting
>>> the submission of the new on-disk format, no?
>>>
>> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
>
> But there are cases where today, the workload would simply fail with
> ENOSPC when the directory couldn't grow any farther. So in those
> cases _maybe_ there is something we could do differently that might
> make things faster, but you have yet to convince me that the
> fundamental fault is one that can only be cured by an on-disk format
> change. (And if you believe this is to be true, please enlighten us
> on how we can make the on-disk format better!)
>
> Cheers,
>
> - Ted
On Mar 18, 2017, at 10:29 AM, Theodore Ts'o <[email protected]> wrote:
>
> On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
>> Andreas,
>>
>> it not about a feature flag. It’s about a situation in whole.
>> Yes, we may increase a directory size, but it open a number a large problems.
>
>> 1) readdir. It tries to read all entries in memory before send to
>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>> increasing it size may produce a problems for a system.
>
> That's not true. We normally only read in one block a time. If there
> is a hash collision, then we may need to insert into the rbtree in a
> subsequent block's worth of dentries to make sure we have all of the
> directory entries corresponding to a particular hash value. I think
> you misunderstood the code.
>
>> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
>
> But there are also only 32k blocks in a group descriptor, and we try
> to keep the blocks allocated close to the inode. So if you are using
> a huge directory, and you are using a storage device with a
> significant seek penalty, then yes, no matter what as the directory
> grows, the time to iterate over all of the files does grow. But there
> is more to life than microbenchmarks which creates huge numbers of zero
> length files! If we assume that the files are going to need to
> contain _data_, and the data blocks should be close to the inodes,
> then there are going to be some performance impacts no matter what.
Actually, on a Lustre MDT there _are_ only zero-length files, since all
of the data is stored in another filesystem. Fortunately, the parent
directory stores the last group successfully used for allocation
(i_alloc_group) so that new inode allocation doesn't have to scan the
whole filesystem each time from the parent's group.
>> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
>
> I can imagine a new feature flag which defines the use a 64-bit inode
> number, but that's more for people who are creating a file system that
> takes advantage of 64-bit block numbers, and they are intending on
> using all of that space to store small (< 4k or < 8k) files.
The 4-billion inode limit is somewhat independent of large directories.
That said, the DIRDATA feature that is used for Lustre is also designed
to allow storing the high 32 bits of the inode number in the directory.
This would allow compatible upgrade of a directory to storing both
32-bit and 64-bit inode numbers without the need for wholescale conversion
of directories, or having space for 64-bit inode numbers even if most
of the inodes are only 32-bit values.
Cheers, Andreas
On Mar 18, 2017, at 6:39 PM, Theodore Ts'o <[email protected]> wrote:
>
> On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
>>>> 1) readdir. It tries to read all entries in memory before send to
>>>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>>>> increasing it size may produce a problems for a system.
>>>
>>> That's not true. We normally only read in one block a time. If there
>>> is a hash collision, then we may need to insert into the rbtree in a
>>> subsequent block's worth of dentries to make sure we have all of the
>>> directory entries corresponding to a particular hash value. I think
>>> you misunderstood the code.
>>
>> As i see it not about hash collisions, but about merging a several
>> blocks into same hash range on up level hash entry. so if we have a
>> large hash range originally assigned to the single block, all that
>> range will read at memory at single step. With «aged» directory
>> when hash blocks used already - it’s easy to hit.
>
> If you look at ext4_htree_fill_tree(), we are only iterating over the
> leaf blocks. We are using a 31-bit hash, where the low-order bit is
> one if there has been a collision. In that case, we need to read the
> next block to make sure all of the directory entries which have the
> same 31-bit hash are in the rbtree.
I guess the main concern here is that insertion into an htree directory
is essentially random based on the hash, so for very large directories
the whole directory (htree and leaf blocks) needs to fit into RAM or the
performance will suffer since every leaf block will need to be read from
disk for every create/lookup/delete.
That said, the maximum size of a 2-level htree directory is about 1GB,
and servers have 128 or 256GB of RAM these days, so I don't think this
will be a huge problem. We also have a tunable that can limit the
maximum directory size if this becomes a problem.
> And I'll note that this is a problem that can be solved without
> changing the on-disk format, but simply adding code so we can merge
> adjacent directory entry blocks. The main reason why it hasn't been
> done is (a) no one has sent me patches, and (b) I haven't seen
> workloads where this has been anything other than a theoretical /
> academic concern.
>
> You seem very passionate about this. Is this a problem you've
> personally seen? If so, can you give me more details about your use
> case, and how you've been running into this issue? Instead of just
> arguing about it from a largely theoretical perspective?
We have seen large directories at the htree limit unable to add new
entries because the htree/leaf blocks become fragmented from repeated
create/delete cycles. I agree that handling directory shrinking
would probably solve that problem, since the htree and leaf blocks
would be compacted during deletion and then the htree would be able
to split the leaf blocks in the right location for the new entries.
Cheers, Andreas
>> Yes, i expect to have some seek penalty. But may testing say it too huge
>> now. directory creation rate started with 80k create/s have dropped
>> to the 20k-30k create/s with hash tree extend to the level 3. Same
>> testing with hard links same create rate dropped slightly.
>
> So this sounds like it's all about the seek penalty of the _data_
> blocks. If you use hard links the creation rate only dropped a
> little, am I understanding you correctly? (Sorry, your English is a
> little fracturered so I'm having trouble parsing the meaning out of
> your sentences.)
>
> So what do you think the creation rate _should_ be? And where do you
> think the time is going to, if it's not the fact that we have to place
> the data blocks further and further from the directory? And more
> importantly, what's your proposal for how to "fix" this?
>
>>> As for the other optimizations --- things like allowing parallel
>>> directory modifications, or being able to shrink empty directory
>>> blocks or shorten the tree are all improvements we can make without
>>> impacting the on-disk format. So they aren't an argument for halting
>>> the submission of the new on-disk format, no?
>>>
>> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
>
> But there are cases where today, the workload would simply fail with
> ENOSPC when the directory couldn't grow any farther. So in those
> cases _maybe_ there is something we could do differently that might
> make things faster, but you have yet to convince me that the
> fundamental fault is one that can only be cured by an on-disk format
> change. (And if you believe this is to be true, please enlighten us
> on how we can make the on-disk format better!)
>
> Cheers,
>
> - Ted
Cheers, Andreas
On Sat, Mar 18, 2017 at 11:38:38PM -0600, Andreas Dilger wrote:
>
> Actually, on a Lustre MDT there _are_ only zero-length files, since all
> of the data is stored in another filesystem. Fortunately, the parent
> directory stores the last group successfully used for allocation
> (i_alloc_group) so that new inode allocation doesn't have to scan the
> whole filesystem each time from the parent's group.
So I'm going to ask a stupid question. If Lustre is using only
zero-length files, and so you're storing all of the data in the
directory entries --- why didn't you use some kind of userspace store,
such as Mysql or MongoDB? Is it because the Lustre metadata server is
all in kernel space, and using ext4 file system was the most
expeditious way of moving forward?
I'd be gratified... surprised, but gratified... if the answer was that
ext4 used in this fashion was faster than MongoDB, but to be honest
that would be very surprising indeed. Most of the cluster file
systems... e.g., GFS, hadoopfs, et.al, tend to use a purpose-built
key-value store (for example GFS uses bigtable) to store the cluster
metadata.
> The 4-billion inode limit is somewhat independent of large directories.
> That said, the DIRDATA feature that is used for Lustre is also designed
> to allow storing the high 32 bits of the inode number in the directory.
> This would allow compatible upgrade of a directory to storing both
> 32-bit and 64-bit inode numbers without the need for wholescale conversion
> of directories, or having space for 64-bit inode numbers even if most
> of the inodes are only 32-bit values.
Ah, I didn't realize that DIRDATA was still used by Lustre. Is the
reason why you haven't retried merging (I think the last time was
~2009) because it's only used by one or two machines (the MDS's) in a
Lustre cluster?
I brought up the 32-bit inode limit because Alexey was using this as
an argument not to move ahead with merging the largedir feature. Now
that I understand his concerns are also based around Lustre, and the
fact that we are inserting into the hash tree effectively randomly,
that *is* a soluble problem for Lustre, if it has control over the
directory names which are being stored in the MDS file. For example,
if you are storing in this gargantuan MDS directory file names which
are composed of the 128-bit Lustre FileID, we could define a new hash
type which, if the filename fits the format of the Lustre FID, parses
the filename and uses the low the 32-bit object ID concatenated with
the low-32 bits of the sequence id (which is used to name the target).
If you did this, then we would limit the number of htree blocks that
you would need to keep in memory at any one time. I think the only
real problem here is that with only a 32-bit object id namespace, you
must eventually need to reuse object ID's, at which point you could no
longer be allocating them sequentially. But if you were willing to
use some parts of the 64-bit sequence number space, perhaps this could
be finessed.
I'd probably add this as a new, Lustre-specific hash alongside some
other proposed new htree hash types that have been propsoed over the
years, but this would allow MDS inserts (assuming that each target is
inserting new objects using a sequentially increasing object ID) to be
done in a way where they aren't splatter themselves all over the
htree.
What do you think?
On Sun, Mar 19, 2017 at 12:13:00AM -0600, Andreas Dilger wrote:
>
> We have seen large directories at the htree limit unable to add new
> entries because the htree/leaf blocks become fragmented from repeated
> create/delete cycles. I agree that handling directory shrinking
> would probably solve that problem, since the htree and leaf blocks
> would be compacted during deletion and then the htree would be able
> to split the leaf blocks in the right location for the new entries.
Right, and the one thing that makes directory shrinking hard is what
to do with the now-unused block. I've been thinking about this, and
it *is* possible to do this without having to change the on disk
format.
What we could do is make a copy of the last block in the directory and
write it on top of the now-empty (and now-unlinked) directory block.
We then find where to find the parent pointer for that block by
looking at the first hash value stored in the block if it is an index
block, or hash the first directory entry if it is a leaf block, and
then walk the directory htree to find the block which needs to be
patched to point at the new copy of that directory block, and then
truncate the directory to remove that last 4k block.
It's actually not that bad; it would require taking a full mutex on
the whole directory tree, but it could be done in workqueue since it's
a cleanup operation so we don't have to slowdown the unlink or rmdir
operation.
If someone would like to code this up, patches would be gratefully
accepted. :-)
Cheers,
- Ted
On Mar 19, 2017, at 9:34 AM, Theodore Ts'o <[email protected]> wrote:
>
> On Sat, Mar 18, 2017 at 11:38:38PM -0600, Andreas Dilger wrote:
>>
>> Actually, on a Lustre MDT there _are_ only zero-length files, since all
>> of the data is stored in another filesystem. Fortunately, the parent
>> directory stores the last group successfully used for allocation
>> (i_alloc_group) so that new inode allocation doesn't have to scan the
>> whole filesystem each time from the parent's group.
>
> So I'm going to ask a stupid question. If Lustre is using only
> zero-length files, and so you're storing all of the data in the
> directory entries --- why didn't you use some kind of userspace store,
> such as Mysql or MongoDB? Is it because the Lustre metadata server is
> all in kernel space, and using ext4 file system was the most
> expeditious way of moving forward?
Right - the Lustre servers are implemented in the kernel, to avoid
user-kernel data transfers from the network, and to avoid creating a new
disk filesystem while still allowing proper transactions.
> I'd be gratified... surprised, but gratified... if the answer was that
> ext4 used in this fashion was faster than MongoDB, but to be honest
> that would be very surprising indeed. Most of the cluster file
> systems... e.g., GFS, hadoopfs, et.al, tend to use a purpose-built
> key-value store (for example GFS uses bigtable) to store the cluster
> metadata.
>
>> The 4-billion inode limit is somewhat independent of large directories.
>> That said, the DIRDATA feature that is used for Lustre is also designed
>> to allow storing the high 32 bits of the inode number in the directory.
>> This would allow compatible upgrade of a directory to storing both
>> 32-bit and 64-bit inode numbers without the need for wholescale conversion
>> of directories, or having space for 64-bit inode numbers even if most
>> of the inodes are only 32-bit values.
>
> Ah, I didn't realize that DIRDATA was still used by Lustre. Is the
> reason why you haven't retried merging (I think the last time was
> ~2009) because it's only used by one or two machines (the MDS's) in a
> Lustre cluster?
Mostly because there hasn't been any interest for it whenever I proposed
merging it in the past. If there is some renewed interest in merging it
I could look into it...
> I brought up the 32-bit inode limit because Alexey was using this as
> an argument not to move ahead with merging the largedir feature. Now
> that I understand his concerns are also based around Lustre, and the
> fact that we are inserting into the hash tree effectively randomly,
> that *is* a soluble problem for Lustre, if it has control over the
> directory names which are being stored in the MDS file. For example,
> if you are storing in this gargantuan MDS directory file names which
> are composed of the 128-bit Lustre FileID, we could define a new hash
> type which, if the filename fits the format of the Lustre FID, parses
> the filename and uses the low the 32-bit object ID concatenated with
> the low-32 bits of the sequence id (which is used to name the target).
No, the directory tree for the Lustre MDS is just a regular directory
tree (under "ROOT/" so we can have other files outside the visible
namespace) with regular filenames as with local ext4. The one difference
is that there are also 128-bit FIDs stored in the dirents to allow readdir
to work efficiently, but the majority of the other Lustre attributes
are stored in xattrs on the inode.
Cheers, Andreas
> On Sun, Mar 19, 2017 at 12:13:00AM -0600, Andreas Dilger wrote:
>>
>> We have seen large directories at the htree limit unable to add new
>> entries because the htree/leaf blocks become fragmented from repeated
>> create/delete cycles. I agree that handling directory shrinking
>> would probably solve that problem, since the htree and leaf blocks
>> would be compacted during deletion and then the htree would be able
>> to split the leaf blocks in the right location for the new entries.
>
> Right, and the one thing that makes directory shrinking hard is what
> to do with the now-unused block. I've been thinking about this, and
> it *is* possible to do this without having to change the on disk
> format.
>
> What we could do is make a copy of the last block in the directory and
> write it on top of the now-empty (and now-unlinked) directory block.
> We then find where to find the parent pointer for that block by
> looking at the first hash value stored in the block if it is an index
> block, or hash the first directory entry if it is a leaf block, and
> then walk the directory htree to find the block which needs to be
> patched to point at the new copy of that directory block, and then
> truncate the directory to remove that last 4k block.
>
> It's actually not that bad; it would require taking a full mutex on
> the whole directory tree, but it could be done in workqueue since it's
> a cleanup operation so we don't have to slowdown the unlink or rmdir
> operation.
>
> If someone would like to code this up, patches would be gratefully
> accepted. :-)
>
> Cheers,
>
> - Ted
Cheers, Andreas
>
>> I brought up the 32-bit inode limit because Alexey was using this as
>> an argument not to move ahead with merging the largedir feature. Now
>> that I understand his concerns are also based around Lustre, and the
>> fact that we are inserting into the hash tree effectively randomly,
>> that *is* a soluble problem for Lustre, if it has control over the
>> directory names which are being stored in the MDS file. For example,
>> if you are storing in this gargantuan MDS directory file names which
>> are composed of the 128-bit Lustre FileID, we could define a new hash
>> type which, if the filename fits the format of the Lustre FID, parses
>> the filename and uses the low the 32-bit object ID concatenated with
>> the low-32 bits of the sequence id (which is used to name the target).
>
> No, the directory tree for the Lustre MDS is just a regular directory
> tree (under "ROOT/" so we can have other files outside the visible
> namespace) with regular filenames as with local ext4. The one difference
> is that there are also 128-bit FIDs stored in the dirents to allow readdir
> to work efficiently, but the majority of the other Lustre attributes
> are stored in xattrs on the inode.
To make picture clean. OST side is regular FS with 32 directories where a stripe objects is live.
With current 4G inodes limit each directory will filled with up 100k regular files.
Files allocated in batch, up to 20k files per batch. Allocated object used on MDT side to make mapping between metadata objects and data for such file.
I worry about it part, not about MDT. these directories have a large number creations/unlinks and performance degradation started after 3M-5M creations/unlinks.
With Large dir feature i think this performance problems may deeper.
Alexey
On Sun, Mar 19, 2017 at 07:54:40PM -0400, Andreas Dilger wrote:
>
> No, the directory tree for the Lustre MDS is just a regular directory
> tree (under "ROOT/" so we can have other files outside the visible
> namespace) with regular filenames as with local ext4. The one difference
> is that there are also 128-bit FIDs stored in the dirents to allow readdir
> to work efficiently, but the majority of the other Lustre attributes
> are stored in xattrs on the inode.
OK, so let's summarize.
1. This is only going to be an issue for Lustre users that are
creating a truly insanely large directories, and who aren't willing to
use a multi-level directories (e.g., users/t/y/tytso) for whatever reason.
2. Currently the proposal is to upstream largedir, and not
necessarily the other file system features that are Lustre MDS
specific.
3. I can therefore assume that Artem is interested in getting
largedir upstream for use cases and users that go beyond Lustre ---
and these users will probably be using non-zero length inodes, in
which case my observations about the fact that the slow down caused by
the fact that you have to spread out the inodes to place them close to
the data blocks will be applicable.
4. Alexey's concerns, which seem to be based around Lustre users for
which (1) are true, could potentially be addressed by further,
additional file system changes, which could either continue to be
Lustre MDS specific and not upstreamed, or could be upstreamed at some
future point --- but which are fairly orthogonal to this discussion.
Does that seem fair?
- Ted
P.S. I could imagine some changes that involve using 64-bit inode
numbers where the low log2(inode_size) bits are used for the location
of the inode in the block, and the rest of the inode number is used to
identify the block number where the inode can be found --- and
abandoning the use of an "inode table" completely. The inode
allocation bitmap block could be used instead to tell us which blocks
in the block group contain inodes for e2fsck pass 1 scanning. Things
get a bit more complicated in e2fsck if it turns out that bitmap block
is corrupt, but that's a subject for another day, and I suspct it's
something that would only make sense if the Lustre community is
willing to put in the investment to work on it.
On Mon, Mar 20, 2017 at 02:34:31PM +0300, Alexey Lyashkov wrote:
>
> To make picture clean. OST side is regular FS with 32 directories where a stripe objects is live.
> With current 4G inodes limit each directory will filled with up 100k regular files.
> Files allocated in batch, up to 20k files per batch. Allocated object used on MDT side to make mapping between metadata objects and data for such file.
> I worry about it part, not about MDT. these directories have a large number creations/unlinks and performance degradation started after 3M-5M creations/unlinks.
> With Large dir feature i think this performance problems may deeper.
This makes no sense. On the Object Store side, if the maximum
directory size is 100k regular files, you're never going to run into
two-levle htree limit, so the presense (or absense) of largedir is
largely irrelevant. So how can it make the performance problem worse?
- Ted
> On Mar 20, 2017, at 10:20 AM, Theodore Ts'o <[email protected]> wrote:
>
> On Mon, Mar 20, 2017 at 02:34:31PM +0300, Alexey Lyashkov wrote:
>>
>> To make picture clean. OST side is regular FS with 32 directories where a stripe objects is live. With current 4G inodes limit each directory will
>> filled with up 100k regular files. Files allocated in batch, up to 20k files per batch. Allocated object used on MDT side to make mapping between metadata objects and data for such file.
>>
>> I worry about it part, not about MDT. these directories have a large number creations/unlinks and performance degradation started after 3M-5M creations/unlinks.
>> With Large dir feature i think this performance problems may deeper.
>
> This makes no sense. On the Object Store side, if the maximum
> directory size is 100k regular files, you're never going to run into
> two-levle htree limit, so the presense (or absense) of largedir is
> largely irrelevant. So how can it make the performance problem worse?
I believe Alexey meant to write "100M" regular files (theoretically up to
32 dirs x 128M files to reach the ext4 4B inode limit).
I was thinking of the largedir code from the MDS side, where we have no
option but to have a single large directory that reflects what the user
created. For Alexey's issue, the OST directories are instead just storing
a large sequentially-incremented integer as the filename in the directory.
This wasn't typically an issue in the past because the average Lustre
file size is large (e.g. 1-32MB) and in < 32TB OST filesystem this would
only be 1-32M files spread across 32 directories. With the 400TB ext4
filesystems that Seagate is using this becomes an actual problem.
As Ted suggested earlier, there is a possibility to use a different hash
function (TEA hash?) that is better suited to be used on OSTs compared
to MDTs since the OST object filenames are mostly sequential integers.
The other option is to (also?) fix this at the Lustre OST level, to use
more directories in the object "O/d*" hierarchy, since that is not exposed
to userspace, and there is no specific reason that we need to stick with
32 subdirectories here. 32 subdirs was a fine option for OSTs in the
past, but there is actually the capability to use a different number of
subdirs on the OST (see struct lr_server_data.lsd_subdir_count), but we've
just never used it and it needs some fixing (e.g. if the last_rcvd file is
lost and needs to be recreated it should check the actual number of d*
subdirs instead of just assuming it is OBJ_SUBDIR_COUNT).
Also, instead of just increasing this to 128 subdirs (or whatever), we
should consider to change to creating new object subdirs every 128M objects
allocated (or similar), so that on huge OSTs there are automatically enough
subdirs allocated, and old subdirectory trees do not need to be kept in RAM
if they are not currently being used. That would be more memory efficient
than having many subdirs that are all in use at the same time. This would
need to have ext4 directory shrinking so that the old directories shrink
when they become empty, and could be removed completely once empty.
We have something similar to this "new object subdirectory" functionality
already with the OST FID sequence allocation that we use for DNE today,
since each sequence gets a different subdirectory tree on the OST. What
would need to be changed is that the MDS needs to use OST FID SEQs even
when DNE isn't used (that should work for all in-use Lustre versions).
Secondly, the objects allocated per sequence (LUSTRE_DATA_SEQ_MAX_WIDTH)
needs to be reduced from the current 4B-1 to 128M or similar.
Cheers, Andreas
On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <[email protected]>
>
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.
>
> Signed-off-by: Yang Sheng <[email protected]>
> Signed-off-by: Artem Blagodarenko <[email protected]>
Thanks, applied.
- Ted
On Sat, Apr 29, 2017 at 08:59:53PM -0400, Theodore Ts'o wrote:
> On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote:
> > From: Artem Blagodarenko <[email protected]>
> >
> > This INCOMPAT_LARGEDIR feature allows larger directories to be created
> > in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> > depth of 3 instead of the current limit of 2. These features are needed
> > in order to exceed the current limit of approximately 10M entries in a
> > single directory.
> >
> > Signed-off-by: Yang Sheng <[email protected]>
> > Signed-off-by: Artem Blagodarenko <[email protected]>
>
> Thanks, applied.
>
> - Ted
FYI, this patch is causing a problem when creating many files in a directory
(without the largedir feature enabled). I haven't looked into it but maybe it
will ring a bell for someone.
seq -f "abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch
[ 42.726480] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.729472] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.731689] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.734303] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.736383] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.739133] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.741307] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.743963] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[ 42.745998] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
...
On May 1, 2017, at 12:58 PM, Eric Biggers <[email protected]> wrote:
>
> On Sat, Apr 29, 2017 at 08:59:53PM -0400, Theodore Ts'o wrote:
>> On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote:
>>> From: Artem Blagodarenko <[email protected]>
>>>
>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>>> depth of 3 instead of the current limit of 2. These features are needed
>>> in order to exceed the current limit of approximately 10M entries in a
>>> single directory.
>>>
>>> Signed-off-by: Yang Sheng <[email protected]>
>>> Signed-off-by: Artem Blagodarenko <[email protected]>
>>
>> Thanks, applied.
>>
>> - Ted
>
> FYI, this patch is causing a problem when creating many files in a directory
> (without the largedir feature enabled). I haven't looked into it but maybe it
> will ring a bell for someone.
Hmm, is this also a problem without the patch, when creating large numbers
of entries in a directory?
It looks like the directory index is clobbering the directory index block
checksum, which is stored in struct ext2_dx_tail at the end of each index
block. One possibility is that the patch is miscalculating the maximum
number of entries in the index blocks when the metadata_csum feature is
enabled? I don't think that feature is enabled by default with e2fsprogs
yet, so it is entirely possible that these two features aren't quite playing
nice together in the sandbox.
That said, I looked through the patch again with this in mind, and I don't
see anything related to the dx_limit. The dx_node_limit() function correctly
takes the metadata_csum feature into account when calculating the limit of
the htree entries in interior index blocks, so it isn't clear where this
checksum error is coming from.
It might be useful to print out the checksum values, just in case e.g. this
is being checked on a block that was just zeroed out?
Alternately, it might be that we are not initializing the checksum properly
in the first place? Looking at it from this angle, I see what could be a
problem. Can you try out the following (untested) patch (also attached in
case of mangling)? I'm not totally sure it is correct, since the change from
ext4_handle_dirty_metadata() to ext4_handle_dirty_dx_block() could potentially
be ext4_handle_dirty_dirent_block(), but I _think_ the current change is right.
There is also a separate question of whether we need to dirty ALL of the buffers
in this code, compared to the pre-patch changes which had fewer calls to dirty
buffers, but at least this patch should fix the checksum errors.
Cheers, Andreas
----
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bd189c04..f0e8a6c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2309,21 +2309,23 @@ static int ext4_dx_add_entry(handle_t *handle,
dx_insert_block((frame - 1), hash2, newblock);
dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node",
- ((struct dx_node *) bh2->b_data)->entries));
+ ((struct dx_node *)bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
- brelse (bh2);
- ext4_handle_dirty_metadata(handle, dir,
- (frame - 1)->bh);
+ brelse(bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
if (restart) {
- ext4_handle_dirty_metadata(handle, dir,
- frame->bh);
- goto cleanup;
+ err = ext4_handle_dirty_dirty_dx_node(handle,
+ dir, frame->bh);
+ goto journal_error;
}
- } else {
+ } else /* add_level */ {
struct dx_root *dxroot;
- memcpy((char *) entries2, (char *) entries,
+ memcpy((char *)entries2, (char *)entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2335,15 +2337,13 @@ static int ext4_dx_add_entry(handle_t *handle,
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
info->indirect_levels));
- ext4_handle_dirty_metadata(handle, dir, frame->bh);
- ext4_handle_dirty_metadata(handle, dir, bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ if (err)
+ goto journal_error;
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
- goto cleanup;
- }
- if (err) {
- ext4_std_error(inode->i_sb, err);
- goto cleanup;
+ goto journal_error;
}
}
de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@@ -2355,7 +2355,7 @@ static int ext4_dx_add_entry(handle_t *handle,
goto cleanup;
journal_error:
- ext4_std_error(dir->i_sb, err);
+ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
> seq -f "abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch
>
> [ 42.726480] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.729472] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.731689] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.734303] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.736383] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.739133] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.741307] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.743963] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> [ 42.745998] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
> ...
Cheers, Andreas
On Mon, May 01, 2017 at 05:39:41PM -0600, Andreas Dilger wrote:
> > FYI, this patch is causing a problem when creating many files in a directory
> > (without the largedir feature enabled). I haven't looked into it but maybe it
> > will ring a bell for someone.
>
> Hmm, is this also a problem without the patch, when creating large numbers
> of entries in a directory?
No, it works fine without the patch. Turnkey reproduction for
kvm-xfstests which works fine without the largedir patch, but which
fails with the largedir patch:
mke2fs -t ext4 -Fq /dev/vdc
mount /vdc
mkdir /vdc/a
cd /vdc/a
seq -f "abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch
umount /vdc
e2fsck -fy /dev/vdc
The output of e2fsck:
e2fsck 1.43.5-WIP-ed1e950f (26-Apr-2017)
Pass 1: Checking inodes, blocks, and sizes
Inode 131073, i_size is 2080768, should be 2084864. Fix? yes
Pass 2: Checking directory structure
Problem in HTREE directory inode 131073: root node fails checksum.
Clear HTree index? yes
Pass 3: Checking directory connectivity
Pass 3A: Optimizing directories
Pass 4: Checking reference counts
Pass 5: Checking group summary information
/dev/vdc: ***** FILE SYSTEM WAS MODIFIED *****
/dev/vdc: 30705/327680 files (0.0% non-contiguous), 42515/1310720 blocks
It looks the large_dir patch is responsible for a test regression ---
it is causing generic/339 to fail.
I'm going to drop the large dir patch for the 4.12 merge window.
Eric, thanks for noticing and pointing this out!
- Ted