2017-05-06 08:43:37

by Artem Blagodarenko

[permalink] [raw]
Subject: [PATCH v2] Add largedir feature

From: Artem Blagodarenko <[email protected]>

This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.

This is second version of the patch set. Changes since v1:
* ext4_handle_dirty_dx_node is used instead of
ext4_handle_dirty_metadata so checksum is reculculated.

Signed-off-by: Yang Sheng <[email protected]>
Signed-off-by: Artem Blagodarenko <[email protected]>
---
fs/ext4/ext4.h | 23 ++++++++---
fs/ext4/inode.c | 4 +-
fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++++++++++------------------
3 files changed, 105 insertions(+), 46 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01d52b9..0bbbd9b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
- EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2125,6 +2126,16 @@ struct dir_private_info {
*/
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))

+/* htree levels for ext4 */
+#define EXT4_HTREE_LEVEL_COMPAT 2
+#define EXT4_HTREE_LEVEL 3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+ return ext4_has_feature_largedir(sb) ?
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+ struct ext4_inode *raw_inode)
{
- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ if (ext4_has_feature_largedir(sb) ||
+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo);
- else
- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f622d4a..5787f3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- inode->i_size = ext4_isize(raw_inode);
+ inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(raw_inode)) {
+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6ad612c..c02eead 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
- return le32_to_cpu(entry->block) & 0x00ffffff;
+ return le32_to_cpu(entry->block) & 0x0fffffff;
}

static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;

+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
}

indirect = root->info.indirect_levels;
- if (indirect > 1) {
- ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
- root->info.indirect_levels);
+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
+ ext4_warning(dir->i_sb,
+ "Directory (ino: %lu) htree depth %#06x exceed"
+ "supported value", dir->i_ino,
+ ext4_dir_htree_level(dir->i_sb));
+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(dir->i_sb, "Enable large directory "
+ "feature to access it");
+ }
goto fail;
}

@@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,

static void dx_release(struct dx_frame *frames)
{
+ struct dx_root_info *info;
+ int i;
+
if (frames[0].bh == NULL)
return;

- if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
+ info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ for (i = 0; i <= info->indirect_levels; i++) {
+ if (frames[i].bh == NULL)
+ break;
+ brelse(frames[i].bh);
+ frames[i].bh = NULL;
+ }
}

/*
@@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
{
struct dx_hash_info hinfo;
struct ext4_dir_entry_2 *de;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct inode *dir;
ext4_lblk_t block;
int count = 0;
@@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
const struct qstr *d_name = fname->usr_fname;
struct buffer_head *bh;
ext4_lblk_t block;
@@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
*/
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
{
struct buffer_head *bh2;
struct dx_root *root;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
struct ext4_dir_entry_tail *t;
@@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
+ int restart;
int err;

+again:
+ restart = 0;
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
@@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (err != -ENOSPC)
goto cleanup;

+ err = 0;
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
ext4_lblk_t newblock;
- unsigned icount = dx_get_count(entries);
- int levels = frame - frames;
+ int levels = frame - frames + 1;
+ unsigned int icount;
+ int add_level = 1;
struct dx_entry *entries2;
struct dx_node *node2;
struct buffer_head *bh2;

- if (levels && (dx_get_count(frames->entries) ==
- dx_get_limit(frames->entries))) {
- ext4_warning_inode(dir, "Directory index full!");
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+ add_level = 0;
+ break;
+ }
+ frame--; /* split higher index block */
+ at = frame->at;
+ entries = frame->entries;
+ restart = 1;
+ }
+ if (add_level && levels == ext4_dir_htree_level(sb)) {
+ ext4_warning(sb, "Directory (ino: %lu) index full, "
+ "reach max htree level :%d",
+ dir->i_ino, levels);
+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(sb, "Large directory feature is "
+ "not enabled on this "
+ "filesystem");
+ }
err = -ENOSPC;
goto cleanup;
}
+ icount = dx_get_count(entries);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
err = PTR_ERR(bh2);
@@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
goto journal_error;
- if (levels) {
+ if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,

BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
- frames[0].bh);
+ (frame - 1)->bh);
if (err)
goto journal_error;

@@ -2269,17 +2306,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block(frames + 0, hash2, newblock);
- dxtrace(dx_show_index("node", frames[1].entries));
+ dx_insert_block((frame - 1), hash2, newblock);
+ dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
+ ext4_handle_dirty_dx_node(handle, dir,
+ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+ if (restart) {
+ ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ goto journal_error;
+ }
} else {
- dxtrace(printk(KERN_DEBUG
- "Creating second level index...\n"));
+ struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2287,22 +2332,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
- frame->at = at = at - entries + entries2;
- frame->entries = entries = entries2;
- frame->bh = bh2;
- err = ext4_journal_get_write_access(handle,
- frame->bh);
+ dxroot = (struct dx_root *)frames[0].bh->b_data;
+ dxroot->info.indirect_levels += 1;
+ dxtrace(printk(KERN_DEBUG
+ "Creating %d level index...\n",
+ info->indirect_levels));
+ ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
- }
- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
- if (err) {
- ext4_std_error(inode->i_sb, err);
- goto cleanup;
+ ext4_handle_dirty_dx_node(handle, dir, bh2);
+ brelse(bh2);
+ restart = 1;
+ goto journal_error;
}
}
de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@@ -2314,10 +2355,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
goto cleanup;

journal_error:
- ext4_std_error(dir->i_sb, err);
+ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
+ /* @restart is true means htree-path has been changed, we need to
+ * repeat dx_probe() to find out valid htree-path
+ */
+ if (restart && err == 0)
+ goto again;
return err;
}

@@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
--
1.8.3.1


2017-05-06 17:34:53

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v2] Add largedir feature

On May 6, 2017, at 2:43 AM, Artem Blagodarenko <[email protected]> wrote:
>
> From: Artem Blagodarenko <[email protected]>
>
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.
>
> This is second version of the patch set. Changes since v1:
> * ext4_handle_dirty_dx_node is used instead of
> ext4_handle_dirty_metadata so checksum is reculculated.

Looks like the error checking is missing. Comments inline.

> Signed-off-by: Yang Sheng <[email protected]>
> Signed-off-by: Artem Blagodarenko <[email protected]>
> ---
> fs/ext4/ext4.h | 23 ++++++++---
> fs/ext4/inode.c | 4 +-
> fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++++++++++------------------
> 3 files changed, 105 insertions(+), 46 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01d52b9..0bbbd9b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> EXT4_FEATURE_INCOMPAT_MMP | \
> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
> @@ -2125,6 +2126,16 @@ struct dir_private_info {
> */
> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>
> +/* htree levels for ext4 */
> +#define EXT4_HTREE_LEVEL_COMPAT 2
> +#define EXT4_HTREE_LEVEL 3
> +
> +static inline int ext4_dir_htree_level(struct super_block *sb)
> +{
> + return ext4_has_feature_largedir(sb) ?
> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
> +}
> +
> /*
> * Timeout and state flag for lazy initialization inode thread.
> */
> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
> }
>
> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
> +static inline loff_t ext4_isize(struct super_block *sb,
> + struct ext4_inode *raw_inode)
> {
> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> + if (ext4_has_feature_largedir(sb) ||
> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
> le32_to_cpu(raw_inode->i_size_lo);
> - else
> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> +
> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> }
>
> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f622d4a..5787f3d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> if (ext4_has_feature_64bit(sb))
> ei->i_file_acl |=
> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
> - inode->i_size = ext4_isize(raw_inode);
> + inode->i_size = ext4_isize(sb, raw_inode);
> if ((size = i_size_read(inode)) < 0) {
> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
> ret = -EFSCORRUPTED;
> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
> raw_inode->i_file_acl_high =
> cpu_to_le16(ei->i_file_acl >> 32);
> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
> - if (ei->i_disksize != ext4_isize(raw_inode)) {
> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
> ext4_isize_set(raw_inode, ei->i_disksize);
> need_datasync = 1;
> }
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6ad612c..c02eead 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>
> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
> {
> - return le32_to_cpu(entry->block) & 0x00ffffff;
> + return le32_to_cpu(entry->block) & 0x0fffffff;
> }
>
> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
> u32 hash;
>
> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
> if (IS_ERR(frame->bh))
> return (struct dx_frame *) frame->bh;
> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> }
>
> indirect = root->info.indirect_levels;
> - if (indirect > 1) {
> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
> - root->info.indirect_levels);
> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
> + ext4_warning(dir->i_sb,
> + "Directory (ino: %lu) htree depth %#06x exceed"
> + "supported value", dir->i_ino,
> + ext4_dir_htree_level(dir->i_sb));
> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(dir->i_sb, "Enable large directory "
> + "feature to access it");
> + }
> goto fail;
> }
>
> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>
> static void dx_release(struct dx_frame *frames)
> {
> + struct dx_root_info *info;
> + int i;
> +
> if (frames[0].bh == NULL)
> return;
>
> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
> - brelse(frames[1].bh);
> - brelse(frames[0].bh);
> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
> + for (i = 0; i <= info->indirect_levels; i++) {
> + if (frames[i].bh == NULL)
> + break;
> + brelse(frames[i].bh);
> + frames[i].bh = NULL;
> + }
> }
>
> /*
> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
> {
> struct dx_hash_info hinfo;
> struct ext4_dir_entry_2 *de;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct inode *dir;
> ext4_lblk_t block;
> int count = 0;
> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
> struct ext4_dir_entry_2 **res_dir)
> {
> struct super_block * sb = dir->i_sb;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> const struct qstr *d_name = fname->usr_fname;
> struct buffer_head *bh;
> ext4_lblk_t block;
> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
> */
> dir->i_mtime = dir->i_ctime = current_time(dir);
> ext4_update_dx_flag(dir);
> - dir->i_version++;
> + inode_inc_iversion(dir);
> ext4_mark_inode_dirty(handle, dir);
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
> {
> struct buffer_head *bh2;
> struct dx_root *root;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries;
> struct ext4_dir_entry_2 *de, *de2;
> struct ext4_dir_entry_tail *t;
> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> struct inode *dir, struct inode *inode)
> {
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries, *at;
> struct buffer_head *bh;
> struct super_block *sb = dir->i_sb;
> struct ext4_dir_entry_2 *de;
> + int restart;
> int err;
>
> +again:
> + restart = 0;
> frame = dx_probe(fname, dir, NULL, frames);
> if (IS_ERR(frame))
> return PTR_ERR(frame);
> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> if (err != -ENOSPC)
> goto cleanup;
>
> + err = 0;
> /* Block full, should compress but for now just split */
> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
> dx_get_count(entries), dx_get_limit(entries)));
> /* Need to split index? */
> if (dx_get_count(entries) == dx_get_limit(entries)) {
> ext4_lblk_t newblock;
> - unsigned icount = dx_get_count(entries);
> - int levels = frame - frames;
> + int levels = frame - frames + 1;
> + unsigned int icount;
> + int add_level = 1;
> struct dx_entry *entries2;
> struct dx_node *node2;
> struct buffer_head *bh2;
>
> - if (levels && (dx_get_count(frames->entries) ==
> - dx_get_limit(frames->entries))) {
> - ext4_warning_inode(dir, "Directory index full!");
> + while (frame > frames) {
> + if (dx_get_count((frame - 1)->entries) <
> + dx_get_limit((frame - 1)->entries)) {
> + add_level = 0;
> + break;
> + }
> + frame--; /* split higher index block */
> + at = frame->at;
> + entries = frame->entries;
> + restart = 1;
> + }
> + if (add_level && levels == ext4_dir_htree_level(sb)) {
> + ext4_warning(sb, "Directory (ino: %lu) index full, "
> + "reach max htree level :%d",
> + dir->i_ino, levels);
> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(sb, "Large directory feature is "
> + "not enabled on this "
> + "filesystem");
> + }
> err = -ENOSPC;
> goto cleanup;
> }
> + icount = dx_get_count(entries);
> bh2 = ext4_append(handle, dir, &newblock);
> if (IS_ERR(bh2)) {
> err = PTR_ERR(bh2);
> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> err = ext4_journal_get_write_access(handle, frame->bh);
> if (err)
> goto journal_error;
> - if (levels) {
> + if (!add_level) {
> unsigned icount1 = icount/2, icount2 = icount - icount1;
> unsigned hash2 = dx_get_hash(entries + icount1);
> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>
> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
> err = ext4_journal_get_write_access(handle,
> - frames[0].bh);
> + (frame - 1)->bh);
> if (err)
> goto journal_error;
>
> @@ -2269,17 +2306,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> frame->entries = entries = entries2;
> swap(frame->bh, bh2);
> }
> - dx_insert_block(frames + 0, hash2, newblock);
> - dxtrace(dx_show_index("node", frames[1].entries));
> + dx_insert_block((frame - 1), hash2, newblock);
> + dxtrace(dx_show_index("node", frame->entries));
> dxtrace(dx_show_index("node",
> ((struct dx_node *) bh2->b_data)->entries));
> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
> if (err)
> goto journal_error;
> brelse (bh2);
> + ext4_handle_dirty_dx_node(handle, dir,
> + (frame - 1)->bh);

(defect) missing "err" assignment:

err = ext4_handle_dirty_dx_node(handle, dir
(frame - 1)->bh);

> + if (err)
> + goto journal_error;
> + if (restart) {
> + ext4_handle_dirty_dx_node(handle, dir,
> + frame->bh);

(defect) missing "err" assignment:

err = ext4_handle_dirty_dx_node(handle, dir
frame->bh);

> + goto journal_error;
> + }
> } else {
> - dxtrace(printk(KERN_DEBUG
> - "Creating second level index...\n"));
> + struct dx_root *dxroot;
> memcpy((char *) entries2, (char *) entries,
> icount * sizeof(struct dx_entry));
> dx_set_limit(entries2, dx_node_limit(dir));
> @@ -2287,22 +2332,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> /* Set up root */
> dx_set_count(entries, 1);
> dx_set_block(entries + 0, newblock);
> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
> -
> - /* Add new access path frame */
> - frame = frames + 1;
> - frame->at = at = at - entries + entries2;
> - frame->entries = entries = entries2;
> - frame->bh = bh2;
> - err = ext4_journal_get_write_access(handle,
> - frame->bh);
> + dxroot = (struct dx_root *)frames[0].bh->b_data;
> + dxroot->info.indirect_levels += 1;
> + dxtrace(printk(KERN_DEBUG
> + "Creating %d level index...\n",
> + info->indirect_levels));
> + ext4_handle_dirty_dx_node(handle, dir, frame->bh);

(defect) missing "err" assignment here too

> if (err)
> goto journal_error;
> - }
> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
> - if (err) {
> - ext4_std_error(inode->i_sb, err);
> - goto cleanup;
> + ext4_handle_dirty_dx_node(handle, dir, bh2);

(defect) missing "err" assignment and check

> + brelse(bh2);
> + restart = 1;
> + goto journal_error;
> }
> }
> de = do_split(handle, dir, &bh, frame, &fname->hinfo);
> @@ -2314,10 +2355,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> goto cleanup;
>
> journal_error:
> - ext4_std_error(dir->i_sb, err);
> + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
> cleanup:
> brelse(bh);
> dx_release(frames);
> + /* @restart is true means htree-path has been changed, we need to
> + * repeat dx_probe() to find out valid htree-path
> + */
> + if (restart && err == 0)
> + goto again;
> return err;
> }
>
> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
> blocksize);
> else
> de->inode = 0;
> - dir->i_version++;
> + inode_inc_iversion(dir);
> return 0;
> }
> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> --
> 1.8.3.1
>


Cheers, Andreas






Attachments:
signature.asc (195.00 B)
Message signed with OpenPGP

2017-05-06 18:19:41

by Artem Blagodarenko

[permalink] [raw]
Subject: [PATCH v3] Add largedir feature

From: Artem Blagodarenko <[email protected]>

This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.

Signed-off-by: Yang Sheng <[email protected]>
Signed-off-by: Artem Blagodarenko <[email protected]>
---
fs/ext4/ext4.h | 23 ++++++++---
fs/ext4/inode.c | 4 +-
fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++++++++++------------------
3 files changed, 105 insertions(+), 46 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01d52b9..0bbbd9b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
- EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2125,6 +2126,16 @@ struct dir_private_info {
*/
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))

+/* htree levels for ext4 */
+#define EXT4_HTREE_LEVEL_COMPAT 2
+#define EXT4_HTREE_LEVEL 3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+ return ext4_has_feature_largedir(sb) ?
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+ struct ext4_inode *raw_inode)
{
- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ if (ext4_has_feature_largedir(sb) ||
+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo);
- else
- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f622d4a..5787f3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- inode->i_size = ext4_isize(raw_inode);
+ inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(raw_inode)) {
+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6ad612c..fd38b4a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
- return le32_to_cpu(entry->block) & 0x00ffffff;
+ return le32_to_cpu(entry->block) & 0x0fffffff;
}

static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;

+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
}

indirect = root->info.indirect_levels;
- if (indirect > 1) {
- ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
- root->info.indirect_levels);
+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
+ ext4_warning(dir->i_sb,
+ "Directory (ino: %lu) htree depth %#06x exceed"
+ "supported value", dir->i_ino,
+ ext4_dir_htree_level(dir->i_sb));
+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(dir->i_sb, "Enable large directory "
+ "feature to access it");
+ }
goto fail;
}

@@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,

static void dx_release(struct dx_frame *frames)
{
+ struct dx_root_info *info;
+ int i;
+
if (frames[0].bh == NULL)
return;

- if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
+ info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ for (i = 0; i <= info->indirect_levels; i++) {
+ if (frames[i].bh == NULL)
+ break;
+ brelse(frames[i].bh);
+ frames[i].bh = NULL;
+ }
}

/*
@@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
{
struct dx_hash_info hinfo;
struct ext4_dir_entry_2 *de;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct inode *dir;
ext4_lblk_t block;
int count = 0;
@@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
const struct qstr *d_name = fname->usr_fname;
struct buffer_head *bh;
ext4_lblk_t block;
@@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
*/
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
{
struct buffer_head *bh2;
struct dx_root *root;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
struct ext4_dir_entry_tail *t;
@@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
+ int restart;
int err;

+again:
+ restart = 0;
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
@@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (err != -ENOSPC)
goto cleanup;

+ err = 0;
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
ext4_lblk_t newblock;
- unsigned icount = dx_get_count(entries);
- int levels = frame - frames;
+ int levels = frame - frames + 1;
+ unsigned int icount;
+ int add_level = 1;
struct dx_entry *entries2;
struct dx_node *node2;
struct buffer_head *bh2;

- if (levels && (dx_get_count(frames->entries) ==
- dx_get_limit(frames->entries))) {
- ext4_warning_inode(dir, "Directory index full!");
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+ add_level = 0;
+ break;
+ }
+ frame--; /* split higher index block */
+ at = frame->at;
+ entries = frame->entries;
+ restart = 1;
+ }
+ if (add_level && levels == ext4_dir_htree_level(sb)) {
+ ext4_warning(sb, "Directory (ino: %lu) index full, "
+ "reach max htree level :%d",
+ dir->i_ino, levels);
+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(sb, "Large directory feature is "
+ "not enabled on this "
+ "filesystem");
+ }
err = -ENOSPC;
goto cleanup;
}
+ icount = dx_get_count(entries);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
err = PTR_ERR(bh2);
@@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
goto journal_error;
- if (levels) {
+ if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,

BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
- frames[0].bh);
+ (frame - 1)->bh);
if (err)
goto journal_error;

@@ -2269,17 +2306,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block(frames + 0, hash2, newblock);
- dxtrace(dx_show_index("node", frames[1].entries));
+ dx_insert_block((frame - 1), hash2, newblock);
+ dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+ if (restart) {
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ goto journal_error;
+ }
} else {
- dxtrace(printk(KERN_DEBUG
- "Creating second level index...\n"));
+ struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2287,22 +2332,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
- frame->at = at = at - entries + entries2;
- frame->entries = entries = entries2;
- frame->bh = bh2;
- err = ext4_journal_get_write_access(handle,
- frame->bh);
+ dxroot = (struct dx_root *)frames[0].bh->b_data;
+ dxroot->info.indirect_levels += 1;
+ dxtrace(printk(KERN_DEBUG
+ "Creating %d level index...\n",
+ info->indirect_levels));
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
- }
- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
- if (err) {
- ext4_std_error(inode->i_sb, err);
- goto cleanup;
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+ brelse(bh2);
+ restart = 1;
+ goto journal_error;
}
}
de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@@ -2314,10 +2355,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
goto cleanup;

journal_error:
- ext4_std_error(dir->i_sb, err);
+ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
+ /* @restart is true means htree-path has been changed, we need to
+ * repeat dx_probe() to find out valid htree-path
+ */
+ if (restart && err == 0)
+ goto again;
return err;
}

@@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
--
1.8.3.1

2017-05-08 23:17:12

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

On May 6, 2017, at 12:19 PM, Artem Blagodarenko <[email protected]> wrote:
>
> From: Artem Blagodarenko <[email protected]>
>
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.

Please also credit the original author. Yang Sheng has been maintaining this
patch and is listed as the author of the patches for other kernels.

Signed-off-by: Liang Zhen <[email protected]>

> Signed-off-by: Yang Sheng <[email protected]>
> Signed-off-by: Artem Blagodarenko <[email protected]>

Reviewed-by: Andreas Dilger <[email protected]>

> ---
> fs/ext4/ext4.h | 23 ++++++++---
> fs/ext4/inode.c | 4 +-
> fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++++++++++------------------
> 3 files changed, 105 insertions(+), 46 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01d52b9..0bbbd9b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> EXT4_FEATURE_INCOMPAT_MMP | \
> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
> EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
> + EXT4_FEATURE_INCOMPAT_LARGEDIR)
> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
> @@ -2125,6 +2126,16 @@ struct dir_private_info {
> */
> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
>
> +/* htree levels for ext4 */
> +#define EXT4_HTREE_LEVEL_COMPAT 2
> +#define EXT4_HTREE_LEVEL 3
> +
> +static inline int ext4_dir_htree_level(struct super_block *sb)
> +{
> + return ext4_has_feature_largedir(sb) ?
> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
> +}
> +
> /*
> * Timeout and state flag for lazy initialization inode thread.
> */
> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
> }
>
> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
> +static inline loff_t ext4_isize(struct super_block *sb,
> + struct ext4_inode *raw_inode)
> {
> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> + if (ext4_has_feature_largedir(sb) ||
> + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
> le32_to_cpu(raw_inode->i_size_lo);
> - else
> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> +
> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> }
>
> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f622d4a..5787f3d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> if (ext4_has_feature_64bit(sb))
> ei->i_file_acl |=
> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
> - inode->i_size = ext4_isize(raw_inode);
> + inode->i_size = ext4_isize(sb, raw_inode);
> if ((size = i_size_read(inode)) < 0) {
> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
> ret = -EFSCORRUPTED;
> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
> raw_inode->i_file_acl_high =
> cpu_to_le16(ei->i_file_acl >> 32);
> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
> - if (ei->i_disksize != ext4_isize(raw_inode)) {
> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
> ext4_isize_set(raw_inode, ei->i_disksize);
> need_datasync = 1;
> }
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6ad612c..fd38b4a 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>
> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
> {
> - return le32_to_cpu(entry->block) & 0x00ffffff;
> + return le32_to_cpu(entry->block) & 0x0fffffff;
> }
>
> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
> u32 hash;
>
> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
> frame->bh = ext4_read_dirblock(dir, 0, INDEX);
> if (IS_ERR(frame->bh))
> return (struct dx_frame *) frame->bh;
> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> }
>
> indirect = root->info.indirect_levels;
> - if (indirect > 1) {
> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
> - root->info.indirect_levels);
> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
> + ext4_warning(dir->i_sb,
> + "Directory (ino: %lu) htree depth %#06x exceed"
> + "supported value", dir->i_ino,
> + ext4_dir_htree_level(dir->i_sb));
> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(dir->i_sb, "Enable large directory "
> + "feature to access it");
> + }
> goto fail;
> }
>
> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>
> static void dx_release(struct dx_frame *frames)
> {
> + struct dx_root_info *info;
> + int i;
> +
> if (frames[0].bh == NULL)
> return;
>
> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
> - brelse(frames[1].bh);
> - brelse(frames[0].bh);
> + info = &((struct dx_root *)frames[0].bh->b_data)->info;
> + for (i = 0; i <= info->indirect_levels; i++) {
> + if (frames[i].bh == NULL)
> + break;
> + brelse(frames[i].bh);
> + frames[i].bh = NULL;
> + }
> }
>
> /*
> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
> {
> struct dx_hash_info hinfo;
> struct ext4_dir_entry_2 *de;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct inode *dir;
> ext4_lblk_t block;
> int count = 0;
> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
> struct ext4_dir_entry_2 **res_dir)
> {
> struct super_block * sb = dir->i_sb;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> const struct qstr *d_name = fname->usr_fname;
> struct buffer_head *bh;
> ext4_lblk_t block;
> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
> */
> dir->i_mtime = dir->i_ctime = current_time(dir);
> ext4_update_dx_flag(dir);
> - dir->i_version++;
> + inode_inc_iversion(dir);
> ext4_mark_inode_dirty(handle, dir);
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> err = ext4_handle_dirty_dirent_node(handle, dir, bh);
> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
> {
> struct buffer_head *bh2;
> struct dx_root *root;
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries;
> struct ext4_dir_entry_2 *de, *de2;
> struct ext4_dir_entry_tail *t;
> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> struct inode *dir, struct inode *inode)
> {
> - struct dx_frame frames[2], *frame;
> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> struct dx_entry *entries, *at;
> struct buffer_head *bh;
> struct super_block *sb = dir->i_sb;
> struct ext4_dir_entry_2 *de;
> + int restart;
> int err;
>
> +again:
> + restart = 0;
> frame = dx_probe(fname, dir, NULL, frames);
> if (IS_ERR(frame))
> return PTR_ERR(frame);
> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> if (err != -ENOSPC)
> goto cleanup;
>
> + err = 0;
> /* Block full, should compress but for now just split */
> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
> dx_get_count(entries), dx_get_limit(entries)));
> /* Need to split index? */
> if (dx_get_count(entries) == dx_get_limit(entries)) {
> ext4_lblk_t newblock;
> - unsigned icount = dx_get_count(entries);
> - int levels = frame - frames;
> + int levels = frame - frames + 1;
> + unsigned int icount;
> + int add_level = 1;
> struct dx_entry *entries2;
> struct dx_node *node2;
> struct buffer_head *bh2;
>
> - if (levels && (dx_get_count(frames->entries) ==
> - dx_get_limit(frames->entries))) {
> - ext4_warning_inode(dir, "Directory index full!");
> + while (frame > frames) {
> + if (dx_get_count((frame - 1)->entries) <
> + dx_get_limit((frame - 1)->entries)) {
> + add_level = 0;
> + break;
> + }
> + frame--; /* split higher index block */
> + at = frame->at;
> + entries = frame->entries;
> + restart = 1;
> + }
> + if (add_level && levels == ext4_dir_htree_level(sb)) {
> + ext4_warning(sb, "Directory (ino: %lu) index full, "
> + "reach max htree level :%d",
> + dir->i_ino, levels);
> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
> + ext4_warning(sb, "Large directory feature is "
> + "not enabled on this "
> + "filesystem");
> + }
> err = -ENOSPC;
> goto cleanup;
> }
> + icount = dx_get_count(entries);
> bh2 = ext4_append(handle, dir, &newblock);
> if (IS_ERR(bh2)) {
> err = PTR_ERR(bh2);
> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> err = ext4_journal_get_write_access(handle, frame->bh);
> if (err)
> goto journal_error;
> - if (levels) {
> + if (!add_level) {
> unsigned icount1 = icount/2, icount2 = icount - icount1;
> unsigned hash2 = dx_get_hash(entries + icount1);
> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>
> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
> err = ext4_journal_get_write_access(handle,
> - frames[0].bh);
> + (frame - 1)->bh);
> if (err)
> goto journal_error;
>
> @@ -2269,17 +2306,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> frame->entries = entries = entries2;
> swap(frame->bh, bh2);
> }
> - dx_insert_block(frames + 0, hash2, newblock);
> - dxtrace(dx_show_index("node", frames[1].entries));
> + dx_insert_block((frame - 1), hash2, newblock);
> + dxtrace(dx_show_index("node", frame->entries));
> dxtrace(dx_show_index("node",
> ((struct dx_node *) bh2->b_data)->entries));
> err = ext4_handle_dirty_dx_node(handle, dir, bh2);
> if (err)
> goto journal_error;
> brelse (bh2);
> + err = ext4_handle_dirty_dx_node(handle, dir,
> + (frame - 1)->bh);
> + if (err)
> + goto journal_error;
> + if (restart) {
> + err = ext4_handle_dirty_dx_node(handle, dir,
> + frame->bh);
> + goto journal_error;
> + }
> } else {
> - dxtrace(printk(KERN_DEBUG
> - "Creating second level index...\n"));
> + struct dx_root *dxroot;
> memcpy((char *) entries2, (char *) entries,
> icount * sizeof(struct dx_entry));
> dx_set_limit(entries2, dx_node_limit(dir));
> @@ -2287,22 +2332,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> /* Set up root */
> dx_set_count(entries, 1);
> dx_set_block(entries + 0, newblock);
> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
> -
> - /* Add new access path frame */
> - frame = frames + 1;
> - frame->at = at = at - entries + entries2;
> - frame->entries = entries = entries2;
> - frame->bh = bh2;
> - err = ext4_journal_get_write_access(handle,
> - frame->bh);
> + dxroot = (struct dx_root *)frames[0].bh->b_data;
> + dxroot->info.indirect_levels += 1;
> + dxtrace(printk(KERN_DEBUG
> + "Creating %d level index...\n",
> + info->indirect_levels));
> + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
> if (err)
> goto journal_error;
> - }
> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
> - if (err) {
> - ext4_std_error(inode->i_sb, err);
> - goto cleanup;
> + err = ext4_handle_dirty_dx_node(handle, dir, bh2);
> + brelse(bh2);
> + restart = 1;
> + goto journal_error;
> }
> }
> de = do_split(handle, dir, &bh, frame, &fname->hinfo);
> @@ -2314,10 +2355,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> goto cleanup;
>
> journal_error:
> - ext4_std_error(dir->i_sb, err);
> + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
> cleanup:
> brelse(bh);
> dx_release(frames);
> + /* @restart is true means htree-path has been changed, we need to
> + * repeat dx_probe() to find out valid htree-path
> + */
> + if (restart && err == 0)
> + goto again;
> return err;
> }
>
> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
> blocksize);
> else
> de->inode = 0;
> - dir->i_version++;
> + inode_inc_iversion(dir);
> return 0;
> }
> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> --
> 1.8.3.1
>


Cheers, Andreas






Attachments:
signature.asc (195.00 B)
Message signed with OpenPGP

2017-06-22 01:11:57

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

On Mon, May 08, 2017 at 05:17:05PM -0600, Andreas Dilger wrote:
> On May 6, 2017, at 12:19 PM, Artem Blagodarenko <[email protected]> wrote:
> >
> > From: Artem Blagodarenko <[email protected]>
> >
> > This INCOMPAT_LARGEDIR feature allows larger directories to be created
> > in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> > depth of 3 instead of the current limit of 2. These features are needed
> > in order to exceed the current limit of approximately 10M entries in a
> > single directory.
>
> Please also credit the original author. Yang Sheng has been maintaining this
> patch and is listed as the author of the patches for other kernels.
>
> Signed-off-by: Liang Zhen <[email protected]>
>
> > Signed-off-by: Yang Sheng <[email protected]>
> > Signed-off-by: Artem Blagodarenko <[email protected]>
>
> Reviewed-by: Andreas Dilger <[email protected]>

Applied, thanks.

- Ted

2017-06-24 05:44:04

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

It looks like there are at least two xfstests failures if the largedir
feature is enabled. generic/021 fails, but I think this may be a test
issue because it seems to be a mke2fs complaining about not having
enough space to create the file system.

generic/027, which is an ENOSPC hitter, is failing with largedir
enabled, but apparently it hasn't been failing w/o this feature being
enabled. The test description is "Run 8 processes writing 1k files to
seperate files in seperate dirs to hit ENOSPC on small fs with little
free space. Loop for 100 iterations."

This was using the following configuration:

export EXT_MKFS_OPTIONS="-I 2048 -O ^64bit,mmp,uninit_bg,^extents,large_dir,dir_nlink,quota,huge_file,flex_bg -E lazy_journal_init"

- Ted

2017-07-02 23:30:58

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

Some more information about the failure that I'm seeing.

It reproduces *extremely* reliably using:

gce-xfstests -c lustre_mds generic/027

I'm testing on the ext4 dev branch, and it's only show up with the
largedir setup. The test in question is creating lots of 1k files in
separate directories to hit ENOSPC. So I'm guessing it's some kind of
problem in an the error handling path.

>From looking at the console logs it looks like things are coming to a
dead halt due to a blocked wait_on_buffer() in jbd2_write_superblock()
in the commit thread. Everything else ends up waiting for the commit
to finish, and it's all she wrote.

The generic/027 test passes on the 4k and 1k configuration. It also
passes when run under kvm-xfstests with the same parameters, so it's
likely there is some kind of timing component as well.

I started doing some more digging, and it looks like it has nothing to
do with largedir. Instead it seems to be something wierd with
lazy_itable initialization. This works fine:

/sbin/mkfs.ext4 -F -b 4096 /dev/mapper/xt-vdc 65536
mount /dev/mapper/xt-vdc /xt-vdc
sleep 1 ; df ; sleep 1
umount /xt-vdc

Replace the first mkfs command with:

/sbin/mkfs.ext4 -F -I 2048 -b 4096 /dev/mapper/xt-vdc 65536

and the system locks up in the same way as generic/027 when run using
the lustre_mds configuration.

Replace the first mkfs with:

/sbin/mkfs.ext4 -F -I 2048 -b 4096 -E lazy_itable_init=0 /dev/mapper/xt-vdc 65536

there are no problems. So, it looks like it's some combination of
using a 2048 inode size and lazy itable initialization.

I haven't figured out if this is a recent regression, or whether this
is something that we're only seeing recently. It also seems to be
related to some SCSI tag aborts that we aren't seeing elsewhere, so it
may have to do with how we are issuing discards. Whether this is a
GCE issue or something which doesn't show up because the KVM I am
handles discards differently is another unknown issue. But I thought
I would at least ease your mind that this doesn't seem to be a
specifically a largedir issue.

Cheers,

- Ted


Attachments:
(No filename) (2.06 kB)
console.out.gz (20.03 kB)
Download all attachments

2017-07-03 16:04:34

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

On Sun, Jul 02, 2017 at 07:30:56PM -0400, Theodore Ts'o wrote:
>
> I haven't figured out if this is a recent regression, or whether this
> is something that we're only seeing recently. It also seems to be
> related to some SCSI tag aborts that we aren't seeing elsewhere, so it
> may have to do with how we are issuing discards. Whether this is a
> GCE issue or something which doesn't show up because the KVM I am
> handles discards differently is another unknown issue. But I thought
> I would at least ease your mind that this doesn't seem to be a
> specifically a largedir issue.

... It now appears that the ext4/021 failure is caused by a GCE PD
bug, and it was unmasked by using 2048 byte inodes. I've worked
around it for now by using mke2fs -E lazy_itable_init=0. (The bug
seems to be triggered by the call to sb_issue_zeroout in the lazy
inode table initialization, and doesn't show up with the standard 256
byte inodes.)

The next failure I'm running into can be replicated on kvm-xfstests as
well as gce-xfstests, but it seems to be an xattr related failure,
with a handle not getting started with enough credits. I need to look
at that one a bit closer, since it's not clear it's a large_dir
related one. It's only triggering on the lustre_mds configuration,
though. It runs clean on the standard ext4 4k configuration, which is
curious because it appear that the largedir code is implicated.

- Ted

generic/070 [10:18:14][ 63.464178] run fstests generic/070 at 2017-07-03 10:18:14
[ 64.279344] ------------[ cut here ]------------
[ 64.280358] WARNING: CPU: 1 PID: 3122 at /usr/projects/linux/ext4/fs/ext4/ext4_jbd2.c:277 __ext4_handle_dirty_metadata+0x173/0x27b
[ 64.282634] CPU: 1 PID: 3122 Comm: fsstress Tainted: G L 4.12.0-rc2-ext4-00042-g037ee4110538 #450
[ 64.284483] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 64.285871] task: ffff88005e552780 task.stack: ffff8800687d0000
[ 64.286868] RIP: 0010:__ext4_handle_dirty_metadata+0x173/0x27b
[ 64.287950] RSP: 0018:ffff8800687d76d8 EFLAGS: 00010286
[ 64.288921] RAX: ffff88006c02a340 RBX: ffff88003a146f40 RCX: ffffffff813e5e4f
[ 64.290085] RDX: 1ffff10007428deb RSI: dffffc0000000000 RDI: ffff88006c02a340
[ 64.291393] RBP: ffff8800687d7720 R08: ffff88005fff71f8 R09: ffffed000fff9608
[ 64.292627] R10: 0000000000000000 R11: ffff88007ffcb043 R12: ffff88005fff71f8
[ 64.293587] R13: 00000000ffffffe4 R14: ffff880064cb3750 R15: 00000000000007e7
[ 64.294449] FS: 00007f642d4b3700(0000) GS:ffff88006d400000(0000) knlGS:0000000000000000
[ 64.295845] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 64.296851] CR2: 00007f642d4b0000 CR3: 0000000068cb6000 CR4: 00000000000006e0
[ 64.298117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 64.298962] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 64.299879] Call Trace:
[ 64.300206] ext4_xattr_block_set+0x1034/0x12bf
[ 64.300780] ? ext4_xattr_inode_array_free+0x51/0x51
[ 64.301463] ? do_get_write_access+0x5bb/0x685
[ 64.302040] ? jbd2_journal_put_journal_head+0x1e7/0x202
[ 64.302629] ? ext4_xattr_check_entries+0x67/0xf7
[ 64.303159] ? memcmp+0x2e/0x4e
[ 64.303468] ? ext4_xattr_ibody_set+0x5b/0x108
[ 64.303893] ext4_xattr_set_handle+0x45e/0x7d6
[ 64.304319] ? check_noncircular+0x31/0x31
[ 64.304773] ? ext4_xattr_block_set+0x12bf/0x12bf
[ 64.305331] ? __lock_is_held+0x33/0x94
[ 64.305749] ? __ext4_journal_start_sb+0x136/0x1c0
[ 64.306252] ext4_xattr_set+0x156/0x1ce
[ 64.306620] ? ext4_xattr_set_handle+0x7d6/0x7d6
[ 64.307077] ? check_noncircular+0x31/0x31
[ 64.307467] ? kvm_clock_read+0x1e/0x20
[ 64.307910] ? mark_lock+0xba/0x75b
[ 64.308304] ? find_held_lock+0x80/0x91
[ 64.308622] ext4_xattr_user_set+0x72/0x7c
[ 64.308959] __vfs_setxattr+0x7c/0x8c
[ 64.309314] __vfs_setxattr_noperm+0x9a/0x1f3
[ 64.309782] vfs_setxattr+0x8d/0xa9
[ 64.310246] setxattr+0x18d/0x1cb
[ 64.310641] ? vfs_setxattr+0xa9/0xa9
[ 64.311193] ? __lock_is_held+0x33/0x94
[ 64.311654] ? rcu_read_lock_sched_held+0x4c/0x53
[ 64.312148] ? rcu_sync_lockdep_assert+0x41/0x67
[ 64.312614] ? __mnt_is_readonly+0x34/0x41
[ 64.313032] ? __mnt_want_write+0x83/0x8e
[ 64.313378] path_setxattr+0xda/0x12f
[ 64.313586] ? setxattr+0x1cb/0x1cb
[ 64.313790] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 64.314049] SyS_lsetxattr+0x11/0x15
[ 64.314271] entry_SYSCALL_64_fastpath+0x1f/0xbe
[ 64.314604] RIP: 0033:0x7f642cdb65b9
[ 64.314963] RSP: 002b:00007ffc9d620a58 EFLAGS: 00000246 ORIG_RAX: 00000000000000bd
[ 64.315693] RAX: ffffffffffffffda RBX: 0000000000000046 RCX: 00007f642cdb65b9
[ 64.316322] RDX: 00007f6428000ab0 RSI: 00007ffc9d620a90 RDI: 00007f64280008c0
[ 64.316948] RBP: ffff8800687d7f98 R08: 0000000000000000 R09: 00007ffc9d620d40
[ 64.317575] R10: 00000000000007d0 R11: 0000000000000246 R12: 0000000000052000
[ 64.318260] R13: 0000000000000003 R14: 000000000004a000 R15: 000000000000005f
[ 64.318847] Code: ef ff 48 8b 45 c8 48 8b 00 48 89 c7 48 89 45 c8 e8 cd 22 ef ff 48 8b 45 c8 f6 00 02 0f 85 ff 00 00 00 45 85 ed 0f 84 ef fe ff ff <0f> ff 48 8b 7d d0 45 89 e8 48 89 d9 44 89 fe 48 c7 c2 20 37 11
[ 64.320670] ---[ end trace ab1bc60121ac1b7e ]---
[ 64.321081] EXT4-fs: ext4_xattr_block_set:2023: aborting transaction: error 28 in __ext4_handle_dirty_metadata
[ 64.321893] EXT4-fs error (device vdd): ext4_xattr_block_set:2023: inode #131076: block 589906: comm fsstress: journal_dirty_metadata failed: handle type 10 started at line 2411, credits 5/0, errcode -28
[ 64.326370] EXT4-fs error (device vdd) in ext4_xattr_set:2419: error 28

2017-07-04 17:56:20

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH v3] Add largedir feature

On Sat, May 06, 2017 at 02:19:21PM -0400, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <[email protected]>
>
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.
>
> Signed-off-by: Yang Sheng <[email protected]>
> Signed-off-by: Artem Blagodarenko <[email protected]>

I'm going to fold the following minor fix-up to this patch before I
send it to Linus.

- Ted

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f97611171023..5e61e464d71c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -77,7 +77,14 @@

#define EXT4_RESERVE_TRANS_BLOCKS 12U

-#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
+/*
+ * Number of credits needed if we need to insert an entry into a
+ * directory. For each new index block, we need 4 blocks (old index
+ * block, new index block, bitmap block, bg summary). For normal
+ * htree directories there are 2 levels; if the largedir feature
+ * enabled it's 3 levels.
+ */
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was