From: Bernd Schubert <[email protected]>
Compilation of ext4/namei.c brought up an error and warning messages
when compiled with -DDX_DEBUG
Signed-off-by: Bernd Schubert <[email protected]>
---
fs/ext4/namei.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b754b77..6f32da4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -288,8 +288,8 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
char *name = de->name;
while (len--) printk("%c", *name++);
ext4fs_dirhash(de->name, de->name_len, &h);
- printk(":%x.%u ", h.hash,
- ((char *) de - base));
+ printk(":%x.%ld ", h.hash,
+ (long) ((char *) de - base));
}
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
*err = -ENOENT;
errout:
- dxtrace(printk(KERN_DEBUG "%s not found\n", name));
+ dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
dx_release (frames);
return NULL;
}
While creating files in large directories we noticed an endless number
of 4K reads. And those reads very much reduced file creation numbers
as shown by bonnie. While we would expect about 2000 creates/s, we
only got about 25 creates/s. Running the benchmarks for a long time
improved the numbers, but not above 200 creates/s.
It turned out those reads came from directory index block reads
and probably the bh cache never cached all dx blocks. Given by
the high number of directories we have (8192) and number of files required
to trigger the issue (16 million), rather probably bh cached dx blocks
got lost in favour of other less important blocks.
The patch below implements a read-ahead for *all* dx blocks of a directory
if a single dx block is missing in the cache. That also helps the LRU
to cache important dx blocks.
Unfortunately, it also has a performance trade-off for the first access to
a directory, although the READA flag is set already.
Therefore at least for now, this option is disabled by default, but may
be enabled using 'mount -o dx_read_ahead' or 'mount -odx_read_ahead=1'
Signed-off-by: Bernd Schubert <[email protected]>
---
Documentation/filesystems/ext4.txt | 6 ++++
fs/ext4/ext4.h | 3 ++
fs/ext4/inode.c | 28 ++++++++++++++++++
fs/ext4/namei.c | 56 +++++++++++++++++++++++++++++++++---
fs/ext4/super.c | 17 +++++++++++
5 files changed, 106 insertions(+), 4 deletions(-)
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 3ae9bc9..fad70ea 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -404,6 +404,12 @@ dioread_nolock locking. If the dioread_nolock option is specified
i_version Enable 64-bit inode version support. This option is
off by default.
+dx_read_ahead Enables read-ahead of directory index blocks.
+ This option should be enabled if the filesystem several
+ directories with a high number of files. Disadvantage
+ is that on first access to a directory additional reads
+ come up, which might slow down other operations.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1921392..997323a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -916,6 +916,8 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+#define EXT4_MOUNT2_DX_READ_AHEAD 0x00002 /* Read ahead directory index blocks */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1802,6 +1804,7 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
+int ext4_bread_ra(struct inode *inode, ext4_lblk_t block);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5763e3..938fb6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1490,6 +1490,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
return bh;
}
+/*
+ * Synchronous read of blocks
+ */
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *err)
{
@@ -1500,6 +1503,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return bh;
if (buffer_uptodate(bh))
return bh;
+
ll_rw_block(READ_META, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -1509,6 +1513,30 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
+/*
+ * Read-ahead blocks
+ */
+int ext4_bread_ra(struct inode *inode, ext4_lblk_t block)
+{
+ struct buffer_head *bh;
+ int err;
+
+ bh = ext4_getblk(NULL, inode, block, 0, &err);
+ if (!bh)
+ return -1;
+
+ if (buffer_uptodate(bh)) {
+ brelse(bh);
+ return 0;
+ }
+
+ ll_rw_block(READA, 1, &bh);
+
+ brelse(bh);
+ return 0;
+}
+
+
static int walk_page_buffers(handle_t *handle,
struct buffer_head *head,
unsigned from,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6f32da4..78290f0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -334,6 +334,35 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
#endif /* DX_DEBUG */
/*
+ * Read ahead directory index blocks
+ */
+static void dx_ra_blocks(struct inode *dir, struct dx_entry * entries)
+{
+ int i, err = 0;
+ unsigned num_entries = dx_get_count(entries);
+
+ if (num_entries < 2 || num_entries > dx_get_limit(entries)) {
+ dxtrace(printk("dx read-ahead: invalid number of entries\n"));
+ return;
+ }
+
+ dxtrace(printk("dx read-ahead: %d entries in dir-ino %lu \n",
+ num_entries, dir->i_ino));
+
+ i = 1; /* skip first entry, it was already read in by the caller */
+ do {
+ struct dx_entry *entry;
+ ext4_lblk_t block;
+
+ entry = entries + i;
+
+ block = dx_get_block(entry);
+ err = ext4_bread_ra(dir, dx_get_block(entry));
+ i++;
+ } while (i < num_entries && !err);
+}
+
+/*
* Probe for a directory leaf block to search.
*
* dx_probe can return ERR_BAD_DX_DIR, which means there was a format
@@ -347,11 +376,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
unsigned count, indirect;
- struct dx_entry *at, *entries, *p, *q, *m;
+ struct dx_entry *at, *entries, *ra_entries, *p, *q, *m;
struct dx_root *root;
struct buffer_head *bh;
struct dx_frame *frame = frame_in;
u32 hash;
+ bool did_ra = false;
frame->bh = NULL;
if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
@@ -390,7 +420,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail;
}
- entries = (struct dx_entry *) (((char *)&root->info) +
+ ra_entries = entries = (struct dx_entry *) (((char *)&root->info) +
root->info.info_length);
if (dx_get_limit(entries) != dx_root_limit(dir,
@@ -446,9 +476,27 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->bh = bh;
frame->entries = entries;
frame->at = at;
- if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+
+ if (!did_ra && test_opt2(dir->i_sb, DX_READ_AHEAD)) {
+ /* read-ahead of dx blocks */
+ struct buffer_head *test_bh;
+ ext4_lblk_t block = dx_get_block(at);
+
+ test_bh = ext4_getblk(NULL, dir, block, 0, err);
+ if (test_bh && !buffer_uptodate(test_bh)) {
+ dx_ra_blocks(dir, ra_entries);
+ did_ra = true;
+ }
+ brelse(test_bh);
+ }
+
+ if (!indirect--)
+ return frame;
+
+ bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err);
+ if (!bh)
goto fail2;
+
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cc5c157..9dd7c05 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1119,6 +1119,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_printf(seq, ",init_inode_table=%u",
(unsigned) sbi->s_li_wait_mult);
+ if (test_opt2(sb, DX_READ_AHEAD))
+ seq_puts(seq, ",dx_read_ahead");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -1294,6 +1297,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard,
Opt_init_inode_table, Opt_noinit_inode_table,
+ Opt_dx_read_ahead,
};
static const match_table_t tokens = {
@@ -1369,6 +1373,8 @@ static const match_table_t tokens = {
{Opt_init_inode_table, "init_itable=%u"},
{Opt_init_inode_table, "init_itable"},
{Opt_noinit_inode_table, "noinit_itable"},
+ {Opt_dx_read_ahead, "dx_read_ahead=%u"},
+ {Opt_dx_read_ahead, "dx_read_ahead"},
{Opt_err, NULL},
};
@@ -1859,6 +1865,17 @@ set_qf_format:
case Opt_noinit_inode_table:
clear_opt(sb, INIT_INODE_TABLE);
break;
+ case Opt_dx_read_ahead:
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
+ if (option)
+ set_opt2(sb, DX_READ_AHEAD);
+ else
+ clear_opt2(sb, DX_READ_AHEAD);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
On 2011年06月18日 00:00, Bernd Schubert Wrote:
> From: Bernd Schubert <[email protected]>
>
> Compilation of ext4/namei.c brought up an error and warning messages
> when compiled with -DDX_DEBUG
>
>
> Signed-off-by: Bernd Schubert <[email protected]>
> ---
> fs/ext4/namei.c | 6 +++---
> 1 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index b754b77..6f32da4 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -288,8 +288,8 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
> char *name = de->name;
> while (len--) printk("%c", *name++);
> ext4fs_dirhash(de->name, de->name_len, &h);
> - printk(":%x.%u ", h.hash,
> - ((char *) de - base));
> + printk(":%x.%ld ", h.hash,
> + (long) ((char *) de - base));
> }
How about use %p in printk like
> + printk(":%x.%p ", h.hash,
> + ((char *) de - base));
> space += EXT4_DIR_REC_LEN(de->name_len);
> names++;
> @@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
>
> *err = -ENOENT;
> errout:
> - dxtrace(printk(KERN_DEBUG "%s not found\n", name));
> + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
Nice catch :-)
[snip]
On 2011年06月18日 00:01, Bernd Schubert Wrote:
> While creating files in large directories we noticed an endless number
> of 4K reads. And those reads very much reduced file creation numbers
> as shown by bonnie. While we would expect about 2000 creates/s, we
> only got about 25 creates/s. Running the benchmarks for a long time
> improved the numbers, but not above 200 creates/s.
> It turned out those reads came from directory index block reads
> and probably the bh cache never cached all dx blocks. Given by
> the high number of directories we have (8192) and number of files required
> to trigger the issue (16 million), rather probably bh cached dx blocks
> got lost in favour of other less important blocks.
> The patch below implements a read-ahead for *all* dx blocks of a directory
> if a single dx block is missing in the cache. That also helps the LRU
> to cache important dx blocks.
>
> Unfortunately, it also has a performance trade-off for the first access to
> a directory, although the READA flag is set already.
> Therefore at least for now, this option is disabled by default, but may
> be enabled using 'mount -o dx_read_ahead' or 'mount -odx_read_ahead=1'
>
> Signed-off-by: Bernd Schubert <[email protected]>
> ---
A question is, is there any performance number for dx dir read ahead ?
My concern is, if buffer cache replacement behavior is not ideal, which may replace a dx block by other (maybe) more hot
blocks, dx dir readahead will introduce more I/Os. In this case, we may focus on exploring why dx block is replaced out
of buffer cache, other than using dx readahead.
[snip]
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6f32da4..78290f0 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -334,6 +334,35 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> #endif /* DX_DEBUG */
>
> /*
> + * Read ahead directory index blocks
> + */
> +static void dx_ra_blocks(struct inode *dir, struct dx_entry * entries)
> +{
> + int i, err = 0;
> + unsigned num_entries = dx_get_count(entries);
> +
> + if (num_entries < 2 || num_entries > dx_get_limit(entries)) {
> + dxtrace(printk("dx read-ahead: invalid number of entries\n"));
> + return;
> + }
> +
> + dxtrace(printk("dx read-ahead: %d entries in dir-ino %lu \n",
> + num_entries, dir->i_ino));
> +
> + i = 1; /* skip first entry, it was already read in by the caller */
> + do {
> + struct dx_entry *entry;
> + ext4_lblk_t block;
> +
> + entry = entries + i;
> +
> + block = dx_get_block(entry);
> + err = ext4_bread_ra(dir, dx_get_block(entry));
> + i++;
> + } while (i < num_entries && !err);
> +}
> +
I see sync reading here (CMIIW), this is performance killer. An async background reading ahead is better.
[snip]
Thanks.
Coly
On 2011-06-17, at 12:44 PM, Coly Li wrote:
> On 2011年06月18日 00:01, Bernd Schubert Wrote:
>> While creating files in large directories we noticed an endless number
>> of 4K reads. And those reads very much reduced file creation numbers
>> as shown by bonnie. While we would expect about 2000 creates/s, we
>> only got about 25 creates/s. Running the benchmarks for a long time
>> improved the numbers, but not above 200 creates/s.
>> It turned out those reads came from directory index block reads
>> and probably the bh cache never cached all dx blocks. Given by
>> the high number of directories we have (8192) and number of files required
>> to trigger the issue (16 million), rather probably bh cached dx blocks
>> got lost in favour of other less important blocks.
>> The patch below implements a read-ahead for *all* dx blocks of a directory
>> if a single dx block is missing in the cache. That also helps the LRU
>> to cache important dx blocks.
>>
>> Unfortunately, it also has a performance trade-off for the first access to
>> a directory, although the READA flag is set already.
>> Therefore at least for now, this option is disabled by default, but may
>> be enabled using 'mount -o dx_read_ahead' or 'mount -odx_read_ahead=1'
>>
>> Signed-off-by: Bernd Schubert <[email protected]>
>> ---
>
> A question is, is there any performance number for dx dir read ahead ?
> My concern is, if buffer cache replacement behavior is not ideal, which may replace a dx block by other (maybe) more hot blocks, dx dir readahead will
> introduce more I/Os. In this case, we may focus on exploring why dx block is
> replaced out of buffer cache, other than using dx readahead.
There was an issue we observed in our testing, where the kernel per-CPU buffer LRU was too small, and for large htree directories the buffer cache was always thrashing. Currently the kernel has:
#define BH_LRU_SIZE 8
but it should be larger (16 improved performance for us by about 10%) on a
16-core system in our testing (excerpt below):
> - name find of ext4 will consume about 3 slots
> - creating inode will take about 3 slots
> - name insert of ext4 will consume another 3-4 slots.
> - we also have some attr_set/xattr_set, which will access the LRU as well.
>
> So some BHs will be popped out from LRU before it can be using again, actually profile shows __find_get_block_slow() and __find_get_block() are the top time consuming functions. I tried to increase BH_LRU_SIZE to 16, and see about 8% increasing of opencreate+close rate on my branch, so I guess we actually have about 10% improvement for opencreate(only, no close) just by increasing BH_LRU_SIZE.
> [snip]
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index 6f32da4..78290f0 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -334,6 +334,35 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> #endif /* DX_DEBUG */
>>
>> /*
>> + * Read ahead directory index blocks
>> + */
>> +static void dx_ra_blocks(struct inode *dir, struct dx_entry * entries)
>> +{
>> + int i, err = 0;
>> + unsigned num_entries = dx_get_count(entries);
>> +
>> + if (num_entries < 2 || num_entries > dx_get_limit(entries)) {
>> + dxtrace(printk("dx read-ahead: invalid number of entries\n"));
>> + return;
>> + }
>> +
>> + dxtrace(printk("dx read-ahead: %d entries in dir-ino %lu \n",
>> + num_entries, dir->i_ino));
>> +
>> + i = 1; /* skip first entry, it was already read in by the caller */
>> + do {
>> + struct dx_entry *entry;
>> + ext4_lblk_t block;
>> +
>> + entry = entries + i;
>> +
>> + block = dx_get_block(entry);
>> + err = ext4_bread_ra(dir, dx_get_block(entry));
>> + i++;
>> + } while (i < num_entries && !err);
>> +}
Two objections here - this is potentially a LOT of readahead that might not
be accessed. Why not limit the number of readahead blocks to some reasonable
amount (e.g. 32 or 64, maybe (BH_LRU_SIZE-1) is best to avoid thrashing?)
and continue to submit more readahead as it traverses the directory.
It is also possible to have ext4_map_blocks() map an array of blocks at one
time, which might improve the efficiency of this code a bit (it needs to hold
i_data_sem during the mapping, so doing more work at once is better).
I also observe some strange inefficiency going on in buffer lookup:
__getblk()
->__find_get_block()
->lookup_bh_lru()
->__find_get_block_slow()
but if that fails, __getblk() continues on to call:
->__getblk_slow()
->unlikely() error message
->__find_get_block()
->lookup_bh_lru()
->__find_get_block_slow()
->grow_buffers()
It appears there is absolutely no benefit to having the initial call to
__find_get_block() in the first place. The "unlikely() error message" is
out-of-line and shouldn't impact perf, and the "slow" part of __getblk_slow()
is skipped if __find_get_block() finds the buffer in the first place.
I could see possibly having __getblk->lookup_bh_lru() for the CPU-local
lookup avoiding some extra function calls (it would also need touch_buffer()
if it finds it via lookup_bh_lru().
> I see sync reading here (CMIIW), this is performance killer. An async background reading ahead is better.
>
> [snip]
>
> Thanks.
>
> Coly
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cheers, Andreas
Cheers, Andreas
--
Andreas Dilger
Principal Engineer
Whamcloud, Inc.
On 06/17/2011 08:29 PM, Coly Li wrote:
> On 2011年06月18日 00:00, Bernd Schubert Wrote:
>> From: Bernd Schubert <[email protected]>
>>
>> Compilation of ext4/namei.c brought up an error and warning messages
>> when compiled with -DDX_DEBUG
>>
>>
>> Signed-off-by: Bernd Schubert <[email protected]>
>> ---
>> fs/ext4/namei.c | 6 +++---
>> 1 files changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index b754b77..6f32da4 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -288,8 +288,8 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
>> char *name = de->name;
>> while (len--) printk("%c", *name++);
>> ext4fs_dirhash(de->name, de->name_len, &h);
>> - printk(":%x.%u ", h.hash,
>> - ((char *) de - base));
>> + printk(":%x.%ld ", h.hash,
>> + (long) ((char *) de - base));
>> }
>
> How about use %p in printk like
>> + printk(":%x.%p ", h.hash,
>> + ((char *) de - base));
>
Fine with me :) I will send updated patches on Monday.
Cheers,
Bernd
On 06/17/2011 08:44 PM, Coly Li wrote:
> On 2011年06月18日 00:01, Bernd Schubert Wrote:
>> While creating files in large directories we noticed an endless number
>> of 4K reads. And those reads very much reduced file creation numbers
>> as shown by bonnie. While we would expect about 2000 creates/s, we
>> only got about 25 creates/s. Running the benchmarks for a long time
>> improved the numbers, but not above 200 creates/s.
>> It turned out those reads came from directory index block reads
>> and probably the bh cache never cached all dx blocks. Given by
>> the high number of directories we have (8192) and number of files required
>> to trigger the issue (16 million), rather probably bh cached dx blocks
>> got lost in favour of other less important blocks.
>> The patch below implements a read-ahead for *all* dx blocks of a directory
>> if a single dx block is missing in the cache. That also helps the LRU
>> to cache important dx blocks.
>>
>> Unfortunately, it also has a performance trade-off for the first access to
>> a directory, although the READA flag is set already.
>> Therefore at least for now, this option is disabled by default, but may
>> be enabled using 'mount -o dx_read_ahead' or 'mount -odx_read_ahead=1'
>>
>> Signed-off-by: Bernd Schubert <[email protected]>
>> ---
>
> A question is, is there any performance number for dx dir read ahead ?
Well, I benchmarked it all the week now. But in between bonnie++ and
ext4 there is FhGFS... What exactly do you want to know?
> My concern is, if buffer cache replacement behavior is not ideal, which may replace a dx block by other (maybe) more hot
> blocks, dx dir readahead will introduce more I/Os. In this case, we may focus on exploring why dx block is replaced out
> of buffer cache, other than using dx readahead.
I think we have to differentiate between two different problems. Firstly
we have to get all the indexes into memory at all and secondly, keep
them in memory. Given by the high number of index blocks we have, it is
not easy to differentiate between both and I had to add several printks
or systemtap prints to get an idea why accessing the filesystem was so slow.
>
>
> [snip]
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index 6f32da4..78290f0 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -334,6 +334,35 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> #endif /* DX_DEBUG */
>>
>> /*
>> + * Read ahead directory index blocks
>> + */
>> +static void dx_ra_blocks(struct inode *dir, struct dx_entry * entries)
>> +{
>> + int i, err = 0;
>> + unsigned num_entries = dx_get_count(entries);
>> +
>> + if (num_entries < 2 || num_entries > dx_get_limit(entries)) {
>> + dxtrace(printk("dx read-ahead: invalid number of entries\n"));
>> + return;
>> + }
>> +
>> + dxtrace(printk("dx read-ahead: %d entries in dir-ino %lu \n",
>> + num_entries, dir->i_ino));
>> +
>> + i = 1; /* skip first entry, it was already read in by the caller */
>> + do {
>> + struct dx_entry *entry;
>> + ext4_lblk_t block;
>> +
>> + entry = entries + i;
>> +
>> + block = dx_get_block(entry);
>> + err = ext4_bread_ra(dir, dx_get_block(entry));
>> + i++;
>> + } while (i < num_entries && !err);
>> +}
>> +
>
>
> I see sync reading here (CMIIW), this is performance killer. An async background reading ahead is better.
But isn't it async? See in the new function ext4_bread_ra() please.
After ll_rw_block(READA, 1, &bh) we don't wait for the buffer to be
up-to-date, but immediately return. I also though about to add it to
worker threads, but then though that only would be additional overhead
without any gain. I didn't test and benchmark it, though.
Thanks for your review!
Cheers,
Bernd
On 06/17/2011 09:29 PM, Andreas Dilger wrote:
> On 2011-06-17, at 12:44 PM, Coly Li wrote:
>> On 2011年06月18日 00:01, Bernd Schubert Wrote:
>>> While creating files in large directories we noticed an endless
>>> number of 4K reads. And those reads very much reduced file
>>> creation numbers as shown by bonnie. While we would expect about
>>> 2000 creates/s, we only got about 25 creates/s. Running the
>>> benchmarks for a long time improved the numbers, but not above
>>> 200 creates/s. It turned out those reads came from directory
>>> index block reads and probably the bh cache never cached all dx
>>> blocks. Given by the high number of directories we have (8192)
>>> and number of files required to trigger the issue (16 million),
>>> rather probably bh cached dx blocks got lost in favour of other
>>> less important blocks. The patch below implements a read-ahead
>>> for *all* dx blocks of a directory if a single dx block is
>>> missing in the cache. That also helps the LRU to cache important
>>> dx blocks.
>>>
>>> Unfortunately, it also has a performance trade-off for the first
>>> access to a directory, although the READA flag is set already.
>>> Therefore at least for now, this option is disabled by default,
>>> but may be enabled using 'mount -o dx_read_ahead' or 'mount
>>> -odx_read_ahead=1'
>>>
>>> Signed-off-by: Bernd Schubert
>>> <[email protected]> ---
>>
>> A question is, is there any performance number for dx dir read
>> ahead ? My concern is, if buffer cache replacement behavior is not
>> ideal, which may replace a dx block by other (maybe) more hot
>> blocks, dx dir readahead will introduce more I/Os. In this case, we
>> may focus on exploring why dx block is replaced out of buffer
>> cache, other than using dx readahead.
>
> There was an issue we observed in our testing, where the kernel
> per-CPU buffer LRU was too small, and for large htree directories the
> buffer cache was always thrashing. Currently the kernel has:
>
> #define BH_LRU_SIZE 8
>
> but it should be larger (16 improved performance for us by about 10%)
> on a 16-core system in our testing (excerpt below):
>> - name find of ext4 will consume about 3 slots - creating inode
>> will take about 3 slots - name insert of ext4 will consume another
>> 3-4 slots. - we also have some attr_set/xattr_set, which will
>> access the LRU as well.
>>
>> So some BHs will be popped out from LRU before it can be using
>> again, actually profile shows __find_get_block_slow() and
>> __find_get_block() are the top time consuming functions. I tried
>> to increase BH_LRU_SIZE to 16, and see about 8% increasing of
>> opencreate+close rate on my branch, so I guess we actually have
>> about 10% improvement for opencreate(only, no close) just by
>> increasing BH_LRU_SIZE.
I think we need to fix bh_lru_install() first, which uses BH_LRU_SIZE to
allocate memory on the stack. I already tried to play with a high number
of BH_LRU_SIZE, but run into stack memory issues...
Shouldn't be too difficult to fix, though.
>
>
>
>> [snip]
>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index
>>> 6f32da4..78290f0 100644 --- a/fs/ext4/namei.c +++
>>> b/fs/ext4/namei.c @@ -334,6 +334,35 @@ struct stats
>>> dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> #endif /* DX_DEBUG */
>>>
>>> /* + * Read ahead directory index blocks + */ +static void
>>> dx_ra_blocks(struct inode *dir, struct dx_entry * entries) +{ +
>>> int i, err = 0; + unsigned num_entries = dx_get_count(entries);
>>> + + if (num_entries < 2 || num_entries > dx_get_limit(entries))
>>> { + dxtrace(printk("dx read-ahead: invalid number of
>>> entries\n")); + return; + } + + dxtrace(printk("dx read-ahead:
>>> %d entries in dir-ino %lu \n", + num_entries, dir->i_ino)); + +
>>> i = 1; /* skip first entry, it was already read in by the caller
>>> */ + do { + struct dx_entry *entry; + ext4_lblk_t block; + +
>>> entry = entries + i; + + block = dx_get_block(entry); + err =
>>> ext4_bread_ra(dir, dx_get_block(entry)); + i++; + } while (i <
>>> num_entries && !err); +}
>
> Two objections here - this is potentially a LOT of readahead that
> might not be accessed. Why not limit the number of readahead blocks
> to some reasonable amount (e.g. 32 or 64, maybe (BH_LRU_SIZE-1) is
> best to avoid thrashing?) and continue to submit more readahead as it
> traverses the directory.
Do you suggest to start with a small read-ahead and to increase the
number over the time? Where could we save the last number of
read-aheads? Using a fixed value and then always
min_block = current_dx_block - max_ra /2
max_block = current_dx_block + max_ra /2
(well something like that...)
would be rather easy to add.
>
> It is also possible to have ext4_map_blocks() map an array of blocks
> at one time, which might improve the efficiency of this code a bit
> (it needs to hold i_data_sem during the mapping, so doing more work
> at once is better).
Hmm, that would be good of course, but will it really work? On a first
glance I see that "struct ext4_map_blocks" needs an explanation about
the member names ;)
But then I don't see an array there, but just a block number
(map->m_lblk) and the number of blocks to map (map->m_len). So doesn't
the mean the blocks need to be linearly one after the other on the disk?
But do have dx-blocks really that layout? From my printks it also didn't
look that way. It would be good, if we could allocate large directories
from the beginning, though (should help Lustres object-hash directories
as well).
Its a bit late here and might miss something, though.
Thanks for your review!
>
> I also observe some strange inefficiency going on in buffer lookup:
>
> __getblk() ->__find_get_block() ->lookup_bh_lru()
> ->__find_get_block_slow()
>
> but if that fails, __getblk() continues on to call:
>
> ->__getblk_slow() ->unlikely() error message ->__find_get_block()
> ->lookup_bh_lru() ->__find_get_block_slow() ->grow_buffers()
>
> It appears there is absolutely no benefit to having the initial call
> to __find_get_block() in the first place. The "unlikely() error
> message" is out-of-line and shouldn't impact perf, and the "slow"
> part of __getblk_slow() is skipped if __find_get_block() finds the
> buffer in the first place.
>
> I could see possibly having __getblk->lookup_bh_lru() for the
> CPU-local lookup avoiding some extra function calls (it would also
> need touch_buffer() if it finds it via lookup_bh_lru().
I will also look into it on Monday. Its too late for me to really follow
you here.
Have a nice weekend!
Cheers,
Bernd
2011/6/18 Bernd Schubert <[email protected]>:
> While creating files in large directories we noticed an endless number
> of 4K reads. And those reads very much reduced file creation numbers
> as shown by bonnie. While we would expect about 2000 creates/s, we
> only got about 25 creates/s. Running the benchmarks for a long time
> improved the numbers, but not above 200 creates/s.
> It turned out those reads came from directory index block reads
> and probably the bh cache never cached all dx blocks. Given by
> the high number of directories we have (8192) and number of files required
> to trigger the issue (16 million), rather probably bh cached dx blocks
> got lost in favour of other less important blocks.
> The patch below implements a read-ahead for *all* dx blocks of a directory
> if a single dx block is missing in the cache. That also helps the LRU
> to cache important dx blocks.
>
> Unfortunately, it also has a performance trade-off for the first access to
> a directory, although the READA flag is set already.
> Therefore at least for now, this option is disabled by default, but may
> be enabled using 'mount -o dx_read_ahead' or 'mount -odx_read_ahead=1'
>
> Signed-off-by: Bernd Schubert <[email protected]>
> ---
> ?Documentation/filesystems/ext4.txt | ? ?6 ++++
> ?fs/ext4/ext4.h ? ? ? ? ? ? ? ? ? ? | ? ?3 ++
> ?fs/ext4/inode.c ? ? ? ? ? ? ? ? ? ?| ? 28 ++++++++++++++++++
> ?fs/ext4/namei.c ? ? ? ? ? ? ? ? ? ?| ? 56 +++++++++++++++++++++++++++++++++---
> ?fs/ext4/super.c ? ? ? ? ? ? ? ? ? ?| ? 17 +++++++++++
> ?5 files changed, 106 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
> index 3ae9bc9..fad70ea 100644
> --- a/Documentation/filesystems/ext4.txt
> +++ b/Documentation/filesystems/ext4.txt
> @@ -404,6 +404,12 @@ dioread_nolock ? ? ? ? ? ? locking. If the dioread_nolock option is specified
> ?i_version ? ? ? ? ? ? ?Enable 64-bit inode version support. This option is
> ? ? ? ? ? ? ? ? ? ? ? ?off by default.
>
> +dx_read_ahead ? ? ? ? ?Enables read-ahead of directory index blocks.
> + ? ? ? ? ? ? ? ? ? ? ? This option should be enabled if the filesystem several
> + ? ? ? ? ? ? ? ? ? ? ? directories with a high number of files. Disadvantage
> + ? ? ? ? ? ? ? ? ? ? ? is that on first access to a directory additional reads
> + ? ? ? ? ? ? ? ? ? ? ? come up, which might slow down other operations.
> +
> ?Data Mode
> ?=========
> ?There are 3 different data modes:
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..997323a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -916,6 +916,8 @@ struct ext4_inode_info {
> ?#define EXT4_MOUNT_DISCARD ? ? ? ? ? ? 0x40000000 /* Issue DISCARD requests */
> ?#define EXT4_MOUNT_INIT_INODE_TABLE ? ?0x80000000 /* Initialize uninitialized itables */
>
> +#define EXT4_MOUNT2_DX_READ_AHEAD ? ? ?0x00002 /* Read ahead directory index blocks */
> +
> ?#define clear_opt(sb, opt) ? ? ? ? ? ? EXT4_SB(sb)->s_mount_opt &= \
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?~EXT4_MOUNT_##opt
> ?#define set_opt(sb, opt) ? ? ? ? ? ? ? EXT4_SB(sb)->s_mount_opt |= \
> @@ -1802,6 +1804,7 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?ext4_lblk_t, int, int *);
> ?struct buffer_head *ext4_bread(handle_t *, struct inode *,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?ext4_lblk_t, int, int *);
> +int ext4_bread_ra(struct inode *inode, ext4_lblk_t block);
> ?int ext4_get_block(struct inode *inode, sector_t iblock,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct buffer_head *bh_result, int create);
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a5763e3..938fb6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1490,6 +1490,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
> ? ? ? ?return bh;
> ?}
>
> +/*
> + ?* Synchronous read of blocks
> + ?*/
> ?struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ext4_lblk_t block, int create, int *err)
> ?{
> @@ -1500,6 +1503,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
> ? ? ? ? ? ? ? ?return bh;
> ? ? ? ?if (buffer_uptodate(bh))
> ? ? ? ? ? ? ? ?return bh;
> +
> ? ? ? ?ll_rw_block(READ_META, 1, &bh);
> ? ? ? ?wait_on_buffer(bh);
> ? ? ? ?if (buffer_uptodate(bh))
> @@ -1509,6 +1513,30 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
> ? ? ? ?return NULL;
> ?}
>
> +/*
> + * Read-ahead blocks
> + */
> +int ext4_bread_ra(struct inode *inode, ext4_lblk_t block)
> +{
> + ? ? ? struct buffer_head *bh;
> + ? ? ? int err;
> +
> + ? ? ? bh = ext4_getblk(NULL, inode, block, 0, &err);
> + ? ? ? if (!bh)
> + ? ? ? ? ? ? ? return -1;
> +
> + ? ? ? if (buffer_uptodate(bh)) {
> + ? ? ? ? ? ? ? brelse(bh);
> + ? ? ? ? ? ? ? return 0;
> + ? ? ? }
> +
> + ? ? ? ll_rw_block(READA, 1, &bh);
> +
> + ? ? ? brelse(bh);
> + ? ? ? return 0;
> +}
> +
> +
> ?static int walk_page_buffers(handle_t *handle,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct buffer_head *head,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? unsigned from,
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6f32da4..78290f0 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -334,6 +334,35 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> ?#endif /* DX_DEBUG */
>
> ?/*
> + * Read ahead directory index blocks
> + */
> +static void dx_ra_blocks(struct inode *dir, struct dx_entry * entries)
> +{
> + ? ? ? int i, err = 0;
> + ? ? ? unsigned num_entries = dx_get_count(entries);
> +
> + ? ? ? if (num_entries < 2 || num_entries > dx_get_limit(entries)) {
> + ? ? ? ? ? ? ? dxtrace(printk("dx read-ahead: invalid number of entries\n"));
> + ? ? ? ? ? ? ? return;
> + ? ? ? }
> +
> + ? ? ? dxtrace(printk("dx read-ahead: %d entries in dir-ino %lu \n",
> + ? ? ? ? ? ? ? ? ? ? ? num_entries, dir->i_ino));
> +
> + ? ? ? i = 1; /* skip first entry, it was already read in by the caller */
> + ? ? ? do {
> + ? ? ? ? ? ? ? struct dx_entry *entry;
> + ? ? ? ? ? ? ? ext4_lblk_t block;
> +
> + ? ? ? ? ? ? ? entry = entries + i;
> +
> + ? ? ? ? ? ? ? block = dx_get_block(entry);
> + ? ? ? ? ? ? ? err = ext4_bread_ra(dir, dx_get_block(entry));
I think your meaning may be:
block = dx_get_block(entry);
err = ext4_bread_ra(dir, block);
> + ? ? ? ? ? ? ? i++;
> + ? ? ? ?} while (i < num_entries && !err);
> +}
> +
> +/*
> ?* Probe for a directory leaf block to search.
> ?*
> ?* dx_probe can return ERR_BAD_DX_DIR, which means there was a format
> @@ -347,11 +376,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
> ? ? ? ? struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
> ?{
> ? ? ? ?unsigned count, indirect;
> - ? ? ? struct dx_entry *at, *entries, *p, *q, *m;
> + ? ? ? struct dx_entry *at, *entries, *ra_entries, *p, *q, *m;
> ? ? ? ?struct dx_root *root;
> ? ? ? ?struct buffer_head *bh;
> ? ? ? ?struct dx_frame *frame = frame_in;
> ? ? ? ?u32 hash;
> + ? ? ? bool did_ra = false;
>
> ? ? ? ?frame->bh = NULL;
> ? ? ? ?if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
> @@ -390,7 +420,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
> ? ? ? ? ? ? ? ?goto fail;
> ? ? ? ?}
>
> - ? ? ? entries = (struct dx_entry *) (((char *)&root->info) +
> + ? ? ? ra_entries = entries = (struct dx_entry *) (((char *)&root->info) +
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? root->info.info_length);
>
> ? ? ? ?if (dx_get_limit(entries) != dx_root_limit(dir,
> @@ -446,9 +476,27 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
> ? ? ? ? ? ? ? ?frame->bh = bh;
> ? ? ? ? ? ? ? ?frame->entries = entries;
> ? ? ? ? ? ? ? ?frame->at = at;
> - ? ? ? ? ? ? ? if (!indirect--) return frame;
> - ? ? ? ? ? ? ? if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
> +
> + ? ? ? ? ? ? ? if (!did_ra && test_opt2(dir->i_sb, DX_READ_AHEAD)) {
> + ? ? ? ? ? ? ? ? ? ? ? /* read-ahead of dx blocks */
> + ? ? ? ? ? ? ? ? ? ? ? struct buffer_head *test_bh;
> + ? ? ? ? ? ? ? ? ? ? ? ext4_lblk_t block = dx_get_block(at);
> +
> + ? ? ? ? ? ? ? ? ? ? ? test_bh = ext4_getblk(NULL, dir, block, 0, err);
> + ? ? ? ? ? ? ? ? ? ? ? if (test_bh && !buffer_uptodate(test_bh)) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? dx_ra_blocks(dir, ra_entries);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? did_ra = true;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? ? ? ? ? brelse(test_bh);
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? if (!indirect--)
> + ? ? ? ? ? ? ? ? ? ? ? return frame;
> +
> + ? ? ? ? ? ? ? bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err);
> + ? ? ? ? ? ? ? if (!bh)
> ? ? ? ? ? ? ? ? ? ? ? ?goto fail2;
> +
> ? ? ? ? ? ? ? ?at = entries = ((struct dx_node *) bh->b_data)->entries;
> ? ? ? ? ? ? ? ?if (dx_get_limit(entries) != dx_node_limit (dir)) {
> ? ? ? ? ? ? ? ? ? ? ? ?ext4_warning(dir->i_sb,
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index cc5c157..9dd7c05 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1119,6 +1119,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
> ? ? ? ? ? ? ? ?seq_printf(seq, ",init_inode_table=%u",
> ? ? ? ? ? ? ? ? ? ? ? ? ? (unsigned) sbi->s_li_wait_mult);
>
> + ? ? ? if (test_opt2(sb, DX_READ_AHEAD))
> + ? ? ? ? ? ? ? seq_puts(seq, ",dx_read_ahead");
> +
> ? ? ? ?ext4_show_quota_options(seq, sb);
>
> ? ? ? ?return 0;
> @@ -1294,6 +1297,7 @@ enum {
> ? ? ? ?Opt_dioread_nolock, Opt_dioread_lock,
> ? ? ? ?Opt_discard, Opt_nodiscard,
> ? ? ? ?Opt_init_inode_table, Opt_noinit_inode_table,
> + ? ? ? Opt_dx_read_ahead,
> ?};
>
> ?static const match_table_t tokens = {
> @@ -1369,6 +1373,8 @@ static const match_table_t tokens = {
> ? ? ? ?{Opt_init_inode_table, "init_itable=%u"},
> ? ? ? ?{Opt_init_inode_table, "init_itable"},
> ? ? ? ?{Opt_noinit_inode_table, "noinit_itable"},
> + ? ? ? {Opt_dx_read_ahead, "dx_read_ahead=%u"},
> + ? ? ? {Opt_dx_read_ahead, "dx_read_ahead"},
> ? ? ? ?{Opt_err, NULL},
> ?};
>
> @@ -1859,6 +1865,17 @@ set_qf_format:
> ? ? ? ? ? ? ? ?case Opt_noinit_inode_table:
> ? ? ? ? ? ? ? ? ? ? ? ?clear_opt(sb, INIT_INODE_TABLE);
> ? ? ? ? ? ? ? ? ? ? ? ?break;
> + ? ? ? ? ? ? ? case Opt_dx_read_ahead:
> + ? ? ? ? ? ? ? ? ? ? ? if (args[0].from) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (match_int(&args[0], &option))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return 0;
> + ? ? ? ? ? ? ? ? ? ? ? } else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? option = 1; ? ? /* No argument, default to 1 */
> + ? ? ? ? ? ? ? ? ? ? ? if (option)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? set_opt2(sb, DX_READ_AHEAD);
> + ? ? ? ? ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? clear_opt2(sb, DX_READ_AHEAD);
> + ? ? ? ? ? ? ? ? ? ? ? break;
> ? ? ? ? ? ? ? ?default:
> ? ? ? ? ? ? ? ? ? ? ? ?ext4_msg(sb, KERN_ERR,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "Unrecognized mount option \"%s\" "
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>
--
--
Best Regard
Robin Dong