From: Wang Shilong <[email protected]>
Somtimes, we need figure out progress of Lazyinit
in the background, this patch try to add stats support
for it, output is something like:
$ cat /sys/fs/ext4/vda/lazyinit_stats
groups_finished: 80
groups_total: 80
Signed-off-by: Wang Shilong <[email protected]>
---
Sorry if you received twice, first one is blocked by
mail list.
---
fs/ext4/ext4.h | 4 ++++
fs/ext4/super.c | 19 ++++++++++++++++---
fs/ext4/sysfs.c | 20 ++++++++++++++++++++
3 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 349afeb..9aaae81 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1505,6 +1505,10 @@ struct ext4_sb_info {
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+
+ /* for lazyinit stats */
+ unsigned long lazyinit_finished_cnt;
+ unsigned long lazyinit_total_cnt;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712..b95e622 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2660,6 +2660,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
+ EXT4_SB(sb)->lazyinit_finished_cnt++;
}
sb_end_write(sb);
@@ -2864,10 +2865,12 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
+ ext4_group_t group, ngroups;
+ struct ext4_group_desc *gdp = NULL;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
if (!elr)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elr->lr_super = sb;
elr->lr_sbi = sbi;
@@ -2880,6 +2883,16 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
*/
elr->lr_next_sched = jiffies + (prandom_u32() %
(EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ ngroups = EXT4_SB(sb)->s_groups_count;
+ for (group = elr->lr_next_group; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp) {
+ elr = ERR_PTR(-EIO);
+ break;
+ }
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ sbi->lazyinit_total_cnt++;
+ }
return elr;
}
@@ -2907,8 +2920,8 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
- if (!elr) {
- ret = -ENOMEM;
+ if (IS_ERR(elr)) {
+ ret = PTR_ERR(elr);
goto out;
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1420a3c..8d0c199 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -20,6 +20,7 @@ typedef enum {
attr_delayed_allocation_blocks,
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
+ attr_lazyinit_stats,
attr_reserved_clusters,
attr_inode_readahead,
attr_trigger_test_error,
@@ -72,6 +73,21 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
+static ssize_t lazyinit_stats_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ int len = 0;
+ unsigned long total = sbi->lazyinit_total_cnt;
+ unsigned long finish = sbi->lazyinit_finished_cnt;
+
+ len += snprintf(buf + len, PAGE_SIZE,
+ "groups_finished: %lu\n", finish);
+ len += snprintf(buf + len, PAGE_SIZE,
+ "groups_total: %lu\n", total);
+
+ return len;
+}
+
static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
@@ -165,6 +181,7 @@ static struct ext4_attr ext4_attr_##_name = { \
EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lazyinit_stats, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
@@ -195,6 +212,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(lazyinit_stats),
ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
@@ -265,6 +283,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return session_write_kbytes_show(a, sbi, buf);
case attr_lifetime_write_kbytes:
return lifetime_write_kbytes_show(a, sbi, buf);
+ case attr_lazyinit_stats:
+ return lazyinit_stats_show(a, sbi, buf);
case attr_reserved_clusters:
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)
--
1.7.1
On 5/16/16 10:41 PM, Wang Shilong wrote:
> From: Wang Shilong <[email protected]>
>
> Somtimes, we need figure out progress of Lazyinit
> in the background, this patch try to add stats support
> for it, output is something like:
>
> $ cat /sys/fs/ext4/vda/lazyinit_stats
> groups_finished: 80
> groups_total: 80
A few thoughts:
If this goes in, it should be documented in
Documentation/fs/ext4.txt, as well.
I suppose it might be nice, but when do you think this
might be needed in real life?
Also: sysfs is technically supposed to be one value per
file, and (I think) just a number, no text. At least,
that's what every other ext4 device sysfs file has today,
so this should follow that precedent.
And as far as I can tell, "groups total" is the total
uninit groups at mount time, not total in the filesystem,
so this would change on a remount? I think that's unexpected,
and not very useful.
Simply printing the remaining uninitialized block group
count might be enough, i.e.:
$ cat /sys/fs/ext4/vda/lazyinit_remaining
42
-Eric
> Signed-off-by: Wang Shilong <[email protected]>
> ---
> Sorry if you received twice, first one is blocked by
> mail list.
> ---
> fs/ext4/ext4.h | 4 ++++
> fs/ext4/super.c | 19 ++++++++++++++++---
> fs/ext4/sysfs.c | 20 ++++++++++++++++++++
> 3 files changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 349afeb..9aaae81 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1505,6 +1505,10 @@ struct ext4_sb_info {
> struct ratelimit_state s_err_ratelimit_state;
> struct ratelimit_state s_warning_ratelimit_state;
> struct ratelimit_state s_msg_ratelimit_state;
> +
> + /* for lazyinit stats */
> + unsigned long lazyinit_finished_cnt;
> + unsigned long lazyinit_total_cnt;
> };
>
> static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 304c712..b95e622 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2660,6 +2660,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
> }
> elr->lr_next_sched = jiffies + elr->lr_timeout;
> elr->lr_next_group = group + 1;
> + EXT4_SB(sb)->lazyinit_finished_cnt++;
> }
> sb_end_write(sb);
>
> @@ -2864,10 +2865,12 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
> {
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> struct ext4_li_request *elr;
> + ext4_group_t group, ngroups;
> + struct ext4_group_desc *gdp = NULL;
>
> elr = kzalloc(sizeof(*elr), GFP_KERNEL);
> if (!elr)
> - return NULL;
> + return ERR_PTR(-ENOMEM);
>
> elr->lr_super = sb;
> elr->lr_sbi = sbi;
> @@ -2880,6 +2883,16 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
> */
> elr->lr_next_sched = jiffies + (prandom_u32() %
> (EXT4_DEF_LI_MAX_START_DELAY * HZ));
> + ngroups = EXT4_SB(sb)->s_groups_count;
> + for (group = elr->lr_next_group; group < ngroups; group++) {
^whitespace issue here
> + gdp = ext4_get_group_desc(sb, group, NULL);
> + if (!gdp) {
> + elr = ERR_PTR(-EIO);
> + break;
> + }
> + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
> + sbi->lazyinit_total_cnt++;
> + }
> return elr;
> }
>
> @@ -2907,8 +2920,8 @@ int ext4_register_li_request(struct super_block *sb,
> goto out;
>
> elr = ext4_li_request_new(sb, first_not_zeroed);
> - if (!elr) {
> - ret = -ENOMEM;
> + if (IS_ERR(elr)) {
> + ret = PTR_ERR(elr);
> goto out;
> }
>
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 1420a3c..8d0c199 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -20,6 +20,7 @@ typedef enum {
> attr_delayed_allocation_blocks,
> attr_session_write_kbytes,
> attr_lifetime_write_kbytes,
> + attr_lazyinit_stats,
> attr_reserved_clusters,
> attr_inode_readahead,
> attr_trigger_test_error,
> @@ -72,6 +73,21 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
> EXT4_SB(sb)->s_sectors_written_start) >> 1)));
> }
>
> +static ssize_t lazyinit_stats_show(struct ext4_attr *a,
> + struct ext4_sb_info *sbi, char *buf)
> +{
> + int len = 0;
> + unsigned long total = sbi->lazyinit_total_cnt;
> + unsigned long finish = sbi->lazyinit_finished_cnt;
> +
> + len += snprintf(buf + len, PAGE_SIZE,
> + "groups_finished: %lu\n", finish);
> + len += snprintf(buf + len, PAGE_SIZE,
> + "groups_total: %lu\n", total);
> +
> + return len;
> +}
> +
> static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
> struct ext4_sb_info *sbi,
> const char *buf, size_t count)
> @@ -165,6 +181,7 @@ static struct ext4_attr ext4_attr_##_name = { \
> EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
> EXT4_ATTR_FUNC(session_write_kbytes, 0444);
> EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
> +EXT4_ATTR_FUNC(lazyinit_stats, 0444);
> EXT4_ATTR_FUNC(reserved_clusters, 0644);
>
> EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
> @@ -195,6 +212,7 @@ static struct attribute *ext4_attrs[] = {
> ATTR_LIST(delayed_allocation_blocks),
> ATTR_LIST(session_write_kbytes),
> ATTR_LIST(lifetime_write_kbytes),
> + ATTR_LIST(lazyinit_stats),
> ATTR_LIST(reserved_clusters),
> ATTR_LIST(inode_readahead_blks),
> ATTR_LIST(inode_goal),
> @@ -265,6 +283,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
> return session_write_kbytes_show(a, sbi, buf);
> case attr_lifetime_write_kbytes:
> return lifetime_write_kbytes_show(a, sbi, buf);
> + case attr_lazyinit_stats:
> + return lazyinit_stats_show(a, sbi, buf);
> case attr_reserved_clusters:
> return snprintf(buf, PAGE_SIZE, "%llu\n",
> (unsigned long long)
>
On Tue, May 17, 2016 at 11:41:28AM +0800, Wang Shilong wrote:
> From: Wang Shilong <[email protected]>
>
> Somtimes, we need figure out progress of Lazyinit
> in the background, this patch try to add stats support
> for it, output is something like:
>
> $ cat /sys/fs/ext4/vda/lazyinit_stats
> groups_finished: 80
> groups_total: 80
That's more of a procfs style of output. In general the sysfs idiom
would be to have two sysfs variables:
/sys/fs/ext4/vda/nr_block_groups
/sys/fs/ext4/vda/nr_lazyinit_bg_done
and each would return an integer value.
For example:
% cat /sys/block/sda/queue/discard_granularity
512
% cat /sys/block/sda/queue/discard_max_bytes
2147450880
% cat /sys/block/sda/queue/discard_zeroes_data
0
There are certainly some exceptions to this rule, but it certainly
simplifies the sysfs support code, as well as making it a bit easier
for userspace programs to parse the output from those files.
- Ted
On Tue, May 17, 2016 at 11:59 AM, Eric Sandeen <[email protected]> wrote:
> On 5/16/16 10:41 PM, Wang Shilong wrote:
>> From: Wang Shilong <[email protected]>
>>
>> Somtimes, we need figure out progress of Lazyinit
>> in the background, this patch try to add stats support
>> for it, output is something like:
>>
>> $ cat /sys/fs/ext4/vda/lazyinit_stats
>> groups_finished: 80
>> groups_total: 80
>
> A few thoughts:
>
> If this goes in, it should be documented in
> Documentation/fs/ext4.txt, as well.
>
> I suppose it might be nice, but when do you think this
> might be needed in real life?
In our performances benchmarking, we want to wait
Lazyinit finished before starting really benchmarking sometimes.
(Since Lazyinit thread trigger some write background here)
but there is no way to see what is progress of lazyinit.
and we can only make sure there is no write from Device
counter...
>
> Also: sysfs is technically supposed to be one value per
> file, and (I think) just a number, no text. At least,
> that's what every other ext4 device sysfs file has today,
> so this should follow that precedent.
>
> And as far as I can tell, "groups total" is the total
> uninit groups at mount time, not total in the filesystem,
> so this would change on a remount? I think that's unexpected,
> and not very useful.
Yeah, From our usage, we need know progress of lazyiniting.
So after remoutning, 'total' will be dynamically changed.
We could store 'total' in superblock etc, but IMO it is
a bit overhead here.
>
> Simply printing the remaining uninitialized block group
> count might be enough, i.e.:
>
> $ cat /sys/fs/ext4/vda/lazyinit_remaining
> 42
Yup, this works fine for me.
Thank you for your coments!
>
> -Eric
>
>> Signed-off-by: Wang Shilong <[email protected]>
>> ---
>> Sorry if you received twice, first one is blocked by
>> mail list.
>> ---
>> fs/ext4/ext4.h | 4 ++++
>> fs/ext4/super.c | 19 ++++++++++++++++---
>> fs/ext4/sysfs.c | 20 ++++++++++++++++++++
>> 3 files changed, 40 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 349afeb..9aaae81 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1505,6 +1505,10 @@ struct ext4_sb_info {
>> struct ratelimit_state s_err_ratelimit_state;
>> struct ratelimit_state s_warning_ratelimit_state;
>> struct ratelimit_state s_msg_ratelimit_state;
>> +
>> + /* for lazyinit stats */
>> + unsigned long lazyinit_finished_cnt;
>> + unsigned long lazyinit_total_cnt;
>> };
>>
>> static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 304c712..b95e622 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -2660,6 +2660,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
>> }
>> elr->lr_next_sched = jiffies + elr->lr_timeout;
>> elr->lr_next_group = group + 1;
>> + EXT4_SB(sb)->lazyinit_finished_cnt++;
>> }
>> sb_end_write(sb);
>>
>> @@ -2864,10 +2865,12 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>> {
>> struct ext4_sb_info *sbi = EXT4_SB(sb);
>> struct ext4_li_request *elr;
>> + ext4_group_t group, ngroups;
>> + struct ext4_group_desc *gdp = NULL;
>>
>> elr = kzalloc(sizeof(*elr), GFP_KERNEL);
>> if (!elr)
>> - return NULL;
>> + return ERR_PTR(-ENOMEM);
>>
>> elr->lr_super = sb;
>> elr->lr_sbi = sbi;
>> @@ -2880,6 +2883,16 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>> */
>> elr->lr_next_sched = jiffies + (prandom_u32() %
>> (EXT4_DEF_LI_MAX_START_DELAY * HZ));
>> + ngroups = EXT4_SB(sb)->s_groups_count;
>> + for (group = elr->lr_next_group; group < ngroups; group++) {
> ^whitespace issue here
>> + gdp = ext4_get_group_desc(sb, group, NULL);
>> + if (!gdp) {
>> + elr = ERR_PTR(-EIO);
>> + break;
>> + }
>> + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
>> + sbi->lazyinit_total_cnt++;
>> + }
>> return elr;
>> }
>>
>> @@ -2907,8 +2920,8 @@ int ext4_register_li_request(struct super_block *sb,
>> goto out;
>>
>> elr = ext4_li_request_new(sb, first_not_zeroed);
>> - if (!elr) {
>> - ret = -ENOMEM;
>> + if (IS_ERR(elr)) {
>> + ret = PTR_ERR(elr);
>> goto out;
>> }
>>
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index 1420a3c..8d0c199 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -20,6 +20,7 @@ typedef enum {
>> attr_delayed_allocation_blocks,
>> attr_session_write_kbytes,
>> attr_lifetime_write_kbytes,
>> + attr_lazyinit_stats,
>> attr_reserved_clusters,
>> attr_inode_readahead,
>> attr_trigger_test_error,
>> @@ -72,6 +73,21 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
>> EXT4_SB(sb)->s_sectors_written_start) >> 1)));
>> }
>>
>> +static ssize_t lazyinit_stats_show(struct ext4_attr *a,
>> + struct ext4_sb_info *sbi, char *buf)
>> +{
>> + int len = 0;
>> + unsigned long total = sbi->lazyinit_total_cnt;
>> + unsigned long finish = sbi->lazyinit_finished_cnt;
>> +
>> + len += snprintf(buf + len, PAGE_SIZE,
>> + "groups_finished: %lu\n", finish);
>> + len += snprintf(buf + len, PAGE_SIZE,
>> + "groups_total: %lu\n", total);
>> +
>> + return len;
>> +}
>> +
>> static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
>> struct ext4_sb_info *sbi,
>> const char *buf, size_t count)
>> @@ -165,6 +181,7 @@ static struct ext4_attr ext4_attr_##_name = { \
>> EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>> EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>> EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
>> +EXT4_ATTR_FUNC(lazyinit_stats, 0444);
>> EXT4_ATTR_FUNC(reserved_clusters, 0644);
>>
>> EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>> @@ -195,6 +212,7 @@ static struct attribute *ext4_attrs[] = {
>> ATTR_LIST(delayed_allocation_blocks),
>> ATTR_LIST(session_write_kbytes),
>> ATTR_LIST(lifetime_write_kbytes),
>> + ATTR_LIST(lazyinit_stats),
>> ATTR_LIST(reserved_clusters),
>> ATTR_LIST(inode_readahead_blks),
>> ATTR_LIST(inode_goal),
>> @@ -265,6 +283,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>> return session_write_kbytes_show(a, sbi, buf);
>> case attr_lifetime_write_kbytes:
>> return lifetime_write_kbytes_show(a, sbi, buf);
>> + case attr_lazyinit_stats:
>> + return lazyinit_stats_show(a, sbi, buf);
>> case attr_reserved_clusters:
>> return snprintf(buf, PAGE_SIZE, "%llu\n",
>> (unsigned long long)
>>
>
On 5/16/16 11:14 PM, Wang Shilong wrote:
> On Tue, May 17, 2016 at 11:59 AM, Eric Sandeen <[email protected]> wrote:
>> On 5/16/16 10:41 PM, Wang Shilong wrote:
>>> From: Wang Shilong <[email protected]>
>>>
>>> Somtimes, we need figure out progress of Lazyinit
>>> in the background, this patch try to add stats support
>>> for it, output is something like:
>>>
>>> $ cat /sys/fs/ext4/vda/lazyinit_stats
>>> groups_finished: 80
>>> groups_total: 80
>>
>> A few thoughts:
>>
>> If this goes in, it should be documented in
>> Documentation/fs/ext4.txt, as well.
>>
>> I suppose it might be nice, but when do you think this
>> might be needed in real life?
>
> In our performances benchmarking, we want to wait
> Lazyinit finished before starting really benchmarking sometimes.
> (Since Lazyinit thread trigger some write background here)
>
> but there is no way to see what is progress of lazyinit.
> and we can only make sure there is no write from Device
> counter...
In that case you should just specify no lazyinit at mkfs time :)
>> Also: sysfs is technically supposed to be one value per
>> file, and (I think) just a number, no text. At least,
>> that's what every other ext4 device sysfs file has today,
>> so this should follow that precedent.
>>
>> And as far as I can tell, "groups total" is the total
>> uninit groups at mount time, not total in the filesystem,
>> so this would change on a remount? I think that's unexpected,
>> and not very useful.
>
> Yeah, From our usage, we need know progress of lazyiniting.
> So after remoutning, 'total' will be dynamically changed.
>
> We could store 'total' in superblock etc, but IMO it is
> a bit overhead here.
Agreed.
>>
>> Simply printing the remaining uninitialized block group
>> count might be enough, i.e.:
>>
>> $ cat /sys/fs/ext4/vda/lazyinit_remaining
>> 42
>
> Yup, this works fine for me.
>
> Thank you for your coments!
Sure thing; I'm still on the fence about usefulness, because if
anyone really cares to wait for it to hit zero, they probably
should have just changed their mkfs options to disable lazyinit.
But if you simply print remaining groups I think it is a very
simple and low-risk patch, so I wouldn't NAK it, either.
-Eric
>>
>> -Eric
On Mon, May 16, 2016 at 11:22:35PM -0500, Eric Sandeen wrote:
>
> Sure thing; I'm still on the fence about usefulness, because if
> anyone really cares to wait for it to hit zero, they probably
> should have just changed their mkfs options to disable lazyinit.
Indeed, if you're going to stall your test startup until lazyinit is
done, it will be faster to let mke2fs initialize the inode table
blocks, since lazyinit deliberately rate limits how much I/O it sends
down to the disk in order to avoid impacting user latencies. So i'd
recommend "mke2fs -E lazy_itable_init=0,lazy_journal_init=0" instead
of patching the kernel and then waiting for lazy itable init to
complete. I think you'll be happier with how long this takes.
BTW, if you care about 99.9 percentile (long tail) latency as a
performance metric, it's a good idea to disable lazy inode table init
and just let mke2fs take its own sweet time. This is what we do
inside Google, since we care very much about long tail latencies on a
number of our workloads.
Cheers,
- Ted
Sure, disabling layzinit is an option, but our single disk size is more than 60TB to make several petabyte system with Lustre.
It takes time in a long while to be completion of mkfs without layzinit and we can't do anything until mkfs is done.
layzinit is still help, we can mount filesystem and do something (create Lustre and testing from client, etc) in parallel under lazyinit is running behind.
So, we still want to see how lazyinit is going and know estimation of finish it to do really performance intensive testing after lazyinit finished.
Thanks
Ihara
On 5/17/16, 1:45 PM, "Theodore Ts'o" <[email protected]> wrote:
>On Mon, May 16, 2016 at 11:22:35PM -0500, Eric Sandeen wrote:
>>
>> Sure thing; I'm still on the fence about usefulness, because if
>> anyone really cares to wait for it to hit zero, they probably
>> should have just changed their mkfs options to disable lazyinit.
>
>Indeed, if you're going to stall your test startup until lazyinit is
>done, it will be faster to let mke2fs initialize the inode table
>blocks, since lazyinit deliberately rate limits how much I/O it sends
>down to the disk in order to avoid impacting user latencies. So i'd
>recommend "mke2fs -E lazy_itable_init=0,lazy_journal_init=0" instead
>of patching the kernel and then waiting for lazy itable init to
>complete. I think you'll be happier with how long this takes.
>
>BTW, if you care about 99.9 percentile (long tail) latency as a
>performance metric, it's a good idea to disable lazy inode table init
>and just let mke2fs take its own sweet time. This is what we do
>inside Google, since we care very much about long tail latencies on a
>number of our workloads.
>
>Cheers,
>
> - Ted
On Tue, May 17, 2016 at 05:36:57AM +0000, Shuichi Ihara wrote:
> Sure, disabling layzinit is an option, but our single disk size is
> more than 60TB to make several petabyte system with Lustre.
> It takes time in a long while to be completion of mkfs without
> layzinit and we can't do anything until mkfs is done. layzinit is
> still help, we can mount filesystem and do something (create Lustre
> and testing from client, etc) in parallel under lazyinit is running
> behind.
Ah, I'm used to doing single disk benchmarks where in the interests of
getting numbers which are as reproducible as possible, I always run
the performance benchmarks on a freshly initialized file system, and
so I don't want to do anything between when the file system is
initialized and when the workload is started --- or if I'm going to be
running a test on an aged file system, I'll have a standardized file
system aging tool with a fixed random seed, and/or a file system image
which I'll dd into place so that the starting point for the
performance evaluation is exactly the same for each benchmark run.
It didn't occur to me that you might want to be using the file system
while lazy init was taking place, and then later on, doing the
performance benchmarks on a used, randomly dirtied file system. Hence
my assumption that the benmarking shell script[1] would not be doing
anything at all while it waited for lazy init to be finished.
[1] In the interests of keeping the results as consistent as possible,
I tend to have benchmarking scripts that do all of the workload setup
and test running and data collection done in an automated fashion, so
I can kick off a benchmarking run and come back the next day (or after
the weekend) with all of the results collected.
- Ted