2020-05-15 10:08:24

by Alex Zhuravlev

[permalink] [raw]
Subject: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

This should significantly improve bitmap loading, especially for flex
groups as it tries to load all bitmaps within a flex.group instead of
one by one synchronously.

Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
reads for a single allocating thread. At the end of allocation the
thread waits for read-ahead completion and initializes buddy information
so that read-aheads are not lost in case of memory pressure.

At cr=0 the number of prefetching IOs is limited per allocation context
to prevent a situation when mballoc loads thousands of bitmaps looking
for a perfect group and ignoring groups with good chunks.

Together with the patch "ext4: limit scanning of uninitialized groups"
the mount time (which includes few tiny allocations) of a 1PB filesystem
is reduced significantly:

0% full 50%-full unpatched patched
mount time 33s 9279s 563s

Signed-off-by: Alex Zhuravlev <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
---
fs/ext4/balloc.c | 12 ++++-
fs/ext4/ext4.h | 8 +++-
fs/ext4/mballoc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/mballoc.h | 2 +
fs/ext4/sysfs.c | 4 ++
5 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a32e5f7b5385..6712146195ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
* Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -444,6 +445,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
if (bitmap_uptodate(bh))
goto verify;

+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, do not wait
+ * if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
@@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;

- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 91eb4381cae5..521fbcd8efc7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1483,6 +1483,8 @@ struct ext4_sb_info {
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;

/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -2420,7 +2422,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -3119,6 +3122,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4

#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3133,6 +3137,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index afb8bd9a10e9..ebfe258bfd0f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -2127,6 +2127,96 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}

+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t start)
+{
+ struct super_block *sb = ac->ac_sb;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info *grp;
+ ext4_group_t nr, group = start;
+ struct buffer_head *bh;
+
+ /* limit prefetching at cr=0, otherwise mballoc can
+ * spend a lot of time loading imperfect groups */
+ if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+ return;
+
+ /* batch prefetching to get few READs in flight */
+ nr = ac->ac_prefetch - group;
+ if (ac->ac_prefetch < group)
+ /* wrapped to the first groups */
+ nr += ngroups;
+ if (nr > 0)
+ return;
+ BUG_ON(nr < 0);
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* align to flex_bg to get more bitmas with a single IO */
+ nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ while (nr-- > 0) {
+ grp = ext4_get_group_info(sb, group);
+
+ /* prevent expensive getblk() on groups w/ IO in progress */
+ if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+ goto next;
+
+ /* ignore empty groups - those will be skipped
+ * during the scanning as well */
+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh))
+ ac->ac_prefetch_ios++;
+ brelse(bh);
+ }
+ }
+next:
+ if (++group >= ngroups)
+ group = 0;
+ }
+ ac->ac_prefetch = group;
+}
+
+static void
+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+ struct ext4_group_info *grp;
+ ext4_group_t group;
+ int nr, rc;
+
+ /* initialize last window of prefetched groups */
+ nr = ac->ac_prefetch_ios;
+ if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+ nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+ group = ac->ac_prefetch;
+ if (!group)
+ group = ext4_get_groups_count(ac->ac_sb);
+ group--;
+ while (nr-- > 0) {
+ grp = ext4_get_group_info(ac->ac_sb, group);
+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+ rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ if (rc)
+ break;
+ }
+ if (!group)
+ group = ext4_get_groups_count(ac->ac_sb);
+ group--;
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -2200,6 +2290,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch = group;

for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2211,6 +2302,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
if (group >= ngroups)
group = 0;

+ ext4_mb_prefetch(ac, group);
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group(ac, group, cr);
if (ret <= 0) {
@@ -2283,6 +2376,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
+ /* use prefetched bitmaps to init buddy so that read info is not lost */
+ ext4_mb_prefetch_fini(ac);
return err;
}

@@ -2542,6 +2637,25 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}

+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4 */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;

err_freebuddy:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..c96a2bd81f72 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,8 @@ struct ext4_allocation_context {
struct page *ac_buddy_page;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
+ ext4_group_t ac_prefetch;
+ int ac_prefetch_ios; /* number of initialied prefetch IO */
};

#define AC_STATUS_CONTINUE 1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 04bfaf63752c..5f443f9d54b8 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -240,6 +240,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);

static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -283,6 +285,8 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
--
2.21.3


2020-05-20 04:50:10

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

Hi Alex,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on v5.7-rc6 next-20200519]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Alex-Zhuravlev/ext4-mballoc-prefetching-for-bitmaps/20200515-181552
base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: microblaze-randconfig-r024-20200519 (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=microblaze

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

In file included from include/linux/kernel.h:11,
from include/linux/list.h:9,
from include/linux/wait.h:7,
from include/linux/wait_bit.h:8,
from include/linux/fs.h:6,
from fs/ext4/ext4_jbd2.h:15,
from fs/ext4/mballoc.c:12:
fs/ext4/mballoc.c: In function 'ext4_mb_prefetch':
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
69 | (cond) ? | ^~~~
include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
56 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
| ^~~~~~~~~~~~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 | BUG_ON(nr < 0);
| ^
include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
69 | (cond) ? | ^~~~
include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
56 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
| ^~~~~~~~~~~~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
| ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
| ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 | BUG_ON(nr < 0);
| ^~~~~~

vim +/if +62 include/asm-generic/bug.h

^1da177e4c3f41 Linus Torvalds 2005-04-16 60
^1da177e4c3f41 Linus Torvalds 2005-04-16 61 #ifndef HAVE_ARCH_BUG_ON
2a41de48b81e61 Alexey Dobriyan 2007-07-17 @62 #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
^1da177e4c3f41 Linus Torvalds 2005-04-16 63 #endif
^1da177e4c3f41 Linus Torvalds 2005-04-16 64

:::::: The code at line 62 was first introduced by commit
:::::: 2a41de48b81e61fbe260ae5031ebcb6f935f35fb Fix sparse false positives re BUG_ON(ptr)

:::::: TO: Alexey Dobriyan <[email protected]>
:::::: CC: Linus Torvalds <[email protected]>

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (8.67 kB)
.config.gz (28.92 kB)
Download all attachments

2020-05-23 19:42:36

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

Hi Alex,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on v5.7-rc6 next-20200522]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Alex-Zhuravlev/ext4-mballoc-prefetching-for-bitmaps/20200515-181552
base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: x86_64-lkp (attached as .config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build):
# save the attached .config to linux build tree
make ARCH=x86_64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

In file included from include/linux/export.h:43:0,
from include/linux/linkage.h:7,
from include/linux/fs.h:5,
from fs/ext4/ext4_jbd2.h:15,
from fs/ext4/mballoc.c:12:
fs/ext4/mballoc.c: In function 'ext4_mb_prefetch':
>> fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
BUG_ON(nr < 0);
^
include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
# define unlikely(x) __builtin_expect(!!(x), 0)
^
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
BUG_ON(nr < 0);
^~~~~~

vim +2137 fs/ext4/mballoc.c

2106
2107 /*
2108 * each allocation context (i.e. a thread doing allocation) has own
2109 * sliding prefetch window of @s_mb_prefetch size which starts at the
2110 * very first goal and moves ahead of scaning.
2111 * a side effect is that subsequent allocations will likely find
2112 * the bitmaps in cache or at least in-flight.
2113 */
2114 static void
2115 ext4_mb_prefetch(struct ext4_allocation_context *ac,
2116 ext4_group_t start)
2117 {
2118 struct super_block *sb = ac->ac_sb;
2119 ext4_group_t ngroups = ext4_get_groups_count(sb);
2120 struct ext4_sb_info *sbi = EXT4_SB(sb);
2121 struct ext4_group_info *grp;
2122 ext4_group_t nr, group = start;
2123 struct buffer_head *bh;
2124
2125 /* limit prefetching at cr=0, otherwise mballoc can
2126 * spend a lot of time loading imperfect groups */
2127 if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
2128 return;
2129
2130 /* batch prefetching to get few READs in flight */
2131 nr = ac->ac_prefetch - group;
2132 if (ac->ac_prefetch < group)
2133 /* wrapped to the first groups */
2134 nr += ngroups;
2135 if (nr > 0)
2136 return;
> 2137 BUG_ON(nr < 0);
2138
2139 nr = sbi->s_mb_prefetch;
2140 if (ext4_has_feature_flex_bg(sb)) {
2141 /* align to flex_bg to get more bitmas with a single IO */
2142 nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
2143 nr = nr + sbi->s_mb_prefetch - group;
2144 }
2145 while (nr-- > 0) {
2146 grp = ext4_get_group_info(sb, group);
2147
2148 /* prevent expensive getblk() on groups w/ IO in progress */
2149 if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
2150 goto next;
2151
2152 /* ignore empty groups - those will be skipped
2153 * during the scanning as well */
2154 if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
2155 bh = ext4_read_block_bitmap_nowait(sb, group, true);
2156 if (bh && !IS_ERR(bh)) {
2157 if (!buffer_uptodate(bh))
2158 ac->ac_prefetch_ios++;
2159 brelse(bh);
2160 }
2161 }
2162 next:
2163 if (++group >= ngroups)
2164 group = 0;
2165 }
2166 ac->ac_prefetch = group;
2167 }
2168

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (3.92 kB)
.config.gz (27.69 kB)
Download all attachments

2020-05-28 14:48:25

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

On Fri, May 15, 2020 at 10:07:06AM +0000, Alex Zhuravlev wrote:
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2127,6 +2127,96 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
> return 0;
> }
>
> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void
> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> + ext4_group_t start)
> +{
> + struct super_block *sb = ac->ac_sb;
> + ext4_group_t ngroups = ext4_get_groups_count(sb);
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_group_info *grp;
> + ext4_group_t nr, group = start;
> + struct buffer_head *bh;
> +
> + /* limit prefetching at cr=0, otherwise mballoc can
> + * spend a lot of time loading imperfect groups */
> + if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
> + return;
> +
> + /* batch prefetching to get few READs in flight */
> + nr = ac->ac_prefetch - group;
> + if (ac->ac_prefetch < group)
> + /* wrapped to the first groups */
> + nr += ngroups;
> + if (nr > 0)I
> + return;
> + BUG_ON(nr < 0);

What are you trying to do here? If nr > 0, we return; if nr < 0, we
BUG() --- but nr is an unsigned int, so we never can trigger --- which
was the warning reported by the kbuild test bot. So we will only get
past this point if ac_prefetch == group. But ac_prefetch appears to
be the last group that we prefetched, so it's not clear that the logic
is correct here.

- Ted

2020-05-29 16:22:27

by Artem Blagodarenko

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

Hello Alex,

Some remarks bellow, after code quote.

Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
Excluding this patch restore rates to the original values.
I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
Is plenty of files and directories creation corner case for this optimisation?

Thanks,
Best regards,
Artem Blagodarenko.

> On 15 May 2020, at 13:07, Alex Zhuravlev <[email protected]> wrote:
>
> This should significantly improve bitmap loading, especially for flex
> groups as it tries to load all bitmaps within a flex.group instead of
> one by one synchronously.
>
> Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
> reads for a single allocating thread. At the end of allocation the
> thread waits for read-ahead completion and initializes buddy information
> so that read-aheads are not lost in case of memory pressure.
>
> At cr=0 the number of prefetching IOs is limited per allocation context
> to prevent a situation when mballoc loads thousands of bitmaps looking
> for a perfect group and ignoring groups with good chunks.
>
> Together with the patch "ext4: limit scanning of uninitialized groups"
> the mount time (which includes few tiny allocations) of a 1PB filesystem
> is reduced significantly:
>
> 0% full 50%-full unpatched patched
> mount time 33s 9279s 563s
>
> Signed-off-by: Alex Zhuravlev <[email protected]>
> Reviewed-by: Andreas Dilger <[email protected]>
> ---
> fs/ext4/balloc.c | 12 ++++-
> fs/ext4/ext4.h | 8 +++-
> fs/ext4/mballoc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/mballoc.h | 2 +
> fs/ext4/sysfs.c | 4 ++
> 5 files changed, 138 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index a32e5f7b5385..6712146195ed 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -413,7 +413,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
> * Return buffer_head on success or an ERR_PTR in case of failure.
> */
> struct buffer_head *
> -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
> + bool ignore_locked)
> {
> struct ext4_group_desc *desc;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -444,6 +445,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> if (bitmap_uptodate(bh))
> goto verify;
>
> + if (ignore_locked && buffer_locked(bh)) {
> + /* buffer under IO already, do not wait
> + * if called for prefetching */
> + put_bh(bh);
> + return NULL;
> + }
> +
> lock_buffer(bh);
> if (bitmap_uptodate(bh)) {
> unlock_buffer(bh);
> @@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> struct buffer_head *bh;
> int err;
>
> - bh = ext4_read_block_bitmap_nowait(sb, block_group);
> + bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
> if (IS_ERR(bh))
> return bh;
> err = ext4_wait_block_bitmap(sb, block_group, bh);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 91eb4381cae5..521fbcd8efc7 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1483,6 +1483,8 @@ struct ext4_sb_info {
> /* where last allocation was done - for stream allocation */
> unsigned long s_mb_last_group;
> unsigned long s_mb_last_start;
> + unsigned int s_mb_prefetch;
> + unsigned int s_mb_prefetch_limit;
>
> /* stats for buddy allocator */
> atomic_t s_bal_reqs; /* number of reqs with len > 1 */
> @@ -2420,7 +2422,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
> extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
>
> extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
> - ext4_group_t block_group);
> + ext4_group_t block_group,
> + bool ignore_locked);
> extern int ext4_wait_block_bitmap(struct super_block *sb,
> ext4_group_t block_group,
> struct buffer_head *bh);
> @@ -3119,6 +3122,7 @@ struct ext4_group_info {
> (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
> #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
> (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
> +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
>
> #define EXT4_MB_GRP_NEED_INIT(grp) \
> (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
> @@ -3133,6 +3137,8 @@ struct ext4_group_info {
> (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
> (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
> + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
>
> #define EXT4_MAX_CONTENTION 8
> #define EXT4_CONTENTION_THRESHOLD 2
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index afb8bd9a10e9..ebfe258bfd0f 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
> bh[i] = NULL;
> continue;
> }
> - bh[i] = ext4_read_block_bitmap_nowait(sb, group);
> + bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
> if (IS_ERR(bh[i])) {
> err = PTR_ERR(bh[i]);
> bh[i] = NULL;
> @@ -2127,6 +2127,96 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
> return 0;
> }
>
> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void
> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> + ext4_group_t start)
> +{
> + struct super_block *sb = ac->ac_sb;
> + ext4_group_t ngroups = ext4_get_groups_count(sb);
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_group_info *grp;
> + ext4_group_t nr, group = start;
> + struct buffer_head *bh;
> +

Can be useful giving an ability to disable this optimisation? As option, by setting s_mb_prefetch to zero.
Now 0 at s_mb_prefetch allows to skip the optimisation for cr=0 and cr=1.

Something like.

If (sbi->s_mb_prefetch == 0)
return;

Some changes in ext4_mb_prefetch_fini() are also probably required.

> + /* limit prefetching at cr=0, otherwise mballoc can
> + * spend a lot of time loading imperfect groups */
> + if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
> + return;

A comment above says prefetching is limited for cr=0 but code limit it for cr=0 and cr=1.
Do you need change the comment or code?

> +
> + /* batch prefetching to get few READs in flight */
> + nr = ac->ac_prefetch - group;
> + if (ac->ac_prefetch < group)
> + /* wrapped to the first groups */
> + nr += ngroups;
> + if (nr > 0)
> + return;
> + BUG_ON(nr < 0);
> +
> + nr = sbi->s_mb_prefetch;
> + if (ext4_has_feature_flex_bg(sb)) {
> + /* align to flex_bg to get more bitmas with a single IO */
> + nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
> + nr = nr + sbi->s_mb_prefetch - group;
> + }
> + while (nr-- > 0) {
> + grp = ext4_get_group_info(sb, group);
> +
> + /* prevent expensive getblk() on groups w/ IO in progress */
> + if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
> + goto next;
> +
> + /* ignore empty groups - those will be skipped
> + * during the scanning as well */
> + if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
> + bh = ext4_read_block_bitmap_nowait(sb, group, true);
> + if (bh && !IS_ERR(bh)) {
> + if (!buffer_uptodate(bh))
> + ac->ac_prefetch_ios++;
> + brelse(bh);
> + }
> + }
> +next:
> + if (++group >= ngroups)
> + group = 0;
> + }
> + ac->ac_prefetch = group;
> +}
> +
> +static void
> +ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
> +{
> + struct ext4_group_info *grp;
> + ext4_group_t group;
> + int nr, rc;
> +
> + /* initialize last window of prefetched groups */
> + nr = ac->ac_prefetch_ios;
> + if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
> + nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
> + group = ac->ac_prefetch;
> + if (!group)
> + group = ext4_get_groups_count(ac->ac_sb);
> + group--;
> + while (nr-- > 0) {
> + grp = ext4_get_group_info(ac->ac_sb, group);
> + if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
> + rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
> + if (rc)
> + break;
> + }
> + if (!group)
> + group = ext4_get_groups_count(ac->ac_sb);
> + group--;
> + }
> +}
> +
> static noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> @@ -2200,6 +2290,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> * from the goal value specified
> */
> group = ac->ac_g_ex.fe_group;
> + ac->ac_prefetch = group;
>
> for (i = 0; i < ngroups; group++, i++) {
> int ret = 0;
> @@ -2211,6 +2302,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> if (group >= ngroups)
> group = 0;
>
> + ext4_mb_prefetch(ac, group);
> +
> /* This now checks without needing the buddy page */
> ret = ext4_mb_good_group(ac, group, cr);
> if (ret <= 0) {
> @@ -2283,6 +2376,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> out:
> if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
> err = first_err;
> + /* use prefetched bitmaps to init buddy so that read info is not lost */
> + ext4_mb_prefetch_fini(ac);
> return err;
> }
>
> @@ -2542,6 +2637,25 @@ static int ext4_mb_init_backend(struct super_block *sb)
> goto err_freebuddy;
> }
>
> + if (ext4_has_feature_flex_bg(sb)) {
> + /* a single flex group is supposed to be read by a single IO */
> + sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
> + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
> + } else {
> + sbi->s_mb_prefetch = 32;
> + }
> + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
> + sbi->s_mb_prefetch = ext4_get_groups_count(sb);
> + /* now many real IOs to prefetch within a single allocation at cr=0

Do you mean “how many” here? At cr=0 and cr=1?

> + * given cr=0 is an CPU-related optimization we shouldn't try to
> + * load too many groups, at some point we should start to use what
> + * we've got in memory.
> + * with an average random access time 5ms, it'd take a second to get
> + * 200 groups (* N with flex_bg), so let's make this limit 4 */
> + sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
> + if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
> + sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
> +
> return 0;
>
> err_freebuddy:
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 88c98f17e3d9..c96a2bd81f72 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -175,6 +175,8 @@ struct ext4_allocation_context {
> struct page *ac_buddy_page;
> struct ext4_prealloc_space *ac_pa;
> struct ext4_locality_group *ac_lg;
> + ext4_group_t ac_prefetch;
> + int ac_prefetch_ios; /* number of initialied prefetch IO */
> };
>
> #define AC_STATUS_CONTINUE 1
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 04bfaf63752c..5f443f9d54b8 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -240,6 +240,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
> EXT4_ATTR(first_error_time, 0444, first_error_time);
> EXT4_ATTR(last_error_time, 0444, last_error_time);
> EXT4_ATTR(journal_task, 0444, journal_task);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
>
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -283,6 +285,8 @@ static struct attribute *ext4_attrs[] = {
> #ifdef CONFIG_EXT4_DEBUG
> ATTR_LIST(simulate_fail),
> #endif
> + ATTR_LIST(mb_prefetch),
> + ATTR_LIST(mb_prefetch_limit),
> NULL,
> };
> ATTRIBUTE_GROUPS(ext4);
> --
> 2.21.3
>

2020-05-30 04:58:30

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps



> On 28 May 2020, at 17:47, Theodore Y. Ts'o <[email protected]> wrote:
>
> What are you trying to do here? If nr > 0, we return; if nr < 0, we
> BUG() --- but nr is an unsigned int, so we never can trigger --- which
> was the warning reported by the kbuild test bot. So we will only get
> past this point if ac_prefetch == group. But ac_prefetch appears to
> be the last group that we prefetched, so it's not clear that the logic
> is correct here.

You’re right, this part “evolved” since the initial version, but I forgot to make it clear.
Basically this should be replaced with:

If (ac->ac_prefetch != group)
return;

ac->ac_prefetch is just a cursor for the current process.

Thanks, Alex

2020-05-30 05:02:21

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps


Hi

> On 29 May 2020, at 19:19, Благодаренко Артём <[email protected]> wrote:
>
> Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
> Excluding this patch restore rates to the original values.
> I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
> Is plenty of files and directories creation corner case for this optimisation?

Noticed as well, the last version posted to the list should have this problem fixed.

>
> Can be useful giving an ability to disable this optimisation? As option, by setting s_mb_prefetch to zero.
> Now 0 at s_mb_prefetch allows to skip the optimisation for cr=0 and cr=1.

s_mb_prefetch_limit=0 can be used to disable prefetching?

>> + /* limit prefetching at cr=0, otherwise mballoc can
>> + * spend a lot of time loading imperfect groups */
>> + if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
>> + return;
>
> A comment above says prefetching is limited for cr=0 but code limit it for cr=0 and cr=1.
> Do you need change the comment or code?

OK

>> + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
>> + sbi->s_mb_prefetch = ext4_get_groups_count(sb);
>> + /* now many real IOs to prefetch within a single allocation at cr=0
>
> Do you mean “how many” here? At cr=0 and cr=1?

Yes, of course


Thanks, Alex

2020-06-20 04:12:11

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

On Sat, May 30, 2020 at 05:01:40AM +0000, Alex Zhuravlev wrote:
>
> Hi
>
> > On 29 May 2020, at 19:19, Благодаренко Артём <[email protected]> wrote:
> >
> > Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
> > Excluding this patch restore rates to the original values.
> > I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
> > Is plenty of files and directories creation corner case for this optimisation?
>
> Noticed as well, the last version posted to the list should have this problem fixed.

I'm not seeing any newer version of this patch since the one on this
thread, posted on May 15th.

Am I missing something? What's the latest version of this patch that you have?

- Ted

2020-06-20 04:21:26

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

Please find it attached, rebased against latest tree.

Thanks, Alex


> On 19 Jun 2020, at 20:16, Theodore Y. Ts'o <[email protected]> wrote:
>
> I'm not seeing any newer version of this patch since the one on this
> thread, posted on May 15th.
>
> Am I missing something? What's the latest version of this patch that you have?
>
> - Ted


From 023a44b30b1c77873401ee310705933482a89b02 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <[email protected]>
Date: Tue, 21 Apr 2020 10:54:07 +0300
Subject: [PATCH] [PATCH 1/2] ext4: mballoc - prefetching for bitmaps

This should significantly improve bitmap loading, especially for flex
groups as it tries to load all bitmaps within a flex.group instead of
one by one synchronously.

Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
reads for a single allocating thread. At the end of allocation the
thread waits for read-ahead completion and initializes buddy information
so that read-aheads are not lost in case of memory pressure.

At cr=0 the number of prefetching IOs is limited per allocation context
to prevent a situation when mballoc loads thousands of bitmaps looking
for a perfect group and ignoring groups with good chunks.

Together with the patch "ext4: limit scanning of uninitialized groups"
the mount time (which includes few tiny allocations) of a 1PB filesystem
is reduced significantly:

0% full 50%-full unpatched patched
mount time 33s 9279s 563s

Signed-off-by: Alex Zhuravlev <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
---
fs/ext4/balloc.c | 12 ++++-
fs/ext4/ext4.h | 8 +++-
fs/ext4/mballoc.c | 113 +++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/mballoc.h | 2 +
fs/ext4/sysfs.c | 4 ++
5 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ba46d87cdf1..c52011af4812 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
* Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -444,6 +445,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
if (bitmap_uptodate(bh))
goto verify;

+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, do not wait
+ * if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
@@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;

- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42f5060f3cdf..7451662e092a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1505,6 +1505,8 @@ struct ext4_sb_info {
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;

/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -2446,7 +2448,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -3145,6 +3148,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4

#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3159,6 +3163,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9d69f0a1372d..88af980e945e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -2244,6 +2244,91 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
return ret;
}

+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t start)
+{
+ struct super_block *sb = ac->ac_sb;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info *grp;
+ ext4_group_t nr, group = start;
+ struct buffer_head *bh;
+
+ /* limit prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups */
+ if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+ return;
+
+ /* batch prefetching to get few READs in flight */
+ if (ac->ac_prefetch != group)
+ return;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* align to flex_bg to get more bitmaps with a single IO */
+ nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ while (nr-- > 0) {
+ grp = ext4_get_group_info(sb, group);
+
+ /* prevent expensive getblk() on groups w/ IO in progress */
+ if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+ goto next;
+
+ /* ignore empty groups - those will be skipped
+ * during the scanning as well */
+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh))
+ ac->ac_prefetch_ios++;
+ brelse(bh);
+ }
+ }
+next:
+ if (++group >= ngroups)
+ group = 0;
+ }
+ ac->ac_prefetch = group;
+}
+
+static void
+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+ struct ext4_group_info *grp;
+ ext4_group_t group;
+ int nr, rc;
+
+ /* initialize last window of prefetched groups */
+ nr = ac->ac_prefetch_ios;
+ if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+ nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+ group = ac->ac_prefetch;
+ if (!group)
+ group = ext4_get_groups_count(ac->ac_sb);
+ group--;
+ while (nr-- > 0) {
+ grp = ext4_get_group_info(ac->ac_sb, group);
+ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+ rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ if (rc)
+ break;
+ }
+ if (!group)
+ group = ext4_get_groups_count(ac->ac_sb);
+ group--;
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -2317,6 +2402,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch = group;

for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2328,6 +2414,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
if (group >= ngroups)
group = 0;

+ ext4_mb_prefetch(ac, group);
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
@@ -2402,6 +2490,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
+
+ /* use prefetched bitmaps to init buddy so that read info is not lost */
+ ext4_mb_prefetch_fini(ac);
+
return err;
}

@@ -2648,6 +2740,25 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}

+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4 */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;

err_freebuddy:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..45103cdf08cb 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -171,6 +171,8 @@ struct ext4_allocation_context {
struct page *ac_buddy_page;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
+ ext4_group_t ac_prefetch;
+ int ac_prefetch_ios; /* number of initialied prefetch IO */
};

#define AC_STATUS_CONTINUE 1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e21c13..31e0db726d21 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -240,6 +240,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);

static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -283,6 +285,8 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
--
2.21.3