It is appeared that extent are not cached for inodes with depth == 0
which result in suboptimal extent status populating inside ext4_map_blocks()
by map's result where size requested is usually smaller than extent size so
cache becomes fragmented
# Example: I have plain file:
File size of /mnt/test is 33554432 (8192 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected: flags:
0: 0.. 8191: 40960.. 49151: 8192: last,eof
$ perf record -e 'ext4:ext4_es_*' /root/bin/fio --name=t --direct=0 --rw=randread --bs=4k --filesize=32M --size=32M --filename=/mnt/test
$ perf script | grep ext4_es_insert_extent | head -n 10
fio 131 [000] 13.975421: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [494/1) mapped 41454 status W
fio 131 [000] 13.975939: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6064/1) mapped 47024 status W
fio 131 [000] 13.976467: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6907/1) mapped 47867 status W
fio 131 [000] 13.976937: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3850/1) mapped 44810 status W
fio 131 [000] 13.977440: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3292/1) mapped 44252 status W
fio 131 [000] 13.977931: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6882/1) mapped 47842 status W
fio 131 [000] 13.978376: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3117/1) mapped 44077 status W
fio 131 [000] 13.978957: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [2896/1) mapped 43856 status W
fio 131 [000] 13.979474: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [7479/1) mapped 48439 status W
This is wrong, we should cache extents inside ext4_find_extent() as we already do for inodes with depth > 0
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/extents.c | 47 +++++++++++++++++++++++++++--------------------
1 file changed, 27 insertions(+), 20 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index fb0f99d..24d6bfd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -498,6 +498,30 @@ int ext4_ext_check_inode(struct inode *inode)
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
+static void ext4_es_cache_extents(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+}
+
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
struct inode *inode, ext4_fsblk_t pblk, int depth,
@@ -532,26 +556,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
*/
if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
struct ext4_extent_header *eh = ext_block_hdr(bh);
- struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t prev = 0;
- int i;
-
- for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
- unsigned int status = EXTENT_STATUS_WRITTEN;
- ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
- int len = ext4_ext_get_actual_len(ex);
-
- if (prev && (prev != lblk))
- ext4_es_cache_extent(inode, prev,
- lblk - prev, ~0,
- EXTENT_STATUS_HOLE);
-
- if (ext4_ext_is_unwritten(ex))
- status = EXTENT_STATUS_UNWRITTEN;
- ext4_es_cache_extent(inode, lblk, len,
- ext4_ext_pblock(ex), status);
- prev = lblk + len;
- }
+ ext4_es_cache_extents(inode, eh);
}
return bh;
errout:
@@ -899,6 +904,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[0].p_bh = NULL;
i = depth;
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
+ ext4_es_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
ext_debug("depth %d: num %d, max %d\n",
--
2.7.4
On Wed, Nov 06, 2019 at 12:25:02PM +0000, Dmitry Monakhov wrote:
> It is appeared that extent are not cached for inodes with depth == 0
> which result in suboptimal extent status populating inside ext4_map_blocks()
> by map's result where size requested is usually smaller than extent size so
> cache becomes fragmented
>
> # Example: I have plain file:
> File size of /mnt/test is 33554432 (8192 blocks of 4096 bytes)
> ext: logical_offset: physical_offset: length: expected: flags:
> 0: 0.. 8191: 40960.. 49151: 8192: last,eof
>
> $ perf record -e 'ext4:ext4_es_*' /root/bin/fio --name=t --direct=0 --rw=randread --bs=4k --filesize=32M --size=32M --filename=/mnt/test
> $ perf script | grep ext4_es_insert_extent | head -n 10
> fio 131 [000] 13.975421: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [494/1) mapped 41454 status W
> fio 131 [000] 13.976467: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6907/1) mapped 47867 status W
So this is certainly bad behavior, but the original intent was to not
cached extents that were in the inode's i_blocks[] array because the
information was already in the inode cache, and so we could save
memory but just pulling the information out of the i_blocks away and
there was no need to cache the extent in the es cache.
There are cases where we do need to track the extent in the es cache
--- for example, if we are writing the file and we need to track its
delayed allocation status.
So I wonder if we might be better off defining a new flag
EXT4_MAP_INROOT, which gets set by ext4_ext_map_blocks() and
ext4_ind_map_blocks() if the mapping is exclusively found in the
i_blocks array, and if EXT4_MAP_INROOT is set, and we don't need to
set EXTENT_STATUS_DELAYED, we skip the call to
ext4_es_insert_extent().
What do you think? This should significantly reduce the memory
utilization of the es_cache, which would be good for low-memory
workloads, and those where there are a large number of inodes that fit
in the es_cache, which is probably true for most desktops, especially
those belonging kernel developers. :-)
- Ted
"Theodore Y. Ts'o" <[email protected]> writes:
> On Wed, Nov 06, 2019 at 12:25:02PM +0000, Dmitry Monakhov wrote:
>> It is appeared that extent are not cached for inodes with depth == 0
>> which result in suboptimal extent status populating inside ext4_map_blocks()
>> by map's result where size requested is usually smaller than extent size so
>> cache becomes fragmented
>>
>> # Example: I have plain file:
>> File size of /mnt/test is 33554432 (8192 blocks of 4096 bytes)
>> ext: logical_offset: physical_offset: length: expected: flags:
>> 0: 0.. 8191: 40960.. 49151: 8192: last,eof
>>
>> $ perf record -e 'ext4:ext4_es_*' /root/bin/fio --name=t --direct=0 --rw=randread --bs=4k --filesize=32M --size=32M --filename=/mnt/test
>> $ perf script | grep ext4_es_insert_extent | head -n 10
>> fio 131 [000] 13.975421: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [494/1) mapped 41454 status W
>> fio 131 [000] 13.976467: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6907/1) mapped 47867 status W
>
> So this is certainly bad behavior, but the original intent was to not
> cached extents that were in the inode's i_blocks[] array because the
> information was already in the inode cache, and so we could save
> memory but just pulling the information out of the i_blocks away and
> there was no need to cache the extent in the es cache.
>
> There are cases where we do need to track the extent in the es cache
> --- for example, if we are writing the file and we need to track its
> delayed allocation status.
>
> So I wonder if we might be better off defining a new flag
> EXT4_MAP_INROOT, which gets set by ext4_ext_map_blocks() and
> ext4_ind_map_blocks() if the mapping is exclusively found in the
> i_blocks array, and if EXT4_MAP_INROOT is set, and we don't need to
> set EXTENT_STATUS_DELAYED, we skip the call to
> ext4_es_insert_extent().
>
> What do you think? This should significantly reduce the memory
> utilization of the es_cache, which would be good for low-memory
> workloads, and those where there are a large number of inodes that fit
> in the es_cache, which is probably true for most desktops, especially
> those belonging kernel developers. :-)
Sound reasonable, same happens in ext4_ext_precache()
See my patch below.
But this also means that on each ext4_map_blocks()
will fallback to regular block lookup:
down_read(&EXT4_I(inode)->i_data_sem)
ext4_ext_map_blocks (
->ext4_find_extent
path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), GFP_NOFS)
kfree(path)
up_read(&EXT4_I(inode)->i_data_sem)
I thought that we can neglect this, but curiosity is a good thing
That it why I've tested nocache(see patch below) vs cache(first path) approach
## Production server ##
# CPU: Intel-Gold-6230
# DEV: /dev/ram0
# TEST: fio --direct=1 --rw=randread --bs=1k --filesize=64M
IOPS(nocache/cache): 729k vs 764k => +5%
# DEV: /dev/nvme0n1 (Samsung DC grade)
# TEST: fio --direct=1 --rw=randread --bs=4k --filesize=64M --ioengine=libaio --iodepth=128
IOPS(nocache/cache): 366k vs 378k => +3%
## My notebook Carbon/X1 ##
# CPU: i7-7600U
# DEV: /dev/nvme0n1 (Samsung MZVLB512HAJQ-000L7)
# TEST: fio --direct=1 --rw=randread --bs=4k --filesize=64M --ioengine=libaio --iodepth=128
IOPS(nocache/cache): 270k vs 270k => No difference
Difference is invisiable for my laptop, but visiable on fast NVME dev.
From other point of view unconditional caching result in memory
overhead sizeof(struct extent_status)/sizeof(ext4_inode_info) => 3.5% for everybody.
>
> - Ted
On Wed, Nov 06, 2019 at 12:25:02PM +0000, Dmitry Monakhov wrote:
> It is appeared that extent are not cached for inodes with depth == 0
> which result in suboptimal extent status populating inside ext4_map_blocks()
> by map's result where size requested is usually smaller than extent size so
> cache becomes fragmented
>
I've done some more performance testing, and analysis, and while for
some workloads this will cause a slight increase in memory, most of
the time, it's going to be a wash. The one case where I could imagine
is where a large number of files are scanned to determine what type
they are (e.g., using file(1) command) and this causes the first block
(and only the first block) to be read. In that case, if there are
four discontiguous regions in the inode's i_blocks[] area, this will
cause multiple extents_status structs to be allocated that will never
be used.
For most cases, though, the memory utilization will be the same or
better, and I can see how this could make a difference in the workload
that you described.
I experimented with a patch that only pulled into the extents status
cache the single physical extent that was found, e.g.
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 393533ff0527..1aad7c0bc0af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -901,9 +901,18 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
/* if not an empty leaf */
- if (path[ppos].p_ext)
- path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-
+ if (path[ppos].p_ext) {
+ struct ext4_extent *ex = path[ppos].p_ext;
+
+ path[ppos].p_block = ext4_ext_pblock(ex);
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
+ ext4_es_cache_extent(inode, le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_unwritten(ex) ?
+ EXTENT_STATUS_UNWRITTEN :
+ EXTENT_STATUS_WRITTEN);
+ }
ext4_ext_show_path(inode, path);
return path;
But as I experimented with it, I realized that it really didn't make
that much difference in practice, so I decided to go with your
original proposed patch. Apologies for it taking a while for me to do
the analysis/experiments.
Cheers,
- Ted