From: Tahsin Erdogan Subject: Re: [PATCH v2] ext4: make xattr inode reads faster Date: Sun, 16 Jul 2017 23:30:07 -0700 Message-ID: References: <705A22C6-4583-47D4-8FB0-B2820F7051F3@dilger.ca> <20170717062012.26331-1-tahsin@google.com> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Cc: lkml , Tahsin Erdogan To: "Theodore Ts'o" , Andreas Dilger , Ext4 Developers List Return-path: Received: from mail-yb0-f174.google.com ([209.85.213.174]:34510 "EHLO mail-yb0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751253AbdGQGaJ (ORCPT ); Mon, 17 Jul 2017 02:30:09 -0400 Received: by mail-yb0-f174.google.com with SMTP id e186so912285ybb.1 for ; Sun, 16 Jul 2017 23:30:09 -0700 (PDT) In-Reply-To: <20170717062012.26331-1-tahsin@google.com> Sender: linux-ext4-owner@vger.kernel.org List-ID: > It surprises me that ext4/VFS doesn't already have a helper routine to do this. > It looks like ext4_find_entry() is doing something similar for read-ahead of > directory blocks, so it may be worthwhile to consider moving that code over > to use ext4_bread_batch(), but it looks like it would need to add a flag for > whether to wait on the buffers to be uptodate or not. Done. > It's nice to have the parameter annotation, since it can often be confusing if > there are multiple int/bool arguments to a function, but, I'd write this as: > > ret = ext4_getblk(NULL, inode, block + i, /* map_flags = */ 0); > > I don't know if there is some kind of convention for this kind of comment? I agree that yours is a bit more intuitive, however the version that puts the comment after the value is more common in the codebase. I prefer keeping it as is for consistency reasons. Let me know if you still prefer the prefix version. > >> + if (IS_ERR(bhs[i])) { >> + err = PTR_ERR(bhs[i]); >> + while (i--) >> + brelse(bhs[i]); > > It would be better to do the error cleanup once at the end of the function, > instead of multiple times in the code, like: > > if (IS_ERR(bhs[i])) { > err = PTR_ERR(bhs[i]); > bh_count = i; > goto out_brelse; > } Done. On Sun, Jul 16, 2017 at 11:20 PM, Tahsin Erdogan wrote: > ext4_xattr_inode_read() currently reads each block sequentially while > waiting for io operation to complete before moving on to the next > block. This prevents request merging in block layer. > > Add a ext4_bread_batch() function that starts reads for all blocks > then optionally waits for them to complete. A similar logic is used > in ext4_find_entry(), so update that code to use the new function. > > Signed-off-by: Tahsin Erdogan > --- > v2: > - updated ext4_find_entry() to also use ext4_bread_batch() > - added wait parameter to ext4_bread_batch() > > fs/ext4/ext4.h | 2 ++ > fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ > fs/ext4/namei.c | 43 ++++++++++++++----------------------------- > fs/ext4/xattr.c | 51 ++++++++++++++++++++++++++++++++------------------- > 4 files changed, 92 insertions(+), 48 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 9ebde0cd632e..e9440ed605c0 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2462,6 +2462,8 @@ extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); > int ext4_inode_is_fast_symlink(struct inode *inode); > struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); > struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); > +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, > + bool wait, struct buffer_head **bhs); > int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, > struct buffer_head *bh_result, int create); > int ext4_get_block(struct inode *inode, sector_t iblock, > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 3c600f02673f..70699940e20d 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -1015,6 +1015,50 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, > return ERR_PTR(-EIO); > } > > +/* Read a contiguous batch of blocks. */ > +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, > + bool wait, struct buffer_head **bhs) > +{ > + int i, err; > + > + for (i = 0; i < bh_count; i++) { > + bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); > + if (IS_ERR(bhs[i])) { > + err = PTR_ERR(bhs[i]); > + bh_count = i; > + goto out_brelse; > + } > + } > + > + for (i = 0; i < bh_count; i++) > + /* Note that NULL bhs[i] is valid because of holes. */ > + if (bhs[i] && !buffer_uptodate(bhs[i])) > + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, > + &bhs[i]); > + > + if (!wait) > + return 0; > + > + for (i = 0; i < bh_count; i++) > + if (bhs[i]) > + wait_on_buffer(bhs[i]); > + > + for (i = 0; i < bh_count; i++) { > + if (bhs[i] && !buffer_uptodate(bhs[i])) { > + err = -EIO; > + goto out_brelse; > + } > + } > + return 0; > + > +out_brelse: > + for (i = 0; i < bh_count; i++) { > + brelse(bhs[i]); > + bhs[i] = NULL; > + } > + return err; > +} > + > int ext4_walk_page_buffers(handle_t *handle, > struct buffer_head *head, > unsigned from, > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index 13f0cadb1238..2758ff1b940f 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -1342,13 +1342,12 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, > struct super_block *sb; > struct buffer_head *bh_use[NAMEI_RA_SIZE]; > struct buffer_head *bh, *ret = NULL; > - ext4_lblk_t start, block, b; > + ext4_lblk_t start, block; > const u8 *name = d_name->name; > - int ra_max = 0; /* Number of bh's in the readahead > + size_t ra_max = 0; /* Number of bh's in the readahead > buffer, bh_use[] */ > - int ra_ptr = 0; /* Current index into readahead > + size_t ra_ptr = 0; /* Current index into readahead > buffer */ > - int num = 0; > ext4_lblk_t nblocks; > int i, namelen, retval; > struct ext4_filename fname; > @@ -1411,31 +1410,17 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, > if (ra_ptr >= ra_max) { > /* Refill the readahead buffer */ > ra_ptr = 0; > - b = block; > - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { > - /* > - * Terminate if we reach the end of the > - * directory and must wrap, or if our > - * search has finished at this block. > - */ > - if (b >= nblocks || (num && block == start)) { > - bh_use[ra_max] = NULL; > - break; > - } > - num++; > - bh = ext4_getblk(NULL, dir, b++, 0); > - if (IS_ERR(bh)) { > - if (ra_max == 0) { > - ret = bh; > - goto cleanup_and_exit; > - } > - break; > - } > - bh_use[ra_max] = bh; > - if (bh) > - ll_rw_block(REQ_OP_READ, > - REQ_META | REQ_PRIO, > - 1, &bh); > + if (block < start) > + ra_max = start - block; > + else > + ra_max = nblocks - block; > + ra_max = min(ra_max, ARRAY_SIZE(bh_use)); > + retval = ext4_bread_batch(dir, block, ra_max, > + false /* wait */, bh_use); > + if (retval) { > + ret = ERR_PTR(retval); > + ra_max = 0; > + goto cleanup_and_exit; > } > } > if ((bh = bh_use[ra_ptr++]) == NULL) > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c > index cff4f41ced61..68b5b4f9fbcb 100644 > --- a/fs/ext4/xattr.c > +++ b/fs/ext4/xattr.c > @@ -317,28 +317,41 @@ static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) > */ > static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) > { > - unsigned long block = 0; > - struct buffer_head *bh; > - int blocksize = ea_inode->i_sb->s_blocksize; > - size_t csize, copied = 0; > - void *copy_pos = buf; > - > - while (copied < size) { > - csize = (size - copied) > blocksize ? blocksize : size - copied; > - bh = ext4_bread(NULL, ea_inode, block, 0); > - if (IS_ERR(bh)) > - return PTR_ERR(bh); > - if (!bh) > - return -EFSCORRUPTED; > + int blocksize = 1 << ea_inode->i_blkbits; > + int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; > + int tail_size = (size % blocksize) ?: blocksize; > + struct buffer_head *bhs_inline[8]; > + struct buffer_head **bhs = bhs_inline; > + int i, ret; > + > + if (bh_count > ARRAY_SIZE(bhs_inline)) { > + bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); > + if (!bhs) > + return -ENOMEM; > + } > > - memcpy(copy_pos, bh->b_data, csize); > - brelse(bh); > + ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, > + true /* wait */, bhs); > + if (ret) > + goto free_bhs; > > - copy_pos += csize; > - block += 1; > - copied += csize; > + for (i = 0; i < bh_count; i++) { > + /* There shouldn't be any holes in ea_inode. */ > + if (!bhs[i]) { > + ret = -EFSCORRUPTED; > + goto put_bhs; > + } > + memcpy((char *)buf + blocksize * i, bhs[i]->b_data, > + i < bh_count - 1 ? blocksize : tail_size); > } > - return 0; > + ret = 0; > +put_bhs: > + for (i = 0; i < bh_count; i++) > + brelse(bhs[i]); > +free_bhs: > + if (bhs != bhs_inline) > + kfree(bhs); > + return ret; > } > > static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, > -- > 2.13.2.932.g7449e964c-goog >