Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
can cause a kernel panic when xfstest generic/300 is run on a file
system mounted with dioread_nolock. The panic is typically triggered
from check_irqs_on() (fs/buffer.c: 1272), and happens because
ext4_end_io_dio() is being called in an interrupt context rather than
from a workqueue for AIO. This suggests that buffer_defer_completion
may not be set properly when creating an unwritten extent for async
direct I/O.
Revert the locking changes until this problem can be resolved. Patch
applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
Signed-off-by: Eric Whitney <[email protected]>
---
fs/ext4/ext4.h | 6 ++++--
fs/ext4/inode.c | 43 +++++++++++++++++++++++--------------------
include/trace/events/ext4.h | 1 +
3 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b28..36fcf2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -563,10 +563,12 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
/* Write zeros to newly created written extents */
-#define EXT4_GET_BLOCKS_ZERO 0x0200
+#define EXT4_GET_BLOCKS_ZERO 0x0300
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bf..f72dc04 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -418,7 +418,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* out taking i_data_sem. So at the time the unwritten extent
* could be converted.
*/
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -426,7 +427,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
/*
* We don't check m_len because extent will be collpased in status
@@ -522,7 +524,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -553,7 +556,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (ret < 0)
retval = ret;
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -703,7 +707,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -898,6 +902,9 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+
#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
@@ -3070,21 +3077,13 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- int ret;
-
- ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
inode->i_ino, create);
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result));
-
- return ret;
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
}
#ifdef CONFIG_FS_DAX
@@ -3230,8 +3229,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
- if (overwrite)
+ if (overwrite) {
+ down_read(&EXT4_I(inode)->i_data_sem);
inode_unlock(inode);
+ }
/*
* We could direct write to holes and fallocate.
@@ -3274,7 +3275,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (overwrite) {
- get_block_func = ext4_get_block_overwrite;
+ get_block_func = ext4_get_block_write_nolock;
} else {
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
@@ -3330,8 +3331,10 @@ retake_lock:
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite)
+ if (overwrite) {
+ up_read(&EXT4_I(inode)->i_data_sem);
inode_lock(inode);
+ }
return ret;
}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4e4b2fa..6599e75 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -43,6 +43,7 @@ struct extent_status;
{ EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \
{ EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \
{ EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \
+ { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \
{ EXT4_GET_BLOCKS_ZERO, "ZERO" })
#define show_mflags(flags) __print_flags(flags, "", \
--
2.1.4
On Fri, Feb 12, 2016 at 01:25:06PM -0500, Eric Whitney wrote:
> Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
> can cause a kernel panic when xfstest generic/300 is run on a file
> system mounted with dioread_nolock. The panic is typically triggered
> from check_irqs_on() (fs/buffer.c: 1272), and happens because
> ext4_end_io_dio() is being called in an interrupt context rather than
> from a workqueue for AIO. This suggests that buffer_defer_completion
> may not be set properly when creating an unwritten extent for async
> direct I/O.
>
> Revert the locking changes until this problem can be resolved. Patch
> applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
>
> Signed-off-by: Eric Whitney <[email protected]>
Applied, thanks.
I was able to reliably reproduce the problem without this revert patch
using a 32-bit x86 kvm-xfstests test appliance:
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/kvm-xfstests/testing/root_fs.img.i386
Using the command: "kvm-xfstests -c dioread_nolock generic/300"
With this patch, the problem goes away, so reverting the patch is
clearly the right thing for now. There is something screwy going on,
since the original commit shouldn't have made a difference, but let's
revert it first, and figure it out when we have time to investigate
more deeply.
- Ted
On Tue 16-02-16 00:08:40, Ted Tso wrote:
> On Fri, Feb 12, 2016 at 01:25:06PM -0500, Eric Whitney wrote:
> > Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
> > can cause a kernel panic when xfstest generic/300 is run on a file
> > system mounted with dioread_nolock. The panic is typically triggered
> > from check_irqs_on() (fs/buffer.c: 1272), and happens because
> > ext4_end_io_dio() is being called in an interrupt context rather than
> > from a workqueue for AIO. This suggests that buffer_defer_completion
> > may not be set properly when creating an unwritten extent for async
> > direct I/O.
> >
> > Revert the locking changes until this problem can be resolved. Patch
> > applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
> >
> > Signed-off-by: Eric Whitney <[email protected]>
>
> Applied, thanks.
>
> I was able to reliably reproduce the problem without this revert patch
> using a 32-bit x86 kvm-xfstests test appliance:
>
> ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/kvm-xfstests/testing/root_fs.img.i386
>
> Using the command: "kvm-xfstests -c dioread_nolock generic/300"
>
> With this patch, the problem goes away, so reverting the patch is
> clearly the right thing for now. There is something screwy going on,
> since the original commit shouldn't have made a difference, but let's
> revert it first, and figure it out when we have time to investigate
> more deeply.
OK, I had a look into this. So I'm not 100% what has happened but the
following looks likely: Current io_end handling can overwrite io_end
pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
to overwrite pointer of locked DIO and then clear it out). I suspect that
the change in i_data_sem locking made this race more visible. Attached
patch should fix the issue (I don't see failures of generic/300 with it in
dioread_nolock mode). Can you consider this instead of a revert Eric sent?
I have also a more complete rewrite of io_end handling which makes the code
more comprehensible and avoids storing io_end pointer in the inode (thus
avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
the rewrite once xfstests runs complete.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu, Feb 18, 2016 at 11:09:56PM +0100, Jan Kara wrote:
> OK, I had a look into this. So I'm not 100% what has happened but the
> following looks likely: Current io_end handling can overwrite io_end
> pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
> to overwrite pointer of locked DIO and then clear it out). I suspect that
> the change in i_data_sem locking made this race more visible. Attached
> patch should fix the issue (I don't see failures of generic/300 with it in
> dioread_nolock mode). Can you consider this instead of a revert Eric sent?
Thanks! That does appear to be it. I dropped the revert, confirmed
that I could still trivially reproduce the failure, applied patch,
and ran the test 10 times ("kvm-xfstests -C 10 -c dioread_nolock
generic/300") and it passed with flying colors.
> I have also a more complete rewrite of io_end handling which makes the code
> more comprehensible and avoids storing io_end pointer in the inode (thus
> avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
> the rewrite once xfstests runs complete.
Great, thanks!
- Ted
* Theodore Ts'o <[email protected]>:
> On Thu, Feb 18, 2016 at 11:09:56PM +0100, Jan Kara wrote:
> > OK, I had a look into this. So I'm not 100% what has happened but the
> > following looks likely: Current io_end handling can overwrite io_end
> > pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
> > to overwrite pointer of locked DIO and then clear it out). I suspect that
> > the change in i_data_sem locking made this race more visible. Attached
> > patch should fix the issue (I don't see failures of generic/300 with it in
> > dioread_nolock mode). Can you consider this instead of a revert Eric sent?
>
> Thanks! That does appear to be it. I dropped the revert, confirmed
> that I could still trivially reproduce the failure, applied patch,
> and ran the test 10 times ("kvm-xfstests -C 10 -c dioread_nolock
> generic/300") and it passed with flying colors.
>
> > I have also a more complete rewrite of io_end handling which makes the code
> > more comprehensible and avoids storing io_end pointer in the inode (thus
> > avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
> > the rewrite once xfstests runs complete.
>
> Great, thanks!
>
> - Ted
I ran the same ten test runs (kvm-xfstests -c dioread_nolock generic/300) on
x86_64 and a full test run (kvm-xfstests -g auto) with the patch applied to
4.5-rc4 without regressions relative to my -rc4 baseline results. Looks
good to me.
Tested-by: Eric Whitney <[email protected]>