e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
updated writeback path to avoid kicking writeback work items if there
are no inodes to be written out; unfortunately, the avoidance logic
was too aggressive and made sync_inodes_sb() skip I_DIRTY_TIME inodes.
This patch fixes the breakage by
* Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
The callers are already testing the condition.
* Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
it always calls into bdi_split_work_to_wbs().
* Making bdi_split_work_to_wbs() consider the b_dirty_time list for
WB_SYNC_ALL writebacks.
Signed-off-by: Tejun Heo <[email protected]>
Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
Cc: Ted Ts'o <[email protected]>
Cc: Jan Kara <[email protected]>
---
Hello,
So, this fixes I_DIRTY_TIME syncing problem for ext4 but AFAICS xfs
doesn't even use the generic inode metadata writeback path, so this
most likely won't do anything for the originally reported problem.
I'll post another patch for debugging.
Thanks.
fs/fs-writeback.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -844,14 +844,15 @@ static void bdi_split_work_to_wbs(struct
struct wb_iter iter;
might_sleep();
-
- if (!bdi_has_dirty_io(bdi))
- return;
restart:
rcu_read_lock();
bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
- if (!wb_has_dirty_io(wb) ||
- (skip_if_busy && writeback_in_progress(wb)))
+ /* SYNC_ALL writes out I_DIRTY_TIME too */
+ if (!wb_has_dirty_io(wb) &&
+ (base_work->sync_mode == WB_SYNC_NONE ||
+ list_empty(&wb->b_dirty_time)))
+ continue;
+ if (skip_if_busy && writeback_in_progress(wb))
continue;
base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
@@ -899,8 +900,7 @@ static void bdi_split_work_to_wbs(struct
{
might_sleep();
- if (bdi_has_dirty_io(bdi) &&
- (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
base_work->single_wait = 0;
base_work->single_done = 0;
@@ -2275,8 +2275,8 @@ void sync_inodes_sb(struct super_block *
};
struct backing_dev_info *bdi = sb->s_bdi;
- /* Nothing to do? */
- if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ /* bdi_has_dirty() ignores I_DIRTY_TIME but we can't, always kick wbs */
+ if (bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
Hello,
On Thu 13-08-15 18:44:15, Tejun Heo wrote:
> e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> updated writeback path to avoid kicking writeback work items if there
> are no inodes to be written out; unfortunately, the avoidance logic
> was too aggressive and made sync_inodes_sb() skip I_DIRTY_TIME inodes.
> This patch fixes the breakage by
>
> * Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
> The callers are already testing the condition.
>
> * Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
> it always calls into bdi_split_work_to_wbs().
>
> * Making bdi_split_work_to_wbs() consider the b_dirty_time list for
> WB_SYNC_ALL writebacks.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> Cc: Ted Ts'o <[email protected]>
> Cc: Jan Kara <[email protected]>
So the patch looks good to me. But the fact that is fixes Eryu's problem
means there is something fishy going on. Either inodes get wrongly attached
to b_dirty_time list or bdi_has_dirty_io() somehow misbehaves only
temporarily and we don't catch it with the debug patch.
Can we add a test to wb_has_dirty_io() to also check whether it matches
bdi_has_dirty_io()? Since Eryu doesn't use lazytime (I assume, Eryu, please
speak up if you do), we could also warn if b_dirty_time lists get
non-empty. Hmm?
Honza
> ---
> Hello,
>
> So, this fixes I_DIRTY_TIME syncing problem for ext4 but AFAICS xfs
> doesn't even use the generic inode metadata writeback path, so this
> most likely won't do anything for the originally reported problem.
> I'll post another patch for debugging.
>
> Thanks.
>
> fs/fs-writeback.c | 18 +++++++++---------
> 1 file changed, 9 insertions(+), 9 deletions(-)
>
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -844,14 +844,15 @@ static void bdi_split_work_to_wbs(struct
> struct wb_iter iter;
>
> might_sleep();
> -
> - if (!bdi_has_dirty_io(bdi))
> - return;
> restart:
> rcu_read_lock();
> bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
> - if (!wb_has_dirty_io(wb) ||
> - (skip_if_busy && writeback_in_progress(wb)))
> + /* SYNC_ALL writes out I_DIRTY_TIME too */
> + if (!wb_has_dirty_io(wb) &&
> + (base_work->sync_mode == WB_SYNC_NONE ||
> + list_empty(&wb->b_dirty_time)))
> + continue;
> + if (skip_if_busy && writeback_in_progress(wb))
> continue;
>
> base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
> @@ -899,8 +900,7 @@ static void bdi_split_work_to_wbs(struct
> {
> might_sleep();
>
> - if (bdi_has_dirty_io(bdi) &&
> - (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
> + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
> base_work->auto_free = 0;
> base_work->single_wait = 0;
> base_work->single_done = 0;
> @@ -2275,8 +2275,8 @@ void sync_inodes_sb(struct super_block *
> };
> struct backing_dev_info *bdi = sb->s_bdi;
>
> - /* Nothing to do? */
> - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
> + /* bdi_has_dirty() ignores I_DIRTY_TIME but we can't, always kick wbs */
> + if (bdi == &noop_backing_dev_info)
> return;
> WARN_ON(!rwsem_is_locked(&sb->s_umount));
>
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
> On Thu 13-08-15 18:44:15, Tejun Heo wrote:
> > e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> > updated writeback path to avoid kicking writeback work items if there
> > are no inodes to be written out; unfortunately, the avoidance logic
> > was too aggressive and made sync_inodes_sb() skip I_DIRTY_TIME inodes.
> > This patch fixes the breakage by
> > * Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
> > The callers are already testing the condition.
> > * Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
> > it always calls into bdi_split_work_to_wbs().
> > * Making bdi_split_work_to_wbs() consider the b_dirty_time list for
> > WB_SYNC_ALL writebacks.
> > Signed-off-by: Tejun Heo <[email protected]>
> > Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> > Cc: Ted Ts'o <[email protected]>
> > Cc: Jan Kara <[email protected]>
* Jan Kara <[email protected]> [2015-08-14 13:14]:
> So the patch looks good to me. But the fact that is fixes Eryu's problem
> means there is something fishy going on. Either inodes get wrongly attached
> to b_dirty_time list or bdi_has_dirty_io() somehow misbehaves only
> temporarily and we don't catch it with the debug patch.
> Can we add a test to wb_has_dirty_io() to also check whether it matches
> bdi_has_dirty_io()? Since Eryu doesn't use lazytime (I assume, Eryu, please
> speak up if you do), we could also warn if b_dirty_time lists get
> non-empty. Hmm?
Hi,
I had an unstable system when running latest Linus tree with Tejun's
patch applied on top. Nothing fishy in the logs after rebooting without
the patch, but remote access with ssh when patch applied did not work
(as if /home partition could not be read). This system has / as ext4 and
other partitions (including /home) as XFS. Trying to login on tty
instead of X resulted in hang of X. I could reboot with sysrq, but can't
do further tests at the moment.
Back to same tree without the patch resulted in normal system.
So just a heads up the patch doesn't seem OK in its current state.
Cheers
Damien
Hello, Damien.
On Fri, Aug 14, 2015 at 05:14:01PM +0200, Damien Wyart wrote:
> I had an unstable system when running latest Linus tree with Tejun's
> patch applied on top. Nothing fishy in the logs after rebooting without
> the patch, but remote access with ssh when patch applied did not work
> (as if /home partition could not be read). This system has / as ext4 and
> other partitions (including /home) as XFS. Trying to login on tty
> instead of X resulted in hang of X. I could reboot with sysrq, but can't
> do further tests at the moment.
>
> Back to same tree without the patch resulted in normal system.
>
> So just a heads up the patch doesn't seem OK in its current state.
Have you been able to reproduce the failure? That sounds like an
unlikely failure mode for the patch.
Thanks.
--
tejun
Hello, Jan.
On Fri, Aug 14, 2015 at 01:14:09PM +0200, Jan Kara wrote:
> So the patch looks good to me. But the fact that is fixes Eryu's problem
> means there is something fishy going on. Either inodes get wrongly attached
Seriously, it shouldn't affect size syncing or xfs but then again my
understanding of xfs is severely limited.
> to b_dirty_time list or bdi_has_dirty_io() somehow misbehaves only
> temporarily and we don't catch it with the debug patch.
>
> Can we add a test to wb_has_dirty_io() to also check whether it matches
> bdi_has_dirty_io()? Since Eryu doesn't use lazytime (I assume, Eryu, please
> speak up if you do), we could also warn if b_dirty_time lists get
> non-empty. Hmm?
Sure, will prep a patch soon.
Thanks.
--
tejun
> > I had an unstable system when running latest Linus tree with Tejun's
> > patch applied on top. Nothing fishy in the logs after rebooting without
> > the patch, but remote access with ssh when patch applied did not work
> > (as if /home partition could not be read). This system has / as ext4 and
> > other partitions (including /home) as XFS. Trying to login on tty
> > instead of X resulted in hang of X. I could reboot with sysrq, but can't
> > do further tests at the moment.
> > Back to same tree without the patch resulted in normal system.
> > So just a heads up the patch doesn't seem OK in its current state.
Hi Tejun,
> Have you been able to reproduce the failure? That sounds like an
> unlikely failure mode for the patch.
Unfortunately (as it would be nice to understand what happened), no.
I reapplied the patch on top of rc7 and could not reproduce the
unstability after several reboots.
I will continue running with the patch and report if anything strange
appears again...
--
Damien
On Mon 17-08-15 16:02:54, Tejun Heo wrote:
> Hello, Jan.
>
> On Fri, Aug 14, 2015 at 01:14:09PM +0200, Jan Kara wrote:
> > So the patch looks good to me. But the fact that is fixes Eryu's problem
> > means there is something fishy going on. Either inodes get wrongly attached
>
> Seriously, it shouldn't affect size syncing or xfs but then again my
> understanding of xfs is severely limited.
Well, i_size == 0 in XFS usually means that writeback didn't get to
flushing delay allocated pages - inode size on disk gets increased only
after the pages are written out in ->end_io callback. So at least this part
makes some sense to me.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, Aug 18, 2015 at 11:16:03AM +0200, Jan Kara wrote:
> On Mon 17-08-15 16:02:54, Tejun Heo wrote:
> > Hello, Jan.
> >
> > On Fri, Aug 14, 2015 at 01:14:09PM +0200, Jan Kara wrote:
> > > So the patch looks good to me. But the fact that is fixes Eryu's problem
> > > means there is something fishy going on. Either inodes get wrongly attached
> >
> > Seriously, it shouldn't affect size syncing or xfs but then again my
> > understanding of xfs is severely limited.
>
> Well, i_size == 0 in XFS usually means that writeback didn't get to
> flushing delay allocated pages - inode size on disk gets increased only
> after the pages are written out in ->end_io callback. So at least this part
> makes some sense to me.
Hmm... the only possibility I can think of is tot_write_bandwidth
being zero when it shouldn't be. I've been staring at the code for a
while now but nothing rings a bell. Time for another debug patch, I
guess.
Thanks.
--
tejun
Hello,
On Tue, Aug 18, 2015 at 10:47:18AM -0700, Tejun Heo wrote:
> Hmm... the only possibility I can think of is tot_write_bandwidth
> being zero when it shouldn't be. I've been staring at the code for a
> while now but nothing rings a bell. Time for another debug patch, I
> guess.
So, I can now reproduce the bug (it takes a lot of trials but lowering
the number of tested files helps quite a bit) and instrumented all the
early exit paths w/o the fix patch. bdi_has_dirty_io() and
wb_has_dirty_io() are never out of sync with the actual dirty / io
lists even when the test 048 fails, so the bug at least is not caused
by writeback skipping due to buggy bdi/wb_has_dirty_io() result.
Whenever it skips, all the lists are actually empty (verified while
holding list_lock).
One suspicion I have is that this could be a subtle timing issue which
is being exposed by the new short-cut path. Anything which adds delay
seems to make the issue go away. Dave, does anything ring a bell?
As for the proposed I_DIRTY_TIME fix, I think it'd be a good idea to
merge it. It fixes a clear brekage regardless of this xfs issue.
Thanks.
--
tejun
On Tue, Aug 18, 2015 at 12:54:39PM -0700, Tejun Heo wrote:
> Hello,
>
> On Tue, Aug 18, 2015 at 10:47:18AM -0700, Tejun Heo wrote:
> > Hmm... the only possibility I can think of is tot_write_bandwidth
> > being zero when it shouldn't be. I've been staring at the code for a
> > while now but nothing rings a bell. Time for another debug patch, I
> > guess.
>
> So, I can now reproduce the bug (it takes a lot of trials but lowering
> the number of tested files helps quite a bit) and instrumented all the
> early exit paths w/o the fix patch. bdi_has_dirty_io() and
> wb_has_dirty_io() are never out of sync with the actual dirty / io
> lists even when the test 048 fails, so the bug at least is not caused
> by writeback skipping due to buggy bdi/wb_has_dirty_io() result.
> Whenever it skips, all the lists are actually empty (verified while
> holding list_lock).
>
> One suspicion I have is that this could be a subtle timing issue which
> is being exposed by the new short-cut path. Anything which adds delay
> seems to make the issue go away. Dave, does anything ring a bell?
No, it doesn't. The data writeback mechanisms XFS uses are all
generic. It marks inodes I_DIRTY_PAGES and lets the generic code
take care of everything else. Yes, we do delayed allocation during
writeback, and we log the inode size updates during IO completion,
so if inode sizes are not getting updated, then Occam's Razor
suggests that writeback is not happening.
I'd suggest looking at some of the XFS tracepoints during the test:
tracepoint trigger
xfs_file_buffered_write once per write syscall
xfs_file_sync once per fsync per inode
xfs_vm_writepage every ->writepage call
xfs_setfilesize every IO completion that updates inode size
And it's probably best to also include all the writeback
tracepoints, too, for context. That will tell you what inodes and
what part of them are getting written back and when....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Aug 19, 2015 at 07:56:11AM +1000, Dave Chinner wrote:
> On Tue, Aug 18, 2015 at 12:54:39PM -0700, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Aug 18, 2015 at 10:47:18AM -0700, Tejun Heo wrote:
> > > Hmm... the only possibility I can think of is tot_write_bandwidth
> > > being zero when it shouldn't be. I've been staring at the code for a
> > > while now but nothing rings a bell. Time for another debug patch, I
> > > guess.
> >
> > So, I can now reproduce the bug (it takes a lot of trials but lowering
> > the number of tested files helps quite a bit) and instrumented all the
> > early exit paths w/o the fix patch. bdi_has_dirty_io() and
> > wb_has_dirty_io() are never out of sync with the actual dirty / io
> > lists even when the test 048 fails, so the bug at least is not caused
> > by writeback skipping due to buggy bdi/wb_has_dirty_io() result.
> > Whenever it skips, all the lists are actually empty (verified while
> > holding list_lock).
> >
> > One suspicion I have is that this could be a subtle timing issue which
> > is being exposed by the new short-cut path. Anything which adds delay
> > seems to make the issue go away. Dave, does anything ring a bell?
>
> No, it doesn't. The data writeback mechanisms XFS uses are all
> generic. It marks inodes I_DIRTY_PAGES and lets the generic code
> take care of everything else. Yes, we do delayed allocation during
> writeback, and we log the inode size updates during IO completion,
> so if inode sizes are not getting updated, then Occam's Razor
> suggests that writeback is not happening.
>
> I'd suggest looking at some of the XFS tracepoints during the test:
>
> tracepoint trigger
> xfs_file_buffered_write once per write syscall
> xfs_file_sync once per fsync per inode
> xfs_vm_writepage every ->writepage call
> xfs_setfilesize every IO completion that updates inode size
I gave the tracepoints a try, but my root fs is xfs so I got many
noises. I'll try to install a new vm with ext4 as root fs. But I'm not
sure if the new vm could reproduce the failure, will see.
BTW, I guess xfs_vm_writepage should be xfs_writepage, and xfs_file_sync
should be xfs_file_fsync?
Thanks,
Eryu
On Thu, Aug 20, 2015 at 02:12:24PM +0800, Eryu Guan wrote:
> On Wed, Aug 19, 2015 at 07:56:11AM +1000, Dave Chinner wrote:
> > On Tue, Aug 18, 2015 at 12:54:39PM -0700, Tejun Heo wrote:
> > > Hello,
> > >
> > > On Tue, Aug 18, 2015 at 10:47:18AM -0700, Tejun Heo wrote:
> > > > Hmm... the only possibility I can think of is tot_write_bandwidth
> > > > being zero when it shouldn't be. I've been staring at the code for a
> > > > while now but nothing rings a bell. Time for another debug patch, I
> > > > guess.
> > >
> > > So, I can now reproduce the bug (it takes a lot of trials but lowering
> > > the number of tested files helps quite a bit) and instrumented all the
> > > early exit paths w/o the fix patch. bdi_has_dirty_io() and
> > > wb_has_dirty_io() are never out of sync with the actual dirty / io
> > > lists even when the test 048 fails, so the bug at least is not caused
> > > by writeback skipping due to buggy bdi/wb_has_dirty_io() result.
> > > Whenever it skips, all the lists are actually empty (verified while
> > > holding list_lock).
> > >
> > > One suspicion I have is that this could be a subtle timing issue which
> > > is being exposed by the new short-cut path. Anything which adds delay
> > > seems to make the issue go away. Dave, does anything ring a bell?
> >
> > No, it doesn't. The data writeback mechanisms XFS uses are all
> > generic. It marks inodes I_DIRTY_PAGES and lets the generic code
> > take care of everything else. Yes, we do delayed allocation during
> > writeback, and we log the inode size updates during IO completion,
> > so if inode sizes are not getting updated, then Occam's Razor
> > suggests that writeback is not happening.
> >
> > I'd suggest looking at some of the XFS tracepoints during the test:
> >
> > tracepoint trigger
> > xfs_file_buffered_write once per write syscall
> > xfs_file_sync once per fsync per inode
> > xfs_vm_writepage every ->writepage call
> > xfs_setfilesize every IO completion that updates inode size
>
> I gave the tracepoints a try, but my root fs is xfs so I got many
> noises. I'll try to install a new vm with ext4 as root fs. But I'm not
> sure if the new vm could reproduce the failure, will see.
I installed a new vm with ext4 as root fs and got some trace info.
On the new vm, only generic/048 is reproducible, generic/049 always
passes. And I can only reproduce generic/048 when xfs tracepoints are
enabled, if writeback tracepoints are enabled too, I can no longer
reproduce the failure.
All tests are done on 4.2-rc7 kernel.
This is the trace-cmd I'm using:
cd /mnt/ext4
trace-cmd record -e xfs_file_buffered_write \
-e xfs_file_fsync \
-e xfs_writepage \
-e xfs_setfilesize &
pushd /path/to/xfstests
./check generic/048
popd
kill -s 2 $!
trace-cmd report >trace_report.txt
I attached three files:
1) xfs-trace-generic-048.txt.bz2[1] trace report result
2) xfs-trace-generic-048.diff generic/048 failure diff output, could know which files has incorrect size
3) xfs-trace-generic-048.metadump.bz2 metadump of SCRATCH_DEV, which contains the test files
If more info is needed please let me know.
Thanks,
Eryu
[1] attach this file in a following mail, to avoid xfs list 500k limit
On Thu, Aug 20, 2015 at 10:36:26PM +0800, Eryu Guan wrote:
> On Thu, Aug 20, 2015 at 02:12:24PM +0800, Eryu Guan wrote:
[snip]
>
> I installed a new vm with ext4 as root fs and got some trace info.
>
> On the new vm, only generic/048 is reproducible, generic/049 always
> passes. And I can only reproduce generic/048 when xfs tracepoints are
> enabled, if writeback tracepoints are enabled too, I can no longer
> reproduce the failure.
>
> All tests are done on 4.2-rc7 kernel.
>
> This is the trace-cmd I'm using:
>
> cd /mnt/ext4
> trace-cmd record -e xfs_file_buffered_write \
> -e xfs_file_fsync \
> -e xfs_writepage \
> -e xfs_setfilesize &
> pushd /path/to/xfstests
> ./check generic/048
> popd
> kill -s 2 $!
> trace-cmd report >trace_report.txt
>
> I attached three files:
> 1) xfs-trace-generic-048.txt.bz2[1] trace report result
> 2) xfs-trace-generic-048.diff generic/048 failure diff output, could know which files has incorrect size
> 3) xfs-trace-generic-048.metadump.bz2 metadump of SCRATCH_DEV, which contains the test files
>
> If more info is needed please let me know.
>
> Thanks,
> Eryu
>
> [1] attach this file in a following mail, to avoid xfs list 500k limit
Attached this file in this mail.
Eryu
Hello, Eryu. Thanks a lot for the trace.
So, this is from the end of the trace from the failed test.
...
kworker/u8:1-1563 [002] 22016.987530: xfs_writepage: dev 253:6 ino 0xef64fe pgoff 0x9ff000 size 0xa00000 offset 0 length 0 delalloc 1 unwritten 0
kworker/2:1-49 [002] 22017.373595: xfs_setfilesize: dev 253:6 ino 0xef6504 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
...
Maybe I'm misunderstanding the code but all xfs_writepage() calls are
from unbound workqueues - the writeback workers - while
xfs_setfilesize() are from bound workqueues, so I wondered why that
was and looked at the code and the setsize functions are run off of a
separate work item which is queued from the end_bio callback and I
can't tell who would be waiting for them. Dave, what am I missing?
Thanks.
--
tejun
On Thu, Aug 20, 2015 at 09:55:37AM -0700, Tejun Heo wrote:
> Hello, Eryu. Thanks a lot for the trace.
>
> So, this is from the end of the trace from the failed test.
>
> ...
> kworker/u8:1-1563 [002] 22016.987530: xfs_writepage: dev 253:6 ino 0xef64fe pgoff 0x9ff000 size 0xa00000 offset 0 length 0 delalloc 1 unwritten 0
> kworker/2:1-49 [002] 22017.373595: xfs_setfilesize: dev 253:6 ino 0xef6504 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
> ...
>
> Maybe I'm misunderstanding the code but all xfs_writepage() calls are
> from unbound workqueues - the writeback workers - while
> xfs_setfilesize() are from bound workqueues, so I wondered why that
> was and looked at the code and the setsize functions are run off of a
> separate work item which is queued from the end_bio callback and I
> can't tell who would be waiting for them. Dave, what am I missing?
xfs_setfilesize runs transactions, so it can't be run from IO
completion context as it needs to block (i.e. on log space or inode
locks). It also can't block log IO completion, nor metadata Io
completion, as only log IO completion can free log space, and the
inode lock might be waiting on metadata buffer IO completion (e.g.
during delayed allocation). Hence we have multiple IO completion
workqueues to keep these things separated and deadlock free. i.e.
they all get punted to a workqueue where they are then processed in
a context that can block safely.
> kworker/u8:1-1563 [002] 22016.987530: xfs_writepage: dev 253:6 ino 0xef64fe pgoff 0x9ff000 size 0xa00000 offset 0 length 0 delalloc 1 unwritten 0
There will be one of these per page that is submitted to XFS. There
won't be one per page, because XFS clusters writes itself. This
trace is telling us that the page at offset 0x9ff000 was submitted,
the in-memory size of the inode at this time is 0xa00000 (i.e. this
is the last dirty page in memory) and that the it is a delayed
allocation extent (i.e. hasn't been written before).
> kworker/2:1-49 [002] 22017.373595: xfs_setfilesize: dev 253:6 ino 0xef6504 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
There will be one of these per IO completion that extents the inode
size. This one tells us the in-memory inode size is 0xa00000, the
current on-disk inode size is 0, and the IO being completed spans
the offsets 0 to 10481664 (0x9ff000). Which means it does not
include the page submitted by the above trace, and after the setsize
transaction, isize=0xa00000 and disize=0x9ff000.
Note that these two traces are from different inodes - you need to
match traces from "ino 0xef6504" with other traces from the same
inode.
Also, note that the trace is not complete - there are many, many
missing trace events in the output....
What is interesting from the trace is that all the file size updates
have this pattern:
kworker/2:1-49 [002] 22017.377918: xfs_setfilesize: dev 253:6 ino 0xef64fd isize 0xa00000 disize 0x0 offset 0x0 count 10481664
kworker/2:1-49 [002] 22017.378438: xfs_setfilesize: dev 253:6 ino 0xef64fd isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
There are two IOs being done - one for everything but the last page,
and one for the last page. This is either a result of the writeback
context limiting the number of pages per writeback slice, or the
page clustering that XFS does in xfs_vm_writepage() not quite
getting everything right (maybe an off-by-one?).
However, this doesn't appear to be a contributing factor. The 9
files that have the wrong file size at the end of the test match up
exactly with the last 9 writepage submissions and IO completions;
they happen after all the IO completions occur for all the good
files.
This implies that the sync is either not submitting all the inodes
for IO correctly or it is not waiting for all the inodes it
submitted to be marked clean. We really need the writeback control
tracepoints in the output to determine exactly what the sync was
doing when it submitted these last inodes for writeback....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Aug 19, 2015 at 07:56:11AM +1000, Dave Chinner wrote:
> On Tue, Aug 18, 2015 at 12:54:39PM -0700, Tejun Heo wrote:
[snip]
>
> I'd suggest looking at some of the XFS tracepoints during the test:
>
> tracepoint trigger
> xfs_file_buffered_write once per write syscall
> xfs_file_sync once per fsync per inode
> xfs_vm_writepage every ->writepage call
> xfs_setfilesize every IO completion that updates inode size
>
> And it's probably best to also include all the writeback
> tracepoints, too, for context. That will tell you what inodes and
> what part of them are getting written back and when....
I finally reproduced generic/048 with both xfs and writeback tracepoints
enabled, please download the trace dat file and trace report file from
http://128.199.137.77/writeback/
Thanks,
Eryu
On Fri, Aug 21, 2015 at 06:20:53PM +0800, Eryu Guan wrote:
> On Wed, Aug 19, 2015 at 07:56:11AM +1000, Dave Chinner wrote:
> > On Tue, Aug 18, 2015 at 12:54:39PM -0700, Tejun Heo wrote:
> [snip]
> >
> > I'd suggest looking at some of the XFS tracepoints during the test:
> >
> > tracepoint trigger
> > xfs_file_buffered_write once per write syscall
> > xfs_file_sync once per fsync per inode
> > xfs_vm_writepage every ->writepage call
> > xfs_setfilesize every IO completion that updates inode size
> >
> > And it's probably best to also include all the writeback
> > tracepoints, too, for context. That will tell you what inodes and
> > what part of them are getting written back and when....
>
> I finally reproduced generic/048 with both xfs and writeback tracepoints
> enabled, please download the trace dat file and trace report file from
>
> http://128.199.137.77/writeback/
OK, so only one inode the wrong size this time. The writeback
tracing is too verbose - it captures everything on the the backing
device so there's a huge amount of noise in the trace, and I can't
filter it easily because everything is recorded as "bdi 253:0" even
though we only want traces from "dev 253:6".
As such, there are lots of missing events in the trace again. We
do not need these writeback tracepoints:
writeback_mark_inode_dirty
writeback_dirty_inode_start
writeback_dirty_inode
writeback_dirty_page
writeback_write_inode
And they are the ones causing most of the noise. This brings the
trace down from 7.1 million events to ~90,000 events and brings
the test behaviour right into focus. The inode that had the short
length:
kworker/u8:1-1563 [002] 71028.844716: writeback_single_inode_start: bdi 253:0: ino=15688963 state=I_DIRTY_SYNC|I_DIRTY_DATASYNC|I_DIRTY_PAGES|I_SYNC dirtied_when=4356811543 age=18446744069352740 index=0 to_write=34816 wrote=0
kworker/u8:1-1563 [002] 71028.844718: wbc_writepage: bdi 253:0: towrt=34816 skip=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
kworker/u8:1-1563 [002] 71028.844740: xfs_writepage: dev 253:6 ino 0xef6503 pgoff 0x0 size 0xa00000 offset 0 length 0 delalloc 1 unwritten 0
kworker/u8:1-1563 [002] 71028.845740: wbc_writepage: bdi 253:0: towrt=32257 skip=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
kworker/u8:1-1563 [002] 71028.845741: xfs_writepage: dev 253:6 ino 0xef6503 pgoff 0x9ff000 size 0xa00000 offset 0 length 0 delalloc 1 unwritten 0
kworker/u8:1-1563 [002] 71028.845788: writeback_single_inode: bdi 253:0: ino=15688963 state=I_SYNC dirtied_when=4356811543 age=18446744069352740 index=2559 to_write=34816 wrote=2560
And so we can see that writeback pushed all 2560 pages of the file
to disk.
However, because of all the noise, the xfs io completion events are
missing for this inode. I know that at least one of them occurred,
because their is this transaction in the log:
INODE: #regs: 3 ino: 0xef6503 flags: 0x5 dsize: 16
size 0x9ff000 nblocks 0xa00 extsize 0x0 nextents 0x1
It is, however, the last inode to be updated in the log before
the unmount record, and it is the only one that does not have a size
of 0xa00000 bytes. It has the right block count, but it it appears
that we haven't captured the final IO completion transaction. It was
most definitely not the last inode written by writeback; it was the
6th last, and that is ordered correctly given the file name was
"993", the 6th last file created by the test.
However, I see completions for the inode written before (0xef6502)
and after (0xef6504) but none for 0xef6503. Yet from the trace in
the log we know that at least one of them occurred, because there's
a transaction to say it happened.
As it is, there is an off-by-one in the page clustering mapping
check in XFS that is causing the last page of the inode to be issued
as a separate IO. That's not the cause of the problem however,
because we can see from the trace that the IO for the entire file
appears to be issued. What we don't see yet is what is happening on
the IO completion side, and hence why the sync code is not waiting
correctly for all the IO that was issued to be waited on properly.
Eryu, can you try again, this time manually specifying the writeback
tracepoints so you exclude the really noisy ones? You can also drop
the xfs_file_buffered_write and xfs_file_fsync tracepoints as well,
as we can see that the incoming side of the code is doing the right
thing....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Sat, Aug 22, 2015 at 10:30:25AM +1000, Dave Chinner wrote:
> On Fri, Aug 21, 2015 at 06:20:53PM +0800, Eryu Guan wrote:
[snip]
>
> Eryu, can you try again, this time manually specifying the writeback
> tracepoints so you exclude the really noisy ones? You can also drop
> the xfs_file_buffered_write and xfs_file_fsync tracepoints as well,
> as we can see that the incoming side of the code is doing the right
> thing....
I excluded the writeback tracepoints you mentioned
writeback_mark_inode_dirty
writeback_dirty_inode_start
writeback_dirty_inode
writeback_dirty_page
writeback_write_inode
and left all other writeback tracepoints enabled, also dropped
xfs_file_buffered_write and xfs_file_fsync.
This time I can reproduce generic/048 quickly and please download the
trace info from below
http://128.199.137.77/writeback-v2/
Thanks,
Eryu
On Sat, Aug 22, 2015 at 12:46:09PM +0800, Eryu Guan wrote:
> On Sat, Aug 22, 2015 at 10:30:25AM +1000, Dave Chinner wrote:
> > On Fri, Aug 21, 2015 at 06:20:53PM +0800, Eryu Guan wrote:
> [snip]
> >
> > Eryu, can you try again, this time manually specifying the writeback
> > tracepoints so you exclude the really noisy ones? You can also drop
> > the xfs_file_buffered_write and xfs_file_fsync tracepoints as well,
> > as we can see that the incoming side of the code is doing the right
> > thing....
>
> I excluded the writeback tracepoints you mentioned
>
> writeback_mark_inode_dirty
> writeback_dirty_inode_start
> writeback_dirty_inode
> writeback_dirty_page
> writeback_write_inode
>
> and left all other writeback tracepoints enabled, also dropped
> xfs_file_buffered_write and xfs_file_fsync.
>
> This time I can reproduce generic/048 quickly and please download the
> trace info from below
>
> http://128.199.137.77/writeback-v2/
ok:
$ ls -li /mnt/scr
total 102396
15688948 -rw------- 1 root root 0 Aug 22 14:31 978
15688950 -rw------- 1 root root 0 Aug 22 14:31 980
15688952 -rw------- 1 root root 10481664 Aug 22 14:31 982
15688957 -rw------- 1 root root 0 Aug 22 14:31 987
15688961 -rw------- 1 root root 0 Aug 22 14:31 991
15688963 -rw------- 1 root root 0 Aug 22 14:31 993
15688964 -rw------- 1 root root 0 Aug 22 14:31 994
15688966 -rw------- 1 root root 0 Aug 22 14:31 996
15688967 -rw------- 1 root root 0 Aug 22 14:31 997
15688968 -rw------- 1 root root 0 Aug 22 14:31 998
$
So, looking at what is on disk and what is in the log:
Inode # Size block count flushiter
dec hex inode log inode log inode log
15688948 0xef64f4 0 0 0xa00 0xa00 0 0
15688950 0xef64f6 0 0 0xa00 0xa00 0 0
15688952 0xef64f8 0x9ff000 0x9ff000 0x9ff 0xa00 1 0
15688957 0xef64fd 0 0 0xa00 0xa00 0 0
15688961 0xef6501 0 0 0xa00 0xa00 0 0
15688963 0xef6503 0 0 0xa00 0xa00 0 0
15688964 0xef6504 0 0 0xa00 0xa00 0 0
15688966 0xef6506 0 0 0xa00 0xa00 0 0
15688967 0xef6507 0 0 0xa00 0xa00 0 0
15688968 0xef6508 0 0 0xa00 0xa00 0 0
Now, inode #15688952 looks like there's some weirdness going on
there with a non-zero flushiter and a block count that doesn't match
between what is in the log and what is on disk. However, this is a
result of the second mount that checks the file sizes and extent
counts - it loads the inode into memory, checks it, and then when
it is purged from the cache on unmount the blocks beyond EOF are
punched away and the inode writen to disk. Hence there is a second
transaction in the log for that inode after all the other inodes
have been unlinked:
INODE: #regs: 3 ino: 0xef64f8 flags: 0x5 dsize: 16
.....
size 0x9ff000 nblocks 0x9ff extsize 0x0 nextents 0x1
It is preceeded in the log by modifications to the AGF and frees
space btree buffers. It's then followed by the superblock buffer and
the unmount record. Hence this is not unexpected.
What it does tell us, though, is that the log never recorded file
size changes for all of the inode with zero size. We see the block
count of 0xa00, which means the delayed allocation transaction
during IO submission has hit the disk, but there are none of the
IO completion transactions in the log.
So let's go look at the event trace now now that we know the EOF
size update transactions were not run before the filesystem shut
down.
Inode # writeback completion
hex first last first last
0xef64f4 0-0x9ff000 yes no no
0xef64f6 0-0x9ff000 yes no no
0xef64f8 0-0x9ff000 yes no no
0xef64fd 0-0x9ff000 yes no no
0xef6501 0-0x9ff000 yes no no
0xef6503 no no no no
0xef6504 no no no no
0xef6506 no no no no
0xef6507 no no no no
0xef6508 no no no no
Ok, so we still can't trust the event trace to be complete - we know
from the log and the on-disk state that that allocation occurred for
those last 5 inodes, so we can't read anything into the fact the
traces for completions are missing.
Eryu, can you change the way you run the event trace to be:
$ sudo trace-cmd <options> -o <outfile location> ./check <test options>
rather than running the trace as a background operation elsewhere?
Maybe that will give better results.
Also, it would be informative to us if you can reproduce this with a
v5 filesystem (i.e. mkfs.xfs -m crc=1) because it has much better
on-disk information for sequence-of-event triage like this. If you
can reproduce it with a v5 filesystem, can you post the trace and
metadump?
Other things to check (separately):
- change godown to godown -f
- add a "sleep 5" before running godown after sync
- add a "sleep 5; sync" before running godown
i.e. I'm wondering if sync is not waiting for everything, and so we
aren't capturing the IO completions because the filesystem is
already shut down by the time they are delivered...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon, Aug 24, 2015 at 11:11:23AM +1000, Dave Chinner wrote:
>
> Eryu, can you change the way you run the event trace to be:
>
> $ sudo trace-cmd <options> -o <outfile location> ./check <test options>
>
> rather than running the trace as a background operation elsewhere?
> Maybe that will give better results.
The results are here
http://128.199.137.77/writeback-v3/
>
> Also, it would be informative to us if you can reproduce this with a
> v5 filesystem (i.e. mkfs.xfs -m crc=1) because it has much better
> on-disk information for sequence-of-event triage like this. If you
> can reproduce it with a v5 filesystem, can you post the trace and
> metadump?
This seems to be harder to reproduce with tracepoints enabled, but I'll
keep trying, and the tests below.
Thanks,
Eryu
>
> Other things to check (separately):
>
> - change godown to godown -f
> - add a "sleep 5" before running godown after sync
> - add a "sleep 5; sync" before running godown
>
> i.e. I'm wondering if sync is not waiting for everything, and so we
> aren't capturing the IO completions because the filesystem is
> already shut down by the time they are delivered...
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
>
> _______________________________________________
> xfs mailing list
> [email protected]
> http://oss.sgi.com/mailman/listinfo/xfs
On Mon, Aug 24, 2015 at 11:18:16AM +0800, Eryu Guan wrote:
> On Mon, Aug 24, 2015 at 11:11:23AM +1000, Dave Chinner wrote:
> >
> > Eryu, can you change the way you run the event trace to be:
> >
> > $ sudo trace-cmd <options> -o <outfile location> ./check <test options>
> >
> > rather than running the trace as a background operation elsewhere?
> > Maybe that will give better results.
>
> The results are here
>
> http://128.199.137.77/writeback-v3/
OK, the trace is still missing a few events, but it's much more
complete, and contains the events that tell more of the story:
$ ls -li /mnt/scr
total 51196
15688955 -rw------- 1 root root 10481664 Aug 24 13:06 985
15688959 -rw------- 1 root root 0 Aug 24 13:06 989
15688961 -rw------- 1 root root 0 Aug 24 13:06 991
15688964 -rw------- 1 root root 0 Aug 24 13:06 994
15688966 -rw------- 1 root root 0 Aug 24 13:06 996
$
Inode # Size block count flushiter
dec hex inode log inode log inode log
15688955 0xef64fb 0x9ff000 0x9ff000 0x9ff 0xa00 1 0
15688959 0xef64ff 0 0 0xa00 0xa00 0 0
15688961 0xef6501 0 0 0xa00 0xa00 0 0
15688964 0xef6504 0 0 0xa00 0xa00 0 0
15688966 0xef6506 0 0 0xa00 0xa00 0 0
Ok, that tallies with the previous symptoms. However, the trace
tells us something different. the xfs_setfilesize events. here's the
last 12 xfs_setfilesize events delivered in the trace (trimmed for
brevity and focus):
ino 0xef64fa isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef64fa isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 409
ino 0xef64fb isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef64fb isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
ino 0xef64ff isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef64ff isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
ino 0xef6501 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef6501 isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
ino 0xef6504 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef6504 isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
ino 0xef6506 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
ino 0xef6506 isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
Yeah, the last 9 transactions corresponding to these events are not
in the log. Which means, most likely, that they occurred after
->sync_fs forced the log out. THe next thing that happens is the fs
is shut down, and that's preventing the dirty log from being written
to disk.
Eryu, this can be confirmed by add the xfs_log_force event to the
trace.
What I can't see in the traces is where sync is doing a blocking
sync pass on the fileystem. The wbc control structure being passed
to XFS is:
wbc_writepage: bdi 253:0: towrt=45569 skip=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
Which is not coming from sync_inodes_sb() as the sync mode is
incorrect (i.e. not WB_SYNC_ALL). It looks to me that writeback is
coming from a generic bdi flusher command rather than a directed
superblock sync. i.e. through wakeup_flusher_threads() which sets:
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
work->auto_free = 1;
as the reason is "sync":
sync-18849 writeback_queue: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
sync-18849 writeback_queue: bdi 253:0: sb_dev 253:1 nr_pages=9223372036854775807 sync_mode=1 kupdate=0 range_cyclic=0 background=0 reason=sync
....
kworker/u8:1-1563 writeback_exec: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
kworker/u8:1-1563 writeback_start: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
The next writeback_queue/writeback_exec tracepoint pair are:
....
kworker/2:1-17163 xfs_setfilesize: dev 253:6 ino 0xef6506 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
kworker/2:1-17163 xfs_setfilesize: dev 253:6 ino 0xef6506 isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775798 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775797 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775796 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775795 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
umount-18852 writeback_queue: bdi 253:0: sb_dev 253:6 nr_pages=22059 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
kworker/u8:1-1563 writeback_exec: bdi 253:0: sb_dev 253:6 nr_pages=22059 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
....
which shows unmount being the next writeback event queued and
executed after the IO completions have come in (that missed the
log). What is missing is the specific queue/exec events for
sync_sb_inodes() from the sync code for each filesystem.
Anyway, Eryum long and short of it is that you don't need to worry
about testing all the different combinations - we now know that the
completion events are occurring, so let's focus on whether the sync
code is not waiting for them correctly. Can you trace the following
events:
xfs_log_force
xfs_setfilesize
writeback_queue
writeback_exec
writeback_start
writeback_queue_io
writeback_written
writeback_pages_written
basically I'm trying to see if we've got all the BDI events as we'd
expect then to be queued and run for sync, and when the ->sync_fs
call occurs during the sync process before shutdown and unmount...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon, Aug 24, 2015 at 04:24:25PM +1000, Dave Chinner wrote:
> On Mon, Aug 24, 2015 at 11:18:16AM +0800, Eryu Guan wrote:
> > On Mon, Aug 24, 2015 at 11:11:23AM +1000, Dave Chinner wrote:
> > >
> > > Eryu, can you change the way you run the event trace to be:
> > >
> > > $ sudo trace-cmd <options> -o <outfile location> ./check <test options>
> > >
> > > rather than running the trace as a background operation elsewhere?
> > > Maybe that will give better results.
[snip]
> Anyway, Eryum long and short of it is that you don't need to worry
> about testing all the different combinations - we now know that the
> completion events are occurring, so let's focus on whether the sync
> code is not waiting for them correctly. Can you trace the following
> events:
>
> xfs_log_force
> xfs_setfilesize
> writeback_queue
> writeback_exec
> writeback_start
> writeback_queue_io
> writeback_written
> writeback_pages_written
>
> basically I'm trying to see if we've got all the BDI events as we'd
> expect then to be queued and run for sync, and when the ->sync_fs
> call occurs during the sync process before shutdown and unmount...
I collected two versions of trace info with crc enabled.
http://128.199.137.77/writeback-crc/
This version traced the same events as previous runs.
http://128.199.137.77/writeback-crc-v2/
And this version only traced the events you listed above.
And the results of other tests to check(all done with v4 xfs, with no
tracepoints enabled):
> Other things to check (separately):
> - change godown to godown -f
Passed 100 loops.
> - add a "sleep 5" before running godown after sync
Failed, if you need the trace info please let me know.
> - add a "sleep 5; sync" before running godown
Passed 100 loops.
Thanks,
Eryu
On Mon, Aug 24, 2015 at 04:34:37PM +0800, Eryu Guan wrote:
> On Mon, Aug 24, 2015 at 04:24:25PM +1000, Dave Chinner wrote:
> > On Mon, Aug 24, 2015 at 11:18:16AM +0800, Eryu Guan wrote:
> > > On Mon, Aug 24, 2015 at 11:11:23AM +1000, Dave Chinner wrote:
> > > >
> > > > Eryu, can you change the way you run the event trace to be:
> > > >
> > > > $ sudo trace-cmd <options> -o <outfile location> ./check <test options>
> > > >
> > > > rather than running the trace as a background operation elsewhere?
> > > > Maybe that will give better results.
> [snip]
> > Anyway, Eryum long and short of it is that you don't need to worry
> > about testing all the different combinations - we now know that the
> > completion events are occurring, so let's focus on whether the sync
> > code is not waiting for them correctly. Can you trace the following
> > events:
> >
> > xfs_log_force
> > xfs_setfilesize
> > writeback_queue
> > writeback_exec
> > writeback_start
> > writeback_queue_io
> > writeback_written
> > writeback_pages_written
> >
> > basically I'm trying to see if we've got all the BDI events as we'd
> > expect then to be queued and run for sync, and when the ->sync_fs
> > call occurs during the sync process before shutdown and unmount...
>
> I collected two versions of trace info with crc enabled.
>
> http://128.199.137.77/writeback-crc/
>
> This version traced the same events as previous runs.
>
> http://128.199.137.77/writeback-crc-v2/
>
> And this version only traced the events you listed above.
OK, I'll look into these later.
> And the results of other tests to check(all done with v4 xfs, with no
> tracepoints enabled):
>
> > Other things to check (separately):
> > - change godown to godown -f
>
> Passed 100 loops.
Yup, I expected that from the last set of traces - the "-f" flag
triggers a log force before shutdown, and that flushes out
transactions that sync missed.
> > - add a "sleep 5" before running godown after sync
>
> Failed, if you need the trace info please let me know.
Expected, still nothing to flush transactions before shutdown.
> > - add a "sleep 5; sync" before running godown
>
> Passed 100 loops.
expected - sync flushed the transactions it missed on the first
pass.
Thanks for running these tests!
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon 24-08-15 16:24:25, Dave Chinner wrote:
> On Mon, Aug 24, 2015 at 11:18:16AM +0800, Eryu Guan wrote:
> > On Mon, Aug 24, 2015 at 11:11:23AM +1000, Dave Chinner wrote:
> > >
> > > Eryu, can you change the way you run the event trace to be:
> > >
> > > $ sudo trace-cmd <options> -o <outfile location> ./check <test options>
> > >
> > > rather than running the trace as a background operation elsewhere?
> > > Maybe that will give better results.
> >
> > The results are here
> >
> > http://128.199.137.77/writeback-v3/
<snip>
> What I can't see in the traces is where sync is doing a blocking
> sync pass on the fileystem. The wbc control structure being passed
> to XFS is:
>
> wbc_writepage: bdi 253:0: towrt=45569 skip=0 mode=0 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
>
> Which is not coming from sync_inodes_sb() as the sync mode is
> incorrect (i.e. not WB_SYNC_ALL). It looks to me that writeback is
> coming from a generic bdi flusher command rather than a directed
> superblock sync. i.e. through wakeup_flusher_threads() which sets:
>
> work->sync_mode = WB_SYNC_NONE;
> work->nr_pages = nr_pages;
> work->range_cyclic = range_cyclic;
> work->reason = reason;
> work->auto_free = 1;
>
> as the reason is "sync":
>
> sync-18849 writeback_queue: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
> sync-18849 writeback_queue: bdi 253:0: sb_dev 253:1 nr_pages=9223372036854775807 sync_mode=1 kupdate=0 range_cyclic=0 background=0 reason=sync
> ....
> kworker/u8:1-1563 writeback_exec: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
> kworker/u8:1-1563 writeback_start: bdi 253:0: sb_dev 0:0 nr_pages=308986 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
>
> The next writeback_queue/writeback_exec tracepoint pair are:
>
> ....
> kworker/2:1-17163 xfs_setfilesize: dev 253:6 ino 0xef6506 isize 0xa00000 disize 0x0 offset 0x0 count 10481664
> kworker/2:1-17163 xfs_setfilesize: dev 253:6 ino 0xef6506 isize 0xa00000 disize 0x9ff000 offset 0x9ff000 count 4096
> sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775798 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
> sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775797 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
> sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775796 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
> sync-18849 wbc_writepage: bdi 253:0: towrt=9223372036854775795 skip=0 mode=1 kupd=0 bgrd=0 reclm=0 cyclic=0 start=0x0 end=0x7fffffffffffffff
> umount-18852 writeback_queue: bdi 253:0: sb_dev 253:6 nr_pages=22059 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
> kworker/u8:1-1563 writeback_exec: bdi 253:0: sb_dev 253:6 nr_pages=22059 sync_mode=0 kupdate=0 range_cyclic=0 background=0 reason=sync
> ....
>
> which shows unmount being the next writeback event queued and
> executed after the IO completions have come in (that missed the
> log). What is missing is the specific queue/exec events for
> sync_sb_inodes() from the sync code for each filesystem.
Bah, I see the problem and indeed it was introduced by commit e79729123f639
"writeback: don't issue wb_writeback_work if clean". The problem is that
we bail out of sync_inodes_sb() if there is no dirty IO. Which is wrong
because we have to wait for any outstanding IO (i.e. call wait_sb_inodes())
regardless of dirty state! And that also explains why Tejun's patch fixes
the problem because it backs out the change to the exit condition in
sync_inodes_sb().
So Tejun's patch from this thread is indeed fixing the real problem but the
comment in sync_inodes_sb() should be fixed to mention wait_sb_inodes()
must be called in all cases... Tejun, will you fixup the comment please?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hello, Jan.
On Mon, Aug 24, 2015 at 11:19:59AM +0200, Jan Kara wrote:
> > which shows unmount being the next writeback event queued and
> > executed after the IO completions have come in (that missed the
> > log). What is missing is the specific queue/exec events for
> > sync_sb_inodes() from the sync code for each filesystem.
>
> Bah, I see the problem and indeed it was introduced by commit e79729123f639
> "writeback: don't issue wb_writeback_work if clean". The problem is that
> we bail out of sync_inodes_sb() if there is no dirty IO. Which is wrong
> because we have to wait for any outstanding IO (i.e. call wait_sb_inodes())
> regardless of dirty state! And that also explains why Tejun's patch fixes
> the problem because it backs out the change to the exit condition in
> sync_inodes_sb().
Dang, I'm an idiot sandwich.
> So Tejun's patch from this thread is indeed fixing the real problem but the
> comment in sync_inodes_sb() should be fixed to mention wait_sb_inodes()
> must be called in all cases... Tejun, will you fixup the comment please?
Will post an updated patch. Kudos to Eryu and Dave for chasing it
down.
Thanks a lot.
--
tejun
Hello,
On Mon, Aug 24, 2015 at 10:51:50AM -0400, Tejun Heo wrote:
> > Bah, I see the problem and indeed it was introduced by commit e79729123f639
> > "writeback: don't issue wb_writeback_work if clean". The problem is that
> > we bail out of sync_inodes_sb() if there is no dirty IO. Which is wrong
> > because we have to wait for any outstanding IO (i.e. call wait_sb_inodes())
> > regardless of dirty state! And that also explains why Tejun's patch fixes
> > the problem because it backs out the change to the exit condition in
> > sync_inodes_sb().
>
> Dang, I'm an idiot sandwich.
A question tho, so this means that an inode may contain dirty or
writeback pages w/o the inode being on one of the dirty lists.
Looking at the generic filesystem and writeback code, this doesn't
seem true in general. Is this something xfs specific?
Thanks.
--
tejun
Hello, Dave.
On Fri, Aug 21, 2015 at 09:04:51AM +1000, Dave Chinner wrote:
> > Maybe I'm misunderstanding the code but all xfs_writepage() calls are
> > from unbound workqueues - the writeback workers - while
> > xfs_setfilesize() are from bound workqueues, so I wondered why that
> > was and looked at the code and the setsize functions are run off of a
> > separate work item which is queued from the end_bio callback and I
> > can't tell who would be waiting for them. Dave, what am I missing?
>
> xfs_setfilesize runs transactions, so it can't be run from IO
> completion context as it needs to block (i.e. on log space or inode
> locks). It also can't block log IO completion, nor metadata Io
> completion, as only log IO completion can free log space, and the
> inode lock might be waiting on metadata buffer IO completion (e.g.
> during delayed allocation). Hence we have multiple IO completion
> workqueues to keep these things separated and deadlock free. i.e.
> they all get punted to a workqueue where they are then processed in
> a context that can block safely.
I'm still a bit confused. What prevents the following from happening?
1. io completion of last dirty page of an inode and work item for
xfs_setfilesize() is queued.
2. inode removed from dirty list.
3. __sync_filesystem() invokes sync_inodes_sb(). There are no dirty
pages, so it finishes.
4. xfs_fs_sync_fs() is called which calls _xfs_log_force() but the
work item from #1 hasn't run yet, so the size update isn't written
out.
5. Crash.
Is it that _xfs_log_force() waits for the setfilesize transaction
created during writepage?
Thanks.
--
tejun
On Mon 24-08-15 13:11:44, Tejun Heo wrote:
> Hello,
>
> On Mon, Aug 24, 2015 at 10:51:50AM -0400, Tejun Heo wrote:
> > > Bah, I see the problem and indeed it was introduced by commit e79729123f639
> > > "writeback: don't issue wb_writeback_work if clean". The problem is that
> > > we bail out of sync_inodes_sb() if there is no dirty IO. Which is wrong
> > > because we have to wait for any outstanding IO (i.e. call wait_sb_inodes())
> > > regardless of dirty state! And that also explains why Tejun's patch fixes
> > > the problem because it backs out the change to the exit condition in
> > > sync_inodes_sb().
> >
> > Dang, I'm an idiot sandwich.
>
> A question tho, so this means that an inode may contain dirty or
> writeback pages w/o the inode being on one of the dirty lists.
> Looking at the generic filesystem and writeback code, this doesn't
> seem true in general. Is this something xfs specific?
Inode may contain writeback pages (but not dirty pages) without being on
any of the dirty lists. That is correct. Josef Bacik had patches to create
a list to track inodes with pages under writeback but they clashed with
your patch series and they didn't get rebased yet AFAIR.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hello, Jan.
On Mon, Aug 24, 2015 at 09:08:47PM +0200, Jan Kara wrote:
> Inode may contain writeback pages (but not dirty pages) without being on
> any of the dirty lists. That is correct. Josef Bacik had patches to create
Hmmm... Can you please expand on how / why that happens? It's kinda
weird to require writeback to walk all inodes regardless of their
dirty states.
> a list to track inodes with pages under writeback but they clashed with
> your patch series and they didn't get rebased yet AFAIR.
Wouldn't it make more sense to simply put them on one of the existing
b_* lists?
Thanks.
--
tejun
On Mon 24-08-15 15:32:42, Tejun Heo wrote:
> Hello, Jan.
>
> On Mon, Aug 24, 2015 at 09:08:47PM +0200, Jan Kara wrote:
> > Inode may contain writeback pages (but not dirty pages) without being on
> > any of the dirty lists. That is correct. Josef Bacik had patches to create
>
> Hmmm... Can you please expand on how / why that happens? It's kinda
> weird to require writeback to walk all inodes regardless of their
> dirty states.
It is inefficient, yes. But note that 'writeback' and 'dirty' states are
completely independent. Page can be in any of the !dirty & !writeback,
dirty & !writeback, !dirty & writeback, dirty & writeback states. So mixing
tracking of writeback and dirty state of an inode just makes the code even
messier.
> > a list to track inodes with pages under writeback but they clashed with
> > your patch series and they didn't get rebased yet AFAIR.
>
> Wouldn't it make more sense to simply put them on one of the existing
> b_* lists?
Logically it just doesn't make sense because as I wrote above dirty and
writeback states are completely independent. Also you'd have to detect &
skip inodes that don't really have any dirty pages to write and all the
detection of "is there any data to write" would get more complicated. A
separate list for inodes under writeback as Josef did is IMO the cleanest
solution.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hello,
On Mon, Aug 24, 2015 at 11:09:27PM +0200, Jan Kara wrote:
> It is inefficient, yes. But note that 'writeback' and 'dirty' states are
> completely independent. Page can be in any of the !dirty & !writeback,
That isn't true for pages being dirtied through set_page_dirty().
It's guaranteed that a dirty inode remains on one of the b_* lists
till there's no dirty page and writeback is complete.
> dirty & !writeback, !dirty & writeback, dirty & writeback states. So mixing
> tracking of writeback and dirty state of an inode just makes the code even
> messier.
I'm curious where and why they would deviate. Can you give me some
examples? AFAICS, anything which uses the usual set_page_dirty() path
shouldn't do that.
> > > a list to track inodes with pages under writeback but they clashed with
> > > your patch series and they didn't get rebased yet AFAIR.
> >
> > Wouldn't it make more sense to simply put them on one of the existing
> > b_* lists?
>
> Logically it just doesn't make sense because as I wrote above dirty and
> writeback states are completely independent. Also you'd have to detect &
> skip inodes that don't really have any dirty pages to write and all the
> detection of "is there any data to write" would get more complicated. A
> separate list for inodes under writeback as Josef did is IMO the cleanest
> solution.
Given that the usual code path tracks dirty and writeback together, I
don't think it's nonsensical; however, I'm more curious how common
writeback w/o dirtying case is.
Thanks.
--
tejun
On Mon, Aug 24, 2015 at 02:10:38PM -0400, Tejun Heo wrote:
> Hello, Dave.
>
> On Fri, Aug 21, 2015 at 09:04:51AM +1000, Dave Chinner wrote:
> > > Maybe I'm misunderstanding the code but all xfs_writepage() calls are
> > > from unbound workqueues - the writeback workers - while
> > > xfs_setfilesize() are from bound workqueues, so I wondered why that
> > > was and looked at the code and the setsize functions are run off of a
> > > separate work item which is queued from the end_bio callback and I
> > > can't tell who would be waiting for them. Dave, what am I missing?
> >
> > xfs_setfilesize runs transactions, so it can't be run from IO
> > completion context as it needs to block (i.e. on log space or inode
> > locks). It also can't block log IO completion, nor metadata Io
> > completion, as only log IO completion can free log space, and the
> > inode lock might be waiting on metadata buffer IO completion (e.g.
> > during delayed allocation). Hence we have multiple IO completion
> > workqueues to keep these things separated and deadlock free. i.e.
> > they all get punted to a workqueue where they are then processed in
> > a context that can block safely.
>
> I'm still a bit confused. What prevents the following from happening?
>
> 1. io completion of last dirty page of an inode and work item for
> xfs_setfilesize() is queued.
>
> 2. inode removed from dirty list.
The inode has already been removed from the dirty list - that
happens at inode writeback submission time, not IO completion.
> 3. __sync_filesystem() invokes sync_inodes_sb(). There are no dirty
> pages, so it finishes.
There are no dirty pages, but the pages aren't clean, either. i.e
they are still under writeback. Hence we need to invoke
wait_inodes_sb() to wait for writeback on all pages to complete
before returning.
> 4. xfs_fs_sync_fs() is called which calls _xfs_log_force() but the
> work item from #1 hasn't run yet, so the size update isn't written
> out.
The bug here is that wait_inodes_sb() has not been run, therefore
->syncfs is being run before IO completions have been processed and
pages marked clean.
> 5. Crash.
>
> Is it that _xfs_log_force() waits for the setfilesize transaction
> created during writepage?
No, it's wait_inodes_sb() that does the waiting for data IO
completion for sync.
Cheers,
Dave.
--
Dave Chinner
[email protected]
Hello, Dave.
On Tue, Aug 25, 2015 at 08:27:20AM +1000, Dave Chinner wrote:
> > I'm still a bit confused. What prevents the following from happening?
> >
> > 1. io completion of last dirty page of an inode and work item for
> > xfs_setfilesize() is queued.
> >
> > 2. inode removed from dirty list.
>
> The inode has already been removed from the dirty list - that
> happens at inode writeback submission time, not IO completion.
Ah, yeah, right, somehow was thinking requeue_io() was being called
from completion path. That's where I was confused.
Thanks.
--
tejun
Hello,
On Mon, Aug 24, 2015 at 05:45:35PM -0400, Tejun Heo wrote:
> That isn't true for pages being dirtied through set_page_dirty().
> It's guaranteed that a dirty inode remains on one of the b_* lists
> till there's no dirty page and writeback is complete.
I got confused here. inodes get removed from b_* once all IOs are
issued.
Thanks.
--
tejun
On Mon, Aug 24, 2015 at 05:45:35PM -0400, Tejun Heo wrote:
> Hello,
>
> On Mon, Aug 24, 2015 at 11:09:27PM +0200, Jan Kara wrote:
> > It is inefficient, yes. But note that 'writeback' and 'dirty' states are
> > completely independent. Page can be in any of the !dirty & !writeback,
>
> That isn't true for pages being dirtied through set_page_dirty().
> It's guaranteed that a dirty inode remains on one of the b_* lists
> till there's no dirty page and writeback is complete.
IO submission calls clear_page_dirty_for_io(), so by the time that
all the pages have been submitted for IO, there are no dirty pages.
IO submission also calls set_page_writeback() once the filesystem
has decided to do IO on the page, and then IO completion calls
end_page_writeback() to clear that state.
IOWs, the page transitions from (dirty && !writeback) before
submission to (!dirty && writeback) after submission, and to (!dirty
&& !writeback) once IO completion occurs.
And you'll note that filemap_fdatawait() blocks on pages with the
PAGECACHE_TAG_WRITEBACK tag set in the mapping tree, not dirty
pages. Hence sync has to wait for all pages to transition out of
writeback before we can consider the inode to be clean.
> > dirty & !writeback, !dirty & writeback, dirty & writeback states. So mixing
> > tracking of writeback and dirty state of an inode just makes the code even
> > messier.
>
> I'm curious where and why they would deviate. Can you give me some
> examples? AFAICS, anything which uses the usual set_page_dirty() path
> shouldn't do that.
mmaped files.
page_mkwrite dirties page (dirty && !writeback)
writepage clear_page_dirty_for_io (!dirty && !writeback)
writepage starts writeback (!dirty && writeback)
page_mkwrite dirties page (dirty && writeback)
io completes (dirty && !writeback)
This is done so we don't lose dirty state from page faults whilst
the page is under IO and hence have sync miss the page next time
through....
Of course, this behaviour is different if you have a filesystem or
block device that requires stable pages (e.g. btrfs for data CRC
validity). In this case, the page fault will block until the
writeback state goes away...
> > > > a list to track inodes with pages under writeback but they clashed with
> > > > your patch series and they didn't get rebased yet AFAIR.
> > >
> > > Wouldn't it make more sense to simply put them on one of the existing
> > > b_* lists?
> >
> > Logically it just doesn't make sense because as I wrote above dirty and
> > writeback states are completely independent. Also you'd have to detect &
> > skip inodes that don't really have any dirty pages to write and all the
> > detection of "is there any data to write" would get more complicated. A
> > separate list for inodes under writeback as Josef did is IMO the cleanest
> > solution.
>
> Given that the usual code path tracks dirty and writeback together, I
> don't think it's nonsensical; however, I'm more curious how common
> writeback w/o dirtying case is.
I suspect you've misunderstood the progression here. You can't get
writeback without first going through dirty. But the transition to
writeback clears the dirty page state so that we can capture page
state changes while writeback is in progress.
Cheers,
Dave.
--
Dave Chinner
[email protected]
e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
updated writeback path to avoid kicking writeback work items if there
are no inodes to be written out; unfortunately, the avoidance logic
was too aggressive and broke sync_inodes_sb().
* sync_inodes_sb() must write out I_DIRTY_TIME inodes but I_DIRTY_TIME
inodes dont't contribute to bdi/wb_has_dirty_io() tests and were
being skipped over.
* inodes are taken off wb->b_dirty/io/more_io lists after writeback
starts on them. sync_inodes_sb() skipping wait_sb_inodes() when
bdi_has_dirty_io() breaks it by making it return while writebacks
are in-flight.
This patch fixes the breakages by
* Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
The callers are already testing the condition.
* Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
it always calls into bdi_split_work_to_wbs() and wait_sb_inodes().
* Making bdi_split_work_to_wbs() consider the b_dirty_time list for
WB_SYNC_ALL writebacks.
Kudos to Eryu, Dave and Jan for tracking down the issue.
Signed-off-by: Tejun Heo <[email protected]>
Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
Link: http://lkml.kernel.org/g/[email protected]
Reported-and-bisected-by: Eryu Guan <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Jan Kara <[email protected]>
Cc: Ted Ts'o <[email protected]>
---
fs/fs-writeback.c | 22 +++++++++++++---------
1 file changed, 13 insertions(+), 9 deletions(-)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -844,14 +844,15 @@ static void bdi_split_work_to_wbs(struct
struct wb_iter iter;
might_sleep();
-
- if (!bdi_has_dirty_io(bdi))
- return;
restart:
rcu_read_lock();
bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
- if (!wb_has_dirty_io(wb) ||
- (skip_if_busy && writeback_in_progress(wb)))
+ /* SYNC_ALL writes out I_DIRTY_TIME too */
+ if (!wb_has_dirty_io(wb) &&
+ (base_work->sync_mode == WB_SYNC_NONE ||
+ list_empty(&wb->b_dirty_time)))
+ continue;
+ if (skip_if_busy && writeback_in_progress(wb))
continue;
base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
@@ -899,8 +900,7 @@ static void bdi_split_work_to_wbs(struct
{
might_sleep();
- if (bdi_has_dirty_io(bdi) &&
- (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
base_work->single_wait = 0;
base_work->single_done = 0;
@@ -2275,8 +2275,12 @@ void sync_inodes_sb(struct super_block *
};
struct backing_dev_info *bdi = sb->s_bdi;
- /* Nothing to do? */
- if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ /*
+ * Can't skip on !bdi_has_dirty() because we should wait for !dirty
+ * inodes under writeback and I_DIRTY_TIME inodes ignored by
+ * bdi_has_dirty() need to be written out too.
+ */
+ if (bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
On 08/25/2015 12:11 PM, Tejun Heo wrote:
> e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> updated writeback path to avoid kicking writeback work items if there
> are no inodes to be written out; unfortunately, the avoidance logic
> was too aggressive and broke sync_inodes_sb().
>
> * sync_inodes_sb() must write out I_DIRTY_TIME inodes but I_DIRTY_TIME
> inodes dont't contribute to bdi/wb_has_dirty_io() tests and were
> being skipped over.
>
> * inodes are taken off wb->b_dirty/io/more_io lists after writeback
> starts on them. sync_inodes_sb() skipping wait_sb_inodes() when
> bdi_has_dirty_io() breaks it by making it return while writebacks
> are in-flight.
>
> This patch fixes the breakages by
>
> * Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
> The callers are already testing the condition.
>
> * Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
> it always calls into bdi_split_work_to_wbs() and wait_sb_inodes().
>
> * Making bdi_split_work_to_wbs() consider the b_dirty_time list for
> WB_SYNC_ALL writebacks.
>
> Kudos to Eryu, Dave and Jan for tracking down the issue.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> Link: http://lkml.kernel.org/g/[email protected]
> Reported-and-bisected-by: Eryu Guan <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Jan Kara <[email protected]>
> Cc: Ted Ts'o <[email protected]>
Added for 4.2.
--
Jens Axboe
On Tue 25-08-15 14:11:52, Tejun Heo wrote:
> e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> updated writeback path to avoid kicking writeback work items if there
> are no inodes to be written out; unfortunately, the avoidance logic
> was too aggressive and broke sync_inodes_sb().
>
> * sync_inodes_sb() must write out I_DIRTY_TIME inodes but I_DIRTY_TIME
> inodes dont't contribute to bdi/wb_has_dirty_io() tests and were
> being skipped over.
>
> * inodes are taken off wb->b_dirty/io/more_io lists after writeback
> starts on them. sync_inodes_sb() skipping wait_sb_inodes() when
> bdi_has_dirty_io() breaks it by making it return while writebacks
> are in-flight.
>
> This patch fixes the breakages by
>
> * Removing bdi_has_dirty_io() shortcut from bdi_split_work_to_wbs().
> The callers are already testing the condition.
>
> * Removing bdi_has_dirty_io() shortcut from sync_inodes_sb() so that
> it always calls into bdi_split_work_to_wbs() and wait_sb_inodes().
>
> * Making bdi_split_work_to_wbs() consider the b_dirty_time list for
> WB_SYNC_ALL writebacks.
>
> Kudos to Eryu, Dave and Jan for tracking down the issue.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Fixes: e79729123f63 ("writeback: don't issue wb_writeback_work if clean")
> Link: http://lkml.kernel.org/g/[email protected]
> Reported-and-bisected-by: Eryu Guan <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Jan Kara <[email protected]>
> Cc: Ted Ts'o <[email protected]>
> ---
> fs/fs-writeback.c | 22 +++++++++++++---------
> 1 file changed, 13 insertions(+), 9 deletions(-)
The patch looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -844,14 +844,15 @@ static void bdi_split_work_to_wbs(struct
> struct wb_iter iter;
>
> might_sleep();
> -
> - if (!bdi_has_dirty_io(bdi))
> - return;
> restart:
> rcu_read_lock();
> bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
> - if (!wb_has_dirty_io(wb) ||
> - (skip_if_busy && writeback_in_progress(wb)))
> + /* SYNC_ALL writes out I_DIRTY_TIME too */
> + if (!wb_has_dirty_io(wb) &&
> + (base_work->sync_mode == WB_SYNC_NONE ||
> + list_empty(&wb->b_dirty_time)))
> + continue;
> + if (skip_if_busy && writeback_in_progress(wb))
> continue;
>
> base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
> @@ -899,8 +900,7 @@ static void bdi_split_work_to_wbs(struct
> {
> might_sleep();
>
> - if (bdi_has_dirty_io(bdi) &&
> - (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
> + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
> base_work->auto_free = 0;
> base_work->single_wait = 0;
> base_work->single_done = 0;
> @@ -2275,8 +2275,12 @@ void sync_inodes_sb(struct super_block *
> };
> struct backing_dev_info *bdi = sb->s_bdi;
>
> - /* Nothing to do? */
> - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
> + /*
> + * Can't skip on !bdi_has_dirty() because we should wait for !dirty
> + * inodes under writeback and I_DIRTY_TIME inodes ignored by
> + * bdi_has_dirty() need to be written out too.
> + */
> + if (bdi == &noop_backing_dev_info)
> return;
> WARN_ON(!rwsem_is_locked(&sb->s_umount));
>
>
--
Jan Kara <[email protected]>
SUSE Labs, CR