2018-03-28 04:40:21

by Qu Wenruo

[permalink] [raw]
Subject: [PATCH v2 1/3] fstests: log-writes: Add support to output human readable flags

Also change the flag numeric output to hex.

Signed-off-by: Qu Wenruo <[email protected]>
Reviewed-by: Amir Goldstein <[email protected]>
---
changelog:
v2:
Add Amir's reviewed-by tag.
---
src/log-writes/log-writes.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index 09391574c4d2..a872429d3f41 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -120,6 +120,58 @@ int log_discard(struct log *log, struct log_write_entry *entry)
return 0;
}

+#define DEFINE_LOG_FLAGS_STR_ENTRY(x) \
+ {LOG_##x##_FLAG, #x}
+
+struct flags_to_str_entry {
+ u64 flags;
+ const char *str;
+} log_flags_table[] = {
+ DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
+ DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
+ DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
+ DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+};
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define LOG_FLAGS_BUF_SIZE 128
+/*
+ * Convert numeric flags to human readable flags.
+ * @flags: numeric flags
+ * @buf: output buffer for human readable string.
+ * must have enough space (LOG_FLAGS_BUF_SIZE) to contain all
+ * the string
+ */
+static void entry_flags_to_str(u64 flags, char *buf)
+{
+ int empty = 1;
+ int left_len;
+ int i;
+
+ buf[0] = '\0';
+ for (i = 0; i < ARRAY_SIZE(log_flags_table); i++) {
+ if (flags & log_flags_table[i].flags) {
+ if (!empty)
+ strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+ empty = 0;
+ strncat(buf, log_flags_table[i].str, LOG_FLAGS_BUF_SIZE);
+ flags &= ~log_flags_table[i].flags;
+ }
+ }
+ if (flags) {
+ if (!empty)
+ strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+ empty = 0;
+ left_len = LOG_FLAGS_BUF_SIZE - strnlen(buf,
+ LOG_FLAGS_BUF_SIZE);
+ if (left_len > 0)
+ snprintf(buf + strnlen(buf, LOG_FLAGS_BUF_SIZE),
+ left_len, "UNKNOWN.0x%llx", flags);
+ }
+ if (empty)
+ strncpy(buf, "NONE", LOG_FLAGS_BUF_SIZE);
+}
+
/*
* @log: the log we are replaying.
* @entry: entry to be replayed.
@@ -179,6 +231,7 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
size_t read_size = read_data ? log->sectorsize :
sizeof(struct log_write_entry);
char *buf;
+ char flags_buf[LOG_FLAGS_BUF_SIZE];
ssize_t ret;
off_t offset;
int skip = 0;
@@ -210,19 +263,20 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
log->cur_pos += read_size;
}

+ flags = le64_to_cpu(entry->flags);
+ entry_flags_to_str(flags, flags_buf);
skip = log_should_skip(log, entry);
if (log_writes_verbose > 1 || (log_writes_verbose && !skip)) {
- printf("%s %d@%llu: sector %llu, size %llu, flags %llu\n",
+ printf("%s %d@%llu: sector %llu, size %llu, flags 0x%llx(%s)\n",
skip ? "skipping" : "replaying",
(int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
(unsigned long long)le64_to_cpu(entry->sector),
(unsigned long long)size,
- (unsigned long long)le64_to_cpu(entry->flags));
+ (unsigned long long)flags, flags_buf);
}
if (!size)
return 0;

- flags = le64_to_cpu(entry->flags);
if (flags & LOG_DISCARD_FLAG)
return log_discard(log, entry);

@@ -301,7 +355,7 @@ int log_seek_entry(struct log *log, u64 entry_num)
return -1;
}
if (log_writes_verbose > 1)
- printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+ printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx\n",
(int)i, log->cur_pos / log->sectorsize,
(unsigned long long)le64_to_cpu(entry.sector),
(unsigned long long)le64_to_cpu(entry.nr_sectors),
@@ -339,6 +393,7 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
size_t read_size = read_data ? log->sectorsize :
sizeof(struct log_write_entry);
u64 flags;
+ char flags_buf[LOG_FLAGS_BUF_SIZE];
ssize_t ret;

if (log->cur_entry >= log->nr_entries)
@@ -366,14 +421,15 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
} else {
log->cur_pos += read_size;
}
+ flags = le64_to_cpu(entry->flags);
+ entry_flags_to_str(flags, flags_buf);
if (log_writes_verbose > 1)
- printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+ printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx(%s)\n",
(int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
(unsigned long long)le64_to_cpu(entry->sector),
(unsigned long long)le64_to_cpu(entry->nr_sectors),
- (unsigned long long)le64_to_cpu(entry->flags));
+ (unsigned long long)flags, flags_buf);

- flags = le64_to_cpu(entry->flags);
read_size = le64_to_cpu(entry->nr_sectors) * log->sectorsize;
if (!read_size || (flags & LOG_DISCARD_FLAG))
return 0;
--
2.16.2



2018-03-28 04:40:23

by Qu Wenruo

[permalink] [raw]
Subject: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes

Basic test case which triggers fsstress with dm-log-writes, and then
check the fs after each FUA writes.
With needed infrastructure and special handlers for journal based fs.

Signed-off-by: Qu Wenruo <[email protected]>
---
changelog:
v2:
Use proper "SUSE Linux Products GmbH" instead of "SuSE"
Get rid of dm-snapshot which is pretty slow if we're creating and
deleting snapshots repeatedly.
(Maybe LVM thin provision would be much better, but current replay
solution is good so far, and no slower than dm-snapshot)
Add back run_check so that we can get the seed.
---
Unfortunately, neither xfs nor ext4 survies this test for even single
success, while btrfs survives.
(Although not what I want, I'm just trying my luck
to reproduce a serious btrfs corruption situation)

Although btrfs may be the fastest fs for the test, since it has fixed
small amount of write in mkfs and almost nothing to replay, it still
takes about 240s~300s to finish (the same level using snapshot).

It would take longer time for ext4 for its large amount of write during
mkfs, if it can survive the test in the first space.

As a comparison, btrfs takes about 5 seconds to replay log, mount,
unmount and run fsck at the end of the test.
But for ext4, it already takes about 5 seconds to do the same thing
before triggering fsck error.

Fsck fail for ext4:
_check_generic_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent
*** fsck.ext4 output ***
fsck from util-linux 2.31.1
e2fsck 1.43.8 (1-Jan-2018)
Pass 1: Checking inodes, blocks, and sizes
Inode 131076 extent tree (at level 1) could be shorter. Fix? no

Inode 131262, i_size is 0, should be 258048. Fix? no

Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information

For xfs:
_check_xfs_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent (r)
*** xfs_repair -n output ***
Phase 1 - find and verify superblock...
Phase 2 - using internal log
- zero log...
- scan filesystem freespace and inode maps...
- found root inode chunk
Phase 3 - for each AG...
- scan (but don't clear) agi unlinked lists...
- process known inodes and perform inode discovery...
- agno = 0
- agno = 1
- agno = 2
bad symlink header ino 8409190, file block 0, disk block 1051147
problem with symbolic link in inode 8409190
would have cleared inode 8409190
- agno = 3
- process newly discovered inodes...
Phase 4 - check for duplicate blocks...
- setting up duplicate extent list...
- check for inodes claiming duplicate blocks...
- agno = 0
- agno = 1
- agno = 3
- agno = 2
entry "lb" in shortform directory 8409188 references free inode 8409190
would have junked entry "lb" in directory inode 8409188
bad symlink header ino 8409190, file block 0, disk block 1051147
problem with symbolic link in inode 8409190
would have cleared inode 8409190
No modify flag set, skipping phase 5
Phase 6 - check inode connectivity...
- traversing filesystem ...
entry "lb" in shortform directory inode 8409188 points to free inode 8409190
would junk entry
- traversal finished ...
- moving disconnected inodes to lost+found ...
Phase 7 - verify link counts...
Maximum metadata LSN (1:396) is ahead of log (1:63).
Would format log to cycle 4.
No modify flag set, skipping filesystem flush and exiting.

And special note for XFS guys, I also hit an XFS internal metadata
warning during journal replay:
[ 7901.423659] XFS (dm-4): Starting recovery (logdev: internal)
[ 7901.577511] XFS (dm-4): Metadata corruption detected at xfs_dinode_verify+0x467/0x570 [xfs], inode 0x805067 dinode
[ 7901.580529] XFS (dm-4): Unmount and run xfs_repair
[ 7901.581901] XFS (dm-4): First 128 bytes of corrupted metadata buffer:
[ 7901.583205] 00000000b8963f41: 49 4e a1 ff 03 02 00 00 00 00 00 00 00 00 00 00 IN..............
[ 7901.584659] 00000000f35a50e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
[ 7901.586075] 00000000386eea9e: 5a b2 0e 69 0a f3 43 27 5a b2 0e 69 0a f3 43 27 Z..i..C'Z..i..C'
[ 7901.587561] 00000000ac636661: 5a b2 0e 69 0d 92 bc 00 00 00 00 00 00 00 00 00 Z..i............
[ 7901.588969] 00000000d75f9093: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
[ 7901.590475] 00000000d2af5688: 00 00 00 02 00 00 00 00 00 00 00 00 84 d7 6a e9 ..............j.
[ 7901.591907] 00000000e8dd8211: ff ff ff ff 34 93 a9 3a 00 00 00 00 00 00 00 04 ....4..:........
[ 7901.593319] 00000000b7610e4e: 00 00 00 01 00 00 00 45 00 00 00 00 00 00 00 00 .......E........
---

common/dmlogwrites | 56 +++++++++++++++++++++++++++
tests/generic/481 | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++
tests/generic/481.out | 2 +
tests/generic/group | 1 +
4 files changed, 163 insertions(+)
create mode 100755 tests/generic/481
create mode 100644 tests/generic/481.out

diff --git a/common/dmlogwrites b/common/dmlogwrites
index 467b872e246f..bf643a77ad09 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -126,3 +126,59 @@ _log_writes_cleanup()
$UDEV_SETTLE_PROG >/dev/null 2>&1
_log_writes_remove
}
+
+# Convert log writes mark to entry number
+# Result entry number is output to stdout, could be empty if not found
+_log_writes_mark_to_entry_number()
+{
+ local mark=$1
+ local ret
+
+ [ -z "$mark" ] && _fatal \
+ "mark must be given for _log_writes_mark_to_entry_number"
+
+ ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
+ --end-mark $mark 2> /dev/null)
+ [ -z "$ret" ] && return
+ ret=$(echo "$ret" | cut -f1 -d\@)
+ echo "mark $mark has entry number $ret" >> $seqres.full
+ echo "$ret"
+}
+
+# Find next fua write entry number
+# Result entry number is output to stdout, could be empty if not found
+_log_writes_find_next_fua()
+{
+ local start_entry=$1
+ local ret
+
+ [ -z "$start_entry" ] && start_entry=0
+ ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
+ --next-fua --start-entry $start_entry 2> /dev/null)
+ [ -z "$ret" ] && return
+
+ # Result should be something like "1024@offset" where 1024 is the
+ # entry number we need
+ ret=$(echo "$ret" | cut -f1 -d\@)
+ echo "next fua is entry number $ret" >> $seqres.full
+ echo "$ret"
+}
+
+# Replay log range to specified entry
+# $1: End entry. The entry with this number *WILL* be replayed
+_log_writes_replay_log_range()
+{
+ local end=$1
+
+ [ -z "$end" ] && _fail \
+ "end entry must be specified for _log_writes_replay_log_range"
+
+ # To ensure we replay the last entry,
+ # we need to manually increase the end entry number to ensure
+ # it's played
+ echo "=== replay to $end ===" >> $seqres.full
+ $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
+ --replay $SCRATCH_DEV --limit $(($end + 1)) \
+ >> $seqres.full 2>&1
+ [ $? -ne 0 ] && _fail "replay failed"
+}
diff --git a/tests/generic/481 b/tests/generic/481
new file mode 100755
index 000000000000..6b03786c8cb4
--- /dev/null
+++ b/tests/generic/481
@@ -0,0 +1,104 @@
+#! /bin/bash
+# FS QA Test 481
+#
+# Test filesystem consistency after each FUA operation
+#
+# Will do log replay and check the filesystem.
+#
+#-----------------------------------------------------------------------
+# Copyright (c) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#-----------------------------------------------------------------------
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1 # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+ cd /
+ $KILLALL_PROG -KILL -q $FSSTRESS_PROG &> /dev/null
+ _log_writes_cleanup &> /dev/null
+ rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmlogwrites
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_supported_os Linux
+
+_require_command "$KILLALL_PROG" killall
+# Use $SCRATCH_DEV as replay device
+_require_scratch
+# and we need extra device as log device
+_require_log_writes
+
+
+nr_cpus=$("$here/src/feature" -o)
+fsstress_args=$(_scale_fsstress_args -w -d $SCRATCH_MNT -n 512 -p $nr_cpus \
+ $FSSTRESS_AVOID)
+
+_test_unmount
+_log_writes_init
+_log_writes_mkfs >> $seqres.full 2>&1
+_log_writes_mark mkfs
+
+_log_writes_mount
+run_check $FSSTRESS_PROG $fsstress_args > /dev/null 2>&1
+_log_writes_unmount
+
+_log_writes_remove
+prev=$(_log_writes_mark_to_entry_number mkfs)
+[ -z "$prev" ] && _fail "failed to locate entry mark 'mkfs'"
+cur=$(_log_writes_find_next_fua $prev)
+[ -z "$cur" ] && _fail "failed to locate next FUA write"
+
+while [ ! -z "$cur" ]; do
+ _log_writes_replay_log_range $cur >> $seqres.full
+
+ # Here we need extra mount to replay the log, mainly for journal based
+ # fs, as their fsck will report dirty log as error.
+ # We don't care to preserve any data on $SCRATCH_DEV, as we can replay
+ # back to the point we need, and in fact sometimes creating/deleting
+ # snapshots repeatedly can be slower than replaying the log.
+ _scratch_mount
+ _scratch_unmount
+ _check_scratch_fs
+
+ prev=$cur
+ cur=$(_log_writes_find_next_fua $(($cur + 1)))
+ [ -z "$cur" ] && break
+done
+
+echo "Silence is golden"
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/481.out b/tests/generic/481.out
new file mode 100644
index 000000000000..206e11635bd6
--- /dev/null
+++ b/tests/generic/481.out
@@ -0,0 +1,2 @@
+QA output created by 481
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index ea2056b1e93c..c4cc800aa2aa 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -483,3 +483,4 @@
478 auto quick
479 auto quick metadata
480 auto quick metadata
+481 auto replay
--
2.16.2


2018-03-28 04:40:22

by Qu Wenruo

[permalink] [raw]
Subject: [PATCH v2 2/3] fstests: log-writes: Add support for METADATA flag

Signed-off-by: Qu Wenruo <[email protected]>
Reviewed-by: Amir Goldstein <[email protected]>
---
changelog:
v2:
Add Amir's reviewed-by tag.
---
src/log-writes/log-writes.c | 3 ++-
src/log-writes/log-writes.h | 9 +++++----
2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index a872429d3f41..5dc22c244cea 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -130,7 +130,8 @@ struct flags_to_str_entry {
DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
- DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+ DEFINE_LOG_FLAGS_STR_ENTRY(MARK),
+ DEFINE_LOG_FLAGS_STR_ENTRY(METADATA)
};

#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/src/log-writes/log-writes.h b/src/log-writes/log-writes.h
index 35ca35838aac..75fb8ac0bf79 100644
--- a/src/log-writes/log-writes.h
+++ b/src/log-writes/log-writes.h
@@ -20,10 +20,11 @@ typedef __u32 u32;
/*
* Constants copied from kernel file drivers/md/dm-log-writes.c
*/
-#define LOG_FLUSH_FLAG (1 << 0)
-#define LOG_FUA_FLAG (1 << 1)
-#define LOG_DISCARD_FLAG (1 << 2)
-#define LOG_MARK_FLAG (1 << 3)
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+#define LOG_METADATA_FLAG (1 << 4)

#define WRITE_LOG_VERSION 1
#define WRITE_LOG_MAGIC 0x6a736677736872
--
2.16.2


2018-03-28 05:04:28

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes

On Wed, Mar 28, 2018 at 7:40 AM, Qu Wenruo <[email protected]> wrote:
> Basic test case which triggers fsstress with dm-log-writes, and then
> check the fs after each FUA writes.
> With needed infrastructure and special handlers for journal based fs.
>
> Signed-off-by: Qu Wenruo <[email protected]>
> ---
> changelog:
> v2:
> Use proper "SUSE Linux Products GmbH" instead of "SuSE"
> Get rid of dm-snapshot which is pretty slow if we're creating and
> deleting snapshots repeatedly.
> (Maybe LVM thin provision would be much better, but current replay
> solution is good so far, and no slower than dm-snapshot)
> Add back run_check so that we can get the seed.
> ---
> Unfortunately, neither xfs nor ext4 survies this test for even single
> success, while btrfs survives.
> (Although not what I want, I'm just trying my luck
> to reproduce a serious btrfs corruption situation)
>
> Although btrfs may be the fastest fs for the test, since it has fixed
> small amount of write in mkfs and almost nothing to replay, it still
> takes about 240s~300s to finish (the same level using snapshot).
>
> It would take longer time for ext4 for its large amount of write during
> mkfs, if it can survive the test in the first space.

Hmm, how much time would the total test would take if you don't fail
it on fsck? I am asking because it may be possible to improve this with
only a single snapshot after mkfs.

Anyway, if total test run time is expected to be under 10min I wouldn't
bother with this optimization, at least not right now. IMO it is more
important to get the test out there to get the corruption bugs floating.

Thanks for working on this!
You can add
Reviewed-by: Amir Goldstein <[email protected]>


>
> As a comparison, btrfs takes about 5 seconds to replay log, mount,
> unmount and run fsck at the end of the test.
> But for ext4, it already takes about 5 seconds to do the same thing
> before triggering fsck error.
>
> Fsck fail for ext4:
> _check_generic_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent
> *** fsck.ext4 output ***
> fsck from util-linux 2.31.1
> e2fsck 1.43.8 (1-Jan-2018)
> Pass 1: Checking inodes, blocks, and sizes
> Inode 131076 extent tree (at level 1) could be shorter. Fix? no
>
> Inode 131262, i_size is 0, should be 258048. Fix? no
>
> Pass 2: Checking directory structure
> Pass 3: Checking directory connectivity
> Pass 4: Checking reference counts
> Pass 5: Checking group summary information
>
> For xfs:
> _check_xfs_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent (r)
> *** xfs_repair -n output ***
> Phase 1 - find and verify superblock...
> Phase 2 - using internal log
> - zero log...
> - scan filesystem freespace and inode maps...
> - found root inode chunk
> Phase 3 - for each AG...
> - scan (but don't clear) agi unlinked lists...
> - process known inodes and perform inode discovery...
> - agno = 0
> - agno = 1
> - agno = 2
> bad symlink header ino 8409190, file block 0, disk block 1051147
> problem with symbolic link in inode 8409190
> would have cleared inode 8409190
> - agno = 3
> - process newly discovered inodes...
> Phase 4 - check for duplicate blocks...
> - setting up duplicate extent list...
> - check for inodes claiming duplicate blocks...
> - agno = 0
> - agno = 1
> - agno = 3
> - agno = 2
> entry "lb" in shortform directory 8409188 references free inode 8409190
> would have junked entry "lb" in directory inode 8409188
> bad symlink header ino 8409190, file block 0, disk block 1051147
> problem with symbolic link in inode 8409190
> would have cleared inode 8409190
> No modify flag set, skipping phase 5
> Phase 6 - check inode connectivity...
> - traversing filesystem ...
> entry "lb" in shortform directory inode 8409188 points to free inode 8409190
> would junk entry
> - traversal finished ...
> - moving disconnected inodes to lost+found ...
> Phase 7 - verify link counts...
> Maximum metadata LSN (1:396) is ahead of log (1:63).
> Would format log to cycle 4.
> No modify flag set, skipping filesystem flush and exiting.
>
> And special note for XFS guys, I also hit an XFS internal metadata
> warning during journal replay:
> [ 7901.423659] XFS (dm-4): Starting recovery (logdev: internal)
> [ 7901.577511] XFS (dm-4): Metadata corruption detected at xfs_dinode_verify+0x467/0x570 [xfs], inode 0x805067 dinode
> [ 7901.580529] XFS (dm-4): Unmount and run xfs_repair
> [ 7901.581901] XFS (dm-4): First 128 bytes of corrupted metadata buffer:
> [ 7901.583205] 00000000b8963f41: 49 4e a1 ff 03 02 00 00 00 00 00 00 00 00 00 00 IN..............
> [ 7901.584659] 00000000f35a50e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> [ 7901.586075] 00000000386eea9e: 5a b2 0e 69 0a f3 43 27 5a b2 0e 69 0a f3 43 27 Z..i..C'Z..i..C'
> [ 7901.587561] 00000000ac636661: 5a b2 0e 69 0d 92 bc 00 00 00 00 00 00 00 00 00 Z..i............
> [ 7901.588969] 00000000d75f9093: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> [ 7901.590475] 00000000d2af5688: 00 00 00 02 00 00 00 00 00 00 00 00 84 d7 6a e9 ..............j.
> [ 7901.591907] 00000000e8dd8211: ff ff ff ff 34 93 a9 3a 00 00 00 00 00 00 00 04 ....4..:........
> [ 7901.593319] 00000000b7610e4e: 00 00 00 01 00 00 00 45 00 00 00 00 00 00 00 00 .......E........
> ---
>
> common/dmlogwrites | 56 +++++++++++++++++++++++++++
> tests/generic/481 | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/481.out | 2 +
> tests/generic/group | 1 +
> 4 files changed, 163 insertions(+)
> create mode 100755 tests/generic/481
> create mode 100644 tests/generic/481.out
>
> diff --git a/common/dmlogwrites b/common/dmlogwrites
> index 467b872e246f..bf643a77ad09 100644
> --- a/common/dmlogwrites
> +++ b/common/dmlogwrites
> @@ -126,3 +126,59 @@ _log_writes_cleanup()
> $UDEV_SETTLE_PROG >/dev/null 2>&1
> _log_writes_remove
> }
> +
> +# Convert log writes mark to entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_mark_to_entry_number()
> +{
> + local mark=$1
> + local ret
> +
> + [ -z "$mark" ] && _fatal \
> + "mark must be given for _log_writes_mark_to_entry_number"
> +
> + ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> + --end-mark $mark 2> /dev/null)
> + [ -z "$ret" ] && return
> + ret=$(echo "$ret" | cut -f1 -d\@)
> + echo "mark $mark has entry number $ret" >> $seqres.full
> + echo "$ret"
> +}
> +
> +# Find next fua write entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_find_next_fua()
> +{
> + local start_entry=$1
> + local ret
> +
> + [ -z "$start_entry" ] && start_entry=0
> + ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> + --next-fua --start-entry $start_entry 2> /dev/null)
> + [ -z "$ret" ] && return
> +
> + # Result should be something like "1024@offset" where 1024 is the
> + # entry number we need
> + ret=$(echo "$ret" | cut -f1 -d\@)
> + echo "next fua is entry number $ret" >> $seqres.full
> + echo "$ret"
> +}
> +
> +# Replay log range to specified entry
> +# $1: End entry. The entry with this number *WILL* be replayed
> +_log_writes_replay_log_range()
> +{
> + local end=$1
> +
> + [ -z "$end" ] && _fail \
> + "end entry must be specified for _log_writes_replay_log_range"
> +
> + # To ensure we replay the last entry,
> + # we need to manually increase the end entry number to ensure
> + # it's played
> + echo "=== replay to $end ===" >> $seqres.full
> + $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> + --replay $SCRATCH_DEV --limit $(($end + 1)) \
> + >> $seqres.full 2>&1
> + [ $? -ne 0 ] && _fail "replay failed"
> +}
> diff --git a/tests/generic/481 b/tests/generic/481
> new file mode 100755
> index 000000000000..6b03786c8cb4
> --- /dev/null
> +++ b/tests/generic/481
> @@ -0,0 +1,104 @@
> +#! /bin/bash
> +# FS QA Test 481
> +#
> +# Test filesystem consistency after each FUA operation
> +#
> +# Will do log replay and check the filesystem.
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) 2018 SUSE Linux Products GmbH. All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1 # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> + cd /
> + $KILLALL_PROG -KILL -q $FSSTRESS_PROG &> /dev/null
> + _log_writes_cleanup &> /dev/null
> + rm -f $tmp.*
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +. ./common/dmlogwrites
> +
> +# remove previous $seqres.full before test
> +rm -f $seqres.full
> +
> +# real QA test starts here
> +
> +# Modify as appropriate.
> +_supported_fs generic
> +_supported_os Linux
> +
> +_require_command "$KILLALL_PROG" killall
> +# Use $SCRATCH_DEV as replay device
> +_require_scratch
> +# and we need extra device as log device
> +_require_log_writes
> +
> +
> +nr_cpus=$("$here/src/feature" -o)
> +fsstress_args=$(_scale_fsstress_args -w -d $SCRATCH_MNT -n 512 -p $nr_cpus \
> + $FSSTRESS_AVOID)
> +
> +_test_unmount
> +_log_writes_init
> +_log_writes_mkfs >> $seqres.full 2>&1
> +_log_writes_mark mkfs
> +
> +_log_writes_mount
> +run_check $FSSTRESS_PROG $fsstress_args > /dev/null 2>&1

I am not sure that is what Eryu meant, but he can fix that on commit.

> +_log_writes_unmount
> +
> +_log_writes_remove
> +prev=$(_log_writes_mark_to_entry_number mkfs)
> +[ -z "$prev" ] && _fail "failed to locate entry mark 'mkfs'"
> +cur=$(_log_writes_find_next_fua $prev)
> +[ -z "$cur" ] && _fail "failed to locate next FUA write"
> +
> +while [ ! -z "$cur" ]; do
> + _log_writes_replay_log_range $cur >> $seqres.full
> +
> + # Here we need extra mount to replay the log, mainly for journal based
> + # fs, as their fsck will report dirty log as error.
> + # We don't care to preserve any data on $SCRATCH_DEV, as we can replay
> + # back to the point we need, and in fact sometimes creating/deleting
> + # snapshots repeatedly can be slower than replaying the log.
> + _scratch_mount
> + _scratch_unmount
> + _check_scratch_fs
> +
> + prev=$cur
> + cur=$(_log_writes_find_next_fua $(($cur + 1)))
> + [ -z "$cur" ] && break
> +done
> +
> +echo "Silence is golden"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/481.out b/tests/generic/481.out
> new file mode 100644
> index 000000000000..206e11635bd6
> --- /dev/null
> +++ b/tests/generic/481.out
> @@ -0,0 +1,2 @@
> +QA output created by 481
> +Silence is golden
> diff --git a/tests/generic/group b/tests/generic/group
> index ea2056b1e93c..c4cc800aa2aa 100644
> --- a/tests/generic/group
> +++ b/tests/generic/group
> @@ -483,3 +483,4 @@
> 478 auto quick
> 479 auto quick metadata
> 480 auto quick metadata
> +481 auto replay

IMO metadata group is also appropriate.

Thanks,
Amir.

2018-03-28 05:51:44

by Qu Wenruo

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes



On 2018年03月28日 13:04, Amir Goldstein wrote:
> On Wed, Mar 28, 2018 at 7:40 AM, Qu Wenruo <[email protected]> wrote:
>> Basic test case which triggers fsstress with dm-log-writes, and then
>> check the fs after each FUA writes.
>> With needed infrastructure and special handlers for journal based fs.
>>
>> Signed-off-by: Qu Wenruo <[email protected]>
>> ---
>> changelog:
>> v2:
>> Use proper "SUSE Linux Products GmbH" instead of "SuSE"
>> Get rid of dm-snapshot which is pretty slow if we're creating and
>> deleting snapshots repeatedly.
>> (Maybe LVM thin provision would be much better, but current replay
>> solution is good so far, and no slower than dm-snapshot)
>> Add back run_check so that we can get the seed.
>> ---
>> Unfortunately, neither xfs nor ext4 survies this test for even single
>> success, while btrfs survives.
>> (Although not what I want, I'm just trying my luck
>> to reproduce a serious btrfs corruption situation)
>>
>> Although btrfs may be the fastest fs for the test, since it has fixed
>> small amount of write in mkfs and almost nothing to replay, it still
>> takes about 240s~300s to finish (the same level using snapshot).
>>
>> It would take longer time for ext4 for its large amount of write during
>> mkfs, if it can survive the test in the first space.
>
> Hmm, how much time would the total test would take if you don't fail
> it on fsck? I am asking because it may be possible to improve this with
> only a single snapshot after mkfs.

The only fs which can pass the test right now is btrfs, so other
estimation is mostly based on guess.

>
> Anyway, if total test run time is expected to be under 10min I wouldn't
> bother with this optimization, at least not right now. IMO it is more
> important to get the test out there to get the corruption bugs floating.

I'd say from current status, if XFS doesn't fail, it would definitely
finish in 10min.
For EXT4 I'm not pretty sure.

I'd like to keep current test case as simple as possible right now, and
for later enhancement, I have several different ideas:

1) Reflink fs + loopback
Yep, use btrfs/xfs as base filesystem and create copy using reflink,
then use such files as loopback device.
The good thing is, AFAIK btrfs/xfs reflink is really fast.
Much much faster than dm-snapshot or even LVM thin.

The much much smaller block size (4K default) makes CoW overhead
(LVM thin is 64K, not sure about dm-snapshot though).

The problem is, such setup needs extra mount point and can be a
little hard to setup, and we're introducing another layer of fs,
if the fs itself has some hidden bug, it would screw up the test
case.

2) LVM thin provision
LVM thin provision looks much like btrfs/xfs for block level, and
smaller default block size (64K vs original 2M) makes CoW overhead
smaller.

I'm currently testing this method, the good thing is it's a little
easier to setup and we can use single mount point.

Anyway, with proper and efficient snapshot ability implemented, I will
definitely convert this test case, and add Flush test case.

Thanks for your review too,
Qu

>
> Thanks for working on this!
> You can add
> Reviewed-by: Amir Goldstein <[email protected]>
>
>
>>
>> As a comparison, btrfs takes about 5 seconds to replay log, mount,
>> unmount and run fsck at the end of the test.
>> But for ext4, it already takes about 5 seconds to do the same thing
>> before triggering fsck error.
>>
>> Fsck fail for ext4:
>> _check_generic_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent
>> *** fsck.ext4 output ***
>> fsck from util-linux 2.31.1
>> e2fsck 1.43.8 (1-Jan-2018)
>> Pass 1: Checking inodes, blocks, and sizes
>> Inode 131076 extent tree (at level 1) could be shorter. Fix? no
>>
>> Inode 131262, i_size is 0, should be 258048. Fix? no
>>
>> Pass 2: Checking directory structure
>> Pass 3: Checking directory connectivity
>> Pass 4: Checking reference counts
>> Pass 5: Checking group summary information
>>
>> For xfs:
>> _check_xfs_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent (r)
>> *** xfs_repair -n output ***
>> Phase 1 - find and verify superblock...
>> Phase 2 - using internal log
>> - zero log...
>> - scan filesystem freespace and inode maps...
>> - found root inode chunk
>> Phase 3 - for each AG...
>> - scan (but don't clear) agi unlinked lists...
>> - process known inodes and perform inode discovery...
>> - agno = 0
>> - agno = 1
>> - agno = 2
>> bad symlink header ino 8409190, file block 0, disk block 1051147
>> problem with symbolic link in inode 8409190
>> would have cleared inode 8409190
>> - agno = 3
>> - process newly discovered inodes...
>> Phase 4 - check for duplicate blocks...
>> - setting up duplicate extent list...
>> - check for inodes claiming duplicate blocks...
>> - agno = 0
>> - agno = 1
>> - agno = 3
>> - agno = 2
>> entry "lb" in shortform directory 8409188 references free inode 8409190
>> would have junked entry "lb" in directory inode 8409188
>> bad symlink header ino 8409190, file block 0, disk block 1051147
>> problem with symbolic link in inode 8409190
>> would have cleared inode 8409190
>> No modify flag set, skipping phase 5
>> Phase 6 - check inode connectivity...
>> - traversing filesystem ...
>> entry "lb" in shortform directory inode 8409188 points to free inode 8409190
>> would junk entry
>> - traversal finished ...
>> - moving disconnected inodes to lost+found ...
>> Phase 7 - verify link counts...
>> Maximum metadata LSN (1:396) is ahead of log (1:63).
>> Would format log to cycle 4.
>> No modify flag set, skipping filesystem flush and exiting.
>>
>> And special note for XFS guys, I also hit an XFS internal metadata
>> warning during journal replay:
>> [ 7901.423659] XFS (dm-4): Starting recovery (logdev: internal)
>> [ 7901.577511] XFS (dm-4): Metadata corruption detected at xfs_dinode_verify+0x467/0x570 [xfs], inode 0x805067 dinode
>> [ 7901.580529] XFS (dm-4): Unmount and run xfs_repair
>> [ 7901.581901] XFS (dm-4): First 128 bytes of corrupted metadata buffer:
>> [ 7901.583205] 00000000b8963f41: 49 4e a1 ff 03 02 00 00 00 00 00 00 00 00 00 00 IN..............
>> [ 7901.584659] 00000000f35a50e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
>> [ 7901.586075] 00000000386eea9e: 5a b2 0e 69 0a f3 43 27 5a b2 0e 69 0a f3 43 27 Z..i..C'Z..i..C'
>> [ 7901.587561] 00000000ac636661: 5a b2 0e 69 0d 92 bc 00 00 00 00 00 00 00 00 00 Z..i............
>> [ 7901.588969] 00000000d75f9093: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
>> [ 7901.590475] 00000000d2af5688: 00 00 00 02 00 00 00 00 00 00 00 00 84 d7 6a e9 ..............j.
>> [ 7901.591907] 00000000e8dd8211: ff ff ff ff 34 93 a9 3a 00 00 00 00 00 00 00 04 ....4..:........
>> [ 7901.593319] 00000000b7610e4e: 00 00 00 01 00 00 00 45 00 00 00 00 00 00 00 00 .......E........
>> ---
>>
>> common/dmlogwrites | 56 +++++++++++++++++++++++++++
>> tests/generic/481 | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++
>> tests/generic/481.out | 2 +
>> tests/generic/group | 1 +
>> 4 files changed, 163 insertions(+)
>> create mode 100755 tests/generic/481
>> create mode 100644 tests/generic/481.out
>>
>> diff --git a/common/dmlogwrites b/common/dmlogwrites
>> index 467b872e246f..bf643a77ad09 100644
>> --- a/common/dmlogwrites
>> +++ b/common/dmlogwrites
>> @@ -126,3 +126,59 @@ _log_writes_cleanup()
>> $UDEV_SETTLE_PROG >/dev/null 2>&1
>> _log_writes_remove
>> }
>> +
>> +# Convert log writes mark to entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_mark_to_entry_number()
>> +{
>> + local mark=$1
>> + local ret
>> +
>> + [ -z "$mark" ] && _fatal \
>> + "mark must be given for _log_writes_mark_to_entry_number"
>> +
>> + ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> + --end-mark $mark 2> /dev/null)
>> + [ -z "$ret" ] && return
>> + ret=$(echo "$ret" | cut -f1 -d\@)
>> + echo "mark $mark has entry number $ret" >> $seqres.full
>> + echo "$ret"
>> +}
>> +
>> +# Find next fua write entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_find_next_fua()
>> +{
>> + local start_entry=$1
>> + local ret
>> +
>> + [ -z "$start_entry" ] && start_entry=0
>> + ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> + --next-fua --start-entry $start_entry 2> /dev/null)
>> + [ -z "$ret" ] && return
>> +
>> + # Result should be something like "1024@offset" where 1024 is the
>> + # entry number we need
>> + ret=$(echo "$ret" | cut -f1 -d\@)
>> + echo "next fua is entry number $ret" >> $seqres.full
>> + echo "$ret"
>> +}
>> +
>> +# Replay log range to specified entry
>> +# $1: End entry. The entry with this number *WILL* be replayed
>> +_log_writes_replay_log_range()
>> +{
>> + local end=$1
>> +
>> + [ -z "$end" ] && _fail \
>> + "end entry must be specified for _log_writes_replay_log_range"
>> +
>> + # To ensure we replay the last entry,
>> + # we need to manually increase the end entry number to ensure
>> + # it's played
>> + echo "=== replay to $end ===" >> $seqres.full
>> + $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>> + --replay $SCRATCH_DEV --limit $(($end + 1)) \
>> + >> $seqres.full 2>&1
>> + [ $? -ne 0 ] && _fail "replay failed"
>> +}
>> diff --git a/tests/generic/481 b/tests/generic/481
>> new file mode 100755
>> index 000000000000..6b03786c8cb4
>> --- /dev/null
>> +++ b/tests/generic/481
>> @@ -0,0 +1,104 @@
>> +#! /bin/bash
>> +# FS QA Test 481
>> +#
>> +# Test filesystem consistency after each FUA operation
>> +#
>> +# Will do log replay and check the filesystem.
>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) 2018 SUSE Linux Products GmbH. All Rights Reserved.
>> +#
>> +# This program is free software; you can redistribute it and/or
>> +# modify it under the terms of the GNU General Public License as
>> +# published by the Free Software Foundation.
>> +#
>> +# This program is distributed in the hope that it would be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, write the Free Software Foundation,
>> +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
>> +#-----------------------------------------------------------------------
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +here=`pwd`
>> +tmp=/tmp/$$
>> +status=1 # failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> + cd /
>> + $KILLALL_PROG -KILL -q $FSSTRESS_PROG &> /dev/null
>> + _log_writes_cleanup &> /dev/null
>> + rm -f $tmp.*
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/filter
>> +. ./common/dmlogwrites
>> +
>> +# remove previous $seqres.full before test
>> +rm -f $seqres.full
>> +
>> +# real QA test starts here
>> +
>> +# Modify as appropriate.
>> +_supported_fs generic
>> +_supported_os Linux
>> +
>> +_require_command "$KILLALL_PROG" killall
>> +# Use $SCRATCH_DEV as replay device
>> +_require_scratch
>> +# and we need extra device as log device
>> +_require_log_writes
>> +
>> +
>> +nr_cpus=$("$here/src/feature" -o)
>> +fsstress_args=$(_scale_fsstress_args -w -d $SCRATCH_MNT -n 512 -p $nr_cpus \
>> + $FSSTRESS_AVOID)
>> +
>> +_test_unmount
>> +_log_writes_init
>> +_log_writes_mkfs >> $seqres.full 2>&1
>> +_log_writes_mark mkfs
>> +
>> +_log_writes_mount
>> +run_check $FSSTRESS_PROG $fsstress_args > /dev/null 2>&1
>
> I am not sure that is what Eryu meant, but he can fix that on commit.
>
>> +_log_writes_unmount
>> +
>> +_log_writes_remove
>> +prev=$(_log_writes_mark_to_entry_number mkfs)
>> +[ -z "$prev" ] && _fail "failed to locate entry mark 'mkfs'"
>> +cur=$(_log_writes_find_next_fua $prev)
>> +[ -z "$cur" ] && _fail "failed to locate next FUA write"
>> +
>> +while [ ! -z "$cur" ]; do
>> + _log_writes_replay_log_range $cur >> $seqres.full
>> +
>> + # Here we need extra mount to replay the log, mainly for journal based
>> + # fs, as their fsck will report dirty log as error.
>> + # We don't care to preserve any data on $SCRATCH_DEV, as we can replay
>> + # back to the point we need, and in fact sometimes creating/deleting
>> + # snapshots repeatedly can be slower than replaying the log.
>> + _scratch_mount
>> + _scratch_unmount
>> + _check_scratch_fs
>> +
>> + prev=$cur
>> + cur=$(_log_writes_find_next_fua $(($cur + 1)))
>> + [ -z "$cur" ] && break
>> +done
>> +
>> +echo "Silence is golden"
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/generic/481.out b/tests/generic/481.out
>> new file mode 100644
>> index 000000000000..206e11635bd6
>> --- /dev/null
>> +++ b/tests/generic/481.out
>> @@ -0,0 +1,2 @@
>> +QA output created by 481
>> +Silence is golden
>> diff --git a/tests/generic/group b/tests/generic/group
>> index ea2056b1e93c..c4cc800aa2aa 100644
>> --- a/tests/generic/group
>> +++ b/tests/generic/group
>> @@ -483,3 +483,4 @@
>> 478 auto quick
>> 479 auto quick metadata
>> 480 auto quick metadata
>> +481 auto replay
>
> IMO metadata group is also appropriate.
>
> Thanks,
> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


Attachments:
signature.asc (488.00 B)
OpenPGP digital signature

2018-03-28 06:19:45

by Eryu Guan

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes

On Wed, Mar 28, 2018 at 01:51:44PM +0800, Qu Wenruo wrote:
>
>
> On 2018年03月28日 13:04, Amir Goldstein wrote:
> > On Wed, Mar 28, 2018 at 7:40 AM, Qu Wenruo <[email protected]> wrote:
> >> Basic test case which triggers fsstress with dm-log-writes, and then
> >> check the fs after each FUA writes.
> >> With needed infrastructure and special handlers for journal based fs.
> >>
> >> Signed-off-by: Qu Wenruo <[email protected]>
> >> ---
> >> changelog:
> >> v2:
> >> Use proper "SUSE Linux Products GmbH" instead of "SuSE"
> >> Get rid of dm-snapshot which is pretty slow if we're creating and
> >> deleting snapshots repeatedly.
> >> (Maybe LVM thin provision would be much better, but current replay
> >> solution is good so far, and no slower than dm-snapshot)
> >> Add back run_check so that we can get the seed.
> >> ---
> >> Unfortunately, neither xfs nor ext4 survies this test for even single
> >> success, while btrfs survives.
> >> (Although not what I want, I'm just trying my luck
> >> to reproduce a serious btrfs corruption situation)
> >>
> >> Although btrfs may be the fastest fs for the test, since it has fixed
> >> small amount of write in mkfs and almost nothing to replay, it still
> >> takes about 240s~300s to finish (the same level using snapshot).
> >>
> >> It would take longer time for ext4 for its large amount of write during
> >> mkfs, if it can survive the test in the first space.
> >
> > Hmm, how much time would the total test would take if you don't fail
> > it on fsck? I am asking because it may be possible to improve this with
> > only a single snapshot after mkfs.
>
> The only fs which can pass the test right now is btrfs, so other
> estimation is mostly based on guess.
>
> >
> > Anyway, if total test run time is expected to be under 10min I wouldn't
> > bother with this optimization, at least not right now. IMO it is more
> > important to get the test out there to get the corruption bugs floating.
>
> I'd say from current status, if XFS doesn't fail, it would definitely
> finish in 10min.
> For EXT4 I'm not pretty sure.

10min might be a bit long, 5min would be good enough. I may need to
adjust the fsstress "-n" param based on test results when I got some
time, hopefully this week..

And I noticed that fsstress "-p" is based on nr_cpus, I'd like to cap it
with a max allowed number, so test won't run for too long on hosts with
hundreds of cpus. It could always be scaled with _scale_fsstress_args.

+nr_cpus=$("$here/src/feature" -o)
+fsstress_args=$(_scale_fsstress_args -w -d $SCRATCH_MNT -n 512 -p $nr_cpus \
+ $FSSTRESS_AVOID)

>
> I'd like to keep current test case as simple as possible right now, and
> for later enhancement, I have several different ideas:

Please make new tests then :)

>
> 1) Reflink fs + loopback
> Yep, use btrfs/xfs as base filesystem and create copy using reflink,
> then use such files as loopback device.
> The good thing is, AFAIK btrfs/xfs reflink is really fast.
> Much much faster than dm-snapshot or even LVM thin.
>
> The much much smaller block size (4K default) makes CoW overhead
> (LVM thin is 64K, not sure about dm-snapshot though).
>
> The problem is, such setup needs extra mount point and can be a
> little hard to setup, and we're introducing another layer of fs,
> if the fs itself has some hidden bug, it would screw up the test
> case.
>
> 2) LVM thin provision
> LVM thin provision looks much like btrfs/xfs for block level, and
> smaller default block size (64K vs original 2M) makes CoW overhead
> smaller.
>
> I'm currently testing this method, the good thing is it's a little
> easier to setup and we can use single mount point.
>
> Anyway, with proper and efficient snapshot ability implemented, I will
> definitely convert this test case, and add Flush test case.
>
> Thanks for your review too,
> Qu
>
> >
> > Thanks for working on this!
> > You can add
> > Reviewed-by: Amir Goldstein <[email protected]>

Thank you both!

Eryu

2018-03-28 06:55:22

by Qu Wenruo

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes



On 2018年03月28日 14:19, Eryu Guan wrote:
> On Wed, Mar 28, 2018 at 01:51:44PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2018年03月28日 13:04, Amir Goldstein wrote:
>>> On Wed, Mar 28, 2018 at 7:40 AM, Qu Wenruo <[email protected]> wrote:
>>>> Basic test case which triggers fsstress with dm-log-writes, and then
>>>> check the fs after each FUA writes.
>>>> With needed infrastructure and special handlers for journal based fs.
>>>>
>>>> Signed-off-by: Qu Wenruo <[email protected]>
>>>> ---
>>>> changelog:
>>>> v2:
>>>> Use proper "SUSE Linux Products GmbH" instead of "SuSE"
>>>> Get rid of dm-snapshot which is pretty slow if we're creating and
>>>> deleting snapshots repeatedly.
>>>> (Maybe LVM thin provision would be much better, but current replay
>>>> solution is good so far, and no slower than dm-snapshot)
>>>> Add back run_check so that we can get the seed.
>>>> ---
>>>> Unfortunately, neither xfs nor ext4 survies this test for even single
>>>> success, while btrfs survives.
>>>> (Although not what I want, I'm just trying my luck
>>>> to reproduce a serious btrfs corruption situation)
>>>>
>>>> Although btrfs may be the fastest fs for the test, since it has fixed
>>>> small amount of write in mkfs and almost nothing to replay, it still
>>>> takes about 240s~300s to finish (the same level using snapshot).
>>>>
>>>> It would take longer time for ext4 for its large amount of write during
>>>> mkfs, if it can survive the test in the first space.
>>>
>>> Hmm, how much time would the total test would take if you don't fail
>>> it on fsck? I am asking because it may be possible to improve this with
>>> only a single snapshot after mkfs.
>>
>> The only fs which can pass the test right now is btrfs, so other
>> estimation is mostly based on guess.
>>
>>>
>>> Anyway, if total test run time is expected to be under 10min I wouldn't
>>> bother with this optimization, at least not right now. IMO it is more
>>> important to get the test out there to get the corruption bugs floating.
>>
>> I'd say from current status, if XFS doesn't fail, it would definitely
>> finish in 10min.
>> For EXT4 I'm not pretty sure.
>
> 10min might be a bit long, 5min would be good enough. I may need to
> adjust the fsstress "-n" param based on test results when I got some
> time, hopefully this week..
>
> And I noticed that fsstress "-p" is based on nr_cpus, I'd like to cap it
> with a max allowed number, so test won't run for too long on hosts with
> hundreds of cpus. It could always be scaled with _scale_fsstress_args.
>
> +nr_cpus=$("$here/src/feature" -o)
> +fsstress_args=$(_scale_fsstress_args -w -d $SCRATCH_MNT -n 512 -p $nr_cpus \
> + $FSSTRESS_AVOID)

This makes sense.

(I used to think 4 cores was enough and now mainstream desktop is push 8
cores)

Thanks,
Qu

>
>>
>> I'd like to keep current test case as simple as possible right now, and
>> for later enhancement, I have several different ideas:
>
> Please make new tests then :)
>
>>
>> 1) Reflink fs + loopback
>> Yep, use btrfs/xfs as base filesystem and create copy using reflink,
>> then use such files as loopback device.
>> The good thing is, AFAIK btrfs/xfs reflink is really fast.
>> Much much faster than dm-snapshot or even LVM thin.
>>
>> The much much smaller block size (4K default) makes CoW overhead
>> (LVM thin is 64K, not sure about dm-snapshot though).
>>
>> The problem is, such setup needs extra mount point and can be a
>> little hard to setup, and we're introducing another layer of fs,
>> if the fs itself has some hidden bug, it would screw up the test
>> case.
>>
>> 2) LVM thin provision
>> LVM thin provision looks much like btrfs/xfs for block level, and
>> smaller default block size (64K vs original 2M) makes CoW overhead
>> smaller.
>>
>> I'm currently testing this method, the good thing is it's a little
>> easier to setup and we can use single mount point.
>>
>> Anyway, with proper and efficient snapshot ability implemented, I will
>> definitely convert this test case, and add Flush test case.
>>
>> Thanks for your review too,
>> Qu
>>
>>>
>>> Thanks for working on this!
>>> You can add
>>> Reviewed-by: Amir Goldstein <[email protected]>
>
> Thank you both!
>
> Eryu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


Attachments:
signature.asc (488.00 B)
OpenPGP digital signature

2018-03-28 23:36:57

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] fstests: generic: Check the fs after each FUA writes

On Wed, Mar 28, 2018 at 12:40:23PM +0800, Qu Wenruo wrote:
> Basic test case which triggers fsstress with dm-log-writes, and then
> check the fs after each FUA writes.
> With needed infrastructure and special handlers for journal based fs.
>
> Signed-off-by: Qu Wenruo <[email protected]>
.....
> For xfs:
> _check_xfs_filesystem: filesystem on /dev/mapper/test-scratch1 is inconsistent (r)
> *** xfs_repair -n output ***
> Phase 1 - find and verify superblock...
> Phase 2 - using internal log
> - zero log...
> - scan filesystem freespace and inode maps...
> - found root inode chunk
> Phase 3 - for each AG...
> - scan (but don't clear) agi unlinked lists...
> - process known inodes and perform inode discovery...
> - agno = 0
> - agno = 1
> - agno = 2
> bad symlink header ino 8409190, file block 0, disk block 1051147
> problem with symbolic link in inode 8409190
> would have cleared inode 8409190
> - agno = 3
> - process newly discovered inodes...
> Phase 4 - check for duplicate blocks...
> - setting up duplicate extent list...
> - check for inodes claiming duplicate blocks...
> - agno = 0
> - agno = 1
> - agno = 3
> - agno = 2
> entry "lb" in shortform directory 8409188 references free inode 8409190
> would have junked entry "lb" in directory inode 8409188
> bad symlink header ino 8409190, file block 0, disk block 1051147
> problem with symbolic link in inode 8409190
> would have cleared inode 8409190
> No modify flag set, skipping phase 5
> Phase 6 - check inode connectivity...
> - traversing filesystem ...
> entry "lb" in shortform directory inode 8409188 points to free inode 8409190
> would junk entry
> - traversal finished ...
> - moving disconnected inodes to lost+found ...
> Phase 7 - verify link counts...
> Maximum metadata LSN (1:396) is ahead of log (1:63).

That warning implies a write ordering problem - there's a write with
an LSN on disk that does not yet exist in the log. i.e. the last FUA
write to the log had 1:63 in it, yet there's metadata on disk that
could only be *issued* after a REQ_FLUSH|REQ_FUA log write with
1:396 in it was *completed*. If we've only replayed up to the
FUA write with 1:63 in it, then no metadata writes should have been
*issued* with 1:396 in it as the LSN that is stamped into metadata
is only updated on log IO completion....

On first glance, this implies a bug in the underlying device write
replay code....

Cheers,

Dave.
--
Dave Chinner
[email protected]