Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 322c4293ecc58110227b49d7e47ae37b9b03566f ("loop: make autoclear operation asynchronous")
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
in testcase: xfstests
version: xfstests-x86_64-972d710-1_20211216
with following parameters:
disk: 4HDD
fs: xfs
test: xfs-group-04
ucode: 0x21
test-description: xfstests is a regression test suite for xfs and other files ystems.
test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
on test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz with 8G memory
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
2021-12-18 14:24:15 export TEST_DIR=/fs/sda2
2021-12-18 14:24:15 export TEST_DEV=/dev/sda2
2021-12-18 14:24:15 export FSTYP=xfs
2021-12-18 14:24:15 export SCRATCH_MNT=/fs/scratch
2021-12-18 14:24:15 mkdir /fs/scratch -p
2021-12-18 14:24:15 export SCRATCH_DEV=/dev/sda5
2021-12-18 14:24:15 export SCRATCH_LOGDEV=/dev/sda3
2021-12-18 14:24:15 export SCRATCH_XFS_LIST_METADATA_FIELDS=u3.sfdir3.hdr.parent.i4
2021-12-18 14:24:15 export SCRATCH_XFS_LIST_FUZZ_VERBS=random
2021-12-18 14:24:15 sed "s:^:xfs/:" //lkp/benchmarks/xfstests/tests/xfs-group-04
2021-12-18 14:24:15 ./check xfs/040 xfs/041 xfs/042 xfs/043 xfs/045 xfs/046 xfs/047 xfs/048 xfs/049
FSTYP -- xfs (debug)
PLATFORM -- Linux/x86_64 lkp-ivb-d04 5.16.0-rc3-00080-g322c4293ecc5 #1 SMP Sat Dec 18 11:24:43 CST 2021
MKFS_OPTIONS -- -f /dev/sda5
MOUNT_OPTIONS -- /dev/sda5 /fs/scratch
xfs/040 [not run] Can't run libxfs-diff without KWORKAREA set
xfs/041 32s
xfs/042 45s
xfs/043 [not run] No dump tape specified
xfs/045 6s
xfs/046 15s
xfs/047 26s
xfs/048 1s
xfs/049 [failed, exit status 1]- output mismatch (see /lkp/benchmarks/xfstests/results//xfs/049.out.bad)
--- tests/xfs/049.out 2021-12-16 21:14:11.000000000 +0000
+++ /lkp/benchmarks/xfstests/results//xfs/049.out.bad 2021-12-18 14:27:49.146706650 +0000
@@ -13,3 +13,5 @@
--- clean
--- umount ext2 on xfs
--- umount xfs
+!!! umount xfs failed
+(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
...
(Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad' to see the entire diff)
Ran: xfs/040 xfs/041 xfs/042 xfs/043 xfs/045 xfs/046 xfs/047 xfs/048 xfs/049
Not run: xfs/040 xfs/043
Failures: xfs/049
Failed 1 of 9 tests
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang
On 2021/12/20 0:09, kernel test robot wrote:
> @@ -13,3 +13,5 @@
> --- clean
> --- umount ext2 on xfs
> --- umount xfs
> +!!! umount xfs failed
> +(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
> ...
> (Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad' to see the entire diff)
Yes, we know this race condition can happen.
https://lkml.kernel.org/r/[email protected]
https://lkml.kernel.org/r/[email protected]
Should we try to wait for autoclear operation to complete?
On Mon 20-12-21 00:45:46, Tetsuo Handa wrote:
> On 2021/12/20 0:09, kernel test robot wrote:
> > @@ -13,3 +13,5 @@
> > --- clean
> > --- umount ext2 on xfs
> > --- umount xfs
> > +!!! umount xfs failed
> > +(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
> > ...
> > (Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad' to see the entire diff)
>
> Yes, we know this race condition can happen.
>
> https://lkml.kernel.org/r/[email protected]
> https://lkml.kernel.org/r/[email protected]
>
> Should we try to wait for autoclear operation to complete?
So I think we should try to fix this because as Dave writes in the
changelog for a1ecac3b0656 ("loop: Make explicit loop device destruction
lazy") which started all this, having random EBUSY failures (either from
losetup or umount) is annoying and you need to work it around it lots of
unexpected places.
We cannot easily wait for work completion in the loop device code without
reintroducing the deadlock - whole lo_release() is called under
disk->open_mutex which you also need to grab in __loop_clr_fd(). So to
avoid holding backing file busy longer than expected, we could use
task_work instead of ordinary work as I suggested - but you were right that
we need to be somewhat careful and in case we are running in a kthread, we
would still need to offload to a normal work (but in that case we don't
care about delaying file release anyway).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On 2021/12/20 20:58, Jan Kara wrote:
> On Mon 20-12-21 00:45:46, Tetsuo Handa wrote:
>> On 2021/12/20 0:09, kernel test robot wrote:
>>> @@ -13,3 +13,5 @@
>>> --- clean
>>> --- umount ext2 on xfs
>>> --- umount xfs
>>> +!!! umount xfs failed
>>> +(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
>>> ...
>>> (Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad' to see the entire diff)
>>
>> Yes, we know this race condition can happen.
>>
>> https://lkml.kernel.org/r/[email protected]
>> https://lkml.kernel.org/r/[email protected]
>>
>> Should we try to wait for autoclear operation to complete?
>
> So I think we should try to fix this because as Dave writes in the
> changelog for a1ecac3b0656 ("loop: Make explicit loop device destruction
> lazy") which started all this, having random EBUSY failures (either from
> losetup or umount) is annoying and you need to work it around it lots of
> unexpected places.
OK. Let's wait for autoclear operation to complete.
>
> We cannot easily wait for work completion in the loop device code without
> reintroducing the deadlock - whole lo_release() is called under
> disk->open_mutex which you also need to grab in __loop_clr_fd(). So to
> avoid holding backing file busy longer than expected, we could use
> task_work instead of ordinary work as I suggested - but you were right that
> we need to be somewhat careful and in case we are running in a kthread, we
> would still need to offload to a normal work (but in that case we don't
> care about delaying file release anyway).
Like fput_many() shows, it is not easy to implement fallback correctly.
I worry that exporting task_work_add() to loadable kernel modules results in
random abuse of task_work which does not implement fallback.
Instead of exporting task_work_add(), I think we can apply below diff.
I chose not to rely on WQ context, for there is fput(filp) in __loop_clr_fd()
which we should make sure that __fput(filp) is processed by a thread which
triggered autoclear operation. If this __fput(filp) is scheduled by other thread
because fput(filp) is called by a thread which did not trigger autoclear operation,
I think it is possible that a thread which triggered autoclear operation fails to
wait for completion of __fput(filp), and results in the same problem.
---
block/bdev.c | 2 ++
drivers/block/loop.c | 42 ++++++++++++------------------------------
drivers/block/loop.h | 2 +-
include/linux/blkdev.h | 5 +++++
4 files changed, 20 insertions(+), 31 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c80cfaefc0a8..b252b1d87471 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1227,6 +1227,11 @@ struct block_device_operations {
* driver.
*/
int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
+ /*
+ * Special callback for doing post-release callback without
+ * disk->open_mutex held. Used by loop driver.
+ */
+ void (*post_release)(struct gendisk *disk);
};
#ifdef CONFIG_COMPAT
diff --git a/block/bdev.c b/block/bdev.c
index 8bf93a19041b..0cb638d81a27 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -948,6 +948,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
else
blkdev_put_whole(bdev, mode);
mutex_unlock(&disk->open_mutex);
+ if (bdev->bd_disk->fops->post_release)
+ bdev->bd_disk->fops->post_release(bdev->bd_disk);
module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc025..f2e9f38694dc 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,7 @@ struct loop_device {
struct gendisk *lo_disk;
struct mutex lo_mutex;
bool idr_visible;
- struct work_struct rundown_work;
+ struct task_struct *rundown_owner; /* current or NULL */
};
struct loop_cmd {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b1b05c45c07c..faa3a3097b22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1165,40 +1165,12 @@ static void __loop_clr_fd(struct loop_device *lo)
lo->lo_disk->flags |= GENHD_FL_NO_PART;
fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
mutex_lock(&lo->lo_mutex);
lo->lo_state = Lo_unbound;
mutex_unlock(&lo->lo_mutex);
module_put(THIS_MODULE);
}
-static void loop_rundown_workfn(struct work_struct *work)
-{
- struct loop_device *lo = container_of(work, struct loop_device,
- rundown_work);
- struct block_device *bdev = lo->lo_device;
- struct gendisk *disk = lo->lo_disk;
-
- __loop_clr_fd(lo);
- kobject_put(&bdev->bd_device.kobj);
- module_put(disk->fops->owner);
- loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
- struct block_device *bdev = lo->lo_device;
- struct gendisk *disk = lo->lo_disk;
-
- __module_get(disk->fops->owner);
- kobject_get(&bdev->bd_device.kobj);
- INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
- queue_work(system_long_wq, &lo->rundown_work);
-}
-
static int loop_clr_fd(struct loop_device *lo)
{
int err;
@@ -1229,7 +1201,6 @@ static int loop_clr_fd(struct loop_device *lo)
mutex_unlock(&lo->lo_mutex);
__loop_clr_fd(lo);
- loop_rundown_completed(lo);
return 0;
}
@@ -1754,7 +1725,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
* In autoclear mode, stop the loop thread
* and remove configuration after last close.
*/
- loop_schedule_rundown(lo);
+ lo->rundown_owner = current;
return;
} else if (lo->lo_state == Lo_bound) {
/*
@@ -1769,10 +1740,21 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
mutex_unlock(&lo->lo_mutex);
}
+static void lo_post_release(struct gendisk *disk)
+{
+ struct loop_device *lo = disk->private_data;
+
+ if (lo->rundown_owner != current)
+ return;
+ lo->rundown_owner = NULL;
+ __loop_clr_fd(lo);
+}
+
static const struct block_device_operations lo_fops = {
.owner = THIS_MODULE,
.open = lo_open,
.release = lo_release,
+ .post_release = lo_post_release,
.ioctl = lo_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = lo_compat_ioctl,
--
2.32.0