2021-03-16 16:50:33

by Dan Schatzberg

[permalink] [raw]
Subject: [PATCH 1/3] loop: Use worker per cgroup instead of kworker

Existing uses of loop device may have multiple cgroups reading/writing
to the same device. Simply charging resources for I/O to the backing
file could result in priority inversion where one cgroup gets
synchronously blocked, holding up all other I/O to the loop device.

In order to avoid this priority inversion, we use a single workqueue
where each work item is a "struct loop_worker" which contains a queue of
struct loop_cmds to issue. The loop device maintains a tree mapping blk
css_id -> loop_worker. This allows each cgroup to independently make
forward progress issuing I/O to the backing file.

There is also a single queue for I/O associated with the rootcg which
can be used in cases of extreme memory shortage where we cannot allocate
a loop_worker.

The locking for the tree and queues is fairly heavy handed - we acquire
a per-loop-device spinlock any time either is accessed. The existing
implementation serializes all I/O through a single thread anyways, so I
don't believe this is any worse.

Signed-off-by: Dan Schatzberg <[email protected]>
---
drivers/block/loop.c | 207 ++++++++++++++++++++++++++++++++++++-------
drivers/block/loop.h | 12 ++-
2 files changed, 182 insertions(+), 37 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a370cde3ddd4..5c080af73caa 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -70,7 +70,6 @@
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
-#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
@@ -83,6 +82,8 @@

#include <linux/uaccess.h>

+#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
+
static DEFINE_IDR(loop_index_idr);
static DEFINE_MUTEX(loop_ctl_mutex);

@@ -920,27 +921,83 @@ static void loop_config_discard(struct loop_device *lo)
q->limits.discard_alignment = 0;
}

-static void loop_unprepare_queue(struct loop_device *lo)
-{
- kthread_flush_worker(&lo->worker);
- kthread_stop(lo->worker_task);
-}
+struct loop_worker {
+ struct rb_node rb_node;
+ struct work_struct work;
+ struct list_head cmd_list;
+ struct list_head idle_list;
+ struct loop_device *lo;
+ struct cgroup_subsys_state *css;
+ unsigned long last_ran_at;
+};

-static int loop_kthread_worker_fn(void *worker_ptr)
-{
- current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
- return kthread_worker_fn(worker_ptr);
-}
+static void loop_workfn(struct work_struct *work);
+static void loop_rootcg_workfn(struct work_struct *work);
+static void loop_free_idle_workers(struct timer_list *timer);

-static int loop_prepare_queue(struct loop_device *lo)
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
{
- kthread_init_worker(&lo->worker);
- lo->worker_task = kthread_run(loop_kthread_worker_fn,
- &lo->worker, "loop%d", lo->lo_number);
- if (IS_ERR(lo->worker_task))
- return -ENOMEM;
- set_user_nice(lo->worker_task, MIN_NICE);
- return 0;
+ struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
+ struct loop_worker *cur_worker, *worker = NULL;
+ struct work_struct *work;
+ struct list_head *cmd_list;
+
+ spin_lock_irq(&lo->lo_work_lock);
+
+ if (!cmd->css)
+ goto queue_work;
+
+ node = &lo->worker_tree.rb_node;
+
+ while (*node) {
+ parent = *node;
+ cur_worker = container_of(*node, struct loop_worker, rb_node);
+ if (cur_worker->css == cmd->css) {
+ worker = cur_worker;
+ break;
+ } else if ((long)cur_worker->css < (long)cmd->css) {
+ node = &(*node)->rb_left;
+ } else {
+ node = &(*node)->rb_right;
+ }
+ }
+ if (worker)
+ goto queue_work;
+
+ worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
+ /*
+ * In the event we cannot allocate a worker, just queue on the
+ * rootcg worker
+ */
+ if (!worker)
+ goto queue_work;
+
+ worker->css = cmd->css;
+ css_get(worker->css);
+ INIT_WORK(&worker->work, loop_workfn);
+ INIT_LIST_HEAD(&worker->cmd_list);
+ INIT_LIST_HEAD(&worker->idle_list);
+ worker->lo = lo;
+ rb_link_node(&worker->rb_node, parent, node);
+ rb_insert_color(&worker->rb_node, &lo->worker_tree);
+queue_work:
+ if (worker) {
+ /*
+ * We need to remove from the idle list here while
+ * holding the lock so that the idle timer doesn't
+ * free the worker
+ */
+ if (!list_empty(&worker->idle_list))
+ list_del_init(&worker->idle_list);
+ work = &worker->work;
+ cmd_list = &worker->cmd_list;
+ } else {
+ work = &lo->rootcg_work;
+ cmd_list = &lo->rootcg_cmd_list;
+ }
+ list_add_tail(&cmd->list_entry, cmd_list);
+ queue_work(lo->workqueue, work);
+ spin_unlock_irq(&lo->lo_work_lock);
}

static void loop_update_rotational(struct loop_device *lo)
@@ -1126,12 +1183,27 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
!file->f_op->write_iter)
lo->lo_flags |= LO_FLAGS_READ_ONLY;

- error = loop_prepare_queue(lo);
- if (error)
+ error = -EFBIG;
+ size = get_loop_size(lo, file);
+ if ((loff_t)(sector_t)size != size)
goto out_unlock;
+ lo->workqueue = alloc_workqueue("loop%d",
+ WQ_UNBOUND | WQ_FREEZABLE |
+ WQ_MEM_RECLAIM,
+ lo->lo_number);
+ if (!lo->workqueue) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }

set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);

+ INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+ INIT_LIST_HEAD(&lo->rootcg_cmd_list);
+ INIT_LIST_HEAD(&lo->idle_worker_list);
+ lo->worker_tree = RB_ROOT;
+ timer_setup(&lo->timer, loop_free_idle_workers,
+ TIMER_DEFERRABLE);
lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
lo->lo_device = bdev;
lo->lo_backing_file = file;
@@ -1199,6 +1271,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
int err = 0;
bool partscan = false;
int lo_number;
+ struct loop_worker *pos, *worker;

mutex_lock(&lo->lo_mutex);
if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1218,6 +1291,18 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
/* freeze request queue during the transition */
blk_mq_freeze_queue(lo->lo_queue);

+ destroy_workqueue(lo->workqueue);
+ spin_lock_irq(&lo->lo_work_lock);
+ list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+ idle_list) {
+ list_del(&worker->idle_list);
+ rb_erase(&worker->rb_node, &lo->worker_tree);
+ css_put(worker->css);
+ kfree(worker);
+ }
+ spin_unlock_irq(&lo->lo_work_lock);
+ del_timer_sync(&lo->timer);
+
spin_lock_irq(&lo->lo_lock);
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1254,7 +1339,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)

partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number;
- loop_unprepare_queue(lo);
out_unlock:
mutex_unlock(&lo->lo_mutex);
if (partscan) {
@@ -2025,7 +2109,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
} else
#endif
cmd->css = NULL;
- kthread_queue_work(&lo->worker, &cmd->work);
+ loop_queue_work(lo, cmd);

return BLK_STS_OK;
}
@@ -2055,26 +2139,82 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
}
}

-static void loop_queue_work(struct kthread_work *work)
+static void loop_set_timer(struct loop_device *lo)
+{
+ timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_process_work(struct loop_worker *worker,
+ struct list_head *cmd_list, struct loop_device *lo)
{
- struct loop_cmd *cmd =
- container_of(work, struct loop_cmd, work);
+ int orig_flags = current->flags;
+ struct loop_cmd *cmd;

- loop_handle_cmd(cmd);
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ spin_lock_irq(&lo->lo_work_lock);
+ while (!list_empty(cmd_list)) {
+ cmd = container_of(
+ cmd_list->next, struct loop_cmd, list_entry);
+ list_del(cmd_list->next);
+ spin_unlock_irq(&lo->lo_work_lock);
+
+ loop_handle_cmd(cmd);
+ cond_resched();
+
+ spin_lock_irq(&lo->lo_work_lock);
+ }
+
+ /*
+ * We only add to the idle list if there are no pending cmds
+ * *and* the worker will not run again which ensures that it
+ * is safe to free any worker on the idle list
+ */
+ if (worker && !work_pending(&worker->work)) {
+ worker->last_ran_at = jiffies;
+ list_add_tail(&worker->idle_list, &lo->idle_worker_list);
+ loop_set_timer(lo);
+ }
+ spin_unlock_irq(&lo->lo_work_lock);
+ current->flags = orig_flags;
}

-static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
+static void loop_workfn(struct work_struct *work)
{
- struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct loop_worker *worker =
+ container_of(work, struct loop_worker, work);
+ loop_process_work(worker, &worker->cmd_list, worker->lo);
+}

- kthread_init_work(&cmd->work, loop_queue_work);
- return 0;
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, rootcg_work);
+ loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
+}
+
+static void loop_free_idle_workers(struct timer_list *timer)
+{
+ struct loop_device *lo = container_of(timer, struct loop_device, timer);
+ struct loop_worker *pos, *worker;
+
+ spin_lock_irq(&lo->lo_work_lock);
+ list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+ idle_list) {
+ if (time_is_after_jiffies(worker->last_ran_at +
+ LOOP_IDLE_WORKER_TIMEOUT))
+ break;
+ list_del(&worker->idle_list);
+ rb_erase(&worker->rb_node, &lo->worker_tree);
+ css_put(worker->css);
+ kfree(worker);
+ }
+ if (!list_empty(&lo->idle_worker_list))
+ loop_set_timer(lo);
+ spin_unlock_irq(&lo->lo_work_lock);
}

static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
- .init_request = loop_init_request,
.complete = lo_complete_rq,
};

@@ -2163,6 +2303,7 @@ static int loop_add(struct loop_device **l, int i)
mutex_init(&lo->lo_mutex);
lo->lo_number = i;
spin_lock_init(&lo->lo_lock);
+ spin_lock_init(&lo->lo_work_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
disk->fops = &lo_fops;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index a3c04f310672..9289c1cd6374 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -14,7 +14,6 @@
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <uapi/linux/loop.h>

/* Possible states of device */
@@ -54,8 +53,13 @@ struct loop_device {

spinlock_t lo_lock;
int lo_state;
- struct kthread_worker worker;
- struct task_struct *worker_task;
+ spinlock_t lo_work_lock;
+ struct workqueue_struct *workqueue;
+ struct work_struct rootcg_work;
+ struct list_head rootcg_cmd_list;
+ struct list_head idle_worker_list;
+ struct rb_root worker_tree;
+ struct timer_list timer;
bool use_dio;
bool sysfs_inited;

@@ -66,7 +70,7 @@ struct loop_device {
};

struct loop_cmd {
- struct kthread_work work;
+ struct list_head list_entry;
bool use_aio; /* use AIO interface to handle I/O */
atomic_t ref; /* only for aio */
long ret;
--
2.30.2


2021-03-22 06:07:07

by kernel test robot

[permalink] [raw]
Subject: [loop] eaba742710: WARNING:at_kernel/workqueue.c:#check_flush_dependency



Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: eaba7427107045752f7454f94a40839c0880cf02 ("[PATCH 1/3] loop: Use worker per cgroup instead of kworker")
url: https://github.com/0day-ci/linux/commits/Dan-Schatzberg/Charge-loop-device-i-o-to-issuing-cgroup/20210316-233842
base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git for-next

in testcase: xfstests
version: xfstests-x86_64-73c0871-1_20210318
with following parameters:

disk: 4HDD
fs: xfs
test: generic-group-18
ucode: 0xe2

test-description: xfstests is a regression test suite for xfs and other files ystems.
test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git


on test machine: 4 threads Intel(R) Xeon(R) CPU E3-1225 v5 @ 3.30GHz with 16G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>


[ 50.428387] WARNING: CPU: 0 PID: 35 at kernel/workqueue.c:2613 check_flush_dependency (kbuild/src/consumer/kernel/workqueue.c:2613 (discriminator 9))
[ 50.450013] Modules linked in: loop xfs dm_mod btrfs blake2b_generic xor zstd_compress raid6_pq libcrc32c sd_mod t10_pi sg ipmi_devintf ipmi_msghandler intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal i915 intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul hp_wmi sparse_keymap intel_gtt crc32c_intel ghash_clmulni_intel mei_wdt rfkill wmi_bmof rapl drm_kms_helper ahci intel_cstate syscopyarea mei_me libahci sysfillrect sysimgblt fb_sys_fops intel_uncore serio_raw mei drm libata intel_pch_thermal ie31200_edac wmi video tpm_infineon intel_pmc_core acpi_pad ip_tables
[ 50.500731] CPU: 0 PID: 35 Comm: kworker/u8:3 Not tainted 5.12.0-rc2-00093-geaba74271070 #1
[ 50.509081] Hardware name: HP HP Z238 Microtower Workstation/8183, BIOS N51 Ver. 01.63 10/05/2017
[ 50.517963] Workqueue: loop0 loop_rootcg_workfn [loop]
[ 50.523109] RIP: 0010:check_flush_dependency (kbuild/src/consumer/kernel/workqueue.c:2613 (discriminator 9))
[ 50.528427] Code: 8d 8b b0 00 00 00 48 8b 50 18 49 89 e8 48 8d b1 b0 00 00 00 48 c7 c7 80 ed 34 82 4c 89 c9 c6 05 8a 11 ab 01 01 e8 29 9a a9 00 <0f> 0b e9 07 ff ff ff 80 3d 78 11 ab 01 00 75 92 e9 3b ff ff ff 66
All code
========
0: 8d 8b b0 00 00 00 lea 0xb0(%rbx),%ecx
6: 48 8b 50 18 mov 0x18(%rax),%rdx
a: 49 89 e8 mov %rbp,%r8
d: 48 8d b1 b0 00 00 00 lea 0xb0(%rcx),%rsi
14: 48 c7 c7 80 ed 34 82 mov $0xffffffff8234ed80,%rdi
1b: 4c 89 c9 mov %r9,%rcx
1e: c6 05 8a 11 ab 01 01 movb $0x1,0x1ab118a(%rip) # 0x1ab11af
25: e8 29 9a a9 00 callq 0xa99a53
2a:* 0f 0b ud2 <-- trapping instruction
2c: e9 07 ff ff ff jmpq 0xffffffffffffff38
31: 80 3d 78 11 ab 01 00 cmpb $0x0,0x1ab1178(%rip) # 0x1ab11b0
38: 75 92 jne 0xffffffffffffffcc
3a: e9 3b ff ff ff jmpq 0xffffffffffffff7a
3f: 66 data16

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: e9 07 ff ff ff jmpq 0xffffffffffffff0e
7: 80 3d 78 11 ab 01 00 cmpb $0x0,0x1ab1178(%rip) # 0x1ab1186
e: 75 92 jne 0xffffffffffffffa2
10: e9 3b ff ff ff jmpq 0xffffffffffffff50
15: 66 data16
[ 50.547207] RSP: 0018:ffffc9000018fba0 EFLAGS: 00010086
[ 50.552434] RAX: 0000000000000000 RBX: ffff88843a39b600 RCX: 0000000000000027
[ 50.559567] RDX: 0000000000000027 RSI: 00000000ffff7fff RDI: ffff88843f4177f8
[ 50.566713] RBP: ffffffffc091c760 R08: ffff88843f4177f0 R09: ffffc9000018f9b8
[ 50.573875] R10: 0000000000000001 R11: 0000000000000001 R12: ffff88810d0fa800
[ 50.581006] R13: 0000000000000001 R14: ffff88843f42ab80 R15: ffff88810d0fa800
[ 50.588139] FS: 0000000000000000(0000) GS:ffff88843f400000(0000) knlGS:0000000000000000
[ 50.596228] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 50.601975] CR2: 00007fbda63ea7f8 CR3: 000000043da0a002 CR4: 00000000003706f0
[ 50.609105] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 50.616240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 50.623379] Call Trace:
[ 50.625837] __flush_work (kbuild/src/consumer/kernel/workqueue.c:2669 kbuild/src/consumer/kernel/workqueue.c:3011 kbuild/src/consumer/kernel/workqueue.c:3051)
[ 50.629418] ? __queue_work (kbuild/src/consumer/arch/x86/include/asm/paravirt.h:559 kbuild/src/consumer/arch/x86/include/asm/qspinlock.h:56 kbuild/src/consumer/include/linux/spinlock.h:212 kbuild/src/consumer/include/linux/spinlock_api_smp.h:151 kbuild/src/consumer/kernel/workqueue.c:1500)
[ 50.633261] xfs_file_buffered_write (kbuild/src/consumer/fs/xfs/xfs_file.c:761) xfs
[ 50.638468] do_iter_readv_writev (kbuild/src/consumer/fs/read_write.c:741)
[ 50.642833] do_iter_write (kbuild/src/consumer/fs/read_write.c:866 kbuild/src/consumer/fs/read_write.c:847)
[ 50.646513] lo_write_bvec (kbuild/src/consumer/include/linux/fs.h:2903 kbuild/src/consumer/drivers/block/loop.c:286) loop
[ 50.650804] loop_process_work (kbuild/src/consumer/drivers/block/loop.c:307 kbuild/src/consumer/drivers/block/loop.c:630 kbuild/src/consumer/drivers/block/loop.c:2129 kbuild/src/consumer/drivers/block/loop.c:2161) loop
[ 50.655543] ? newidle_balance (kbuild/src/consumer/kernel/sched/fair.c:10635)
[ 50.659647] process_one_work (kbuild/src/consumer/arch/x86/include/asm/jump_label.h:25 kbuild/src/consumer/include/linux/jump_label.h:200 kbuild/src/consumer/include/trace/events/workqueue.h:108 kbuild/src/consumer/kernel/workqueue.c:2280)
[ 50.663696] worker_thread (kbuild/src/consumer/include/linux/list.h:282 kbuild/src/consumer/kernel/workqueue.c:2422)
[ 50.667365] ? process_one_work (kbuild/src/consumer/kernel/workqueue.c:2364)
[ 50.671568] kthread (kbuild/src/consumer/kernel/kthread.c:292)
[ 50.674813] ? kthread_park (kbuild/src/consumer/kernel/kthread.c:245)
[ 50.678476] ret_from_fork (kbuild/src/consumer/arch/x86/entry/entry_64.S:300)
[ 50.682070] ---[ end trace cdbf6e08ae434f6c ]---
[ 56.951496] loop: Write error at byte offset 1032134656, length 4096.
[ 56.957973] blk_update_request: I/O error, dev loop0, sector 2015816 op 0x1:(WRITE) flags 0x4000 phys_seg 21 prio class 0
[ 56.968947] loop: Write error at byte offset 1033408512, length 4096.
[ 56.975421] blk_update_request: I/O error, dev loop0, sector 2018376 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 56.976319] loop: Write error at byte offset 1034719232, length 4096.
[ 56.992849] blk_update_request: I/O error, dev loop0, sector 2020936 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 56.994344] loop: Write error at byte offset 1036029952, length 4096.
[ 57.010337] blk_update_request: I/O error, dev loop0, sector 2023496 op 0x1:(WRITE) flags 0x4000 phys_seg 21 prio class 0
[ 57.011345] loop: Write error at byte offset 1037340672, length 4096.
[ 57.027789] blk_update_request: I/O error, dev loop0, sector 2026056 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 57.029342] loop: Write error at byte offset 1038651392, length 4096.
[ 57.045247] blk_update_request: I/O error, dev loop0, sector 2028616 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 57.047344] loop: Write error at byte offset 1039962112, length 4096.
[ 57.062715] blk_update_request: I/O error, dev loop0, sector 2031176 op 0x1:(WRITE) flags 0x4000 phys_seg 21 prio class 0
[ 57.065323] loop: Write error at byte offset 1041272832, length 4096.
[ 57.080148] blk_update_request: I/O error, dev loop0, sector 2033736 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 57.081357] loop: Write error at byte offset 1042583552, length 4096.
[ 57.097612] blk_update_request: I/O error, dev loop0, sector 2036296 op 0x1:(WRITE) flags 0x4000 phys_seg 20 prio class 0
[ 57.098342] loop: Write error at byte offset 1043894272, length 4096.
[ 57.115063] blk_update_request: I/O error, dev loop0, sector 2038856 op 0x1:(WRITE) flags 0x4000 phys_seg 21 prio class 0
[ 57.117369] loop0: writeback error on inode 131, offset 536248320, sector 17792
[ 57.138492] loop0: writeback error on inode 131, offset 268124160, sector 2096616
[ 57.477745] XFS (loop0): Unmounting Filesystem
[ 57.676457] XFS (sda1): Unmounting Filesystem
[ 57.812233] XFS (sda4): Unmounting Filesystem
[ 58.538531] XFS (sda4): Mounting V5 Filesystem
[ 58.555208] XFS (sda4): Ending clean mount
[ 58.562478] xfs filesystem being mounted at /fs/scratch supports timestamps until 2038 (0x7fffffff)
[ 58.574138] XFS (sda4): Unmounting Filesystem
[ 58.707339] generic/361 _check_dmesg: something found in dmesg (see /lkp/benchmarks/xfstests/results//generic/361.dmesg)
[ 58.707355]
[ 58.719955]
[ 58.719956]
[ 58.726562] run fstests generic/362 at 2021-03-19 23:56:59
[ 59.025589] XFS (sda1): Mounting V5 Filesystem
[ 59.113481] XFS (sda1): Ending clean mount
[ 59.120573] xfs filesystem being mounted at /fs/sda1 supports timestamps until 2038 (0x7fffffff)
[ 59.171890] generic/362 [not run] mkfs.xfs doesn't have richacl feature
[ 59.171893]
[ 59.188841] run fstests generic/363 at 2021-03-19 23:56:59
[ 59.479869] generic/363 [not run] mkfs.xfs doesn't have richacl feature
[ 59.479872]
[ 59.496814] run fstests generic/364 at 2021-03-19 23:56:59
[ 59.771886] generic/364 [not run] mkfs.xfs doesn't have richacl feature
[ 59.771889]
[ 59.788796] run fstests generic/365 at 2021-03-19 23:57:00
[ 60.058891] generic/365 [not run] mkfs.xfs doesn't have richacl feature
[ 60.058894]
[ 60.075830] run fstests generic/366 at 2021-03-19 23:57:00
[ 60.346927] generic/366 [not run] mkfs.xfs doesn't have richacl feature
[ 60.346930]
[ 60.363892] run fstests generic/367 at 2021-03-19 23:57:00
[ 60.635809] generic/367 [not run] mkfs.xfs doesn't have richacl feature
[ 60.635812]
[ 60.652791] run fstests generic/368 at 2021-03-19 23:57:01
[ 60.928853] generic/368 [not run] mkfs.xfs doesn't have richacl feature
[ 60.928857]
[ 60.945736] run fstests generic/369 at 2021-03-19 23:57:01
[ 61.214037] generic/369 [not run] mkfs.xfs doesn't have richacl feature
[ 61.214040]
[ 61.231028] run fstests generic/370 at 2021-03-19 23:57:01
[ 61.503865] generic/370 [not run] mkfs.xfs doesn't have richacl feature
[ 61.503868]
[ 61.520803] run fstests generic/371 at 2021-03-19 23:57:02
[ 62.013442] XFS (sda4): Mounting V5 Filesystem
[ 62.122308] XFS (sda4): Ending clean mount
[ 62.129243] xfs filesystem being mounted at /fs/scratch supports timestamps until 2038 (0x7fffffff)
[ 65.996069] XFS (sda1): Unmounting Filesystem
[ 66.214239] XFS (sda4): Unmounting Filesystem
[ 66.810579] XFS (sda4): Mounting V5 Filesystem
[ 66.854787] XFS (sda4): Ending clean mount
[ 66.860848] xfs filesystem being mounted at /fs/scratch supports timestamps until 2038 (0x7fffffff)
[ 66.872560] XFS (sda4): Unmounting Filesystem
[ 66.957505] generic/371 5s
[ 66.957507]
[ 66.973880] run fstests generic/372 at 2021-03-19 23:57:07
[ 67.270516] XFS (sda1): Mounting V5 Filesystem
[ 67.391362] XFS (sda1): Ending clean mount
[ 67.398502] xfs filesystem being mounted at /fs/sda1 supports timestamps until 2038 (0x7fffffff)
[ 69.203230] XFS (sda4): Mounting V5 Filesystem
[ 69.360064] XFS (sda4): Ending clean mount
[ 69.365981] xfs filesystem being mounted at /fs/scratch supports timestamps until 2038 (0x7fffffff)
[ 69.446605] XFS (sda4): Unmounting Filesystem
[ 69.602242] generic/372 [not run] Reflink not supported by scratch filesystem type: xfs
[ 69.602246]
[ 69.619022] run fstests generic/373 at 2021-03-19 23:57:10
[ 71.779682] XFS (sda4): Mounting V5 Filesystem
[ 72.056627] XFS (sda4): Ending clean mount
[ 72.091290] xfs filesystem being mounted at /fs/scratch supports timestamps until 2038 (0x7fffffff)
[ 72.169088] XFS (sda4): Unmounting Filesystem
[ 72.324408] generic/373 [not run] Reflink not supported by scratch filesystem type: xfs
[ 72.324412]
[ 72.341201] run fstests generic/374 at 2021-03-19 23:57:12


To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml
bin/lkp run compatible-job.yaml



---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation

Thanks,
Oliver Sang


Attachments:
(No filename) (12.62 kB)
config-5.12.0-rc2-00093-geaba74271070 (175.56 kB)
job-script (5.76 kB)
dmesg.xz (25.61 kB)
xfstests (2.31 kB)
job.yaml (4.76 kB)
reproduce (0.99 kB)
Download all attachments

2021-03-22 13:49:05

by Dan Schatzberg

[permalink] [raw]
Subject: Re: [loop] eaba742710: WARNING:at_kernel/workqueue.c:#check_flush_dependency

On Mon, Mar 22, 2021 at 02:03:34PM +0800, kernel test robot wrote:
>
>
> Greeting,
>
> FYI, we noticed the following commit (built with gcc-9):
>
> commit: eaba7427107045752f7454f94a40839c0880cf02 ("[PATCH 1/3] loop: Use worker per cgroup instead of kworker")
> url: https://github.com/0day-ci/linux/commits/Dan-Schatzberg/Charge-loop-device-i-o-to-issuing-cgroup/20210316-233842
> base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git for-next
>
> in testcase: xfstests
> version: xfstests-x86_64-73c0871-1_20210318
> with following parameters:
>
> disk: 4HDD
> fs: xfs
> test: generic-group-18
> ucode: 0xe2
>
> test-description: xfstests is a regression test suite for xfs and other files ystems.
> test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
>
>
> on test machine: 4 threads Intel(R) Xeon(R) CPU E3-1225 v5 @ 3.30GHz with 16G memory
>
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
> ...
> [ 50.428387] WARNING: CPU: 0 PID: 35 at kernel/workqueue.c:2613 check_flush_dependency (kbuild/src/consumer/kernel/workqueue.c:2613 (discriminator 9))
> [ 50.450013] Modules linked in: loop xfs dm_mod btrfs blake2b_generic xor zstd_compress raid6_pq libcrc32c sd_mod t10_pi sg ipmi_devintf ipmi_msghandler intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal i915 intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul hp_wmi sparse_keymap intel_gtt crc32c_intel ghash_clmulni_intel mei_wdt rfkill wmi_bmof rapl drm_kms_helper ahci intel_cstate syscopyarea mei_me libahci sysfillrect sysimgblt fb_sys_fops intel_uncore serio_raw mei drm libata intel_pch_thermal ie31200_edac wmi video tpm_infineon intel_pmc_core acpi_pad ip_tables
> [ 50.500731] CPU: 0 PID: 35 Comm: kworker/u8:3 Not tainted 5.12.0-rc2-00093-geaba74271070 #1
> [ 50.509081] Hardware name: HP HP Z238 Microtower Workstation/8183, BIOS N51 Ver. 01.63 10/05/2017
> [ 50.517963] Workqueue: loop0 loop_rootcg_workfn [loop]
> [ 50.523109] RIP: 0010:check_flush_dependency (kbuild/src/consumer/kernel/workqueue.c:2613 (discriminator 9))
> ...
> [ 50.625837] __flush_work (kbuild/src/consumer/kernel/workqueue.c:2669 kbuild/src/consumer/kernel/workqueue.c:3011 kbuild/src/consumer/kernel/workqueue.c:3051)
> [ 50.629418] ? __queue_work (kbuild/src/consumer/arch/x86/include/asm/paravirt.h:559 kbuild/src/consumer/arch/x86/include/asm/qspinlock.h:56 kbuild/src/consumer/include/linux/spinlock.h:212 kbuild/src/consumer/include/linux/spinlock_api_smp.h:151 kbuild/src/consumer/kernel/workqueue.c:1500)
> [ 50.633261] xfs_file_buffered_write (kbuild/src/consumer/fs/xfs/xfs_file.c:761) xfs
> [ 50.638468] do_iter_readv_writev (kbuild/src/consumer/fs/read_write.c:741)
> [ 50.642833] do_iter_write (kbuild/src/consumer/fs/read_write.c:866 kbuild/src/consumer/fs/read_write.c:847)
> [ 50.646513] lo_write_bvec (kbuild/src/consumer/include/linux/fs.h:2903 kbuild/src/consumer/drivers/block/loop.c:286) loop
> [ 50.650804] loop_process_work (kbuild/src/consumer/drivers/block/loop.c:307 kbuild/src/consumer/drivers/block/loop.c:630 kbuild/src/consumer/drivers/block/loop.c:2129 kbuild/src/consumer/drivers/block/loop.c:2161) loop
> [ 50.655543] ? newidle_balance (kbuild/src/consumer/kernel/sched/fair.c:10635)
> [ 50.659647] process_one_work (kbuild/src/consumer/arch/x86/include/asm/jump_label.h:25 kbuild/src/consumer/include/linux/jump_label.h:200 kbuild/src/consumer/include/trace/events/workqueue.h:108 kbuild/src/consumer/kernel/workqueue.c:2280)
> [ 50.663696] worker_thread (kbuild/src/consumer/include/linux/list.h:282 kbuild/src/consumer/kernel/workqueue.c:2422)
> [ 50.667365] ? process_one_work (kbuild/src/consumer/kernel/workqueue.c:2364)
> [ 50.671568] kthread (kbuild/src/consumer/kernel/kthread.c:292)
> [ 50.674813] ? kthread_park (kbuild/src/consumer/kernel/kthread.c:245)
> [ 50.678476] ret_from_fork (kbuild/src/consumer/arch/x86/entry/entry_64.S:300)

My understanding is that this warning is firing because the loop
workqueue sets WQ_MEM_RECLAIM but the XFS workqueue (m_sync_workqueue)
does not. I believe that the WQ_MEM_RECLAIM on the loop device is
sensible because reclaim may flush dirty writes through the loop
device. I'm not familiar with xfs and its not clear why
m_sync_workqueue (flushed from xfs_flush_inodes) wouldn't have the
same reclaim dependency. I'll keep digging, but if anyone has
insights, please let me know.