2023-04-20 11:34:36

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 0/8] md/raid1-10: limit the number of plugged bio

From: Yu Kuai <[email protected]>

This patchset tries to limit the number of plugged bio for raid1 and
raid10, which is done in the last patch, other patches are some refactor
and optimizations.

Yu Kuai (8):
md/raid10: prevent soft lockup while flush writes
md/raid1-10: rename raid1-10.c to raid1-10.h
md/raid1-10: factor out a helper to add bio to plug
md/raid1-10: factor out a helper to submit normal write
md/raid1-10: submit write io directly if bitmap is not enabled
md/md-bitmap: support to unplug bitmap asynchrously
md/raid1{,0}: Revert "md/raid1{,0}: fix deadlock in bitmap_unplug."
md/raid1-10: limit the number of plugged bio

drivers/md/md-bitmap.c | 61 ++++++++++++++++++++++++---
drivers/md/md-bitmap.h | 10 +++++
drivers/md/{raid1-10.c => raid1-10.h} | 61 +++++++++++++++++++++++++--
drivers/md/raid1.c | 32 +++-----------
drivers/md/raid1.h | 2 +
drivers/md/raid10.c | 45 ++++----------------
drivers/md/raid10.h | 2 +
7 files changed, 143 insertions(+), 70 deletions(-)
rename drivers/md/{raid1-10.c => raid1-10.h} (64%)

--
2.39.2


2023-04-20 11:34:38

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 2/8] md/raid1-10: rename raid1-10.c to raid1-10.h

From: Yu Kuai <[email protected]>

raid1-10.c contains definitions that are used both for raid1 and raid10,
it's werid to use ".c" suffix.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/{raid1-10.c => raid1-10.h} | 10 +++++++---
drivers/md/raid1.c | 2 --
drivers/md/raid1.h | 2 ++
drivers/md/raid10.c | 2 --
drivers/md/raid10.h | 2 ++
5 files changed, 11 insertions(+), 7 deletions(-)
rename drivers/md/{raid1-10.c => raid1-10.h} (92%)

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.h
similarity index 92%
rename from drivers/md/raid1-10.c
rename to drivers/md/raid1-10.h
index e61f6cad4e08..04beef35142d 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.h
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#ifndef _RAID1_10_H
+#define _RAID1_10_H
+
/* Maximum size of each resync request */
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
@@ -33,7 +36,7 @@ struct raid1_plug_cb {
struct bio_list pending;
};

-static void rbio_pool_free(void *rbio, void *data)
+static inline void rbio_pool_free(void *rbio, void *data)
{
kfree(rbio);
}
@@ -91,8 +94,8 @@ static inline struct resync_pages *get_resync_pages(struct bio *bio)
}

/* generally called after bio_reset() for reseting bvec */
-static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
- int size)
+static inline void md_bio_reset_resync_pages(struct bio *bio,
+ struct resync_pages *rp, int size)
{
int idx = 0;

@@ -109,3 +112,4 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
size -= len;
} while (idx++ < RESYNC_PAGES && size > 0);
}
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2f1011ffdf09..84724b9b20b8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -49,8 +49,6 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)

-#include "raid1-10.c"
-
#define START(node) ((node)->start)
#define LAST(node) ((node)->last)
INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a0..80de4d66f010 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -2,6 +2,8 @@
#ifndef _RAID1_H
#define _RAID1_H

+#include "raid1-10.h"
+
/*
* each barrier unit size is 64MB fow now
* note: it must be larger than RESYNC_DEPTH
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a116b7c9d9f3..50d56b6af42f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -77,8 +77,6 @@ static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)

-#include "raid1-10.c"
-
#define NULL_CMD
#define cmd_before(conf, cmd) \
do { \
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b552..63e88dd774f7 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,6 +2,8 @@
#ifndef _RAID10_H
#define _RAID10_H

+#include "raid1-10.h"
+
/* Note: raid10_info.rdev can be set to NULL asynchronously by
* raid10_remove_disk.
* There are three safe ways to access raid10_info.rdev.
--
2.39.2

2023-04-20 11:34:47

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 4/8] md/raid1-10: factor out a helper to submit normal write

From: Yu Kuai <[email protected]>

There are multiple places to do the same thing, factor out a helper to
prevent redundant code, and the helper will be used in following patch
as well.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1-10.h | 17 +++++++++++++++++
drivers/md/raid1.c | 13 ++-----------
drivers/md/raid10.c | 26 ++++----------------------
3 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/md/raid1-10.h b/drivers/md/raid1-10.h
index 664646a3591a..9dc53d8a8129 100644
--- a/drivers/md/raid1-10.h
+++ b/drivers/md/raid1-10.h
@@ -113,6 +113,22 @@ static inline void md_bio_reset_resync_pages(struct bio *bio,
} while (idx++ < RESYNC_PAGES && size > 0);
}

+static inline void md_submit_write(struct bio *bio)
+{
+ struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
+
+ bio->bi_next = NULL;
+ bio_set_dev(bio, rdev->bdev);
+ if (test_bit(Faulty, &rdev->flags))
+ bio_io_error(bio);
+ else if (unlikely(bio_op(bio) == REQ_OP_DISCARD &&
+ !bdev_max_discard_sectors(bio->bi_bdev)))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ submit_bio_noacct(bio);
+}
+
static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
blk_plug_cb_fn unplug)
{
@@ -128,4 +144,5 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,

return true;
}
+
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 44c8d113621f..c068ed3e6c96 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -797,17 +797,8 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)

while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void *)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ md_submit_write(bio);
bio = next;
cond_resched();
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d67c5672933c..fd625026c97b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -907,17 +907,8 @@ static void flush_pending_writes(struct r10conf *conf)

while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ md_submit_write(bio);
bio = next;
cond_resched();
}
@@ -1127,17 +1118,8 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)

while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ md_submit_write(bio);
bio = next;
cond_resched();
}
--
2.39.2

2023-04-20 11:34:49

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

From: Yu Kuai <[email protected]>

Currently, there is no limit for raid1/raid10 plugged bio. While flushing
writes, raid1 has cond_resched() while raid10 doesn't, and too many
writes can cause soft lockup.

Follow up soft lockup can be triggered easily with writeback test for
raid10 with ramdisks:

watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
Call Trace:
<TASK>
call_rcu+0x16/0x20
put_object+0x41/0x80
__delete_object+0x50/0x90
delete_object_full+0x2b/0x40
kmemleak_free+0x46/0xa0
slab_free_freelist_hook.constprop.0+0xed/0x1a0
kmem_cache_free+0xfd/0x300
mempool_free_slab+0x1f/0x30
mempool_free+0x3a/0x100
bio_free+0x59/0x80
bio_put+0xcf/0x2c0
free_r10bio+0xbf/0xf0
raid_end_bio_io+0x78/0xb0
one_write_done+0x8a/0xa0
raid10_end_write_request+0x1b4/0x430
bio_endio+0x175/0x320
brd_submit_bio+0x3b9/0x9b7 [brd]
__submit_bio+0x69/0xe0
submit_bio_noacct_nocheck+0x1e6/0x5a0
submit_bio_noacct+0x38c/0x7e0
flush_pending_writes+0xf0/0x240
raid10d+0xac/0x1ed0

This patch fix the problem by adding cond_resched() to raid10 like what
raid1 did.

Note that unlimited plugged bio still need to be optimized because in
the case of writeback lots of dirty pages, this will take lots of memory
and io latecy is quite bad.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid10.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6590aa49598c..a116b7c9d9f3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -921,6 +921,7 @@ static void flush_pending_writes(struct r10conf *conf)
else
submit_bio_noacct(bio);
bio = next;
+ cond_resched();
}
blk_finish_plug(&plug);
} else
@@ -1140,6 +1141,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
else
submit_bio_noacct(bio);
bio = next;
+ cond_resched();
}
kfree(plug);
}
--
2.39.2

2023-04-20 11:34:58

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 7/8] md/raid1{,0}: Revert "md/raid1{,0}: fix deadlock in bitmap_unplug."

From: Yu Kuai <[email protected]>

This reverts commit 874807a83139abc094f939e93623c5623573d543.

Because the deadlock won't exist after commit a214b949d8e3 ("blk-mq: only
flush requests from the plug in blk_mq_submit_bio"). And bitmap io is
handled asynchronously now, the deadlock won't be triggered anymore.

By the way, for performance considerations, plugged bio should not be
handled asynchronously.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1.c | 2 +-
drivers/md/raid10.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7389e599f34e..91e1dbc48228 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1164,7 +1164,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r1conf *conf = mddev->private;
struct bio *bio;

- if (from_schedule || current->bio_list) {
+ if (from_schedule) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9f307ff5d4f6..d92b1efe9eee 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1101,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r10conf *conf = mddev->private;
struct bio *bio;

- if (from_schedule || current->bio_list) {
+ if (from_schedule) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
--
2.39.2

2023-04-20 11:35:09

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 3/8] md/raid1-10: factor out a helper to add bio to plug

From: Yu Kuai <[email protected]>

The code in raid1 and raid10 is identical, prepare to limit the number
of pluged bios.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1-10.h | 16 ++++++++++++++++
drivers/md/raid1.c | 12 +-----------
drivers/md/raid10.c | 11 +----------
3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid1-10.h b/drivers/md/raid1-10.h
index 04beef35142d..664646a3591a 100644
--- a/drivers/md/raid1-10.h
+++ b/drivers/md/raid1-10.h
@@ -112,4 +112,20 @@ static inline void md_bio_reset_resync_pages(struct bio *bio,
size -= len;
} while (idx++ < RESYNC_PAGES && size > 0);
}
+
+static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
+ blk_plug_cb_fn unplug)
+{
+ struct raid1_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb = blk_check_plugged(unplug, mddev,
+ sizeof(*plug));
+
+ if (!cb)
+ return false;
+
+ plug = container_of(cb, struct raid1_plug_cb, cb);
+ bio_list_add(&plug->pending, bio);
+
+ return true;
+}
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 84724b9b20b8..44c8d113621f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1341,8 +1341,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
- struct blk_plug_cb *cb;
- struct raid1_plug_cb *plug = NULL;
int first_clone;
int max_sectors;
bool write_behind = false;
@@ -1571,15 +1569,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
-
- cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid1_plug_cb, cb);
- else
- plug = NULL;
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- } else {
+ if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50d56b6af42f..d67c5672933c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1279,8 +1279,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
unsigned long flags;
- struct blk_plug_cb *cb;
- struct raid1_plug_cb *plug = NULL;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int devnum = r10_bio->devs[n_copy].devnum;
@@ -1320,14 +1318,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,

atomic_inc(&r10_bio->remaining);

- cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid1_plug_cb, cb);
- else
- plug = NULL;
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- } else {
+ if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
--
2.39.2

2023-04-20 11:35:26

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 6/8] md/md-bitmap: support to unplug bitmap asynchrously

From: Yu Kuai <[email protected]>

If bitmap is enabled, bitmap must update before submiting write io, this
is why unplug callback must move these io to 'conf->pending_io_list' if
'current->bio_list' is not empty, which will suffer performance
degeration.

This patch add a new helper md_bitmap_unplug_async() to submit bitmap io
in a kworker, so that submit bitmap io in raid10_unplug() doesn't require
that 'current->bio_list' is empty.

This patch prepare to limit the number of plugged bio.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-bitmap.c | 59 +++++++++++++++++++++++++++++++++++++++---
drivers/md/md-bitmap.h | 3 +++
drivers/md/raid1.c | 3 ++-
drivers/md/raid10.c | 2 +-
4 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 4bd980b272ef..da8ad2e95e88 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1000,10 +1000,18 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
return set;
}

-/* this gets called when the md device is ready to unplug its underlying
+struct bitmap_unplug_work {
+ struct work_struct work;
+ struct bitmap *bitmap;
+ struct completion *done;
+};
+
+/*
+ * This gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
- * sync the dirty pages of the bitmap file to disk */
-void md_bitmap_unplug(struct bitmap *bitmap)
+ * sync the dirty pages of the bitmap file to disk.
+ */
+static void md_do_bitmap_unplug(struct bitmap *bitmap)
{
unsigned long i;
int dirty, need_write;
@@ -1035,9 +1043,45 @@ void md_bitmap_unplug(struct bitmap *bitmap)

if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
+
+}
+static void md_bitmap_unplug_fn(struct work_struct *work)
+{
+ struct bitmap_unplug_work *unplug_work =
+ container_of(work, struct bitmap_unplug_work, work);
+
+ md_do_bitmap_unplug(unplug_work->bitmap);
+ complete(unplug_work->done);
+}
+
+static void __md_bitmap_unplug(struct bitmap *bitmap, bool async)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct bitmap_unplug_work unplug_work;
+
+ if (!async)
+ return md_do_bitmap_unplug(bitmap);
+
+ INIT_WORK(&unplug_work.work, md_bitmap_unplug_fn);
+ unplug_work.bitmap = bitmap;
+ unplug_work.done = &done;
+
+ queue_work(bitmap->unplug_wq, &unplug_work.work);
+ wait_for_completion(&done);
+}
+
+void md_bitmap_unplug(struct bitmap *bitmap)
+{
+ return __md_bitmap_unplug(bitmap, false);
}
EXPORT_SYMBOL(md_bitmap_unplug);

+void md_bitmap_unplug_async(struct bitmap *bitmap)
+{
+ return __md_bitmap_unplug(bitmap, true);
+}
+EXPORT_SYMBOL(md_bitmap_unplug_async);
+
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
* the in-memory bitmap from the on-disk bitmap -- also, sets up the
@@ -1753,6 +1797,9 @@ void md_bitmap_free(struct bitmap *bitmap)
if (!bitmap) /* there was no bitmap */
return;

+ if (bitmap->unplug_wq)
+ destroy_workqueue(bitmap->unplug_wq);
+
if (bitmap->sysfs_can_clear)
sysfs_put(bitmap->sysfs_can_clear);

@@ -1843,6 +1890,12 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
if (!bitmap)
return ERR_PTR(-ENOMEM);

+ bitmap->unplug_wq = create_workqueue("md_bitmap");
+ if (!bitmap->unplug_wq) {
+ err = -ENOMEM;
+ goto error;
+ }
+
spin_lock_init(&bitmap->counts.lock);
atomic_set(&bitmap->pending_writes, 0);
init_waitqueue_head(&bitmap->write_wait);
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 3a4750952b3a..55531669db24 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -231,6 +231,8 @@ struct bitmap {

struct kernfs_node *sysfs_can_clear;
int cluster_slot; /* Slot offset for clustered env */
+
+ struct workqueue_struct *unplug_wq;
};

/* the bitmap API */
@@ -264,6 +266,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev,
sector_t new_lo, sector_t new_hi);

void md_bitmap_unplug(struct bitmap *bitmap);
+void md_bitmap_unplug_async(struct bitmap *bitmap);
void md_bitmap_daemon_work(struct mddev *mddev);

int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c068ed3e6c96..7389e599f34e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -792,7 +792,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
- md_bitmap_unplug(conf->mddev->bitmap);
wake_up(&conf->wait_barrier);

while (bio) { /* submit pending writes */
@@ -829,6 +828,7 @@ static void flush_pending_writes(struct r1conf *conf)
*/
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
+ md_bitmap_unplug(conf->mddev->bitmap);
flush_bio_list(conf, bio);
blk_finish_plug(&plug);
} else
@@ -1176,6 +1176,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)

/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
+ md_bitmap_unplug_async(conf->mddev->bitmap);
flush_bio_list(conf, bio);
kfree(plug);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fd625026c97b..9f307ff5d4f6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1113,7 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)

/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- md_bitmap_unplug(mddev->bitmap);
+ md_bitmap_unplug_async(mddev->bitmap);
wake_up(&conf->wait_barrier);

while (bio) { /* submit pending writes */
--
2.39.2

2023-04-20 11:35:42

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 8/8] md/raid1-10: limit the number of plugged bio

From: Yu Kuai <[email protected]>

bio can be added to plug infinitely, and following writeback test can
trigger huge amount of pluged bio:

Test script:
modprobe brd rd_nr=4 rd_size=10485760
mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
echo 0 > /proc/sys/vm/dirty_background_ratio
echo 60 > /proc/sys/vm/dirty_ratio
fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k -numjobs=1 -iodepth=128 -name=xxx

Test result:
Monitor /sys/block/md0/inflight will found that inflight keep increasing
until fio finish writing, after running for about 2 minutes:

[root@fedora ~]# cat /sys/block/md0/inflight
0 4474191

Fix the problem by limiting the number of pluged bio based on the number
of copies for orininal bio.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1-10.h | 9 ++++++++-
drivers/md/raid1.c | 2 +-
drivers/md/raid10.c | 2 +-
3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1-10.h b/drivers/md/raid1-10.h
index 95b2fb4dd9aa..2785ae805953 100644
--- a/drivers/md/raid1-10.h
+++ b/drivers/md/raid1-10.h
@@ -33,9 +33,12 @@ struct resync_pages {
struct page *pages[RESYNC_PAGES];
};

+#define MAX_PLUG_BIO 32
+
struct raid1_plug_cb {
struct blk_plug_cb cb;
struct bio_list pending;
+ unsigned int count;
};

static inline void rbio_pool_free(void *rbio, void *data)
@@ -132,7 +135,7 @@ static inline void md_submit_write(struct bio *bio)
}

static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
- blk_plug_cb_fn unplug)
+ blk_plug_cb_fn unplug, int copies)
{
struct raid1_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
@@ -152,6 +155,10 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,

plug = container_of(cb, struct raid1_plug_cb, cb);
bio_list_add(&plug->pending, bio);
+ if (++plug->count / MAX_PLUG_BIO >= copies) {
+ list_del(&cb->list);
+ cb->callback(cb, false);
+ }

return true;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 91e1dbc48228..6a38104a7b89 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1561,7 +1561,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
- if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
+ if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d92b1efe9eee..721d50646043 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1300,7 +1300,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,

atomic_inc(&r10_bio->remaining);

- if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
+ if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
--
2.39.2

2023-04-20 11:36:02

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next 5/8] md/raid1-10: submit write io directly if bitmap is not enabled

From: Yu Kuai <[email protected]>

Commit 6cce3b23f6f8 ("[PATCH] md: write intent bitmap support for raid10")
add bitmap support, and it changed that write io is submitted through demon
thread because bitmap need to be updated before write io. And later, plug
is used to fix performance regression because all the write io will go to
demon thread, which means io can't be issued concurrently.

However, if bitmap is not enabled, the write io should not go to demon
thread in the first place, and plug is not needed as well.

Fixes: 6cce3b23f6f8 ("[PATCH] md: write intent bitmap support for raid10")
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-bitmap.c | 4 +---
drivers/md/md-bitmap.h | 7 +++++++
drivers/md/raid1-10.h | 15 +++++++++++++--
3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ab27f66dbb1f..4bd980b272ef 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1000,7 +1000,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
return set;
}

-
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
@@ -1010,8 +1009,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
int dirty, need_write;
int writing = 0;

- if (!bitmap || !bitmap->storage.filemap ||
- test_bit(BITMAP_STALE, &bitmap->flags))
+ if (!md_bitmap_enabled(bitmap))
return;

/* look at each page to see if there are any set bits that need to be
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index cfd7395de8fd..3a4750952b3a 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -273,6 +273,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
void md_bitmap_free(struct bitmap *bitmap);
void md_bitmap_wait_behind_writes(struct mddev *mddev);
+
+static inline bool md_bitmap_enabled(struct bitmap *bitmap)
+{
+ return bitmap && bitmap->storage.filemap &&
+ !test_bit(BITMAP_STALE, &bitmap->flags);
+}
+
#endif

#endif
diff --git a/drivers/md/raid1-10.h b/drivers/md/raid1-10.h
index 9dc53d8a8129..95b2fb4dd9aa 100644
--- a/drivers/md/raid1-10.h
+++ b/drivers/md/raid1-10.h
@@ -2,6 +2,8 @@
#ifndef _RAID1_10_H
#define _RAID1_10_H

+#include "md-bitmap.h"
+
/* Maximum size of each resync request */
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
@@ -133,9 +135,18 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
blk_plug_cb_fn unplug)
{
struct raid1_plug_cb *plug = NULL;
- struct blk_plug_cb *cb = blk_check_plugged(unplug, mddev,
- sizeof(*plug));
+ struct blk_plug_cb *cb;
+
+ /*
+ * If bitmap is not enabled, it's safe to submit the io directly, and
+ * this can get optimal performance.
+ */
+ if (!md_bitmap_enabled(mddev->bitmap)) {
+ md_submit_write(bio);
+ return true;
+ }

+ cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
if (!cb)
return false;

--
2.39.2

2023-04-25 00:01:24

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Currently, there is no limit for raid1/raid10 plugged bio. While flushing
> writes, raid1 has cond_resched() while raid10 doesn't, and too many
> writes can cause soft lockup.
>
> Follow up soft lockup can be triggered easily with writeback test for
> raid10 with ramdisks:
>
> watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
> Call Trace:
> <TASK>
> call_rcu+0x16/0x20
> put_object+0x41/0x80
> __delete_object+0x50/0x90
> delete_object_full+0x2b/0x40
> kmemleak_free+0x46/0xa0
> slab_free_freelist_hook.constprop.0+0xed/0x1a0
> kmem_cache_free+0xfd/0x300
> mempool_free_slab+0x1f/0x30
> mempool_free+0x3a/0x100
> bio_free+0x59/0x80
> bio_put+0xcf/0x2c0
> free_r10bio+0xbf/0xf0
> raid_end_bio_io+0x78/0xb0
> one_write_done+0x8a/0xa0
> raid10_end_write_request+0x1b4/0x430
> bio_endio+0x175/0x320
> brd_submit_bio+0x3b9/0x9b7 [brd]
> __submit_bio+0x69/0xe0
> submit_bio_noacct_nocheck+0x1e6/0x5a0
> submit_bio_noacct+0x38c/0x7e0
> flush_pending_writes+0xf0/0x240
> raid10d+0xac/0x1ed0
>
> This patch fix the problem by adding cond_resched() to raid10 like what
> raid1 did.

nit: per submitting-patches.rst:

Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
to do frotz", as if you are giving orders to the codebase to change
its behaviour.

>
> Note that unlimited plugged bio still need to be optimized because in
> the case of writeback lots of dirty pages, this will take lots of memory
> and io latecy is quite bad.

typo: latency.

>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid10.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6590aa49598c..a116b7c9d9f3 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -921,6 +921,7 @@ static void flush_pending_writes(struct r10conf *conf)
> else
> submit_bio_noacct(bio);
> bio = next;
> + cond_resched();
> }
> blk_finish_plug(&plug);
> } else
> @@ -1140,6 +1141,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
> else
> submit_bio_noacct(bio);
> bio = next;
> + cond_resched();
> }
> kfree(plug);
> }
> --
> 2.39.2
>

2023-04-25 00:11:06

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH -next 2/8] md/raid1-10: rename raid1-10.c to raid1-10.h

On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> raid1-10.c contains definitions that are used both for raid1 and raid10,
> it's werid to use ".c" suffix.

type: weird.

Please see the original discussion about raid1-10.c here:

https://lore.kernel.org/linux-raid/[email protected]/

Let's keep raid1-10.c for now.

Thanks,
Song

>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/{raid1-10.c => raid1-10.h} | 10 +++++++---
> drivers/md/raid1.c | 2 --
> drivers/md/raid1.h | 2 ++
> drivers/md/raid10.c | 2 --
> drivers/md/raid10.h | 2 ++
> 5 files changed, 11 insertions(+), 7 deletions(-)
> rename drivers/md/{raid1-10.c => raid1-10.h} (92%)
>
> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.h
> similarity index 92%
> rename from drivers/md/raid1-10.c
> rename to drivers/md/raid1-10.h
> index e61f6cad4e08..04beef35142d 100644
> --- a/drivers/md/raid1-10.c
> +++ b/drivers/md/raid1-10.h
> @@ -1,4 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0
> +#ifndef _RAID1_10_H
> +#define _RAID1_10_H
> +
> /* Maximum size of each resync request */
> #define RESYNC_BLOCK_SIZE (64*1024)
> #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
> @@ -33,7 +36,7 @@ struct raid1_plug_cb {
> struct bio_list pending;
> };
>
> -static void rbio_pool_free(void *rbio, void *data)
> +static inline void rbio_pool_free(void *rbio, void *data)
> {
> kfree(rbio);
> }
> @@ -91,8 +94,8 @@ static inline struct resync_pages *get_resync_pages(struct bio *bio)
> }
>
> /* generally called after bio_reset() for reseting bvec */
> -static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
> - int size)
> +static inline void md_bio_reset_resync_pages(struct bio *bio,
> + struct resync_pages *rp, int size)
> {
> int idx = 0;
>
> @@ -109,3 +112,4 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
> size -= len;
> } while (idx++ < RESYNC_PAGES && size > 0);
> }
> +#endif
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 2f1011ffdf09..84724b9b20b8 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -49,8 +49,6 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
> #define raid1_log(md, fmt, args...) \
> do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
>
> -#include "raid1-10.c"
> -
> #define START(node) ((node)->start)
> #define LAST(node) ((node)->last)
> INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index 468f189da7a0..80de4d66f010 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -2,6 +2,8 @@
> #ifndef _RAID1_H
> #define _RAID1_H
>
> +#include "raid1-10.h"
> +
> /*
> * each barrier unit size is 64MB fow now
> * note: it must be larger than RESYNC_DEPTH
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index a116b7c9d9f3..50d56b6af42f 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -77,8 +77,6 @@ static void end_reshape(struct r10conf *conf);
> #define raid10_log(md, fmt, args...) \
> do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
>
> -#include "raid1-10.c"
> -
> #define NULL_CMD
> #define cmd_before(conf, cmd) \
> do { \
> diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
> index 63e48b11b552..63e88dd774f7 100644
> --- a/drivers/md/raid10.h
> +++ b/drivers/md/raid10.h
> @@ -2,6 +2,8 @@
> #ifndef _RAID10_H
> #define _RAID10_H
>
> +#include "raid1-10.h"
> +
> /* Note: raid10_info.rdev can be set to NULL asynchronously by
> * raid10_remove_disk.
> * There are three safe ways to access raid10_info.rdev.
> --
> 2.39.2
>

2023-04-25 00:26:11

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Currently, there is no limit for raid1/raid10 plugged bio. While flushing
> writes, raid1 has cond_resched() while raid10 doesn't, and too many
> writes can cause soft lockup.
>
> Follow up soft lockup can be triggered easily with writeback test for
> raid10 with ramdisks:
>
> watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
> Call Trace:
> <TASK>
> call_rcu+0x16/0x20
> put_object+0x41/0x80
> __delete_object+0x50/0x90
> delete_object_full+0x2b/0x40
> kmemleak_free+0x46/0xa0
> slab_free_freelist_hook.constprop.0+0xed/0x1a0
> kmem_cache_free+0xfd/0x300
> mempool_free_slab+0x1f/0x30
> mempool_free+0x3a/0x100
> bio_free+0x59/0x80
> bio_put+0xcf/0x2c0
> free_r10bio+0xbf/0xf0
> raid_end_bio_io+0x78/0xb0
> one_write_done+0x8a/0xa0
> raid10_end_write_request+0x1b4/0x430
> bio_endio+0x175/0x320
> brd_submit_bio+0x3b9/0x9b7 [brd]
> __submit_bio+0x69/0xe0
> submit_bio_noacct_nocheck+0x1e6/0x5a0
> submit_bio_noacct+0x38c/0x7e0
> flush_pending_writes+0xf0/0x240
> raid10d+0xac/0x1ed0

Is it possible to trigger this with a mdadm test?

Thanks,
Song

>
> This patch fix the problem by adding cond_resched() to raid10 like what
> raid1 did.
>
> Note that unlimited plugged bio still need to be optimized because in
> the case of writeback lots of dirty pages, this will take lots of memory
> and io latecy is quite bad.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid10.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6590aa49598c..a116b7c9d9f3 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -921,6 +921,7 @@ static void flush_pending_writes(struct r10conf *conf)
> else
> submit_bio_noacct(bio);
> bio = next;
> + cond_resched();
> }
> blk_finish_plug(&plug);
> } else
> @@ -1140,6 +1141,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
> else
> submit_bio_noacct(bio);
> bio = next;
> + cond_resched();
> }
> kfree(plug);
> }
> --
> 2.39.2
>

2023-04-25 06:33:17

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

Hi,

在 2023/04/25 8:23, Song Liu 写道:
> On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> Currently, there is no limit for raid1/raid10 plugged bio. While flushing
>> writes, raid1 has cond_resched() while raid10 doesn't, and too many
>> writes can cause soft lockup.
>>
>> Follow up soft lockup can be triggered easily with writeback test for
>> raid10 with ramdisks:
>>
>> watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
>> Call Trace:
>> <TASK>
>> call_rcu+0x16/0x20
>> put_object+0x41/0x80
>> __delete_object+0x50/0x90
>> delete_object_full+0x2b/0x40
>> kmemleak_free+0x46/0xa0
>> slab_free_freelist_hook.constprop.0+0xed/0x1a0
>> kmem_cache_free+0xfd/0x300
>> mempool_free_slab+0x1f/0x30
>> mempool_free+0x3a/0x100
>> bio_free+0x59/0x80
>> bio_put+0xcf/0x2c0
>> free_r10bio+0xbf/0xf0
>> raid_end_bio_io+0x78/0xb0
>> one_write_done+0x8a/0xa0
>> raid10_end_write_request+0x1b4/0x430
>> bio_endio+0x175/0x320
>> brd_submit_bio+0x3b9/0x9b7 [brd]
>> __submit_bio+0x69/0xe0
>> submit_bio_noacct_nocheck+0x1e6/0x5a0
>> submit_bio_noacct+0x38c/0x7e0
>> flush_pending_writes+0xf0/0x240
>> raid10d+0xac/0x1ed0
>
> Is it possible to trigger this with a mdadm test?
>

The test I mentioned in patch 8 can trigger this problem reliablity, so
I this add a new test can achieve this.

Thanks,
Kuai
> Thanks,
> Song
>
>>
>> This patch fix the problem by adding cond_resched() to raid10 like what
>> raid1 did.
>>
>> Note that unlimited plugged bio still need to be optimized because in
>> the case of writeback lots of dirty pages, this will take lots of memory
>> and io latecy is quite bad.
>>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> drivers/md/raid10.c | 2 ++
>> 1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index 6590aa49598c..a116b7c9d9f3 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -921,6 +921,7 @@ static void flush_pending_writes(struct r10conf *conf)
>> else
>> submit_bio_noacct(bio);
>> bio = next;
>> + cond_resched();
>> }
>> blk_finish_plug(&plug);
>> } else
>> @@ -1140,6 +1141,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>> else
>> submit_bio_noacct(bio);
>> bio = next;
>> + cond_resched();
>> }
>> kfree(plug);
>> }
>> --
>> 2.39.2
>>
> .
>

2023-04-25 06:50:46

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

On Mon, Apr 24, 2023 at 11:16 PM Yu Kuai <[email protected]> wrote:
>
> Hi,
>
> 在 2023/04/25 8:23, Song Liu 写道:
> > On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
> >>
> >> From: Yu Kuai <[email protected]>
> >>
> >> Currently, there is no limit for raid1/raid10 plugged bio. While flushing
> >> writes, raid1 has cond_resched() while raid10 doesn't, and too many
> >> writes can cause soft lockup.
> >>
> >> Follow up soft lockup can be triggered easily with writeback test for
> >> raid10 with ramdisks:
> >>
> >> watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
> >> Call Trace:
> >> <TASK>
> >> call_rcu+0x16/0x20
> >> put_object+0x41/0x80
> >> __delete_object+0x50/0x90
> >> delete_object_full+0x2b/0x40
> >> kmemleak_free+0x46/0xa0
> >> slab_free_freelist_hook.constprop.0+0xed/0x1a0
> >> kmem_cache_free+0xfd/0x300
> >> mempool_free_slab+0x1f/0x30
> >> mempool_free+0x3a/0x100
> >> bio_free+0x59/0x80
> >> bio_put+0xcf/0x2c0
> >> free_r10bio+0xbf/0xf0
> >> raid_end_bio_io+0x78/0xb0
> >> one_write_done+0x8a/0xa0
> >> raid10_end_write_request+0x1b4/0x430
> >> bio_endio+0x175/0x320
> >> brd_submit_bio+0x3b9/0x9b7 [brd]
> >> __submit_bio+0x69/0xe0
> >> submit_bio_noacct_nocheck+0x1e6/0x5a0
> >> submit_bio_noacct+0x38c/0x7e0
> >> flush_pending_writes+0xf0/0x240
> >> raid10d+0xac/0x1ed0
> >
> > Is it possible to trigger this with a mdadm test?
> >
>
> The test I mentioned in patch 8 can trigger this problem reliablity, so
> I this add a new test can achieve this.

To be clear, by "mdadm test" I mean the tests included in mdadm:

https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git/tree/tests

Could you please try to add a test? If it works, we should add it to
mdadm.

Thanks,
Song

2023-04-25 06:55:40

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH -next 1/8] md/raid10: prevent soft lockup while flush writes

Hi,

在 2023/04/25 14:39, Song Liu 写道:
> On Mon, Apr 24, 2023 at 11:16 PM Yu Kuai <[email protected]> wrote:
>>
>> Hi,
>>
>> 在 2023/04/25 8:23, Song Liu 写道:
>>> On Thu, Apr 20, 2023 at 4:31 AM Yu Kuai <[email protected]> wrote:
>>>>
>>>> From: Yu Kuai <[email protected]>
>>>>
>>>> Currently, there is no limit for raid1/raid10 plugged bio. While flushing
>>>> writes, raid1 has cond_resched() while raid10 doesn't, and too many
>>>> writes can cause soft lockup.
>>>>
>>>> Follow up soft lockup can be triggered easily with writeback test for
>>>> raid10 with ramdisks:
>>>>
>>>> watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293]
>>>> Call Trace:
>>>> <TASK>
>>>> call_rcu+0x16/0x20
>>>> put_object+0x41/0x80
>>>> __delete_object+0x50/0x90
>>>> delete_object_full+0x2b/0x40
>>>> kmemleak_free+0x46/0xa0
>>>> slab_free_freelist_hook.constprop.0+0xed/0x1a0
>>>> kmem_cache_free+0xfd/0x300
>>>> mempool_free_slab+0x1f/0x30
>>>> mempool_free+0x3a/0x100
>>>> bio_free+0x59/0x80
>>>> bio_put+0xcf/0x2c0
>>>> free_r10bio+0xbf/0xf0
>>>> raid_end_bio_io+0x78/0xb0
>>>> one_write_done+0x8a/0xa0
>>>> raid10_end_write_request+0x1b4/0x430
>>>> bio_endio+0x175/0x320
>>>> brd_submit_bio+0x3b9/0x9b7 [brd]
>>>> __submit_bio+0x69/0xe0
>>>> submit_bio_noacct_nocheck+0x1e6/0x5a0
>>>> submit_bio_noacct+0x38c/0x7e0
>>>> flush_pending_writes+0xf0/0x240
>>>> raid10d+0xac/0x1ed0
>>>
>>> Is it possible to trigger this with a mdadm test?
>>>
>>
>> The test I mentioned in patch 8 can trigger this problem reliablity, so
>> I this add a new test can achieve this.
>
> To be clear, by "mdadm test" I mean the tests included in mdadm:
>
> https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git/tree/tests
>
> Could you please try to add a test? If it works, we should add it to
> mdadm.

Yes, of course. However, I'm not familiar how mdadm tests works yet, it
might take some time. By the way, I'll be good if I can add the test to
blktests if possible.

Thanks,
Kuai
>
> Thanks,
> Song
> .
>