2020-06-29 19:47:36

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 18/20] block: refator submit_bio_noacct

Split out a __submit_bio_noacct helper for the actual de-recursion
algorithm, and simplify the loop by using a continue when we can't
enter the queue for a bio.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-core.c | 131 +++++++++++++++++++++++++----------------------
1 file changed, 71 insertions(+), 60 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1caeb01e127768..b82f48c86e6f7a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1085,6 +1085,74 @@ static blk_qc_t do_make_request(struct bio *bio)
return ret;
}

+/*
+ * The loop in this function may be a bit non-obvious, and so deserves some
+ * explanation:
+ *
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
+ * that), so we have a list with a single bio.
+ * - We pretend that we have just taken it off a longer list, so we assign
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
+ * non-NULL value in bio_list and re-enter the loop from the top.
+ * - In this case we really did just take the bio of the top of the list (no
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
+ * again.
+ *
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
+ * bio_list_on_stack[1] contains bios that were submitted before the current
+ * ->submit_bio_bio, but that haven't been processed yet.
+ */
+static blk_qc_t __submit_bio_noacct(struct bio *bio)
+{
+ struct bio_list bio_list_on_stack[2];
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ BUG_ON(bio->bi_next);
+
+ bio_list_init(&bio_list_on_stack[0]);
+ current->bio_list = bio_list_on_stack;
+
+ do {
+ struct request_queue *q = bio->bi_disk->queue;
+ struct bio_list lower, same;
+
+ if (unlikely(bio_queue_enter(bio) != 0))
+ continue;
+
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ bio_list_on_stack[1] = bio_list_on_stack[0];
+ bio_list_init(&bio_list_on_stack[0]);
+
+ ret = do_make_request(bio);
+
+ /*
+ * Sort new bios into those for a lower level and those for the
+ * same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+ if (q == bio->bi_disk->queue)
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);
+
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&bio_list_on_stack[0], &lower);
+ bio_list_merge(&bio_list_on_stack[0], &same);
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+ return ret;
+}
+
/**
* submit_bio_noacct - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
@@ -1096,17 +1164,8 @@ static blk_qc_t do_make_request(struct bio *bio)
*/
blk_qc_t submit_bio_noacct(struct bio *bio)
{
- /*
- * bio_list_on_stack[0] contains bios submitted by the current
- * ->submit_bio.
- * bio_list_on_stack[1] contains bios that were submitted before the
- * current ->submit_bio_bio, but that haven't been processed yet.
- */
- struct bio_list bio_list_on_stack[2];
- blk_qc_t ret = BLK_QC_T_NONE;
-
if (!submit_bio_checks(bio))
- goto out;
+ return BLK_QC_T_NONE;

/*
* We only want one ->submit_bio to be active at a time, else
@@ -1120,58 +1179,10 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
*/
if (current->bio_list) {
bio_list_add(&current->bio_list[0], bio);
- goto out;
+ return BLK_QC_T_NONE;
}

- /* following loop may be a bit non-obvious, and so deserves some
- * explanation.
- * Before entering the loop, bio->bi_next is NULL (as all callers
- * ensure that) so we have a list with a single bio.
- * We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->submit_bio() may indeed add some more bios
- * through a recursive call to submit_bio_noacct. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->submit_bio() again.
- */
- BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack[0]);
- current->bio_list = bio_list_on_stack;
- do {
- struct request_queue *q = bio->bi_disk->queue;
-
- if (likely(bio_queue_enter(bio) == 0)) {
- struct bio_list lower, same;
-
- /* Create a fresh bio_list for all subordinate requests */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
- ret = do_make_request(bio);
-
- /* sort new bios into those for a lower level
- * and those for the same level
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_disk->queue)
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
- /* now assemble so we handle the lowest level first */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- }
- bio = bio_list_pop(&bio_list_on_stack[0]);
- } while (bio);
- current->bio_list = NULL; /* deactivate */
-
-out:
- return ret;
+ return __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);

--
2.26.2


2020-07-02 14:10:51

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH 18/20] block: refator submit_bio_noacct

On Mon, Jun 29, 2020 at 09:39:45PM +0200, Christoph Hellwig wrote:
> Split out a __submit_bio_noacct helper for the actual de-recursion
> algorithm, and simplify the loop by using a continue when we can't
> enter the queue for a bio.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reverting this commit and its dependencies,

5a6c35f9af41 block: remove direct_make_request
ff93ea0ce763 block: shortcut __submit_bio_noacct for blk-mq drivers

fixed the stack-out-of-bounds during boot,

https://lore.kernel.org/linux-block/[email protected]/

[ 55.573431][ T1373] BUG: KASAN: stack-out-of-bounds in bio_alloc_bioset+0x493/0x4a0
bio_alloc_bioset+0x493/0x4a0:
bio_list_empty at include/linux/bio.h:561
(inlined by) bio_alloc_bioset at block/bio.c:482
[ 55.581140][ T1373] Read of size 8 at addr ffffc9000a7df1e0 by task mount/1373
[ 55.588409][ T1373]
[ 55.590615][ T1373] CPU: 2 PID: 1373 Comm: mount Not tainted 5.8.0-rc3-next-20200702 #2
[ 55.598672][ T1373] Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019
[ 55.607972][ T1373] Call Trace:
[ 55.607980][ T1373] dump_stack+0x9d/0xe0
[ 55.607984][ T1373] ? bio_alloc_bioset+0x493/0x4a0
[ 55.607992][ T1373] ? bio_alloc_bioset+0x493/0x4a0
[ 55.625007][ T1373] print_address_description.constprop.8.cold.10+0x56/0x44e
[ 55.632191][ T1373] ? bio_alloc_bioset+0x493/0x4a0
[ 55.637100][ T1373] ? bio_alloc_bioset+0x493/0x4a0
[ 55.642011][ T1373] kasan_report.cold.11+0x37/0x7c
[ 55.646923][ T1373] ? bio_alloc_bioset+0x493/0x4a0
[ 55.651968][ T1373] bio_alloc_bioset+0x493/0x4a0
[ 55.651971][ T1373] ? bvec_alloc+0x290/0x290
[ 55.651975][ T1373] ? mark_lock+0x147/0x1800
[ 55.651978][ T1373] ? mark_lock+0x147/0x1800
[ 55.651981][ T1373] bio_clone_fast+0xe/0x30
[ 55.651983][ T1373] bio_split+0x8a/0x4c0
[ 55.651986][ T1373] ? print_irqtrace_events+0x270/0x270
[ 55.651990][ T1373] __blk_queue_split+0xc42/0x13e0
[ 55.651998][ T1373] ? __lock_acquire+0xc57/0x4da0
Startin[ 55.693322][ T1373] ? __blk_rq_map_sg+0x14c0/0x14c0
[ 55.699711][ T1373] ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 55.705602][ T1373] ? mark_held_locks+0xb0/0x110
[ 55.705605][ T1373] ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 55.705608][ T1373] ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 55.705611][ T1373] ? find_held_lock+0x33/0x1c0
[ 55.705614][ T1373] ? find_held_lock+0x33/0x1c0
[ 55.705618][ T1373] blk_mq_submit_bio+0x19e/0x1e20
[ 55.705621][ T1373] ? lock_downgrade+0x720/0x720
[ 55.705624][ T1373] ? blk_mq_try_issue_directly+0x140/0x140
[ 55.705628][ T1373] ? rcu_read_lock_sched_held+0xaa/0xd0
[ 55.705631][ T1373] ? rcu_read_lock_bh_held+0xc0/0xc0
[ 55.705635][ T1373] ? blk_queue_enter+0x83c/0x9a0
[ 55.705647][ T1373] ? submit_bio_checks+0x1cc0/0x1cc0
[ 55.767384][ T1373] submit_bio_noacct+0x9c0/0xeb0
[ 55.772212][ T1373] ? blk_queue_enter+0x9a0/0x9a0
[ 55.777038][ T1373] ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 55.782913][ T1373] ? trace_hardirqs_on+0x20/0x1b5
[ 55.787825][ T1373] ? submit_bio+0xe7/0x480
[ 55.792125][ T1373] submit_bio+0xe7/0x480
[ 55.796252][ T1373] ? bio_associate_blkg_from_css+0x4a3/0xd30
[ 55.802124][ T1373] ? submit_bio_noacct+0xeb0/0xeb0
[ 55.807124][ T1373] ? lock_downgrade+0x720/0x720
[ 55.811862][ T1373] ? rcu_read_unlock+0x50/0x50
[ 55.816512][ T1373] ? lockdep_init_map_waits+0x267/0x7b0
[ 55.821948][ T1373] ? lockdep_init_map_waits+0x267/0x7b0
g LVM event acti[ 55.827386][ T1373] ? __raw_spin_lock_init+0x34/0x100
[ 55.833957][ T1373] submit_bio_wait+0xf9/0x200
vation on device[ 55.838521][ T1373] ? submit_bio_wait_endio+0x30/0x30
[ 55.845091][ T1373] xfs_rw_bdev+0x3ca/0x4d0
[ 55.849396][ T1373] xlog_do_io+0x149/0x320
[ 55.853611][ T1373] xlog_bread+0x1e/0xb0
[ 55.857651][ T1373] xlog_find_verify_log_record+0xba/0x4c0
[ 55.863264][ T1373] ? xlog_header_check_mount+0xb0/0xb0
[ 55.868615][ T1373] xlog_find_zeroed+0x2bc/0x4c0
8:3...
[ 55.873356][ T1373] ? print_irqtrace_events+0x270/0x270
[ 55.880093][ T1373] ? xlog_find_verify_log_record+0x4c0/0x4c0
[ 55.885966][ T1373] ? __lock_acquire+0x1920/0x4da0
[ 55.890881][ T1373] xlog_find_head+0xd4/0x790
[ 55.895355][ T1373] ? xlog_find_zeroed+0x4c0/0x4c0
[ 55.900269][ T1373] ? rcu_read_lock_sched_held+0xaa/0xd0
[ 55.905708][ T1373] ? rcu_read_lock_bh_held+0xc0/0xc0
[ 55.910885][ T1373] ? sugov_update_single+0x18d/0x4f0
[ 55.916058][ T1373] xlog_find_tail+0xc2/0x810
[ 55.920534][ T1373] ? mark_lock+0x147/0x1800
[ 55.924921][ T1373] ? xlog_verify_head+0x4c0/0x4c0
[ 55.929834][ T1373] ? debug_show_held_locks+0x30/0x50
[ 55.935007][ T1373] ? print_irqtrace_events+0x270/0x270
[ 55.940358][ T1373] ? try_to_wake_up+0x6d1/0xf40
[ 55.945094][ T1373] ? mark_held_locks+0xb0/0x110
[ 55.949835][ T1373] ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 55.955708][ T1373] ? _raw_spin_unlock_irqrestore+0x39/0x40
[ 55.961410][ T1373] ? trace_hardirqs_on+0x20/0x1b5
[ 55.966324][ T1373] xlog_recover+0x7c/0x480
[ 55.970627][ T1373] ? xlog_buf_readahead+0x110/0x110
[ 55.975715][ T1373] ? migrate_swap_stop+0xbf0/0xbf0
[ 55.980718][ T1373] ? lockdep_init_map_waits+0x267/0x7b0
[ 55.986156][ T1373] ? __raw_spin_lock_init+0x34/0x100
[ 55.991333][ T1373] xfs_log_mount+0x541/0x660
[ 55.995809][ T1373] xfs_mountfs+0xccd/0x1a00
[ 56.000202][ T1373] ? queue_work_node+0x190/0x190
[ 56.005028][ T1373] ? rcu_read_lock_sched_held+0xaa/0xd0
[ 56.010466][ T1373] ? xfs_default_resblks+0x50/0x50
[ 56.015464][ T1373] ? xfs_filestream_get_parent+0xa0/0xa0
[ 56.020989][ T1373] ? init_timer_key+0x285/0x320
[ 56.025727][ T1373] ? lockdep_init_map_waits+0x267/0x7b0
[ 56.031165][ T1373] ? xfs_filestream_get_parent+0xa0/0xa0
[ 56.036689][ T1373] ? xfs_mru_cache_create+0x358/0x560
[ 56.041951][ T1373] xfs_fc_fill_super+0x6d3/0xd50
[ 56.046777][ T1373] get_tree_bdev+0x40a/0x690
[ 56.051257][ T1373] ? xfs_fs_inode_init_once+0xc0/0xc0
[ 56.056523][ T1373] vfs_get_tree+0x84/0x2c0
[ 56.060827][ T1373] do_mount+0xf93/0x1630
[ 56.064953][ T1373] ? rcu_read_lock_bh_held+0xc0/0xc0
[ 56.070129][ T1373] ? copy_mount_string+0x20/0x20
[ 56.074956][ T1373] ? _copy_from_user+0xbe/0x100
[ 56.079696][ T1373] ? memdup_user+0x4f/0x80
[ 56.083999][ T1373] __x64_sys_mount+0x15d/0x1b0
2m OK ] St[ 56.088654][ T1373] do_syscall_64+0x5f/0x310
[ 56.094437][ T1373] ? trace_hardirqs_off+0x12/0x1a0
[ 56.099439][ T1373] ? asm_exc_page_fault+0x8/0x30
[ 56.104267][ T1373] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 56.110055][ T1373] RIP: 0033:0x7f3bc2c8a9ee
[ 56.114357][ T1373] Code: Bad RIP value.
[ 56.118309][ T1373] RSP: 002b:00007fffd4675718 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
arted File Syste[ 56.126629][ T1373] RAX: ffffffffffffffda RBX: 000055a59d34c9c0 RCX: 00007f3bc2c8a9ee
[ 56.135900][ T1373] RDX: 000055a59d34cba0 RSI: 000055a59d34cc00 RDI: 000055a59d34e900
[ 56.143779][ T1373] RBP: 00007f3bc3a36184 R08: 0000000000000000 R09: 0000000000000003
[ 56.151661][ T1373] R10: 00000000c0ed0000 R11: 0000000000000246 R12: 0000000000000000
[ 56.159541][ T1373] R13: 00000000c0ed0000 R14: 000055a59d34e900 R15: 000055a59d34cba0
[ 56.167422][ T1373]
[ 56.169626][ T1373]
[ 56.171831][ T1373] addr ffffc9000a7df1e0 is located in stack of task mount/1373 at offset 48 in frame:
[ 56.181287][ T1373] submit_bio_noacct+0x0/0xeb0
submit_bio_noacct at block/blk-core.c:1198
[ 56.185939][ T1373]
[ 56.188144][ T1373] this frame has 2 objects:
m Check on /dev/[ 56.192532][ T1373] [32, 48) 'bio_list'
[ 56.192534][ T1373] [96, 128) 'bio_list_on_stack'
[ 56.197872][ T1373]
[ 56.204894][ T1373] Memory state around the buggy address:
[ 56.210420][ T1373] ffffc9000a7df080: f2 f2 f2 f2 f2 00 f2 f2 f2 f2 f2 f2 f2 00 00 00
[ 56.218389][ T1373] ffffc9000a7df100: 00 00 f2 f2 f2 00 00 00 00 00 00 00 00 00 00 00
disk/by-uuid/D10[ 56.226359][ T1373] >ffffc9000a7df180: 00 00 00 00 00 00 f1 f1 f1 f1 00 00 f2 f2 f2 f2
[ 56.235718][ T1373] ^
[ 56.242817][ T1373] ffffc9000a7df200: f2 f2 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00
[ 56.250790][ T1373] ffffc9000a7df280: 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f2
[ 56.258757][ T1373] ==================================================================

> ---
> block/blk-core.c | 131 +++++++++++++++++++++++++----------------------
> 1 file changed, 71 insertions(+), 60 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 1caeb01e127768..b82f48c86e6f7a 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1085,6 +1085,74 @@ static blk_qc_t do_make_request(struct bio *bio)
> return ret;
> }
>
> +/*
> + * The loop in this function may be a bit non-obvious, and so deserves some
> + * explanation:
> + *
> + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
> + * that), so we have a list with a single bio.
> + * - We pretend that we have just taken it off a longer list, so we assign
> + * bio_list to a pointer to the bio_list_on_stack, thus initialising the
> + * bio_list of new bios to be added. ->submit_bio() may indeed add some more
> + * bios through a recursive call to submit_bio_noacct. If it did, we find a
> + * non-NULL value in bio_list and re-enter the loop from the top.
> + * - In this case we really did just take the bio of the top of the list (no
> + * pretending) and so remove it from bio_list, and call into ->submit_bio()
> + * again.
> + *
> + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
> + * bio_list_on_stack[1] contains bios that were submitted before the current
> + * ->submit_bio_bio, but that haven't been processed yet.
> + */
> +static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +{
> + struct bio_list bio_list_on_stack[2];
> + blk_qc_t ret = BLK_QC_T_NONE;
> +
> + BUG_ON(bio->bi_next);
> +
> + bio_list_init(&bio_list_on_stack[0]);
> + current->bio_list = bio_list_on_stack;
> +
> + do {
> + struct request_queue *q = bio->bi_disk->queue;
> + struct bio_list lower, same;
> +
> + if (unlikely(bio_queue_enter(bio) != 0))
> + continue;
> +
> + /*
> + * Create a fresh bio_list for all subordinate requests.
> + */
> + bio_list_on_stack[1] = bio_list_on_stack[0];
> + bio_list_init(&bio_list_on_stack[0]);
> +
> + ret = do_make_request(bio);
> +
> + /*
> + * Sort new bios into those for a lower level and those for the
> + * same level.
> + */
> + bio_list_init(&lower);
> + bio_list_init(&same);
> + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
> + if (q == bio->bi_disk->queue)
> + bio_list_add(&same, bio);
> + else
> + bio_list_add(&lower, bio);
> +
> + /*
> + * Now assemble so we handle the lowest level first.
> + */
> + bio_list_merge(&bio_list_on_stack[0], &lower);
> + bio_list_merge(&bio_list_on_stack[0], &same);
> + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
> + } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
> +
> + current->bio_list = NULL;
> + return ret;
> +}
> +
> /**
> * submit_bio_noacct - re-submit a bio to the block device layer for I/O
> * @bio: The bio describing the location in memory and on the device.
> @@ -1096,17 +1164,8 @@ static blk_qc_t do_make_request(struct bio *bio)
> */
> blk_qc_t submit_bio_noacct(struct bio *bio)
> {
> - /*
> - * bio_list_on_stack[0] contains bios submitted by the current
> - * ->submit_bio.
> - * bio_list_on_stack[1] contains bios that were submitted before the
> - * current ->submit_bio_bio, but that haven't been processed yet.
> - */
> - struct bio_list bio_list_on_stack[2];
> - blk_qc_t ret = BLK_QC_T_NONE;
> -
> if (!submit_bio_checks(bio))
> - goto out;
> + return BLK_QC_T_NONE;
>
> /*
> * We only want one ->submit_bio to be active at a time, else
> @@ -1120,58 +1179,10 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
> */
> if (current->bio_list) {
> bio_list_add(&current->bio_list[0], bio);
> - goto out;
> + return BLK_QC_T_NONE;
> }
>
> - /* following loop may be a bit non-obvious, and so deserves some
> - * explanation.
> - * Before entering the loop, bio->bi_next is NULL (as all callers
> - * ensure that) so we have a list with a single bio.
> - * We pretend that we have just taken it off a longer list, so
> - * we assign bio_list to a pointer to the bio_list_on_stack,
> - * thus initialising the bio_list of new bios to be
> - * added. ->submit_bio() may indeed add some more bios
> - * through a recursive call to submit_bio_noacct. If it
> - * did, we find a non-NULL value in bio_list and re-enter the loop
> - * from the top. In this case we really did just take the bio
> - * of the top of the list (no pretending) and so remove it from
> - * bio_list, and call into ->submit_bio() again.
> - */
> - BUG_ON(bio->bi_next);
> - bio_list_init(&bio_list_on_stack[0]);
> - current->bio_list = bio_list_on_stack;
> - do {
> - struct request_queue *q = bio->bi_disk->queue;
> -
> - if (likely(bio_queue_enter(bio) == 0)) {
> - struct bio_list lower, same;
> -
> - /* Create a fresh bio_list for all subordinate requests */
> - bio_list_on_stack[1] = bio_list_on_stack[0];
> - bio_list_init(&bio_list_on_stack[0]);
> - ret = do_make_request(bio);
> -
> - /* sort new bios into those for a lower level
> - * and those for the same level
> - */
> - bio_list_init(&lower);
> - bio_list_init(&same);
> - while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
> - if (q == bio->bi_disk->queue)
> - bio_list_add(&same, bio);
> - else
> - bio_list_add(&lower, bio);
> - /* now assemble so we handle the lowest level first */
> - bio_list_merge(&bio_list_on_stack[0], &lower);
> - bio_list_merge(&bio_list_on_stack[0], &same);
> - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
> - }
> - bio = bio_list_pop(&bio_list_on_stack[0]);
> - } while (bio);
> - current->bio_list = NULL; /* deactivate */
> -
> -out:
> - return ret;
> + return __submit_bio_noacct(bio);
> }
> EXPORT_SYMBOL(submit_bio_noacct);
>
> --
> 2.26.2
>

2020-07-02 15:16:22

by Naresh Kamboju

[permalink] [raw]
Subject: Re: [PATCH 18/20] block: refator submit_bio_noacct

On Thu, 2 Jul 2020 at 19:40, Qian Cai <[email protected]> wrote:
>
> On Mon, Jun 29, 2020 at 09:39:45PM +0200, Christoph Hellwig wrote:
> > Split out a __submit_bio_noacct helper for the actual de-recursion
> > algorithm, and simplify the loop by using a continue when we can't
> > enter the queue for a bio.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>

Kernel BUG: on arm64 and x86_64 devices running linux next-rc3-next-20200702
with KASAN config enabled. While running mkfs -t ext4.

metadata:
git branch: master
git repo: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
git commit: d37d57041350dff35dd17cbdf9aef4011acada38
git describe: next-20200702
make_kernelversion: 5.8.0-rc3
kernel-config:
https://builds.tuxbuild.com/DnjQHvYrx586eUoFxtYZxQ/kernel.config

steps to reproduce:
# mkfs -t ext4 /dev/disk/by-id/ata-SanDisk_SDSSDA120G_165193445014


BUG: KASAN: stack-out-of-bounds in bio_alloc_bioset+0x28c/0x2c8
[ 59.398307] Read of size 8 at addr ffff0009084277e0 by task mkfs.ext4/417
[ 59.405121]
[ 59.406644] CPU: 5 PID: 417 Comm: mkfs.ext4 Not tainted
5.8.0-rc3-next-20200702 #1
[ 59.414248] Hardware name: ARM Juno development board (r2) (DT)
[ 59.420195] Call trace:
[ 59.422683] dump_backtrace+0x0/0x2b8
[ 59.426386] show_stack+0x18/0x28
[ 59.429741] dump_stack+0xec/0x144
[ 59.433183] print_address_description.isra.0+0x6c/0x448
[ 59.438531] kasan_report+0x134/0x200
[ 59.442226] __asan_load8+0x9c/0xd8
[ 59.445751] bio_alloc_bioset+0x28c/0x2c8
[ 59.449796] bio_clone_fast+0x28/0x98
[ 59.453492] bio_split+0x64/0x138
[ 59.456842] __blk_queue_split+0x534/0x698
[ 59.460979] blk_mq_submit_bio+0x10c/0x680
[ 59.465118] submit_bio_noacct+0x57c/0x640
[ 59.469253] submit_bio+0xc0/0x358
[ 59.472688] submit_bio_wait+0xc0/0x110
[ 59.476561] blkdev_issue_discard+0xd0/0x138
[ 59.480877] blk_ioctl_discard+0x1b8/0x238
[ 59.485008] blkdev_common_ioctl+0x594/0xd38
[ 59.489312] blkdev_ioctl+0x130/0x578
[ 59.493010] block_ioctl+0x78/0x98
[ 59.496453] ksys_ioctl+0xb8/0xf8
[ 59.499808] __arm64_sys_ioctl+0x44/0x60
[ 59.503781] el0_svc_common.constprop.0+0xa4/0x1e0
[ 59.508615] do_el0_svc+0x38/0xa0
[ 59.511967] el0_sync_handler+0x98/0x1a8
[ 59.515922] el0_sync+0x158/0x180
[ 59.519255]
[ 59.520761] The buggy address belongs to the page:
[ 59.525590] page:fffffe00240109c0 refcount:0 mapcount:0
mapping:0000000000000000 index:0x0
[ 59.533895] flags: 0x2ffff00000000000()
[ 59.537779] raw: 2ffff00000000000 0000000000000000 fffffe00240109c8
0000000000000000
[ 59.545575] raw: 0000000000000000 0000000000000000 00000000ffffffff
0000000000000000
[ 59.553352] page dumped because: kasan: bad access detected
[ 59.558947]
[ 59.560463] addr ffff0009084277e0 is located in stack of task
mkfs.ext4/417 at offset 48 in frame:
[ 59.569475] submit_bio_noacct+0x0/0x640
[ 59.573423]
[ 59.574930] this frame has 2 objects:
[ 59.578624] [32, 48) 'bio_list'
[ 59.578644] [64, 96) 'bio_list_on_stack'
[ 59.581889]
[ 59.587412] Memory state around the buggy address:
[ 59.592243] ffff000908427680: 00 00 00 f2 00 00 00 f2 f2 f2 00 00
00 00 00 f3
[ 59.599510] ffff000908427700: f3 f3 f3 f3 00 00 00 00 00 00 00 00
00 00 00 00
[ 59.606777] >ffff000908427780: 00 00 00 00 00 00 f1 f1 f1 f1 00 00
f2 f2 00 00
[ 59.614031] ^
[ 59.620427] ffff000908427800: 00 00 f3 f3 f3 f3 00 00 00 00 00 00
00 00 00 00
[ 59.627694] ffff000908427880: 00 00 00 00 00 00 f1 f1 f1 f1 00 00
00 00 f3 f3
[ 59.634946] ==================================================================
[ 59.642198] Disabling lock debugging due to kernel taint


Kernel BUG on x86_64:

[ 17.809563] ==================================================================
[ 17.816786] BUG: KASAN: stack-out-of-bounds in bio_alloc_bioset+0x31f/0x340
[ 17.823750] Read of size 8 at addr ffff888225f9f450 by task systemd-udevd/361
[ 17.830881]
[ 17.832384] CPU: 0 PID: 361 Comm: systemd-udevd Not tainted
5.8.0-rc3-next-20200702 #1
[ 17.840294] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
2.2 05/23/2018
[ 17.847686] Call Trace:
[ 17.850143] dump_stack+0x84/0xba
[ 17.853462] print_address_description.constprop.0+0x1f/0x210
[ 17.859212] ? _raw_spin_lock_irqsave+0x7c/0xd0
[ 17.859214] ? _raw_write_lock_irqsave+0xd0/0xd0
[ 17.859217] ? bio_alloc_bioset+0x31f/0x340
[ 17.859220] kasan_report.cold+0x37/0x7c
[ 17.859222] ? bio_alloc_bioset+0x31f/0x340
[ 17.859224] __asan_load8+0x86/0xb0
[ 17.859226] bio_alloc_bioset+0x31f/0x340
[ 17.859228] ? bvec_alloc+0x160/0x160
[ 17.859230] ? bio_alloc_bioset+0x253/0x340
[ 17.859232] ? mpage_alloc.isra.0+0x37/0x120
[ 17.859234] ? do_mpage_readpage+0x740/0xd40
[ 17.859236] ? mpage_readahead+0x196/0x280
[ 17.859238] ? blkdev_readahead+0x10/0x20
[ 17.859241] ? read_pages+0x149/0x470
[ 17.859243] ? page_cache_readahead_unbounded+0x2de/0x360
[ 17.859246] ? __do_page_cache_readahead+0x6c/0x80
[ 17.859248] bio_clone_fast+0x14/0x30
[ 17.859250] bio_split+0x64/0x1b0
[ 17.859252] __blk_queue_split+0x417/0x8d0
[ 17.859255] ? __blk_rq_map_sg+0x820/0x820
[ 17.859258] ? kmem_cache_alloc+0xc6/0x4b0
[ 17.859260] ? mempool_alloc_slab+0x12/0x20
[ 17.859262] blk_mq_submit_bio+0x150/0xb90
[ 17.859265] ? blk_mq_try_issue_directly+0xe0/0xe0
[ 17.859267] ? blk_queue_enter+0xea/0x460
[ 17.859269] ? submit_bio_checks+0x4cc/0xa00
[ 17.859272] ? bio_add_page+0x78/0x110
[ 17.859274] submit_bio_noacct+0x5ff/0x6c0
[ 17.859276] ? mpage_alloc.isra.0+0xab/0x120
[ 17.859279] ? blk_queue_enter+0x460/0x460
[ 17.859281] ? do_mpage_readpage+0xc02/0xd40
[ 17.859283] submit_bio+0xb5/0x2e0
[ 17.859286] ? submit_bio_noacct+0x6c0/0x6c0
[ 17.859288] ? __disk_get_part+0x3d/0x50
[ 17.859290] mpage_readahead+0x227/0x280
[ 17.859293] ? do_mpage_readpage+0xd40/0xd40
[ 17.859295] ? bdev_evict_inode+0x130/0x130
[ 17.859297] ? find_get_pages_contig+0x340/0x340
[ 17.859299] blkdev_readahead+0x10/0x20
[ 17.859302] read_pages+0x149/0x470
[ 17.859304] ? lru_cache_add+0xde/0xf0
[ 17.859306] ? read_cache_pages+0x280/0x280
[ 17.859309] ? add_to_page_cache_locked+0x10/0x10
[ 17.859310] ? alloc_pages_current+0x98/0x110
[ 17.859313] page_cache_readahead_unbounded+0x2de/0x360
[ 17.859316] ? read_pages+0x470/0x470
[ 17.859319] ? xas_load+0xee/0x110
[ 17.859321] ? find_get_entry+0xbf/0x250
[ 17.859323] __do_page_cache_readahead+0x6c/0x80
[ 17.859326] force_page_cache_readahead+0xee/0x180
[ 17.859329] page_cache_sync_readahead+0x131/0x140
[ 17.859331] generic_file_buffered_read+0x698/0x1130
[ 17.859334] ? get_page_from_freelist+0x1b13/0x1e60
[ 17.859337] ? pagecache_get_page+0x3a0/0x3a0
[ 17.859340] ? __isolate_free_page+0x210/0x210
[ 17.859342] ? __ia32_sys_mmap_pgoff+0x90/0x90
[ 17.859345] generic_file_read_iter+0x17f/0x1f0
[ 17.859347] ? memory_high_write+0x1c0/0x1c0
[ 17.859349] blkdev_read_iter+0x76/0x90
[ 17.859352] new_sync_read+0x298/0x3c0
[ 17.859354] ? __ia32_sys_llseek+0x230/0x230
[ 17.859357] ? asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 17.859359] ? fsnotify+0x12c/0x5f0
[ 17.859361] ? __vfs_read+0x30/0x90
[ 17.859363] __vfs_read+0x76/0x90
[ 17.859365] vfs_read+0xc8/0x1e0
[ 17.859368] ksys_read+0xc8/0x170
[ 17.859370] ? kernel_write+0xc0/0xc0
[ 17.859372] ? syscall_trace_enter+0x166/0x280
[ 17.859375] __x64_sys_read+0x3e/0x50
[ 17.859377] do_syscall_64+0x43/0x70
[ 17.859379] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 17.859381] RIP: 0033:0x7fe23cf4b56e
[ 17.859382] Code: Bad RIP value.
[ 17.859383] RSP: 002b:00007fff586583c8 EFLAGS: 00000246 ORIG_RAX:
0000000000000000
[ 17.859386] RAX: ffffffffffffffda RBX: 00005620318bd8a0 RCX: 00007fe23cf4b56e
[ 17.859387] RDX: 0000000000040000 RSI: 00007fe23dd56038 RDI: 000000000000000f
[ 17.859388] RBP: 0000000000040000 R08: 00007fe23dd56010 R09: 0000000000000000
[ 17.859390] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000
[ 17.859391] R13: 00005620318bd8f0 R14: 00007fe23dd56028 R15: 00007fe23dd56010
[ 17.859392]
[ 17.859393] The buggy address belongs to the page:
[ 17.859396] page:ffffea000897e7c0 refcount:0 mapcount:0
mapping:0000000000000000 index:0x0
[ 17.859397] flags: 0x200000000000000()
[ 17.859400] raw: 0200000000000000 0000000000000000 ffffea000897e7c8
0000000000000000
[ 17.859403] raw: 0000000000000000 0000000000000000 00000000ffffffff
0000000000000000
[ 17.859403] page dumped because: kasan: bad access detected
[ 17.859404]
[ 17.859406] addr ffff888225f9f450 is located in stack of task
systemd-udevd/361 at offset 48 in frame:
[ 17.859408] submit_bio_noacct+0x0/0x6c0
[ 17.859409]
[ 17.859410] this frame has 2 objects:
[ 17.859412] [32, 48) 'bio_list'
[ 17.859414] [64, 96) 'bio_list_on_stack'
[ 17.859414]
[ 17.859415] Memory state around the buggy address:
[ 17.859417] ffff888225f9f300: f2 00 00 00 f2 00 00 00 f2 f2 f2 00
00 00 00 00
[ 17.859418] ffff888225f9f380: f3 f3 f3 f3 f3 00 00 00 00 00 00 00
00 00 00 00
[ 17.859420] >ffff888225f9f400: 00 00 00 00 f1 f1 f1 f1 00 00 f2 f2
00 00 00 00
[ 17.859421] ^
[ 17.859422] ffff888225f9f480: f3 f3 f3 f3 00 00 00 00 00 00 00 00
00 00 00 00
[ 17.859424] ffff888225f9f500: 00 00 00 f1 f1 f1 f1 00 00 00 00 f3
f3 f3 f3 00
[ 17.859425] ==================================================================
[ 17.859425] Disabling lock debugging due to kernel taint

2020-07-02 15:17:30

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 18/20] block: refator submit_bio_noacct

On Thu, Jul 02, 2020 at 10:10:10AM -0400, Qian Cai wrote:
> On Mon, Jun 29, 2020 at 09:39:45PM +0200, Christoph Hellwig wrote:
> > Split out a __submit_bio_noacct helper for the actual de-recursion
> > algorithm, and simplify the loop by using a continue when we can't
> > enter the queue for a bio.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
>
> Reverting this commit and its dependencies,
>
> 5a6c35f9af41 block: remove direct_make_request
> ff93ea0ce763 block: shortcut __submit_bio_noacct for blk-mq drivers
>
> fixed the stack-out-of-bounds during boot,
>
> https://lore.kernel.org/linux-block/[email protected]/

Yikes. bio_alloc_bioset pokes into bio_list[1] in a totally
undocumented way. But even with that the problem should only show
up with "block: shortcut __submit_bio_noacct for blk-mq drivers".

Can you try this patch?

diff --git a/block/blk-core.c b/block/blk-core.c
index bf882b8d84450c..9f1bf8658b611a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1155,11 +1155,10 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
{
struct gendisk *disk = bio->bi_disk;
- struct bio_list bio_list;
+ struct bio_list bio_list[2] = { };
blk_qc_t ret = BLK_QC_T_NONE;

- bio_list_init(&bio_list);
- current->bio_list = &bio_list;
+ current->bio_list = bio_list;

do {
WARN_ON_ONCE(bio->bi_disk != disk);
@@ -1174,7 +1173,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
}

ret = blk_mq_submit_bio(bio);
- } while ((bio = bio_list_pop(&bio_list)));
+ } while ((bio = bio_list_pop(&bio_list[0])));

current->bio_list = NULL;
return ret;

2020-07-02 15:56:29

by Naresh Kamboju

[permalink] [raw]
Subject: Re: [PATCH 18/20] block: refator submit_bio_noacct

On Thu, 2 Jul 2020 at 20:45, Christoph Hellwig <[email protected]> wrote:
>
> On Thu, Jul 02, 2020 at 10:10:10AM -0400, Qian Cai wrote:
> > On Mon, Jun 29, 2020 at 09:39:45PM +0200, Christoph Hellwig wrote:
> > > Split out a __submit_bio_noacct helper for the actual de-recursion
> > > algorithm, and simplify the loop by using a continue when we can't
> > > enter the queue for a bio.
> > >
> > > Signed-off-by: Christoph Hellwig <[email protected]>
> >
> > Reverting this commit and its dependencies,
> >
> > 5a6c35f9af41 block: remove direct_make_request
> > ff93ea0ce763 block: shortcut __submit_bio_noacct for blk-mq drivers
> >
> > fixed the stack-out-of-bounds during boot,
> >
> > https://lore.kernel.org/linux-block/[email protected]/
>
> Yikes. bio_alloc_bioset pokes into bio_list[1] in a totally
> undocumented way. But even with that the problem should only show
> up with "block: shortcut __submit_bio_noacct for blk-mq drivers".
>
> Can you try this patch?

Applied your patch on top of linux-next 20200702 and tested on
arm64 and x86_64 devices and the reported BUG fixed.

Reported-by: Naresh Kamboju <[email protected]>
Tested-by: Naresh Kamboju <[email protected]>

>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index bf882b8d84450c..9f1bf8658b611a 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1155,11 +1155,10 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> {
> struct gendisk *disk = bio->bi_disk;
> - struct bio_list bio_list;
> + struct bio_list bio_list[2] = { };
> blk_qc_t ret = BLK_QC_T_NONE;
>
> - bio_list_init(&bio_list);
> - current->bio_list = &bio_list;
> + current->bio_list = bio_list;
>
> do {
> WARN_ON_ONCE(bio->bi_disk != disk);
> @@ -1174,7 +1173,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> }
>
> ret = blk_mq_submit_bio(bio);
> - } while ((bio = bio_list_pop(&bio_list)));
> + } while ((bio = bio_list_pop(&bio_list[0])));
>
> current->bio_list = NULL;
> return ret;

ref:
https://lkft.validation.linaro.org/scheduler/job/1538359#L288
https://lkft.validation.linaro.org/scheduler/job/1538360#L572


- Naresh