LinuxLists.cc - [PATCH] blk-cgroup: check blkcg policy is enabled in blkg

2021-10-08 07:17:23

Subject: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

Out test report a null pointer dereference:

[ 168.534653] ==================================================================
[ 168.535614] Disabling lock debugging due to kernel taint
[ 168.536346] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 168.537274] #PF: supervisor read access in kernel mode
[ 168.537964] #PF: error_code(0x0000) - not-present page
[ 168.538667] PGD 0 P4D 0
[ 168.539025] Oops: 0000 [#1] PREEMPT SMP KASAN
[ 168.539656] CPU: 13 PID: 759 Comm: bash Tainted: G B 5.15.0-rc2-next-202100
[ 168.540954] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_0738364
[ 168.542736] RIP: 0010:bfq_pd_init+0x88/0x1e0
[ 168.543318] Code: 98 00 00 00 e8 c9 e4 5b ff 4c 8b 65 00 49 8d 7c 24 08 e8 bb e4 5b ff 4d0
[ 168.545803] RSP: 0018:ffff88817095f9c0 EFLAGS: 00010002
[ 168.546497] RAX: 0000000000000001 RBX: ffff888101a1c000 RCX: 0000000000000000
[ 168.547438] RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff888106553428
[ 168.548402] RBP: ffff888106553400 R08: ffffffff961bcaf4 R09: 0000000000000001
[ 168.549365] R10: ffffffffa2e16c27 R11: fffffbfff45c2d84 R12: 0000000000000000
[ 168.550291] R13: ffff888101a1c098 R14: ffff88810c7a08c8 R15: ffffffffa55541a0
[ 168.551221] FS: 00007fac75227700(0000) GS:ffff88839ba80000(0000) knlGS:0000000000000000
[ 168.552278] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 168.553040] CR2: 0000000000000008 CR3: 0000000165ce7000 CR4: 00000000000006e0
[ 168.554000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 168.554929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 168.555888] Call Trace:
[ 168.556221] <TASK>
[ 168.556510] blkg_create+0x1c0/0x8c0
[ 168.556989] blkg_conf_prep+0x574/0x650
[ 168.557502] ? stack_trace_save+0x99/0xd0
[ 168.558033] ? blkcg_conf_open_bdev+0x1b0/0x1b0
[ 168.558629] tg_set_conf.constprop.0+0xb9/0x280
[ 168.559231] ? kasan_set_track+0x29/0x40
[ 168.559758] ? kasan_set_free_info+0x30/0x60
[ 168.560344] ? tg_set_limit+0xae0/0xae0
[ 168.560853] ? do_sys_openat2+0x33b/0x640
[ 168.561383] ? do_sys_open+0xa2/0x100
[ 168.561877] ? __x64_sys_open+0x4e/0x60
[ 168.562383] ? __kasan_check_write+0x20/0x30
[ 168.562951] ? copyin+0x48/0x70
[ 168.563390] ? _copy_from_iter+0x234/0x9e0
[ 168.563948] tg_set_conf_u64+0x17/0x20
[ 168.564467] cgroup_file_write+0x1ad/0x380
[ 168.565014] ? cgroup_file_poll+0x80/0x80
[ 168.565568] ? __mutex_lock_slowpath+0x30/0x30
[ 168.566165] ? pgd_free+0x100/0x160
[ 168.566649] kernfs_fop_write_iter+0x21d/0x340
[ 168.567246] ? cgroup_file_poll+0x80/0x80
[ 168.567796] new_sync_write+0x29f/0x3c0
[ 168.568314] ? new_sync_read+0x410/0x410
[ 168.568840] ? __handle_mm_fault+0x1c97/0x2d80
[ 168.569425] ? copy_page_range+0x2b10/0x2b10
[ 168.570007] ? _raw_read_lock_bh+0xa0/0xa0
[ 168.570622] vfs_write+0x46e/0x630
[ 168.571091] ksys_write+0xcd/0x1e0
[ 168.571563] ? __x64_sys_read+0x60/0x60
[ 168.572081] ? __kasan_check_write+0x20/0x30
[ 168.572659] ? do_user_addr_fault+0x446/0xff0
[ 168.573264] __x64_sys_write+0x46/0x60
[ 168.573774] do_syscall_64+0x35/0x80
[ 168.574264] entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 168.574960] RIP: 0033:0x7fac74915130
[ 168.575456] Code: 73 01 c3 48 8b 0d 58 ed 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 444
[ 168.577969] RSP: 002b:00007ffc3080e288 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 168.578986] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007fac74915130
[ 168.579937] RDX: 0000000000000009 RSI: 000056007669f080 RDI: 0000000000000001
[ 168.580884] RBP: 000056007669f080 R08: 000000000000000a R09: 00007fac75227700
[ 168.581841] R10: 000056007655c8f0 R11: 0000000000000246 R12: 0000000000000009
[ 168.582796] R13: 0000000000000001 R14: 00007fac74be55e0 R15: 00007fac74be08c0
[ 168.583757] </TASK>
[ 168.584063] Modules linked in:
[ 168.584494] CR2: 0000000000000008
[ 168.584964] ---[ end trace 2475611ad0f77a1a ]---

This is because blkg_alloc() is called from blkg_conf_prep() without
holding 'q->queue_lock', and elevator is exited before blkg_create():

thread 1 thread 2
blkg_conf_prep
spin_lock_irq(&q->queue_lock);
blkg_lookup_check -> return NULL
spin_unlock_irq(&q->queue_lock);

blkg_alloc
blkcg_policy_enabled -> true
pd = ->pd_alloc_fn
blkg->pd[i] = pd
blk_mq_exit_sched
bfq_exit_queue
blkcg_deactivate_policy
spin_lock_irq(&q->queue_lock);
__clear_bit(pol->plid, q->blkcg_pols);
spin_unlock_irq(&q->queue_lock);
q->elevator = NULL;
spin_lock_irq(&q->queue_lock);
blkg_create
if (blkg->pd[i])
->pd_init_fn -> q->elevator is NULL
spin_unlock_irq(&q->queue_lock);

Fix the problem by checking that policy is still enabled in
blkg_create().

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-cgroup.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb48090eefce..00e1d97621ea 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
}
EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);

+static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
+{
+ int i;
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
+ pol->pd_free_fn(blkg->pd[i]);
+ blkg->pd[i] = NULL;
+ }
+ }
+}
+
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
* necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
@@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}

+ if (new_blkg)
+ blkg_check_pd(q, new_blkg);
+
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
--
2.31.1

2021-10-11 16:41:14

by Michal Koutný

[permalink] [raw]

Subject: Re: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

Hello.

On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai <[email protected]> wrote:
> This is because blkg_alloc() is called from blkg_conf_prep() without
> holding 'q->queue_lock', and elevator is exited before blkg_create():

IIUC the problematic interleaving is this one (I've noticed `blkg->pd[i]
= NULL` to thread 2 call trace):

> thread 1 thread 2
> blkg_conf_prep
> spin_lock_irq(&q->queue_lock);
> blkg_lookup_check -> return NULL
> spin_unlock_irq(&q->queue_lock);
>
> blkg_alloc
> blkcg_policy_enabled -> true
> pd = ->pd_alloc_fn
> blk_mq_exit_sched
> bfq_exit_queue
> blkcg_deactivate_policy
> spin_lock_irq(&q->queue_lock);
> __clear_bit(pol->plid, q->blkcg_pols);
>
pol->pd_free_fn(blkg->pd[i]);
blkg->pd[i] = NULL;
>
> spin_unlock_irq(&q->queue_lock);
> q->elevator = NULL;
blkg->pd[i] = pd
> spin_lock_irq(&q->queue_lock);
> blkg_create
> if (blkg->pd[i])
> ->pd_init_fn -> q->elevator is NULL
> spin_unlock_irq(&q->queue_lock);

In high-level terms, is this a race between (blk)io controller attribute
write and a device scheduler (elevator) switch?
If so, I'd add it to the commit message.

> Fix the problem by checking that policy is still enabled in
> blkg_create().

Is this sufficient wrt some other q->elevator users later?

> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
> goto err_free_blkg;
> }
>

I'd add a comment here like:

> Re-check policies are still enabled, since the caller blkg_conf_prep()
> temporarily drops q->queue_lock and we can race with
> blk_mq_exit_sched() removing policies.

> + if (new_blkg)
> + blkg_check_pd(q, new_blkg);
> +

Thanks,
Michal

2021-10-11 17:19:46

by Tejun Heo

[permalink] [raw]

Subject: Re: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index eb48090eefce..00e1d97621ea 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
> }
> EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>
> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
> +{
> + int i;
> +
> + for (i = 0; i < BLKCG_MAX_POLS; i++) {
> + struct blkcg_policy *pol = blkcg_policy[i];
> +
> + if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
> + pol->pd_free_fn(blkg->pd[i]);
> + blkg->pd[i] = NULL;
> + }
> + }
> +}
> +
> /*
> * If @new_blkg is %NULL, this function tries to allocate a new one as
> * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
> goto err_free_blkg;
> }
>
> + if (new_blkg)
> + blkg_check_pd(q, new_blkg);
> +

Can't this happen the other way around too? ie. Linking a pd which doesn't
have an entry for a policy which got enabled inbetween? And what if an
existing policy was de-registered and another policy got the policy id
inbetween? I think the correct solution here would be synchronizing alloc -
create blocks against policy deactivation rather than trying to patch an
allocated blkg later. Deactivation being a really slow path, there are
plenty of options. The main challenge would making it difficult to make
mistakes with, I guess.

Thanks.

--
tejun

2021-10-12 01:14:54

by Yu Kuai

[permalink] [raw]

Subject: Re: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

On 2021/10/11 23:23, Michal Koutný wrote:
> Hello.
>
> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai <[email protected]> wrote:
>> This is because blkg_alloc() is called from blkg_conf_prep() without
>> holding 'q->queue_lock', and elevator is exited before blkg_create():
>
> IIUC the problematic interleaving is this one (I've noticed `blkg->pd[i]
> = NULL` to thread 2 call trace):

The new blkg will not add to blkg_list untill pd_init_fn() is done in
blkg_create(), thus blkcg_deactivate_policy() can't access this blkg.
>
>> thread 1 thread 2
>> blkg_conf_prep
>> spin_lock_irq(&q->queue_lock);
>> blkg_lookup_check -> return NULL
>> spin_unlock_irq(&q->queue_lock);
>>
>> blkg_alloc
>> blkcg_policy_enabled -> true
>> pd = ->pd_alloc_fn
>> blk_mq_exit_sched
>> bfq_exit_queue
>> blkcg_deactivate_policy
>> spin_lock_irq(&q->queue_lock);
>> __clear_bit(pol->plid, q->blkcg_pols);
>>
> pol->pd_free_fn(blkg->pd[i]);
> blkg->pd[i] = NULL;
>>
>> spin_unlock_irq(&q->queue_lock);
>> q->elevator = NULL;
> blkg->pd[i] = pd
>> spin_lock_irq(&q->queue_lock);
>> blkg_create
>> if (blkg->pd[i])
>> ->pd_init_fn -> q->elevator is NULL
>> spin_unlock_irq(&q->queue_lock);
>
> In high-level terms, is this a race between (blk)io controller attribute
> write and a device scheduler (elevator) switch?
> If so, I'd add it to the commit message.
>
>> Fix the problem by checking that policy is still enabled in
>> blkg_create().
>
> Is this sufficient wrt some other q->elevator users later?
>
>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>> goto err_free_blkg;
>> }
>>
>
> I'd add a comment here like:
>
>> Re-check policies are still enabled, since the caller blkg_conf_prep()
>> temporarily drops q->queue_lock and we can race with
>> blk_mq_exit_sched() removing policies.

Thanks for your advice.

Best regards,
Kuai
>
>> + if (new_blkg)
>> + blkg_check_pd(q, new_blkg);
>> +
>
> Thanks,
> Michal
> .
>

2021-10-12 01:47:50

by Yu Kuai

[permalink] [raw]

Subject: Re: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

On 2021/10/12 1:16, Tejun Heo wrote:
> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index eb48090eefce..00e1d97621ea 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
>> }
>> EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>>
>> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < BLKCG_MAX_POLS; i++) {
>> + struct blkcg_policy *pol = blkcg_policy[i];
>> +
>> + if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
>> + pol->pd_free_fn(blkg->pd[i]);
>> + blkg->pd[i] = NULL;
>> + }
>> + }
>> +}
>> +
>> /*
>> * If @new_blkg is %NULL, this function tries to allocate a new one as
>> * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>> goto err_free_blkg;
>> }
>>
>> + if (new_blkg)
>> + blkg_check_pd(q, new_blkg);
>> +
>
> Can't this happen the other way around too? ie. Linking a pd which doesn't
> have an entry for a policy which got enabled inbetween? And what if an
> existing policy was de-registered and another policy got the policy id
> inbetween? I think the correct solution here would be synchronizing alloc -
> create blocks against policy deactivation rather than trying to patch an
> allocated blkg later. Deactivation being a really slow path, there are
> plenty of options. The main challenge would making it difficult to make
> mistakes with, I guess.

For the case policy was de-registered, I think there won't be a problem
because pd_init_fn() is not called yet, and the blkg is not at
blkg_list, it's fine to use this blkg for the new policy.

For the case policy got enabled inbetween, the problem is that the pd
still doesn't have an entry for the policy, perhaps we can call
pd_alloc_fn() additionally in blkg_create?

If checking the blkg in blkg_create() is not a good solution, and we
decide to synchronize alloc-create blkg against policy deactivation.
Since only bfq policy can be deactivated or activated while queue is
not dying, and queue is freezed during activation and deactivation,
can we get a q->q_usage_counter and put it after blkg_create() is done
to prevent concurrent bfq policy activation and deactivation?

Thanks,
Kuai
>
> Thanks.
>

2021-10-13 11:48:41

by Yu Kuai

[permalink] [raw]

Subject: Re: [PATCH] blk-cgroup: check blkcg policy is enabled in blkg_create()

On 2021/10/12 9:39, yukuai (C) wrote:
> On 2021/10/12 1:16, Tejun Heo wrote:
>> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index eb48090eefce..00e1d97621ea 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct
>>> blkcg *blkcg,
>>> }
>>> EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>>> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq
>>> *blkg)
>>> +{
>>> +    int i;
>>> +
>>> +    for (i = 0; i < BLKCG_MAX_POLS; i++) {
>>> +        struct blkcg_policy *pol = blkcg_policy[i];
>>> +
>>> +        if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
>>> +            pol->pd_free_fn(blkg->pd[i]);
>>> +            blkg->pd[i] = NULL;
>>> +        }
>>> +    }
>>> +}
>>> +
>>> /*
>>>    * If @new_blkg is %NULL, this function tries to allocate a new one as
>>>    * necessary using %GFP_NOWAIT. @new_blkg is always consumed on
>>> return.
>>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg
>>> *blkcg,
>>>           goto err_free_blkg;
>>>       }
>>> +    if (new_blkg)
>>> +        blkg_check_pd(q, new_blkg);
>>> +
>>
>> Can't this happen the other way around too? ie. Linking a pd which
>> doesn't
>> have an entry for a policy which got enabled inbetween? And what if an
>> existing policy was de-registered and another policy got the policy id
>> inbetween? I think the correct solution here would be synchronizing
>> alloc -
>> create blocks against policy deactivation rather than trying to patch an
>> allocated blkg later. Deactivation being a really slow path, there are
>> plenty of options. The main challenge would making it difficult to make
>> mistakes with, I guess.
>
> For the case policy was de-registered, I think there won't be a problem
> because pd_init_fn() is not called yet, and the blkg is not at
> blkg_list, it's fine to use this blkg for the new policy.
>
> For the case policy got enabled inbetween, the problem is that the pd
> still doesn't have an entry for the policy, perhaps we can call
> pd_alloc_fn() additionally in blkg_create?
>
> If checking the blkg in blkg_create() is not a good solution, and we
> decide to synchronize alloc-create blkg against policy deactivation.
> Since only bfq policy can be deactivated or activated while queue is
> not dying, and queue is freezed during activation and deactivation,
> can we get a q->q_usage_counter and put it after blkg_create() is done
> to prevent concurrent bfq policy activation and deactivation?

Just found that blkcg_deactivate_policy() will call
blk_mq_freeze_queue(), thus get q->q_usage_counter is wrong...

Thanks,
Kuai