Li Nan (4):
blk-iocost: fix divide by 0 error in calc_lcoefs()
blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in
ioc_refresh_params()
blk-iocost: fix UAF in ioc_pd_free
block: fix null-pointer dereference in ioc_pd_init
Yu Kuai (5):
blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
blk-iocost: improve hanlder of match_u64()
blk-iocost: don't allow to configure bio based device
blk-iocost: read params inside lock in sysfs apis
blk-iocost: fix walk_list corruption
block/blk-iocost.c | 117 ++++++++++++++++++++++++++++-----------------
block/genhd.c | 2 +-
2 files changed, 75 insertions(+), 44 deletions(-)
--
2.31.1
From: Yu Kuai <[email protected]>
1) There are one place that return value of match_u64() is not checked.
2) If match_u64() failed, return value is set to -EINVAL despite that
there are other possible errnos.
Signed-off-by: Yu Kuai <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Li Nan <[email protected]>
---
block/blk-iocost.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index fd495e823db2..c532129a1456 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3202,6 +3202,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
substring_t args[MAX_OPT_ARGS];
char buf[32];
int tok;
+ int err;
s64 v;
if (!*p)
@@ -3209,7 +3210,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
switch (match_token(p, qos_ctrl_tokens, args)) {
case QOS_ENABLE:
- match_u64(&args[0], &v);
+ err = match_u64(&args[0], &v);
+ if (err) {
+ ret = err;
+ goto out_unlock;
+ }
+
enable = v;
continue;
case QOS_CTRL:
@@ -3238,8 +3244,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
break;
case QOS_RLAT:
case QOS_WLAT:
- if (match_u64(&args[0], &v))
+ err = match_u64(&args[0], &v);
+ if (err) {
+ ret = err;
goto out_unlock;
+ }
+
qos[tok] = v;
break;
case QOS_MIN:
@@ -3374,6 +3384,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
substring_t args[MAX_OPT_ARGS];
char buf[32];
int tok;
+ int err;
u64 v;
if (!*p)
@@ -3399,8 +3410,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
tok = match_token(p, i_lcoef_tokens, args);
if (tok == NR_I_LCOEFS)
goto out_unlock;
- if (match_u64(&args[0], &v))
+
+ err = match_u64(&args[0], &v);
+ if (err) {
+ ret = err;
goto out_unlock;
+ }
+
u[tok] = v;
user = true;
}
--
2.31.1
Our test found the following problem in kernel 5.10, and the same problem
should exist in mainline:
BUG: KASAN: use-after-free in _raw_spin_lock_irqsave+0x71/0xe0
Write of size 4 at addr ffff8881432000e0 by task swapper/4/0
...
Call Trace:
<IRQ>
dump_stack+0x9c/0xd3
print_address_description.constprop.0+0x19/0x170
__kasan_report.cold+0x6c/0x84
kasan_report+0x3a/0x50
check_memory_region+0xfd/0x1f0
_raw_spin_lock_irqsave+0x71/0xe0
ioc_pd_free+0x9d/0x250
blkg_free.part.0+0x80/0x100
__blkg_release+0xf3/0x1c0
rcu_do_batch+0x292/0x700
rcu_core+0x270/0x2d0
__do_softirq+0xfd/0x402
</IRQ>
asm_call_irq_on_stack+0x12/0x20
do_softirq_own_stack+0x37/0x50
irq_exit_rcu+0x134/0x1a0
sysvec_apic_timer_interrupt+0x36/0x80
asm_sysvec_apic_timer_interrupt+0x12/0x20
Freed by task 57:
kfree+0xba/0x680
rq_qos_exit+0x5a/0x80
blk_cleanup_queue+0xce/0x1a0
virtblk_remove+0x77/0x130 [virtio_blk]
virtio_dev_remove+0x56/0xe0
__device_release_driver+0x2ba/0x450
device_release_driver+0x29/0x40
bus_remove_device+0x1d8/0x2c0
device_del+0x333/0x7e0
device_unregister+0x27/0x90
unregister_virtio_device+0x22/0x40
virtio_pci_remove+0x53/0xb0
pci_device_remove+0x7a/0x130
__device_release_driver+0x2ba/0x450
device_release_driver+0x29/0x40
pci_stop_bus_device+0xcf/0x100
pci_stop_and_remove_bus_device+0x16/0x20
disable_slot+0xa1/0x110
acpiphp_disable_and_eject_slot+0x35/0xe0
hotplug_event+0x1b8/0x3c0
acpiphp_hotplug_notify+0x37/0x70
acpi_device_hotplug+0xee/0x320
acpi_hotplug_work_fn+0x69/0x80
process_one_work+0x3c5/0x730
worker_thread+0x93/0x650
kthread+0x1ba/0x210
ret_from_fork+0x22/0x30
It happened as follow:
T1 T2 T3
//delete device
del_gendisk
bdi_unregister
bdi_remove_from_list
synchronize_rcu_expedited
//rmdir cgroup
blkcg_destroy_blkgs
blkg_destroy
percpu_ref_kill
blkg_release
call_rcu
rq_qos_exit
ioc_rqos_exit
kfree(ioc)
__blkg_release
blkg_free
blkg_free_workfn
pd_free_fn
ioc_pd_free
spin_lock_irqsave
->ioc is freed
Fix the problem by moving the operation on ioc in ioc_pd_free() to
ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
and throttle.
Signed-off-by: Li Nan <[email protected]>
---
block/blk-iocost.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 2316ba93e7d6..710cf63a1643 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2978,7 +2978,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
spin_unlock_irqrestore(&ioc->lock, flags);
}
-static void ioc_pd_free(struct blkg_policy_data *pd)
+static void ioc_pd_offline(struct blkg_policy_data *pd)
{
struct ioc_gq *iocg = pd_to_iocg(pd);
struct ioc *ioc = iocg->ioc;
@@ -3002,6 +3002,12 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
hrtimer_cancel(&iocg->waitq_timer);
}
+}
+
+static void ioc_pd_free(struct blkg_policy_data *pd)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+
free_percpu(iocg->pcpu_stat);
kfree(iocg);
}
@@ -3488,6 +3494,7 @@ static struct blkcg_policy blkcg_policy_iocost = {
.cpd_free_fn = ioc_cpd_free,
.pd_alloc_fn = ioc_pd_alloc,
.pd_init_fn = ioc_pd_init,
+ .pd_offline_fn = ioc_pd_offline,
.pd_free_fn = ioc_pd_free,
.pd_stat_fn = ioc_pd_stat,
};
--
2.31.1
On 11/30/22 6:21 AM, Li Nan wrote:
> Li Nan (4):
> blk-iocost: fix divide by 0 error in calc_lcoefs()
> blk-iocost: change div64_u64 to DIV64_U64_ROUND_UP in
> ioc_refresh_params()
> blk-iocost: fix UAF in ioc_pd_free
> block: fix null-pointer dereference in ioc_pd_init
>
> Yu Kuai (5):
> blk-iocost: cleanup ioc_qos_write() and ioc_cost_model_write()
> blk-iocost: improve hanlder of match_u64()
> blk-iocost: don't allow to configure bio based device
> blk-iocost: read params inside lock in sysfs apis
> blk-iocost: fix walk_list corruption
>
> block/blk-iocost.c | 117 ++++++++++++++++++++++++++++-----------------
> block/genhd.c | 2 +-
> 2 files changed, 75 insertions(+), 44 deletions(-)
Please include a changelog when posting later versions of a patchset.
--
Jens Axboe
On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
> From: Yu Kuai <[email protected]>
>
> 1) There are one place that return value of match_u64() is not checked.
> 2) If match_u64() failed, return value is set to -EINVAL despite that
> there are other possible errnos.
Ditto. Does this matter?
--
tejun
On Wed, Nov 30, 2022 at 09:21:54PM +0800, Li Nan wrote:
> T1 T2 T3
> //delete device
> del_gendisk
> bdi_unregister
> bdi_remove_from_list
> synchronize_rcu_expedited
>
> //rmdir cgroup
> blkcg_destroy_blkgs
> blkg_destroy
> percpu_ref_kill
> blkg_release
> call_rcu
> rq_qos_exit
> ioc_rqos_exit
> kfree(ioc)
> __blkg_release
> blkg_free
> blkg_free_workfn
> pd_free_fn
> ioc_pd_free
> spin_lock_irqsave
> ->ioc is freed
>
> Fix the problem by moving the operation on ioc in ioc_pd_free() to
> ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
> and throttle.
>
> Signed-off-by: Li Nan <[email protected]>
I wonder what we really wanna do is pinning ioc while blkgs are still around
but I think this should work too.
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
Hi,
?? 2022/12/01 4:32, Tejun Heo д??:
> On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
>> From: Yu Kuai <[email protected]>
>>
>> 1) There are one place that return value of match_u64() is not checked.
>> 2) If match_u64() failed, return value is set to -EINVAL despite that
>> there are other possible errnos.
>
> Ditto. Does this matter?
>
It's not a big deal, but I think at least return value of match_u64()
should be checked, we don't want to continue with invalid input, right?
By the way, match_u64() can return -ERANGE, which can provide more
specific error messge to user.
Thanks,
Kuai
On Thu, Dec 01, 2022 at 10:15:53AM +0800, Yu Kuai wrote:
> Hi,
>
> 在 2022/12/01 4:32, Tejun Heo 写道:
> > On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
> > > From: Yu Kuai <[email protected]>
> > >
> > > 1) There are one place that return value of match_u64() is not checked.
> > > 2) If match_u64() failed, return value is set to -EINVAL despite that
> > > there are other possible errnos.
> >
> > Ditto. Does this matter?
> >
>
> It's not a big deal, but I think at least return value of match_u64()
> should be checked, we don't want to continue with invalid input, right?
Yeah, sure.
> By the way, match_u64() can return -ERANGE, which can provide more
> specific error messge to user.
I'm really not convinced going over 64bit range would be all that difficult
to spot whether the error code is -EINVAL or -ERANGE. There isn't anything
wrong with returning -ERANGE but the fact that that particular function
returns an error code doesn't necessarily mean that it *must* be forwarded.
Imagine that we used sscanf(buf, "%llu", &value) to parse the number
instead. We'd only know whether the parsing would have succeeded or not and
would probably return -EINVAL on failure and the behavior would be just
fine. This does not matter *at all*.
So, idk, I'm not necessarily against it but changing -EINVAL to -ERANGE is
pure churn. Nothing material is being improved by that change.
Thanks.
--
tejun
Hi,
在 2022/12/01 18:08, Tejun Heo 写道:
> On Thu, Dec 01, 2022 at 10:15:53AM +0800, Yu Kuai wrote:
>> Hi,
>>
>> 在 2022/12/01 4:32, Tejun Heo 写道:
>>> On Wed, Nov 30, 2022 at 09:21:49PM +0800, Li Nan wrote:
>>>> From: Yu Kuai <[email protected]>
>>>>
>>>> 1) There are one place that return value of match_u64() is not checked.
>>>> 2) If match_u64() failed, return value is set to -EINVAL despite that
>>>> there are other possible errnos.
>>>
>>> Ditto. Does this matter?
>>>
>>
>> It's not a big deal, but I think at least return value of match_u64()
>> should be checked, we don't want to continue with invalid input, right?
>
> Yeah, sure.
>
>> By the way, match_u64() can return -ERANGE, which can provide more
>> specific error messge to user.
>
> I'm really not convinced going over 64bit range would be all that difficult
> to spot whether the error code is -EINVAL or -ERANGE. There isn't anything
> wrong with returning -ERANGE but the fact that that particular function
> returns an error code doesn't necessarily mean that it *must* be forwarded.
>
> Imagine that we used sscanf(buf, "%llu", &value) to parse the number
> instead. We'd only know whether the parsing would have succeeded or not and
> would probably return -EINVAL on failure and the behavior would be just
> fine. This does not matter *at all*.
>
> So, idk, I'm not necessarily against it but changing -EINVAL to -ERANGE is
> pure churn. Nothing material is being improved by that change.
Thanks for the review and explanation, I'll just keep the addition of
return value checking of the former 2 patches.
Thanks,
Kuai
Hi, Tejun!
?? 2022/12/01 4:42, Tejun Heo д??:
> On Wed, Nov 30, 2022 at 09:21:54PM +0800, Li Nan wrote:
>> T1 T2 T3
>> //delete device
>> del_gendisk
>> bdi_unregister
>> bdi_remove_from_list
>> synchronize_rcu_expedited
>>
>> //rmdir cgroup
>> blkcg_destroy_blkgs
>> blkg_destroy
>> percpu_ref_kill
>> blkg_release
>> call_rcu
>> rq_qos_exit
>> ioc_rqos_exit
>> kfree(ioc)
>> __blkg_release
>> blkg_free
>> blkg_free_workfn
>> pd_free_fn
>> ioc_pd_free
>> spin_lock_irqsave
>> ->ioc is freed
>>
>> Fix the problem by moving the operation on ioc in ioc_pd_free() to
>> ioc_pd_offline(), and just free resource in ioc_pd_free() like iolatency
>> and throttle.
>>
>> Signed-off-by: Li Nan <[email protected]>
>
> I wonder what we really wanna do is pinning ioc while blkgs are still around
> but I think this should work too.
>
I just found that this is not enough, other problems still exist:
t1:
bio_init
bio_associate_blkg
//get blkg->refcnt
......
submit_bio
blk_throtl_bio
// bio is throttlled, user thread can exit
t2:
// blkcg online_pin is zero
blkcg_destroy_blkgs
blkg_destroy
ioc_pd_offline
list_del_init(&iocg->active_list)
t3:
ioc_rqos_throttle
blkg_to_iocg
// got the iocg that is offlined
iocg_activate
// acitvate the iocg again
For consequence, kernel can crash due to access unexpected
address. Fortunately, bfq already handle similar problem by checking
blkg->online in bfq_bio_bfqg(), this problem can be fixed by checking
it in iocg_activate().
BTW, I'm still working on checking if other policies have the same
problem.
Thanks,
Kuai