When a vlan netdev enter netdevice_event process although it is not a
roce netdev, it will be passed to netdevice_event_work_handler() to
process. In order to hold the netdev of netdevice_event after
netdevice_event() return, call dev_hold() to hold the netdev in
netdevice_queue_work(). But that did not consider the real_dev of a vlan
netdev, the real_dev can be freed within netdevice_event_work_handler()
be scheduled. It would trigger the UAF problem for the real_dev like
following:
==================================================================
BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
Workqueue: gid-cache-wq netdevice_event_work_handler
Call Trace:
dump_stack_lvl+0xcd/0x134
print_address_description.constprop.0.cold+0x93/0x334
kasan_report.cold+0x83/0xdf
vlan_dev_real_dev+0xf9/0x120
is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
is_eth_port_of_netdev_filter+0x28/0x40
ib_enum_roce_netdev+0x1a3/0x300
ib_enum_all_roce_netdevs+0xc7/0x140
netdevice_event_work_handler+0x9d/0x210
...
Allocated by task 9289:
kasan_save_stack+0x1b/0x40
__kasan_kmalloc+0x9b/0xd0
__kmalloc_node+0x20a/0x330
kvmalloc_node+0x61/0xf0
alloc_netdev_mqs+0x9d/0x1140
rtnl_create_link+0x955/0xb70
__rtnl_newlink+0xe10/0x15b0
rtnl_newlink+0x64/0xa0
...
Freed by task 9288:
kasan_save_stack+0x1b/0x40
kasan_set_track+0x1c/0x30
kasan_set_free_info+0x20/0x30
__kasan_slab_free+0xfc/0x130
slab_free_freelist_hook+0xdd/0x240
kfree+0xe4/0x690
kvfree+0x42/0x50
device_release+0x9f/0x240
kobject_put+0x1c8/0x530
put_device+0x1b/0x30
free_netdev+0x370/0x540
ppp_destroy_interface+0x313/0x3d0
ppp_release+0x1bf/0x240
...
Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
to fix the UAF problem.
Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
Reported-by: [email protected]
Signed-off-by: Ziyang Xuan <[email protected]>
---
drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 68197e576433..063dbe72b7c2 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
{
struct netdev_event_work *work =
container_of(_work, struct netdev_event_work, work);
+ struct net_device *real_dev;
unsigned int i;
for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
@@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
work->cmds[i].filter_ndev,
work->cmds[i].cb,
work->cmds[i].ndev);
+ real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
+ if (real_dev)
+ dev_put(real_dev);
+ real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
+ if (real_dev)
+ dev_put(real_dev);
dev_put(work->cmds[i].ndev);
dev_put(work->cmds[i].filter_ndev);
}
@@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
struct net_device *ndev)
{
- unsigned int i;
struct netdev_event_work *ndev_work =
kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+ struct net_device *real_dev;
+ unsigned int i;
if (!ndev_work)
return NOTIFY_DONE;
@@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
ndev_work->cmds[i].filter_ndev = ndev;
dev_hold(ndev_work->cmds[i].ndev);
dev_hold(ndev_work->cmds[i].filter_ndev);
+ real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
+ if (real_dev)
+ dev_hold(real_dev);
+ real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
+ if (real_dev)
+ dev_hold(real_dev);
}
INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
--
2.25.1
> On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
>> When a vlan netdev enter netdevice_event process although it is not a
>> roce netdev, it will be passed to netdevice_event_work_handler() to
>> process. In order to hold the netdev of netdevice_event after
>> netdevice_event() return, call dev_hold() to hold the netdev in
>> netdevice_queue_work(). But that did not consider the real_dev of a vlan
>> netdev, the real_dev can be freed within netdevice_event_work_handler()
>> be scheduled. It would trigger the UAF problem for the real_dev like
>> following:
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
>> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
>> Workqueue: gid-cache-wq netdevice_event_work_handler
>> Call Trace:
>> dump_stack_lvl+0xcd/0x134
>> print_address_description.constprop.0.cold+0x93/0x334
>> kasan_report.cold+0x83/0xdf
>> vlan_dev_real_dev+0xf9/0x120
>> is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
>> is_eth_port_of_netdev_filter+0x28/0x40
>> ib_enum_roce_netdev+0x1a3/0x300
>> ib_enum_all_roce_netdevs+0xc7/0x140
>> netdevice_event_work_handler+0x9d/0x210
>> ...
>>
>> Allocated by task 9289:
>> kasan_save_stack+0x1b/0x40
>> __kasan_kmalloc+0x9b/0xd0
>> __kmalloc_node+0x20a/0x330
>> kvmalloc_node+0x61/0xf0
>> alloc_netdev_mqs+0x9d/0x1140
>> rtnl_create_link+0x955/0xb70
>> __rtnl_newlink+0xe10/0x15b0
>> rtnl_newlink+0x64/0xa0
>> ...
>>
>> Freed by task 9288:
>> kasan_save_stack+0x1b/0x40
>> kasan_set_track+0x1c/0x30
>> kasan_set_free_info+0x20/0x30
>> __kasan_slab_free+0xfc/0x130
>> slab_free_freelist_hook+0xdd/0x240
>> kfree+0xe4/0x690
>> kvfree+0x42/0x50
>> device_release+0x9f/0x240
>> kobject_put+0x1c8/0x530
>> put_device+0x1b/0x30
>> free_netdev+0x370/0x540
>> ppp_destroy_interface+0x313/0x3d0
>> ppp_release+0x1bf/0x240
>> ...
>>
>> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
>> to fix the UAF problem.
>>
>> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
>> Reported-by: [email protected]
>> Signed-off-by: Ziyang Xuan <[email protected]>
>> ---
>> drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
>> 1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
>> index 68197e576433..063dbe72b7c2 100644
>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>> {
>> struct netdev_event_work *work =
>> container_of(_work, struct netdev_event_work, work);
>> + struct net_device *real_dev;
>> unsigned int i;
>>
>> for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>> work->cmds[i].filter_ndev,
>> work->cmds[i].cb,
>> work->cmds[i].ndev);
>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
>> + if (real_dev)
>> + dev_put(real_dev);
>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
>> + if (real_dev)
>> + dev_put(real_dev);
>> dev_put(work->cmds[i].ndev);
>> dev_put(work->cmds[i].filter_ndev);
>> }
>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>> static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>> struct net_device *ndev)
>> {
>> - unsigned int i;
>> struct netdev_event_work *ndev_work =
>> kmalloc(sizeof(*ndev_work), GFP_KERNEL);
>> + struct net_device *real_dev;
>> + unsigned int i;
>>
>> if (!ndev_work)
>> return NOTIFY_DONE;
>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>> ndev_work->cmds[i].filter_ndev = ndev;
>> dev_hold(ndev_work->cmds[i].ndev);
>> dev_hold(ndev_work->cmds[i].filter_ndev);
>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
>> + if (real_dev)
>> + dev_hold(real_dev);
>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
>> + if (real_dev)
>> + dev_hold(real_dev);
>> }
>> INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
>
> Probably, this is the right change, but I don't know well enough that
> part of code. What prevents from "real_dev" to disappear right after
> your call to rdma_vlan_dev_real_dev()?
>
It is known that free the net_device until its dev_refcnt is one. The
detail realization see netdev_run_todo().The real_dev's dev_refcnt of
a vlan net_device will reach one after unregister_netdevice(&real_dev)
and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
net_device is bigger than one because netdevice_queue_work() will hold
the vlan net_device. So my solution is hold the real_dev too in
netdevice_queue_work().
> Thanks
>
>>
>> --
>> 2.25.1
>>
> .
>
On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> When a vlan netdev enter netdevice_event process although it is not a
> roce netdev, it will be passed to netdevice_event_work_handler() to
> process. In order to hold the netdev of netdevice_event after
> netdevice_event() return, call dev_hold() to hold the netdev in
> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> netdev, the real_dev can be freed within netdevice_event_work_handler()
> be scheduled. It would trigger the UAF problem for the real_dev like
> following:
>
> ==================================================================
> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
> Workqueue: gid-cache-wq netdevice_event_work_handler
> Call Trace:
> dump_stack_lvl+0xcd/0x134
> print_address_description.constprop.0.cold+0x93/0x334
> kasan_report.cold+0x83/0xdf
> vlan_dev_real_dev+0xf9/0x120
> is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
> is_eth_port_of_netdev_filter+0x28/0x40
> ib_enum_roce_netdev+0x1a3/0x300
> ib_enum_all_roce_netdevs+0xc7/0x140
> netdevice_event_work_handler+0x9d/0x210
> ...
>
> Allocated by task 9289:
> kasan_save_stack+0x1b/0x40
> __kasan_kmalloc+0x9b/0xd0
> __kmalloc_node+0x20a/0x330
> kvmalloc_node+0x61/0xf0
> alloc_netdev_mqs+0x9d/0x1140
> rtnl_create_link+0x955/0xb70
> __rtnl_newlink+0xe10/0x15b0
> rtnl_newlink+0x64/0xa0
> ...
>
> Freed by task 9288:
> kasan_save_stack+0x1b/0x40
> kasan_set_track+0x1c/0x30
> kasan_set_free_info+0x20/0x30
> __kasan_slab_free+0xfc/0x130
> slab_free_freelist_hook+0xdd/0x240
> kfree+0xe4/0x690
> kvfree+0x42/0x50
> device_release+0x9f/0x240
> kobject_put+0x1c8/0x530
> put_device+0x1b/0x30
> free_netdev+0x370/0x540
> ppp_destroy_interface+0x313/0x3d0
> ppp_release+0x1bf/0x240
> ...
>
> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
> to fix the UAF problem.
>
> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
> Reported-by: [email protected]
> Signed-off-by: Ziyang Xuan <[email protected]>
> ---
> drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
> 1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> index 68197e576433..063dbe72b7c2 100644
> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> {
> struct netdev_event_work *work =
> container_of(_work, struct netdev_event_work, work);
> + struct net_device *real_dev;
> unsigned int i;
>
> for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> work->cmds[i].filter_ndev,
> work->cmds[i].cb,
> work->cmds[i].ndev);
> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> + if (real_dev)
> + dev_put(real_dev);
> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> + if (real_dev)
> + dev_put(real_dev);
> dev_put(work->cmds[i].ndev);
> dev_put(work->cmds[i].filter_ndev);
> }
> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> struct net_device *ndev)
> {
> - unsigned int i;
> struct netdev_event_work *ndev_work =
> kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> + struct net_device *real_dev;
> + unsigned int i;
>
> if (!ndev_work)
> return NOTIFY_DONE;
> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> ndev_work->cmds[i].filter_ndev = ndev;
> dev_hold(ndev_work->cmds[i].ndev);
> dev_hold(ndev_work->cmds[i].filter_ndev);
> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> + if (real_dev)
> + dev_hold(real_dev);
> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> + if (real_dev)
> + dev_hold(real_dev);
> }
> INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
Probably, this is the right change, but I don't know well enough that
part of code. What prevents from "real_dev" to disappear right after
your call to rdma_vlan_dev_real_dev()?
Thanks
>
> --
> 2.25.1
>
On Mon, Oct 25, 2021 at 04:37:41PM +0800, Ziyang Xuan (William) wrote:
> > On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> >> When a vlan netdev enter netdevice_event process although it is not a
> >> roce netdev, it will be passed to netdevice_event_work_handler() to
> >> process. In order to hold the netdev of netdevice_event after
> >> netdevice_event() return, call dev_hold() to hold the netdev in
> >> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> >> netdev, the real_dev can be freed within netdevice_event_work_handler()
> >> be scheduled. It would trigger the UAF problem for the real_dev like
> >> following:
> >>
> >> ==================================================================
> >> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
> >> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
> >> Workqueue: gid-cache-wq netdevice_event_work_handler
> >> Call Trace:
> >> dump_stack_lvl+0xcd/0x134
> >> print_address_description.constprop.0.cold+0x93/0x334
> >> kasan_report.cold+0x83/0xdf
> >> vlan_dev_real_dev+0xf9/0x120
> >> is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
> >> is_eth_port_of_netdev_filter+0x28/0x40
> >> ib_enum_roce_netdev+0x1a3/0x300
> >> ib_enum_all_roce_netdevs+0xc7/0x140
> >> netdevice_event_work_handler+0x9d/0x210
> >> ...
> >>
> >> Allocated by task 9289:
> >> kasan_save_stack+0x1b/0x40
> >> __kasan_kmalloc+0x9b/0xd0
> >> __kmalloc_node+0x20a/0x330
> >> kvmalloc_node+0x61/0xf0
> >> alloc_netdev_mqs+0x9d/0x1140
> >> rtnl_create_link+0x955/0xb70
> >> __rtnl_newlink+0xe10/0x15b0
> >> rtnl_newlink+0x64/0xa0
> >> ...
> >>
> >> Freed by task 9288:
> >> kasan_save_stack+0x1b/0x40
> >> kasan_set_track+0x1c/0x30
> >> kasan_set_free_info+0x20/0x30
> >> __kasan_slab_free+0xfc/0x130
> >> slab_free_freelist_hook+0xdd/0x240
> >> kfree+0xe4/0x690
> >> kvfree+0x42/0x50
> >> device_release+0x9f/0x240
> >> kobject_put+0x1c8/0x530
> >> put_device+0x1b/0x30
> >> free_netdev+0x370/0x540
> >> ppp_destroy_interface+0x313/0x3d0
> >> ppp_release+0x1bf/0x240
> >> ...
> >>
> >> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
> >> to fix the UAF problem.
> >>
> >> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
> >> Reported-by: [email protected]
> >> Signed-off-by: Ziyang Xuan <[email protected]>
> >> ---
> >> drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
> >> 1 file changed, 15 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> >> index 68197e576433..063dbe72b7c2 100644
> >> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> >> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> >> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >> {
> >> struct netdev_event_work *work =
> >> container_of(_work, struct netdev_event_work, work);
> >> + struct net_device *real_dev;
> >> unsigned int i;
> >>
> >> for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> >> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >> work->cmds[i].filter_ndev,
> >> work->cmds[i].cb,
> >> work->cmds[i].ndev);
> >> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> >> + if (real_dev)
> >> + dev_put(real_dev);
> >> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> >> + if (real_dev)
> >> + dev_put(real_dev);
> >> dev_put(work->cmds[i].ndev);
> >> dev_put(work->cmds[i].filter_ndev);
> >> }
> >> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >> static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >> struct net_device *ndev)
> >> {
> >> - unsigned int i;
> >> struct netdev_event_work *ndev_work =
> >> kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> >> + struct net_device *real_dev;
> >> + unsigned int i;
> >>
> >> if (!ndev_work)
> >> return NOTIFY_DONE;
> >> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >> ndev_work->cmds[i].filter_ndev = ndev;
> >> dev_hold(ndev_work->cmds[i].ndev);
> >> dev_hold(ndev_work->cmds[i].filter_ndev);
> >> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> >> + if (real_dev)
> >> + dev_hold(real_dev);
> >> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> >> + if (real_dev)
> >> + dev_hold(real_dev);
> >> }
> >> INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
> >
> > Probably, this is the right change, but I don't know well enough that
> > part of code. What prevents from "real_dev" to disappear right after
> > your call to rdma_vlan_dev_real_dev()?
> >
>
> It is known that free the net_device until its dev_refcnt is one. The
> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
> a vlan net_device will reach one after unregister_netdevice(&real_dev)
> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
> net_device is bigger than one because netdevice_queue_work() will hold
> the vlan net_device. So my solution is hold the real_dev too in
> netdevice_queue_work().
dev_hold(ndev_work->cmds[i].filter_ndev);
+ real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
+ if (real_dev)
<------------ real_dev is released here.
+ dev_hold(real_dev);
>
> > Thanks
> >
> >>
> >> --
> >> 2.25.1
> >>
> > .
> >
On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> When a vlan netdev enter netdevice_event process although it is not a
> roce netdev, it will be passed to netdevice_event_work_handler() to
> process. In order to hold the netdev of netdevice_event after
> netdevice_event() return, call dev_hold() to hold the netdev in
> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> netdev, the real_dev can be freed within netdevice_event_work_handler()
> be scheduled. It would trigger the UAF problem for the real_dev like
> following:
I think this is a netdev bug. Under rtnl vlan_dev_real_dev() should
return NULL if the vlan device has passed unregister_vlan_dev()
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 55275ef9a31a7c..1106da84e72559 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -126,6 +126,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
/* Get rid of the vlan's reference to real_dev */
dev_put(real_dev);
+ vlan->real_dev = NULL;
}
int vlan_check_real_dev(struct net_device *real_dev,
I'm assuming there is more too it than this, but it is a starting
point.
Jason
>>>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
>>>> index 68197e576433..063dbe72b7c2 100644
>>>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
>>>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
>>>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>> {
>>>> struct netdev_event_work *work =
>>>> container_of(_work, struct netdev_event_work, work);
>>>> + struct net_device *real_dev;
>>>> unsigned int i;
>>>>
>>>> for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
>>>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>> work->cmds[i].filter_ndev,
>>>> work->cmds[i].cb,
>>>> work->cmds[i].ndev);
>>>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
>>>> + if (real_dev)
>>>> + dev_put(real_dev);
>>>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
>>>> + if (real_dev)
>>>> + dev_put(real_dev);
>>>> dev_put(work->cmds[i].ndev);
>>>> dev_put(work->cmds[i].filter_ndev);
>>>> }
>>>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>> static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>>> struct net_device *ndev)
>>>> {
>>>> - unsigned int i;
>>>> struct netdev_event_work *ndev_work =
>>>> kmalloc(sizeof(*ndev_work), GFP_KERNEL);
>>>> + struct net_device *real_dev;
>>>> + unsigned int i;
>>>>
>>>> if (!ndev_work)
>>>> return NOTIFY_DONE;
>>>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>>> ndev_work->cmds[i].filter_ndev = ndev;
>>>> dev_hold(ndev_work->cmds[i].ndev);
>>>> dev_hold(ndev_work->cmds[i].filter_ndev);
>>>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
>>>> + if (real_dev)
>>>> + dev_hold(real_dev);
>>>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
>>>> + if (real_dev)
>>>> + dev_hold(real_dev);
>>>> }
>>>> INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
>>>
>>> Probably, this is the right change, but I don't know well enough that
>>> part of code. What prevents from "real_dev" to disappear right after
>>> your call to rdma_vlan_dev_real_dev()?
>>>
>>
>> It is known that free the net_device until its dev_refcnt is one. The
>> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
>> a vlan net_device will reach one after unregister_netdevice(&real_dev)
>> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
>> net_device is bigger than one because netdevice_queue_work() will hold
>> the vlan net_device. So my solution is hold the real_dev too in
>> netdevice_queue_work().
>
> dev_hold(ndev_work->cmds[i].filter_ndev);
> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> + if (real_dev)
> <------------ real_dev is released here.
> + dev_hold(real_dev);
At first, I thought the real_dev's dev_refcnt is bigger than one before
NETDEV_UNREGISTER notifier event of the vlan net_device because it calls
dev_put(real_dev) after calling unregister_netdevice_queue(dev, head).
I thought unregister_netdevice_queue() would issue NETDEV_UNREGISTER
notifier event of the vlan net_device, I can hold the real_dev in
NETDEV_UNREGISTER notifier event handler netdevice_queue_work().
But I read unregister_vlan_dev() again, found unregister_netdevice_queue()
in unregister_vlan_dev() just move the vlan net_device to a list to unregister
later. So it is possible the real_dev has been freed when we access in
netdevice_queue_work() although the probability is very small.
So the modification need to improve. For example set vlan->real_dev = NULL
after dev_put(real_dev) in unregister_vlan_dev() proposed by Jason Gunthorpe.
Do you have any other good ideas?
Thank you!
On Tue, Oct 26, 2021 at 11:14:01AM +0800, Ziyang Xuan (William) wrote:
> >>>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> index 68197e576433..063dbe72b7c2 100644
> >>>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>> {
> >>>> struct netdev_event_work *work =
> >>>> container_of(_work, struct netdev_event_work, work);
> >>>> + struct net_device *real_dev;
> >>>> unsigned int i;
> >>>>
> >>>> for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> >>>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>> work->cmds[i].filter_ndev,
> >>>> work->cmds[i].cb,
> >>>> work->cmds[i].ndev);
> >>>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> >>>> + if (real_dev)
> >>>> + dev_put(real_dev);
> >>>> + real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> >>>> + if (real_dev)
> >>>> + dev_put(real_dev);
> >>>> dev_put(work->cmds[i].ndev);
> >>>> dev_put(work->cmds[i].filter_ndev);
> >>>> }
> >>>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>> static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>>> struct net_device *ndev)
> >>>> {
> >>>> - unsigned int i;
> >>>> struct netdev_event_work *ndev_work =
> >>>> kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> >>>> + struct net_device *real_dev;
> >>>> + unsigned int i;
> >>>>
> >>>> if (!ndev_work)
> >>>> return NOTIFY_DONE;
> >>>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>>> ndev_work->cmds[i].filter_ndev = ndev;
> >>>> dev_hold(ndev_work->cmds[i].ndev);
> >>>> dev_hold(ndev_work->cmds[i].filter_ndev);
> >>>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> >>>> + if (real_dev)
> >>>> + dev_hold(real_dev);
> >>>> + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> >>>> + if (real_dev)
> >>>> + dev_hold(real_dev);
> >>>> }
> >>>> INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
> >>>
> >>> Probably, this is the right change, but I don't know well enough that
> >>> part of code. What prevents from "real_dev" to disappear right after
> >>> your call to rdma_vlan_dev_real_dev()?
> >>>
> >>
> >> It is known that free the net_device until its dev_refcnt is one. The
> >> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
> >> a vlan net_device will reach one after unregister_netdevice(&real_dev)
> >> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
> >> net_device is bigger than one because netdevice_queue_work() will hold
> >> the vlan net_device. So my solution is hold the real_dev too in
> >> netdevice_queue_work().
> >
> > dev_hold(ndev_work->cmds[i].filter_ndev);
> > + real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> > + if (real_dev)
> > <------------ real_dev is released here.
> > + dev_hold(real_dev);
>
> At first, I thought the real_dev's dev_refcnt is bigger than one before
> NETDEV_UNREGISTER notifier event of the vlan net_device because it calls
> dev_put(real_dev) after calling unregister_netdevice_queue(dev, head).
> I thought unregister_netdevice_queue() would issue NETDEV_UNREGISTER
> notifier event of the vlan net_device, I can hold the real_dev in
> NETDEV_UNREGISTER notifier event handler netdevice_queue_work().
>
> But I read unregister_vlan_dev() again, found unregister_netdevice_queue()
> in unregister_vlan_dev() just move the vlan net_device to a list to unregister
> later. So it is possible the real_dev has been freed when we access in
> netdevice_queue_work() although the probability is very small.
>
> So the modification need to improve. For example set vlan->real_dev = NULL
> after dev_put(real_dev) in unregister_vlan_dev() proposed by Jason Gunthorpe.
>
> Do you have any other good ideas?
It is hard to tell, such implementation existed almost from day one.
Thanks
>
> Thank you!