From: Leon Romanovsky <[email protected]>
Devlink reload was implemented as a special command which does _SET_
operation, but doesn't take devlink->lock, while recursive devlink
calls that were part of .reload_up()/.reload_down() sequence took it.
This fragile flow was possible due to holding a big devlink lock
(devlink_mutex), which effectively stopped all devlink activities,
even unrelated to reloaded devlink.
So let's make sure that devlink reload behaves as other commands and
use special nested locking primitives with a depth of 1. Such change
opens us to a venue of removing devlink_mutex completely, while keeping
devlink locking complexity in devlink.c.
Signed-off-by: Leon Romanovsky <[email protected]>
---
net/core/devlink.c | 58 ++++++++++++++++++++++++----------------------
1 file changed, 30 insertions(+), 28 deletions(-)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2d8abe88c673..e46f8ad43c20 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8734,7 +8734,6 @@ static const struct genl_small_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_reload,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_PARAM_GET,
@@ -9219,7 +9218,7 @@ int devlink_port_register(struct devlink *devlink,
struct devlink_port *devlink_port,
unsigned int port_index)
{
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
if (devlink_port_index_exists(devlink, port_index)) {
mutex_unlock(&devlink->lock);
return -EEXIST;
@@ -9253,7 +9252,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
list_del(&devlink_port->list);
mutex_unlock(&devlink->lock);
WARN_ON(!list_empty(&devlink_port->reporter_list));
@@ -9501,7 +9500,7 @@ devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
if (!devlink_rate)
return -ENOMEM;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
WARN_ON(devlink_port->devlink_rate);
devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
devlink_rate->devlink = devlink;
@@ -9531,7 +9530,7 @@ void devlink_rate_leaf_destroy(struct devlink_port *devlink_port)
if (!devlink_rate)
return;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
if (devlink_rate->parent)
refcount_dec(&devlink_rate->parent->refcnt);
@@ -9557,7 +9556,7 @@ void devlink_rate_nodes_destroy(struct devlink *devlink)
static struct devlink_rate *devlink_rate, *tmp;
const struct devlink_ops *ops = devlink->ops;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
if (!devlink_rate->parent)
continue;
@@ -9656,7 +9655,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
struct devlink_sb *devlink_sb;
int err = 0;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
if (devlink_sb_index_exists(devlink, sb_index)) {
err = -EEXIST;
goto unlock;
@@ -9684,7 +9683,7 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
{
struct devlink_sb *devlink_sb;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
WARN_ON(!devlink_sb);
list_del(&devlink_sb->list);
@@ -9704,7 +9703,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
int devlink_dpipe_headers_register(struct devlink *devlink,
struct devlink_dpipe_headers *dpipe_headers)
{
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
devlink->dpipe_headers = dpipe_headers;
mutex_unlock(&devlink->lock);
return 0;
@@ -9720,7 +9719,7 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
*/
void devlink_dpipe_headers_unregister(struct devlink *devlink)
{
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
devlink->dpipe_headers = NULL;
mutex_unlock(&devlink->lock);
}
@@ -9814,7 +9813,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
{
struct devlink_dpipe_table *table;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
if (!table)
@@ -9856,7 +9855,7 @@ int devlink_resource_register(struct devlink *devlink,
top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (resource) {
err = -EINVAL;
@@ -9919,7 +9918,7 @@ void devlink_resources_unregister(struct devlink *devlink,
resource_list = &devlink->resource_list;
if (!resource)
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
devlink_resources_unregister(devlink, child_resource);
@@ -9946,7 +9945,7 @@ int devlink_resource_size_get(struct devlink *devlink,
struct devlink_resource *resource;
int err = 0;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (!resource) {
err = -EINVAL;
@@ -9975,7 +9974,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
struct devlink_dpipe_table *table;
int err = 0;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
if (!table) {
@@ -10006,7 +10005,7 @@ void devlink_resource_occ_get_register(struct devlink *devlink,
{
struct devlink_resource *resource;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (WARN_ON(!resource))
goto out;
@@ -10030,7 +10029,7 @@ void devlink_resource_occ_get_unregister(struct devlink *devlink,
{
struct devlink_resource *resource;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (WARN_ON(!resource))
goto out;
@@ -10278,7 +10277,7 @@ devlink_region_create(struct devlink *devlink,
if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
return ERR_PTR(-EINVAL);
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
if (devlink_region_get_by_name(devlink, ops->name)) {
err = -EEXIST;
@@ -10369,7 +10368,7 @@ void devlink_region_destroy(struct devlink_region *region)
struct devlink *devlink = region->devlink;
struct devlink_snapshot *snapshot, *ts;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
/* Free all snapshots of region */
list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list)
@@ -10402,7 +10401,7 @@ int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
int err;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
err = __devlink_region_snapshot_id_get(devlink, id);
mutex_unlock(&devlink->lock);
@@ -10422,7 +10421,7 @@ EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
*/
void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
{
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
__devlink_snapshot_id_decrement(devlink, id);
mutex_unlock(&devlink->lock);
}
@@ -10446,7 +10445,7 @@ int devlink_region_snapshot_create(struct devlink_region *region,
struct devlink *devlink = region->devlink;
int err;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
err = __devlink_region_snapshot_create(region, data, snapshot_id);
mutex_unlock(&devlink->lock);
@@ -10831,7 +10830,7 @@ int devlink_traps_register(struct devlink *devlink,
if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
return -EINVAL;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < traps_count; i++) {
const struct devlink_trap *trap = &traps[i];
@@ -10868,7 +10867,7 @@ void devlink_traps_unregister(struct devlink *devlink,
{
int i;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
/* Make sure we do not have any packets in-flight while unregistering
* traps by disabling all of them and waiting for a grace period.
*/
@@ -11049,7 +11048,7 @@ int devlink_trap_groups_register(struct devlink *devlink,
{
int i, err;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < groups_count; i++) {
const struct devlink_trap_group *group = &groups[i];
@@ -11086,7 +11085,7 @@ void devlink_trap_groups_unregister(struct devlink *devlink,
{
int i;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
for (i = groups_count - 1; i >= 0; i--)
devlink_trap_group_unregister(devlink, &groups[i]);
mutex_unlock(&devlink->lock);
@@ -11189,7 +11188,7 @@ devlink_trap_policers_register(struct devlink *devlink,
{
int i, err;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < policers_count; i++) {
const struct devlink_trap_policer *policer = &policers[i];
@@ -11230,7 +11229,7 @@ devlink_trap_policers_unregister(struct devlink *devlink,
{
int i;
- mutex_lock(&devlink->lock);
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
for (i = policers_count - 1; i >= 0; i--)
devlink_trap_policer_unregister(devlink, &policers[i]);
mutex_unlock(&devlink->lock);
@@ -11400,6 +11399,8 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
if (!net_eq(devlink_net(devlink), net))
goto retry;
+ mutex_lock_nested(&devlink->lock, SINGLE_DEPTH_NESTING);
+
WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
err = devlink_reload(devlink, &init_net,
DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
@@ -11407,6 +11408,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
&actions_performed, NULL);
if (err && err != -EOPNOTSUPP)
pr_warn("Failed to reload devlink instance into init_net\n");
+ mutex_unlock(&devlink->lock);
retry:
devlink_put(devlink);
}
--
2.31.1
On Sun, Oct 31, 2021 at 07:35:56PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <[email protected]>
>
> Devlink reload was implemented as a special command which does _SET_
> operation, but doesn't take devlink->lock, while recursive devlink
> calls that were part of .reload_up()/.reload_down() sequence took it.
>
> This fragile flow was possible due to holding a big devlink lock
> (devlink_mutex), which effectively stopped all devlink activities,
> even unrelated to reloaded devlink.
>
> So let's make sure that devlink reload behaves as other commands and
> use special nested locking primitives with a depth of 1. Such change
> opens us to a venue of removing devlink_mutex completely, while keeping
> devlink locking complexity in devlink.c.
Jakub/Dave, I will be mostly unavailable until later this week, but I
have applied this patch to our queue and can report testing results
tomorrow. I would appreciate it if you could hold off on applying it
until then.
Thanks
Sun, Oct 31, 2021 at 06:35:56PM CET, [email protected] wrote:
>From: Leon Romanovsky <[email protected]>
>
>Devlink reload was implemented as a special command which does _SET_
>operation, but doesn't take devlink->lock, while recursive devlink
>calls that were part of .reload_up()/.reload_down() sequence took it.
>
>This fragile flow was possible due to holding a big devlink lock
>(devlink_mutex), which effectively stopped all devlink activities,
>even unrelated to reloaded devlink.
>
>So let's make sure that devlink reload behaves as other commands and
>use special nested locking primitives with a depth of 1. Such change
>opens us to a venue of removing devlink_mutex completely, while keeping
>devlink locking complexity in devlink.c.
>
>Signed-off-by: Leon Romanovsky <[email protected]>
Looks fine to me.
Reviewed-by: Jiri Pirko <[email protected]>
On Mon, Nov 01, 2021 at 04:03:05PM +0100, Jiri Pirko wrote:
> Sun, Oct 31, 2021 at 06:35:56PM CET, [email protected] wrote:
> >From: Leon Romanovsky <[email protected]>
> >
> >Devlink reload was implemented as a special command which does _SET_
> >operation, but doesn't take devlink->lock, while recursive devlink
> >calls that were part of .reload_up()/.reload_down() sequence took it.
> >
> >This fragile flow was possible due to holding a big devlink lock
> >(devlink_mutex), which effectively stopped all devlink activities,
> >even unrelated to reloaded devlink.
> >
> >So let's make sure that devlink reload behaves as other commands and
> >use special nested locking primitives with a depth of 1. Such change
> >opens us to a venue of removing devlink_mutex completely, while keeping
> >devlink locking complexity in devlink.c.
> >
> >Signed-off-by: Leon Romanovsky <[email protected]>
>
> Looks fine to me.
>
> Reviewed-by: Jiri Pirko <[email protected]>
Traces from mlxsw / netdevsim below:
INFO: task syz-executor.0:569 blocked for more than 143 seconds.
Not tainted 5.15.0-rc7-custom-64786-g16ed1b4acff7 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor.0 state:D stack:24856 pid: 569 ppid: 1 flags:0x00004004
Call Trace:
context_switch kernel/sched/core.c:4940 [inline]
__schedule+0x9da/0x22f0 kernel/sched/core.c:6287
schedule+0xe5/0x280 kernel/sched/core.c:6366
schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:6425
__mutex_lock_common kernel/locking/mutex.c:669 [inline]
__mutex_lock+0xc06/0x1630 kernel/locking/mutex.c:729
mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:743
devlink_port_unregister+0x6a/0x260 net/core/devlink.c:9255
__mlxsw_core_port_fini+0x4c/0x70 drivers/net/ethernet/mellanox/mlxsw/core.c:2787
mlxsw_core_port_fini+0x3c/0x50 drivers/net/ethernet/mellanox/mlxsw/core.c:2817
mlxsw_sp_port_remove+0x3a2/0x500 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:1807
mlxsw_sp_ports_remove drivers/net/ethernet/mellanox/mlxsw/spectrum.c:1870 [inline]
mlxsw_sp_fini+0xec/0x500 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3095
mlxsw_core_bus_device_unregister+0xdf/0x6c0 drivers/net/ethernet/mellanox/mlxsw/core.c:2091
mlxsw_devlink_core_bus_device_reload_down+0x87/0xb0 drivers/net/ethernet/mellanox/mlxsw/core.c:1473
devlink_reload+0x184/0x630 net/core/devlink.c:4040
devlink_nl_cmd_reload+0x612/0x1320 net/core/devlink.c:4161
genl_family_rcv_msg_doit.isra.0+0x253/0x370 net/netlink/genetlink.c:731
genl_family_rcv_msg net/netlink/genetlink.c:775 [inline]
genl_rcv_msg+0x389/0x620 net/netlink/genetlink.c:792
netlink_rcv_skb+0x173/0x480 net/netlink/af_netlink.c:2491
genl_rcv+0x2e/0x40 net/netlink/genetlink.c:803
netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
netlink_unicast+0x5ae/0x7f0 net/netlink/af_netlink.c:1345
netlink_sendmsg+0x8e1/0xe30 net/netlink/af_netlink.c:1916
sock_sendmsg_nosec net/socket.c:704 [inline]
sock_sendmsg net/socket.c:724 [inline]
__sys_sendto+0x2b6/0x410 net/socket.c:2036
__do_sys_sendto net/socket.c:2048 [inline]
__se_sys_sendto net/socket.c:2044 [inline]
__x64_sys_sendto+0xe6/0x1a0 net/socket.c:2044
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0x80 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x41f91a
RSP: 002b:00007fffc5b2f5b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000014c4320 RCX: 000000000041f91a
RDX: 0000000000000038 RSI: 00000000014c4370 RDI: 0000000000000004
RBP: 0000000000000001 R08: 00007fffc5b2f5d4 R09: 000000000000000c
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00000000014c4370 R14: 0000000000000004 R15: 0000000000000000
Showing all locks held in the system:
3 locks held by kworker/1:0/20:
#0: ffff88811aff0558 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x28/0x40 kernel/sched/core.c:474
#1: ffffffff85ae9c20 (rcu_read_lock){....}-{1:2}, at: trace_sched_stat_runtime include/trace/events/sched.h:517 [inline]
#1: ffffffff85ae9c20 (rcu_read_lock){....}-{1:2}, at: update_curr+0x1e3/0x7c0 kernel/sched/fair.c:852
#2: ffff88811afdfcc0 (krc.lock){..-.}-{2:2}, at: kfree_rcu_work+0x549/0x10e0 kernel/rcu/tree.c:3288
1 lock held by khungtaskd/26:
#0: ffffffff85ae9c20 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x5f/0x27f kernel/locking/lockdep.c:6448
1 lock held by systemd-journal/95:
1 lock held by in:imklog/410:
#0: ffff88810da2cbc0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xff/0x120 fs/file.c:990
4 locks held by rs:main Q:Reg/411:
5 locks held by syz-executor.0/569:
#0: ffffffff862f26b8 (cb_lock){++++}-{3:3}, at: genl_rcv+0x1f/0x40 net/netlink/genetlink.c:802
#1: ffffffff862f2790 (genl_mutex){+.+.}-{3:3}, at: genl_lock net/netlink/genetlink.c:33 [inline]
#1: ffffffff862f2790 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg+0x43d/0x620 net/netlink/genetlink.c:790
#2: ffffffff862cec70 (devlink_mutex){+.+.}-{3:3}, at: devlink_nl_pre_doit+0x34/0xa10 net/core/devlink.c:575
#3: ffff88810344c278 (&devlink->lock#2){+.+.}-{3:3}, at: devlink_nl_pre_doit+0x4ae/0xa10 net/core/devlink.c:582
#4: ffff88810344c278 (&devlink->lock/1){+.+.}-{3:3}, at: devlink_port_unregister+0x6a/0x260 net/core/devlink.c:9255
INFO: task devlink:1172 blocked for more than 143 seconds.
Not tainted 5.15.0-rc7-custom-64786-g16ed1b4acff7 #536
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:devlink state:D stack:25736 pid: 1172 ppid: 977 flags:0x00004000
Call Trace:
__schedule+0xa14/0x2660
schedule+0xd8/0x270
schedule_preempt_disabled+0x14/0x20
__mutex_lock+0xa1f/0x1320
devlink_port_unregister+0xd2/0x2d0
__nsim_dev_port_del+0x1c8/0x250 [netdevsim]
nsim_dev_reload_destroy+0x1a3/0x2f0 [netdevsim]
nsim_dev_reload_down+0x105/0x1a0 [netdevsim]
devlink_reload+0x52a/0x6a0
devlink_nl_cmd_reload+0x71d/0x1210
genl_family_rcv_msg_doit+0x22a/0x320
genl_rcv_msg+0x341/0x5a0
netlink_rcv_skb+0x14d/0x430
genl_rcv+0x29/0x40
netlink_unicast+0x539/0x7e0
netlink_sendmsg+0x84d/0xd80
__sys_sendto+0x23f/0x350
__x64_sys_sendto+0xe2/0x1b0
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7fd4f9d0314a
RSP: 002b:00007fff18371848 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fd4f9d0314a
RDX: 0000000000000034 RSI: 00000000006f0ad0 RDI: 0000000000000003
RBP: 00000000006f0aa0 R08: 00007fd4f9dd9200 R09: 000000000000000c
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000408d80
R13: 00000000006f0910 R14: 0000000000000000 R15: 0000000000000001
Showing all locks held in the system:
1 lock held by khungtaskd/62:
#0: ffffffff83ca4aa0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x28c
3 locks held by kworker/4:2/144:
#0: ffff888100101548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x8ac/0x16f0
#1: ffffc9000209fdb0 ((work_completion)(&(&nsim_dev->trap_data->trap_report_dw)->work)){+.+.}-{0:0}, at: process_one_work+0x8e0/0x16f0
#2: ffff888123f9d440 (&nsim_dev->port_list_lock){+.+.}-{3:3}, at: nsim_dev_trap_report_work+0x62/0xbc0 [netdevsim]
1 lock held by dmesg/1045:
7 locks held by devlink/1172:
#0: ffffffff8423dbd8 (cb_lock){++++}-{3:3}, at: genl_rcv+0x1a/0x40
#1: ffffffff8423dcb0 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg+0x3f9/0x5a0
#2: ffffffff84222050 (devlink_mutex){+.+.}-{3:3}, at: devlink_nl_pre_doit+0x30/0xa20
#3: ffff888123f9d278 (&devlink->lock#2){+.+.}-{3:3}, at: devlink_nl_pre_doit+0x376/0xa20
#4: ffff888176758d10 (&nsim_bus_dev->nsim_bus_reload_lock){+.+.}-{3:3}, at: nsim_dev_reload_down+0x52/0x1a0 [netdevsim]
#5: ffff888123f9d440 (&nsim_dev->port_list_lock){+.+.}-{3:3}, at: nsim_dev_reload_destroy+0x142/0x2f0 [netdevsim]
#6: ffff888123f9d278 (&devlink->lock/1){+.+.}-{3:3}, at: devlink_port_unregister+0xd2/0x2d0
On Mon, 1 Nov 2021 22:52:19 +0200 Ido Schimmel wrote:
> > >Signed-off-by: Leon Romanovsky <[email protected]>
> >
> > Looks fine to me.
> >
> > Reviewed-by: Jiri Pirko <[email protected]>
>
> Traces from mlxsw / netdevsim below:
Thanks a lot for the testing Ido!
Would you mind giving my RFC a spin as well on your syzbot machinery?
Any input on the three discussion points there?
(1) should we have a "locked" and "unlocked" API or use lock nesting?
(2) should we expose devlink lock so that drivers can use devlink
as a framework for their locking needs?
(3) should we let drivers take refs on the devlink instance?
On Mon, Nov 01, 2021 at 04:11:22PM -0700, Jakub Kicinski wrote:
> On Mon, 1 Nov 2021 22:52:19 +0200 Ido Schimmel wrote:
> > > >Signed-off-by: Leon Romanovsky <[email protected]>
> > >
> > > Looks fine to me.
> > >
> > > Reviewed-by: Jiri Pirko <[email protected]>
> >
> > Traces from mlxsw / netdevsim below:
>
> Thanks a lot for the testing Ido!
>
> Would you mind giving my RFC a spin as well on your syzbot machinery?
Sorry for the delay. I didn't have a lot of time last week.
I tried to apply your set [1] on top of net-next, but I'm getting a
conflict with patch #5. Can you send me (here / privately) a link to a
git tree that has the patches on top of net-next?
TBH, if you ran the netdevsim selftests with a debug config and nothing
exploded, then I don't expect syzkaller to find anything (major).
[1] https://lore.kernel.org/netdev/[email protected]/
>
> Any input on the three discussion points there?
>
> (1) should we have a "locked" and "unlocked" API or use lock nesting?
Judging by the netdevsim conversion, it seems that for the vast majority
of APIs (if not all) we will only have an "unlocked" API. Drivers will
hold the devlink instance lock on probe / remove and devlink itself will
hold the lock when calling into drivers (e.g., on reload, port split).
>
> (2) should we expose devlink lock so that drivers can use devlink
> as a framework for their locking needs?
It is better than dropping locks (e.g., DEVLINK_NL_FLAG_NO_LOCK, which I
expect will go away after the conversion). With the asserts you put in
place, misuses will be caught early.
>
> (3) should we let drivers take refs on the devlink instance?
I think it's fine mainly because I don't expect it to be used by too
many drivers other than netdevsim which is somewhat special. Looking at
the call sites of devlink_get() in netdevsim, it is only called from
places (debugfs and trap workqueue) that shouldn't be present in real
drivers.
The tl;dr is that your approach makes sense to me. I was initially
worried that we will need to propagate a "reload" argument everywhere in
drivers, but you wrote "The expectation is that driver will take the
devlink instance lock on its probe and remove paths", which avoids that.
Thanks for working on that
On Sun, Nov 07, 2021 at 07:16:05PM +0200, Ido Schimmel wrote:
> On Mon, Nov 01, 2021 at 04:11:22PM -0700, Jakub Kicinski wrote:
> > On Mon, 1 Nov 2021 22:52:19 +0200 Ido Schimmel wrote:
> > > > >Signed-off-by: Leon Romanovsky <[email protected]>
> > > >
> > > > Looks fine to me.
> > > >
> > > > Reviewed-by: Jiri Pirko <[email protected]>
> > >
> > > Traces from mlxsw / netdevsim below:
> >
> > Thanks a lot for the testing Ido!
> >
> > Would you mind giving my RFC a spin as well on your syzbot machinery?
<...>
> >
> > (3) should we let drivers take refs on the devlink instance?
>
> I think it's fine mainly because I don't expect it to be used by too
> many drivers other than netdevsim which is somewhat special. Looking at
> the call sites of devlink_get() in netdevsim, it is only called from
> places (debugfs and trap workqueue) that shouldn't be present in real
> drivers.
Sorry, I'm obligated to ask. In which universe is it ok to create new
set of API that no real driver should use?
Thanks
On Sun, 7 Nov 2021 19:54:20 +0200 Leon Romanovsky wrote:
> > > (3) should we let drivers take refs on the devlink instance?
> >
> > I think it's fine mainly because I don't expect it to be used by too
> > many drivers other than netdevsim which is somewhat special. Looking at
> > the call sites of devlink_get() in netdevsim, it is only called from
> > places (debugfs and trap workqueue) that shouldn't be present in real
> > drivers.
>
> Sorry, I'm obligated to ask. In which universe is it ok to create new
> set of API that no real driver should use?
I think it's common sense. We're just exporting something to make our
lives easier somewhere else in the three. Do you see a way in which
taking refs on devlink can help out-of-tree code?
BTW we can put the symbols in a namespace or under a kconfig, to aid
reviews of drivers using them if you want.
On Mon, Nov 08, 2021 at 08:09:18AM -0800, Jakub Kicinski wrote:
> On Sun, 7 Nov 2021 19:54:20 +0200 Leon Romanovsky wrote:
> > > > (3) should we let drivers take refs on the devlink instance?
> > >
> > > I think it's fine mainly because I don't expect it to be used by too
> > > many drivers other than netdevsim which is somewhat special. Looking at
> > > the call sites of devlink_get() in netdevsim, it is only called from
> > > places (debugfs and trap workqueue) that shouldn't be present in real
> > > drivers.
> >
> > Sorry, I'm obligated to ask. In which universe is it ok to create new
> > set of API that no real driver should use?
>
> I think it's common sense. We're just exporting something to make our
> lives easier somewhere else in the three. Do you see a way in which
> taking refs on devlink can help out-of-tree code?
I didn't go such far in my thoughts. My main concern is that you ore
exposing broken devlink internals in the hope that drivers will do better
locking. I wanted to show that internal locking should be fixed first.
https://lore.kernel.org/netdev/[email protected]/T/#m093f067d0cafcbe6c05ed469bcfd708dd1eb7f36
While this series fixes locking and after all my changes devlink started
to be more secure, that works correctly for simple drivers. However for
net namespace aware drivers it still stays DOA.
As you can see, devlink reload holds pernet_ops_rwsem, which drivers should
take too in order to unregister_netdevice_notifier.
So for me, the difference between netdevsim and real device (mlx5) is
too huge to really invest time into netdevsim-centric API, because it
won't solve any of real world problems.
sudo ip netns add n1
sudo devlink dev reload pci/0000:00:09.0 netns n1
sudo ip netns del n1
[ 463.357081] ======================================================
[ 463.357309] WARNING: possible circular locking dependency detected
[ 463.357452] 5.15.0-rc7+ #286 Not tainted
[ 463.357532] ------------------------------------------------------
[ 463.357668] kworker/u16:1/9 is trying to acquire lock:
[ 463.357777] ffff888011694648 (&devlink->lock){+.+.}-{3:3}, at: devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.358006]
[ 463.358006] but task is already holding lock:
[ 463.358150] ffffffff83602a50 (pernet_ops_rwsem){++++}-{3:3}, at: cleanup_net+0x9c/0x8e0
[ 463.358334]
[ 463.358334] which lock already depends on the new lock.
[ 463.358334]
[ 463.358923]
[ 463.358923] the existing dependency chain (in reverse order) is:
[ 463.359093]
[ 463.359093] -> #3 (pernet_ops_rwsem){++++}-{3:3}:
[ 463.359291] down_write+0x92/0x150
[ 463.359386] unregister_netdevice_notifier+0x1e/0x150
[ 463.359519] mlx5_ib_roce_cleanup+0x23a/0x480 [mlx5_ib]
[ 463.359709] mlx5r_remove+0xb4/0x130 [mlx5_ib]
[ 463.359873] auxiliary_bus_remove+0x52/0x70
[ 463.359997] __device_release_driver+0x334/0x660
[ 463.360110] device_release_driver+0x26/0x40
[ 463.360225] bus_remove_device+0x2a5/0x560
[ 463.360331] device_del+0x489/0xb80
[ 463.360423] mlx5_detach_device+0x14b/0x2c0 [mlx5_core]
[ 463.360677] mlx5_unload_one+0x2d/0xa0 [mlx5_core]
[ 463.360849] mlx5_devlink_reload_down+0x1be/0x360 [mlx5_core]
[ 463.361019] devlink_reload+0x48b/0x610
[ 463.361108] devlink_nl_cmd_reload+0x5c3/0xf90
[ 463.361192] genl_family_rcv_msg_doit+0x1e9/0x2f0
[ 463.361288] genl_rcv_msg+0x27f/0x4a0
[ 463.361378] netlink_rcv_skb+0x11e/0x340
[ 463.361470] genl_rcv+0x24/0x40
[ 463.361538] netlink_unicast+0x433/0x700
[ 463.361627] netlink_sendmsg+0x705/0xbe0
[ 463.361720] sock_sendmsg+0xb0/0xe0
[ 463.361792] __sys_sendto+0x192/0x240
[ 463.361855] __x64_sys_sendto+0xdc/0x1b0
[ 463.361953] do_syscall_64+0x3d/0x90
[ 463.362016] entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 463.362108]
[ 463.362108] -> #2 (mlx5_intf_mutex){+.+.}-{3:3}:
[ 463.362249] __mutex_lock+0x150/0x15c0
[ 463.362340] mlx5_lag_add_mdev+0x36/0x5e0 [mlx5_core]
[ 463.362490] mlx5_load+0x155/0x1c0 [mlx5_core]
[ 463.362615] mlx5_init_one+0x2d5/0x400 [mlx5_core]
[ 463.362751] probe_one+0x430/0x680 [mlx5_core]
[ 463.362874] pci_device_probe+0x2a0/0x4a0
[ 463.362996] really_probe+0x1cc/0xba0
[ 463.363066] __driver_probe_device+0x18f/0x470
[ 463.363147] driver_probe_device+0x49/0x120
[ 463.363258] __driver_attach+0x1ce/0x400
[ 463.363358] bus_for_each_dev+0x11e/0x1a0
[ 463.363462] bus_add_driver+0x309/0x570
[ 463.363558] driver_register+0x20f/0x390
[ 463.363661] value_read+0x62/0x160 [ib_core]
[ 463.363821] do_one_initcall+0xd5/0x400
[ 463.363929] do_init_module+0x1c8/0x760
[ 463.364035] load_module+0x7d9d/0xa4b0
[ 463.364133] __do_sys_finit_module+0x118/0x1a0
[ 463.364232] do_syscall_64+0x3d/0x90
[ 463.364321] entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 463.364422]
[ 463.364422] -> #1 (&dev->intf_state_mutex){+.+.}-{3:3}:
[ 463.364554] __mutex_lock+0x150/0x15c0
[ 463.364658] mlx5_unload_one+0x1e/0xa0 [mlx5_core]
[ 463.364787] mlx5_devlink_reload_down+0x1be/0x360 [mlx5_core]
[ 463.364946] devlink_reload+0x48b/0x610
[ 463.365040] devlink_nl_cmd_reload+0x5c3/0xf90
[ 463.365137] genl_family_rcv_msg_doit+0x1e9/0x2f0
[ 463.365245] genl_rcv_msg+0x27f/0x4a0
[ 463.365310] netlink_rcv_skb+0x11e/0x340
[ 463.365404] genl_rcv+0x24/0x40
[ 463.365477] netlink_unicast+0x433/0x700
[ 463.365576] netlink_sendmsg+0x705/0xbe0
[ 463.365666] sock_sendmsg+0xb0/0xe0
[ 463.365746] __sys_sendto+0x192/0x240
[ 463.365817] __x64_sys_sendto+0xdc/0x1b0
[ 463.365907] do_syscall_64+0x3d/0x90
[ 463.365980] entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 463.366074]
[ 463.366074] -> #0 (&devlink->lock){+.+.}-{3:3}:
[ 463.366209] __lock_acquire+0x2999/0x5a40
[ 463.366322] lock_acquire+0x1a9/0x4a0
[ 463.366401] __mutex_lock+0x150/0x15c0
[ 463.366499] devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.366598] cleanup_net+0x372/0x8e0
[ 463.366674] process_one_work+0x8f5/0x1580
[ 463.366779] worker_thread+0x58d/0x1330
[ 463.366881] kthread+0x379/0x450
[ 463.366952] ret_from_fork+0x1f/0x30
[ 463.367020]
[ 463.367020] other info that might help us debug this:
[ 463.367020]
[ 463.367173] Chain exists of:
[ 463.367173] &devlink->lock --> mlx5_intf_mutex --> pernet_ops_rwsem
[ 463.367173]
[ 463.367360] Possible unsafe locking scenario:
[ 463.367360]
[ 463.367478] CPU0 CPU1
[ 463.367574] ---- ----
[ 463.367663] lock(pernet_ops_rwsem);
[ 463.367748] lock(mlx5_intf_mutex);
[ 463.367874] lock(pernet_ops_rwsem);
[ 463.368027] lock(&devlink->lock);
[ 463.368098]
[ 463.368098] *** DEADLOCK ***
[ 463.368098]
[ 463.368233] 3 locks held by kworker/u16:1/9:
[ 463.368347] #0: ffff888005df8938 ((wq_completion)netns){+.+.}-{0:0}, at: process_one_work+0x80a/0x1580
[ 463.371930] #1: ffff8880059c7db0 (net_cleanup_work){+.+.}-{0:0}, at: process_one_work+0x837/0x1580
[ 463.372376] #2: ffffffff83602a50 (pernet_ops_rwsem){++++}-{3:3}, at: cleanup_net+0x9c/0x8e0
[ 463.372638]
[ 463.372638] stack backtrace:
[ 463.372804] CPU: 3 PID: 9 Comm: kworker/u16:1 Not tainted 5.15.0-rc7+ #286
[ 463.372965] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 463.373258] Workqueue: netns cleanup_net
[ 463.373366] Call Trace:
[ 463.373444] dump_stack_lvl+0x45/0x59
[ 463.373545] check_noncircular+0x268/0x310
[ 463.373639] ? print_circular_bug+0x460/0x460
[ 463.373767] ? mark_lock+0x104/0x2e30
[ 463.373899] ? find_busiest_group+0x1bc0/0x27a0
[ 463.374083] ? alloc_chain_hlocks+0x1e6/0x5a0
[ 463.374254] __lock_acquire+0x2999/0x5a40
[ 463.374344] ? lockdep_hardirqs_on_prepare+0x3e0/0x3e0
[ 463.374480] ? __lock_acquire+0xbec/0x5a40
[ 463.374585] lock_acquire+0x1a9/0x4a0
[ 463.374693] ? devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.374817] ? lock_release+0x6c0/0x6c0
[ 463.374905] ? lock_is_held_type+0x98/0x110
[ 463.374996] ? lock_is_held_type+0x98/0x110
[ 463.375089] __mutex_lock+0x150/0x15c0
[ 463.375178] ? devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.375291] ? lock_downgrade+0x6d0/0x6d0
[ 463.375381] ? devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.375507] ? lock_is_held_type+0x98/0x110
[ 463.375602] ? find_held_lock+0x2d/0x110
[ 463.375698] ? mutex_lock_io_nested+0x1400/0x1400
[ 463.375848] ? lock_release+0x1f9/0x6c0
[ 463.375978] ? devlink_pernet_pre_exit+0x17e/0x2a0
[ 463.376105] ? lock_downgrade+0x6d0/0x6d0
[ 463.376195] ? lock_is_held_type+0x98/0x110
[ 463.376283] ? find_held_lock+0x2d/0x110
[ 463.376379] ? devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.376529] devlink_pernet_pre_exit+0x1b4/0x2a0
[ 463.376675] ? devlink_nl_cmd_reload+0xf90/0xf90
[ 463.376790] ? mark_held_locks+0x9f/0xe0
[ 463.376887] ? lockdep_hardirqs_on_prepare+0x273/0x3e0
[ 463.377001] ? __local_bh_enable_ip+0xa2/0x100
[ 463.377148] cleanup_net+0x372/0x8e0
[ 463.377238] ? unregister_pernet_device+0x70/0x70
[ 463.377353] ? lock_is_held_type+0x98/0x110
[ 463.377467] process_one_work+0x8f5/0x1580
[ 463.377560] ? lock_release+0x6c0/0x6c0
[ 463.377647] ? pwq_dec_nr_in_flight+0x230/0x230
[ 463.377771] ? rwlock_bug.part.0+0x90/0x90
[ 463.377891] worker_thread+0x58d/0x1330
[ 463.377987] ? process_one_work+0x1580/0x1580
[ 463.378102] kthread+0x379/0x450
[ 463.378193] ? _raw_spin_unlock_irq+0x24/0x30
[ 463.378326] ? set_kthread_struct+0x100/0x100
[ 463.378457] ret_from_fork+0x1f/0x30
>
> BTW we can put the symbols in a namespace or under a kconfig, to aid
> reviews of drivers using them if you want.
On Mon, Nov 08, 2021 at 10:16:46AM -0800, Jakub Kicinski wrote:
> On Mon, 8 Nov 2021 19:32:19 +0200 Leon Romanovsky wrote:
> > > I think it's common sense. We're just exporting something to make our
> > > lives easier somewhere else in the three. Do you see a way in which
> > > taking refs on devlink can help out-of-tree code?
> >
> > I didn't go such far in my thoughts. My main concern is that you ore
> > exposing broken devlink internals in the hope that drivers will do better
> > locking. I wanted to show that internal locking should be fixed first.
> >
> > https://lore.kernel.org/netdev/[email protected]/T/#m093f067d0cafcbe6c05ed469bcfd708dd1eb7f36
> >
> > While this series fixes locking and after all my changes devlink started
> > to be more secure, that works correctly for simple drivers.
>
> I prefer my version. I think I asked you to show how the changes make
> drivers simpler, which you failed to do.
Why did I fail? My version requires **zero** changes to the drivers.
Everything works without them changing anything. You can't ask for more.
>
> I already told you how this is going to go, don't expect me to comment
> too much.
>
> > However for net namespace aware drivers it still stays DOA.
> >
> > As you can see, devlink reload holds pernet_ops_rwsem, which drivers should
> > take too in order to unregister_netdevice_notifier.
> >
> > So for me, the difference between netdevsim and real device (mlx5) is
> > too huge to really invest time into netdevsim-centric API, because it
> > won't solve any of real world problems.
>
> Did we not already go over this? Sorry, it feels like you're repeating
> arguments which I replied to before. This is exhausting.
I don't enjoy it either.
>
> nfp will benefit from the simplified locking as well, and so will bnxt,
> although I'm not sure the maintainers will opt for using devlink framework
> due to the downstream requirements.
Exactly why devlink should be fixed first.
Thanks
On Mon, 8 Nov 2021 20:24:37 +0200 Leon Romanovsky wrote:
> > I prefer my version. I think I asked you to show how the changes make
> > drivers simpler, which you failed to do.
>
> Why did I fail? My version requires **zero** changes to the drivers.
> Everything works without them changing anything. You can't ask for more.
For the last time.
"Your version" does require driver changes, but for better or worse
we have already committed them to the tree. All the re-ordering to make
sure devlink is registered last and more work is done at alloc,
remember?
The goal is to make the upstream drivers simpler. You failed to show
how your code does that.
Maybe you don't see the benefit because upstream simplifications are
hard to depend on in out-of-tree drivers?
> > I already told you how this is going to go, don't expect me to comment
> > too much.
> >
> > > However for net namespace aware drivers it still stays DOA.
> > >
> > > As you can see, devlink reload holds pernet_ops_rwsem, which drivers should
> > > take too in order to unregister_netdevice_notifier.
> > >
> > > So for me, the difference between netdevsim and real device (mlx5) is
> > > too huge to really invest time into netdevsim-centric API, because it
> > > won't solve any of real world problems.
> >
> > Did we not already go over this? Sorry, it feels like you're repeating
> > arguments which I replied to before. This is exhausting.
>
> I don't enjoy it either.
>
> > nfp will benefit from the simplified locking as well, and so will bnxt,
> > although I'm not sure the maintainers will opt for using devlink framework
> > due to the downstream requirements.
>
> Exactly why devlink should be fixed first.
If by "fixed first" you mean it needs 5 locks to be added and to remove
any guarantees on sub-object lifetime then no thanks.
On Mon, 8 Nov 2021 19:32:19 +0200 Leon Romanovsky wrote:
> > I think it's common sense. We're just exporting something to make our
> > lives easier somewhere else in the three. Do you see a way in which
> > taking refs on devlink can help out-of-tree code?
>
> I didn't go such far in my thoughts. My main concern is that you ore
> exposing broken devlink internals in the hope that drivers will do better
> locking. I wanted to show that internal locking should be fixed first.
>
> https://lore.kernel.org/netdev/[email protected]/T/#m093f067d0cafcbe6c05ed469bcfd708dd1eb7f36
>
> While this series fixes locking and after all my changes devlink started
> to be more secure, that works correctly for simple drivers.
I prefer my version. I think I asked you to show how the changes make
drivers simpler, which you failed to do.
I already told you how this is going to go, don't expect me to comment
too much.
> However for net namespace aware drivers it still stays DOA.
>
> As you can see, devlink reload holds pernet_ops_rwsem, which drivers should
> take too in order to unregister_netdevice_notifier.
>
> So for me, the difference between netdevsim and real device (mlx5) is
> too huge to really invest time into netdevsim-centric API, because it
> won't solve any of real world problems.
Did we not already go over this? Sorry, it feels like you're repeating
arguments which I replied to before. This is exhausting.
nfp will benefit from the simplified locking as well, and so will bnxt,
although I'm not sure the maintainers will opt for using devlink framework
due to the downstream requirements.
On Mon, Nov 08, 2021 at 10:46:08AM -0800, Jakub Kicinski wrote:
> On Mon, 8 Nov 2021 20:24:37 +0200 Leon Romanovsky wrote:
> > > I prefer my version. I think I asked you to show how the changes make
> > > drivers simpler, which you failed to do.
> >
> > Why did I fail? My version requires **zero** changes to the drivers.
> > Everything works without them changing anything. You can't ask for more.
>
> For the last time.
>
> "Your version" does require driver changes, but for better or worse
> we have already committed them to the tree. All the re-ordering to make
> sure devlink is registered last and more work is done at alloc,
> remember?
It fixed access to devlink before driver is ready. Also it fixed devlink
reload of simple drivers (without net namespaces support). So yes, at
least for now, we have a workaround to devlink reload bugs. We rmmod
mlx5_ib before reload and after. Everything thanks to reordering.
>
> The goal is to make the upstream drivers simpler. You failed to show
> how your code does that.
>
> Maybe you don't see the benefit because upstream simplifications are
> hard to depend on in out-of-tree drivers?
I don't care about out-of-tree drivers, mlx5 is fully upstream.
>
> > > I already told you how this is going to go, don't expect me to comment
> > > too much.
> > >
> > > > However for net namespace aware drivers it still stays DOA.
> > > >
> > > > As you can see, devlink reload holds pernet_ops_rwsem, which drivers should
> > > > take too in order to unregister_netdevice_notifier.
> > > >
> > > > So for me, the difference between netdevsim and real device (mlx5) is
> > > > too huge to really invest time into netdevsim-centric API, because it
> > > > won't solve any of real world problems.
> > >
> > > Did we not already go over this? Sorry, it feels like you're repeating
> > > arguments which I replied to before. This is exhausting.
> >
> > I don't enjoy it either.
> >
> > > nfp will benefit from the simplified locking as well, and so will bnxt,
> > > although I'm not sure the maintainers will opt for using devlink framework
> > > due to the downstream requirements.
> >
> > Exactly why devlink should be fixed first.
>
> If by "fixed first" you mean it needs 5 locks to be added and to remove
> any guarantees on sub-object lifetime then no thanks.
How do you plan to fix pernet_ops_rwsem lock? By exposing devlink state
to the drivers? By providing unlocked version of unregister_netdevice_notifier?
This simple scenario has deadlocks:
sudo ip netns add n1
sudo devlink dev reload pci/0000:00:09.0 netns n1
sudo ip netns del n1
https://lore.kernel.org/netdev/20211108104608.378c106e@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com/T/#m94b5c173f134c7d19daf455e3f6bad5ba6afd90d
On Mon, 8 Nov 2021 21:58:36 +0200 Leon Romanovsky wrote:
> > > > nfp will benefit from the simplified locking as well, and so will bnxt,
> > > > although I'm not sure the maintainers will opt for using devlink framework
> > > > due to the downstream requirements.
> > >
> > > Exactly why devlink should be fixed first.
> >
> > If by "fixed first" you mean it needs 5 locks to be added and to remove
> > any guarantees on sub-object lifetime then no thanks.
>
> How do you plan to fix pernet_ops_rwsem lock? By exposing devlink state
> to the drivers? By providing unlocked version of unregister_netdevice_notifier?
>
> This simple scenario has deadlocks:
> sudo ip netns add n1
> sudo devlink dev reload pci/0000:00:09.0 netns n1
> sudo ip netns del n1
Okay - I'm not sure why you're asking me this. This is not related to
devlink locking as far as I can tell. Neither are you fixing this
problem in your own RFC.
You'd need to tell me more about what the notifier is used for (I see
RoCE in the call trace). I don't understand why you need to re-register
a global (i.e. not per netns) notifier when devlink is switching name
spaces.
On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> > > > I once sketched out fixing this by removing the need to hold the
> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
> > > > over the devlink reload paths. It seemed like a reasonable step toward
> > > > finer grained locking.
> > >
> > > Seems to me the locking is just a symptom.
> >
> > My fear is this reload during net ns destruction is devlink uAPI now
> > and, yes it may be only a symptom, but the root cause may be unfixable
> > uAPI constraints.
>
> If I'm reading this right it locks up 100% of the time, what is a uAPI
> for? DoS? ;)
>
> Hence my questions about the actual use cases.
Removing namespace support from devlink would solve the crasher. I
certainly didn't feel bold enough to suggest such a thing :)
If no other devlink driver cares about this it is probably the best
idea.
Jason
On Mon, Nov 08, 2021 at 03:31:26PM -0800, Jakub Kicinski wrote:
> On Mon, 8 Nov 2021 21:58:36 +0200 Leon Romanovsky wrote:
> > > > > nfp will benefit from the simplified locking as well, and so will bnxt,
> > > > > although I'm not sure the maintainers will opt for using devlink framework
> > > > > due to the downstream requirements.
> > > >
> > > > Exactly why devlink should be fixed first.
> > >
> > > If by "fixed first" you mean it needs 5 locks to be added and to remove
> > > any guarantees on sub-object lifetime then no thanks.
> >
> > How do you plan to fix pernet_ops_rwsem lock? By exposing devlink state
> > to the drivers? By providing unlocked version of unregister_netdevice_notifier?
> >
> > This simple scenario has deadlocks:
> > sudo ip netns add n1
> > sudo devlink dev reload pci/0000:00:09.0 netns n1
> > sudo ip netns del n1
>
> Okay - I'm not sure why you're asking me this. This is not related to
> devlink locking as far as I can tell. Neither are you fixing this
> problem in your own RFC.
I asked you because you clearly showed to me that things that makes
sense for me, doesn't make sense for you and vice versa.
I don't want to do work that will be thrown away.
>
> You'd need to tell me more about what the notifier is used for (I see
> RoCE in the call trace). I don't understand why you need to re-register
> a global (i.e. not per netns) notifier when devlink is switching name
> spaces.
RDMA subsystem supports two net namespace aware scenarios.
We need global netdev_notifier for shared mode. This is legacy mode where
we listen to all namespaces. We must support this mode otherwise we break
whole RDMA world.
See commit below:
de641d74fb00 ("Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"")
Thanks
On Tue, 9 Nov 2021 16:12:33 +0200 Leon Romanovsky wrote:
> > You'd need to tell me more about what the notifier is used for (I see
> > RoCE in the call trace). I don't understand why you need to re-register
> > a global (i.e. not per netns) notifier when devlink is switching name
> > spaces.
>
> RDMA subsystem supports two net namespace aware scenarios.
>
> We need global netdev_notifier for shared mode. This is legacy mode where
> we listen to all namespaces. We must support this mode otherwise we break
> whole RDMA world.
>
> See commit below:
> de641d74fb00 ("Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"")
But why re-reg? To take advantage of clean event replay?
IIUC the problem is that the un-reg is called from the reload_down path.
On Tue, Nov 09, 2021 at 06:17:29AM -0800, Jakub Kicinski wrote:
> On Tue, 9 Nov 2021 16:12:33 +0200 Leon Romanovsky wrote:
> > > You'd need to tell me more about what the notifier is used for (I see
> > > RoCE in the call trace). I don't understand why you need to re-register
> > > a global (i.e. not per netns) notifier when devlink is switching name
> > > spaces.
> >
> > RDMA subsystem supports two net namespace aware scenarios.
> >
> > We need global netdev_notifier for shared mode. This is legacy mode where
> > we listen to all namespaces. We must support this mode otherwise we break
> > whole RDMA world.
> >
> > See commit below:
> > de641d74fb00 ("Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"")
>
> But why re-reg? To take advantage of clean event replay?
>
> IIUC the problem is that the un-reg is called from the reload_down path.
This is how devlink reload was explained to me by Jiri. It suppose to
unload and load whole driver again. In our case, it unloads mlx5_core,
which destroys all ULPs (VDPA, RDMA, e.t.c) and these ULPs are not aware
of devlink at all. Especially they are not aware of the reason of
devlink reload.
This is why I asked you. It is not related to devlink locking directly,
but indirectly I didn't want to work on anything related to locking
without making sure that devlink.c is correct.
Thanks
On Mon, Nov 08, 2021 at 03:31:26PM -0800, Jakub Kicinski wrote:
> On Mon, 8 Nov 2021 21:58:36 +0200 Leon Romanovsky wrote:
> > > > > nfp will benefit from the simplified locking as well, and so will bnxt,
> > > > > although I'm not sure the maintainers will opt for using devlink framework
> > > > > due to the downstream requirements.
> > > >
> > > > Exactly why devlink should be fixed first.
> > >
> > > If by "fixed first" you mean it needs 5 locks to be added and to remove
> > > any guarantees on sub-object lifetime then no thanks.
> >
> > How do you plan to fix pernet_ops_rwsem lock? By exposing devlink state
> > to the drivers? By providing unlocked version of unregister_netdevice_notifier?
> >
> > This simple scenario has deadlocks:
> > sudo ip netns add n1
> > sudo devlink dev reload pci/0000:00:09.0 netns n1
> > sudo ip netns del n1
>
> Okay - I'm not sure why you're asking me this. This is not related to
> devlink locking as far as I can tell. Neither are you fixing this
> problem in your own RFC.
>
> You'd need to tell me more about what the notifier is used for (I see
> RoCE in the call trace). I don't understand why you need to re-register
> a global (i.e. not per netns) notifier when devlink is switching name
> spaces.
This becomes all entangled in the aux device stuff we did before.
devlink reload is defined, for reasons unrelated to netns, to do a
complete restart of the aux devices below the devlink. This happens
necessarily during actual reconfiguration operations, for instance.
So we have a situation, which seems like bad design, where reload is
also triggered by net namespace change that has nothing to do with
reconfiguring. In this case the per-net-ns becomes a BKL that gets
held across way too much stuff as it recuses down the reload path,
through aux devices, into the driver core and beyond.
When I looked at trying to fix this from the RDMA side I could not
find any remedy that didn't involve some kind of change in netdev
land. The drivers must be able to register/unregister notifiers in
their struct device_driver probe/remove functions.
I once sketched out fixing this by removing the need to hold the
per_net_rwsem just for list iteration, which in turn avoids holding it
over the devlink reload paths. It seemed like a reasonable step toward
finer grained locking.
Jason
On Tue, 9 Nov 2021 16:30:16 +0200 Leon Romanovsky wrote:
> On Tue, Nov 09, 2021 at 06:17:29AM -0800, Jakub Kicinski wrote:
> > On Tue, 9 Nov 2021 16:12:33 +0200 Leon Romanovsky wrote:
> > > RDMA subsystem supports two net namespace aware scenarios.
> > >
> > > We need global netdev_notifier for shared mode. This is legacy mode where
> > > we listen to all namespaces. We must support this mode otherwise we break
> > > whole RDMA world.
> > >
> > > See commit below:
> > > de641d74fb00 ("Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"")
> >
> > But why re-reg? To take advantage of clean event replay?
> >
> > IIUC the problem is that the un-reg is called from the reload_down path.
>
> This is how devlink reload was explained to me by Jiri. It suppose to
> unload and load whole driver again. In our case, it unloads mlx5_core,
> which destroys all ULPs (VDPA, RDMA, e.t.c) and these ULPs are not aware
> of devlink at all. Especially they are not aware of the reason of
> devlink reload.
Anything registering the global notifier outside of init_net needs
scrutiny. If the devlink instance was moved into namespace A then
paying attention to anything outside namespace A weakens the namespace
abstraction. init_net is legacy/special and it can't disappear so the
problem at hand won't happen (as it is triggered on namespace cleanup).
Maybe you can install the global notifier only if instance is in
init_net? That's my thinking with the scant information at hand.
> This is why I asked you. It is not related to devlink locking directly,
> but indirectly I didn't want to work on anything related to locking
> without making sure that devlink.c is correct.
On Tue, 9 Nov 2021 10:43:58 -0400 Jason Gunthorpe wrote:
> This becomes all entangled in the aux device stuff we did before.
So entangled in fact that neither of you is willing to elucidate
the exact need ;)
> devlink reload is defined, for reasons unrelated to netns, to do a
> complete restart of the aux devices below the devlink. This happens
> necessarily during actual reconfiguration operations, for instance.
>
> So we have a situation, which seems like bad design, where reload is
> also triggered by net namespace change that has nothing to do with
> reconfiguring.
Agreed, it is somewhat uncomfortable that the same callback achieves
two things. As clear as the need for reload-for-reset is (reconfig,
recovery etc.) I'm not as clear on reload for netns.
The main use case for reload for netns is placing a VF in a namespace,
for a container to use. Is that right? I've not seen use cases
requiring the PF to be moved, are there any?
devlink now lives in a networking namespace yet it spans such
namespaces (thru global notifiers). I think we need to define what it
means for devlink to live in a namespace. Is it just about the
configuration / notification channel? Or do we expect proper isolation?
Jiri?
> In this case the per-net-ns becomes a BKL that gets
> held across way too much stuff as it recuses down the reload path,
> through aux devices, into the driver core and beyond.
>
> When I looked at trying to fix this from the RDMA side I could not
> find any remedy that didn't involve some kind of change in netdev
> land. The drivers must be able to register/unregister notifiers in
> their struct device_driver probe/remove functions.
>
> I once sketched out fixing this by removing the need to hold the
> per_net_rwsem just for list iteration, which in turn avoids holding it
> over the devlink reload paths. It seemed like a reasonable step toward
> finer grained locking.
Seems to me the locking is just a symptom.
On Tue, Nov 09, 2021 at 07:07:02AM -0800, Jakub Kicinski wrote:
> On Tue, 9 Nov 2021 10:43:58 -0400 Jason Gunthorpe wrote:
> > This becomes all entangled in the aux device stuff we did before.
>
> So entangled in fact that neither of you is willing to elucidate
> the exact need ;)
I haven't been paying attention to this thread, Leon just asked me to
elaborate on why roce is in these traces.
What I know is mlx5 testing has shown an alarming number of crashers
and bugs connected to devlink and Leon has been grinding them
away. mlx5 is quite heavily invested in devlink and mlx5 really needs
it to work robustly.
> The main use case for reload for netns is placing a VF in a namespace,
> for a container to use. Is that right? I've not seen use cases
> requiring the PF to be moved, are there any?
Sure, it can be useful. I wouldn't call it essential, but it is there.
> > I once sketched out fixing this by removing the need to hold the
> > per_net_rwsem just for list iteration, which in turn avoids holding it
> > over the devlink reload paths. It seemed like a reasonable step toward
> > finer grained locking.
>
> Seems to me the locking is just a symptom.
My fear is this reload during net ns destruction is devlink uAPI now
and, yes it may be only a symptom, but the root cause may be unfixable
uAPI constraints. Jiri, what do you think?
Speaking generally, I greatly prefer devlink move toward a design where
the aux bus operations that must be connected with devlink reload are
not invoked under any global locks at all. Not per_net_rwsem, rtnl, or
devlink BKLs.
We know holding global locks in open-ended contexts, like driver core
device_register/unregister, is an dangerous pattern.
Concretely, now that we are pushing virtualization use cases into
using devlink as a control plane mlx5 has pod/VM launch issuing
devlink steps. Some of this stuff is necessarily slow, like attaching
a roce aux driver takes a while. Holding a BKL while doing that means
all VM launches are serialized.
IMHO these needs demand a fine grained locking approach, not a BKL
design.
Jason
Tue, Nov 09, 2021 at 04:07:02PM CET, [email protected] wrote:
>On Tue, 9 Nov 2021 10:43:58 -0400 Jason Gunthorpe wrote:
>> This becomes all entangled in the aux device stuff we did before.
>
>So entangled in fact that neither of you is willing to elucidate
>the exact need ;)
>
>> devlink reload is defined, for reasons unrelated to netns, to do a
>> complete restart of the aux devices below the devlink. This happens
>> necessarily during actual reconfiguration operations, for instance.
>>
>> So we have a situation, which seems like bad design, where reload is
>> also triggered by net namespace change that has nothing to do with
>> reconfiguring.
>
>Agreed, it is somewhat uncomfortable that the same callback achieves
>two things. As clear as the need for reload-for-reset is (reconfig,
>recovery etc.) I'm not as clear on reload for netns.
>
>The main use case for reload for netns is placing a VF in a namespace,
>for a container to use. Is that right? I've not seen use cases
>requiring the PF to be moved, are there any?
>
>devlink now lives in a networking namespace yet it spans such
>namespaces (thru global notifiers). I think we need to define what it
>means for devlink to live in a namespace. Is it just about the
>configuration / notification channel? Or do we expect proper isolation?
>
>Jiri?
Well honestly the primary motivation was to be able to run smoothly with
syzkaller for which the "configuration / notification channel" is
enough.
By "proper isolation" you mean what exactly?
>
>> In this case the per-net-ns becomes a BKL that gets
>> held across way too much stuff as it recuses down the reload path,
>> through aux devices, into the driver core and beyond.
>>
>> When I looked at trying to fix this from the RDMA side I could not
>> find any remedy that didn't involve some kind of change in netdev
>> land. The drivers must be able to register/unregister notifiers in
>> their struct device_driver probe/remove functions.
>>
>> I once sketched out fixing this by removing the need to hold the
>> per_net_rwsem just for list iteration, which in turn avoids holding it
>> over the devlink reload paths. It seemed like a reasonable step toward
>> finer grained locking.
>
>Seems to me the locking is just a symptom.
On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> > > I once sketched out fixing this by removing the need to hold the
> > > per_net_rwsem just for list iteration, which in turn avoids holding it
> > > over the devlink reload paths. It seemed like a reasonable step toward
> > > finer grained locking.
> >
> > Seems to me the locking is just a symptom.
>
> My fear is this reload during net ns destruction is devlink uAPI now
> and, yes it may be only a symptom, but the root cause may be unfixable
> uAPI constraints.
If I'm reading this right it locks up 100% of the time, what is a uAPI
for? DoS? ;)
Hence my questions about the actual use cases.
On Tue, 9 Nov 2021 17:15:24 +0100 Jiri Pirko wrote:
> Tue, Nov 09, 2021 at 04:07:02PM CET, [email protected] wrote:
> >On Tue, 9 Nov 2021 10:43:58 -0400 Jason Gunthorpe wrote:
> >> This becomes all entangled in the aux device stuff we did before.
> >
> >So entangled in fact that neither of you is willing to elucidate
> >the exact need ;)
> >
> >> devlink reload is defined, for reasons unrelated to netns, to do a
> >> complete restart of the aux devices below the devlink. This happens
> >> necessarily during actual reconfiguration operations, for instance.
> >>
> >> So we have a situation, which seems like bad design, where reload is
> >> also triggered by net namespace change that has nothing to do with
> >> reconfiguring.
> >
> >Agreed, it is somewhat uncomfortable that the same callback achieves
> >two things. As clear as the need for reload-for-reset is (reconfig,
> >recovery etc.) I'm not as clear on reload for netns.
> >
> >The main use case for reload for netns is placing a VF in a namespace,
> >for a container to use. Is that right? I've not seen use cases
> >requiring the PF to be moved, are there any?
> >
> >devlink now lives in a networking namespace yet it spans such
> >namespaces (thru global notifiers). I think we need to define what it
> >means for devlink to live in a namespace. Is it just about the
> >configuration / notification channel? Or do we expect proper isolation?
> >
> >Jiri?
>
> Well honestly the primary motivation was to be able to run smoothly with
> syzkaller for which the "configuration / notification channel" is
> enough.
Hm. And syzkaller runs in a namespace?
> By "proper isolation" you mean what exactly?
For the devlink instance and all subordinate objects to be entirely
contained to the namespace within which devlink resides, unless
explicitly liked up with or delegated to another namespace.
Tue, Nov 09, 2021 at 05:26:48PM CET, [email protected] wrote:
>On Tue, 9 Nov 2021 17:15:24 +0100 Jiri Pirko wrote:
>> Tue, Nov 09, 2021 at 04:07:02PM CET, [email protected] wrote:
>> >On Tue, 9 Nov 2021 10:43:58 -0400 Jason Gunthorpe wrote:
>> >> This becomes all entangled in the aux device stuff we did before.
>> >
>> >So entangled in fact that neither of you is willing to elucidate
>> >the exact need ;)
>> >
>> >> devlink reload is defined, for reasons unrelated to netns, to do a
>> >> complete restart of the aux devices below the devlink. This happens
>> >> necessarily during actual reconfiguration operations, for instance.
>> >>
>> >> So we have a situation, which seems like bad design, where reload is
>> >> also triggered by net namespace change that has nothing to do with
>> >> reconfiguring.
>> >
>> >Agreed, it is somewhat uncomfortable that the same callback achieves
>> >two things. As clear as the need for reload-for-reset is (reconfig,
>> >recovery etc.) I'm not as clear on reload for netns.
>> >
>> >The main use case for reload for netns is placing a VF in a namespace,
>> >for a container to use. Is that right? I've not seen use cases
>> >requiring the PF to be moved, are there any?
>> >
>> >devlink now lives in a networking namespace yet it spans such
>> >namespaces (thru global notifiers). I think we need to define what it
>> >means for devlink to live in a namespace. Is it just about the
>> >configuration / notification channel? Or do we expect proper isolation?
>> >
>> >Jiri?
>>
>> Well honestly the primary motivation was to be able to run smoothly with
>> syzkaller for which the "configuration / notification channel" is
>> enough.
>
>Hm. And syzkaller runs in a namespace?
Correct.
>
>> By "proper isolation" you mean what exactly?
>
>For the devlink instance and all subordinate objects to be entirely
>contained to the namespace within which devlink resides, unless
>explicitly liked up with or delegated to another namespace.
What makes sense to me and that is actually how the current drivers
should behave (mlxsw, netdevsim are for sure).
Tue, Nov 09, 2021 at 03:12:33PM CET, [email protected] wrote:
>On Mon, Nov 08, 2021 at 03:31:26PM -0800, Jakub Kicinski wrote:
>> On Mon, 8 Nov 2021 21:58:36 +0200 Leon Romanovsky wrote:
>> > > > > nfp will benefit from the simplified locking as well, and so will bnxt,
>> > > > > although I'm not sure the maintainers will opt for using devlink framework
>> > > > > due to the downstream requirements.
>> > > >
>> > > > Exactly why devlink should be fixed first.
>> > >
>> > > If by "fixed first" you mean it needs 5 locks to be added and to remove
>> > > any guarantees on sub-object lifetime then no thanks.
>> >
>> > How do you plan to fix pernet_ops_rwsem lock? By exposing devlink state
>> > to the drivers? By providing unlocked version of unregister_netdevice_notifier?
>> >
>> > This simple scenario has deadlocks:
>> > sudo ip netns add n1
>> > sudo devlink dev reload pci/0000:00:09.0 netns n1
>> > sudo ip netns del n1
>>
>> Okay - I'm not sure why you're asking me this. This is not related to
>> devlink locking as far as I can tell. Neither are you fixing this
>> problem in your own RFC.
>
>I asked you because you clearly showed to me that things that makes
>sense for me, doesn't make sense for you and vice versa.
>
>I don't want to do work that will be thrown away.
>
>>
>> You'd need to tell me more about what the notifier is used for (I see
>> RoCE in the call trace). I don't understand why you need to re-register
>> a global (i.e. not per netns) notifier when devlink is switching name
>> spaces.
>
>RDMA subsystem supports two net namespace aware scenarios.
>
>We need global netdev_notifier for shared mode. This is legacy mode where
>we listen to all namespaces. We must support this mode otherwise we break
>whole RDMA world.
>
>See commit below:
>de641d74fb00 ("Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"")
If it is not possible for whatever reason to have per-ns notifier, you
have to register the global notifier probably only once in init, and
have probably some sort of mechanism to ignore the events while you are
in the middle of the re-init. I don't see other way.
On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> > > > I once sketched out fixing this by removing the need to hold the
> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
> > > > over the devlink reload paths. It seemed like a reasonable step toward
> > > > finer grained locking.
> > >
> > > Seems to me the locking is just a symptom.
> >
> > My fear is this reload during net ns destruction is devlink uAPI now
> > and, yes it may be only a symptom, but the root cause may be unfixable
> > uAPI constraints.
>
> If I'm reading this right it locks up 100% of the time, what is a uAPI
> for? DoS? ;)
>
> Hence my questions about the actual use cases.
It is important to add that this problem existed even before auxiliary
bus conversion.
Thanks
Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
>On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
>> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
>> > > > I once sketched out fixing this by removing the need to hold the
>> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
>> > > > over the devlink reload paths. It seemed like a reasonable step toward
>> > > > finer grained locking.
>> > >
>> > > Seems to me the locking is just a symptom.
>> >
>> > My fear is this reload during net ns destruction is devlink uAPI now
>> > and, yes it may be only a symptom, but the root cause may be unfixable
>> > uAPI constraints.
>>
>> If I'm reading this right it locks up 100% of the time, what is a uAPI
>> for? DoS? ;)
>>
>> Hence my questions about the actual use cases.
>
>Removing namespace support from devlink would solve the crasher. I
>certainly didn't feel bold enough to suggest such a thing :)
>
>If no other devlink driver cares about this it is probably the best
>idea.
Devlink namespace support is not generic, not related to any driver.
>
>Jason
On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> >> > > > I once sketched out fixing this by removing the need to hold the
> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
> >> > > > finer grained locking.
> >> > >
> >> > > Seems to me the locking is just a symptom.
> >> >
> >> > My fear is this reload during net ns destruction is devlink uAPI now
> >> > and, yes it may be only a symptom, but the root cause may be unfixable
> >> > uAPI constraints.
> >>
> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
> >> for? DoS? ;)
> >>
> >> Hence my questions about the actual use cases.
> >
> >Removing namespace support from devlink would solve the crasher. I
> >certainly didn't feel bold enough to suggest such a thing :)
> >
> >If no other devlink driver cares about this it is probably the best
> >idea.
>
> Devlink namespace support is not generic, not related to any driver.
What do you mean?
devlink_pernet_pre_exit() calls to devlink reload, which means that only
drivers that support reload care about it. The reload is driver thing.
Thanks
>
> >
> >Jason
Thu, Nov 11, 2021 at 01:17:52PM CET, [email protected] wrote:
>On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
>> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
>> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
>> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
>> >> > > > I once sketched out fixing this by removing the need to hold the
>> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
>> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
>> >> > > > finer grained locking.
>> >> > >
>> >> > > Seems to me the locking is just a symptom.
>> >> >
>> >> > My fear is this reload during net ns destruction is devlink uAPI now
>> >> > and, yes it may be only a symptom, but the root cause may be unfixable
>> >> > uAPI constraints.
>> >>
>> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
>> >> for? DoS? ;)
>> >>
>> >> Hence my questions about the actual use cases.
>> >
>> >Removing namespace support from devlink would solve the crasher. I
>> >certainly didn't feel bold enough to suggest such a thing :)
>> >
>> >If no other devlink driver cares about this it is probably the best
>> >idea.
>>
>> Devlink namespace support is not generic, not related to any driver.
>
>What do you mean?
>
>devlink_pernet_pre_exit() calls to devlink reload, which means that only
>drivers that support reload care about it. The reload is driver thing.
However, Jason was talking about "namespace support removal from
devlink"..
>
>Thanks
>
>>
>> >
>> >Jason
On Fri, Nov 12, 2021 at 08:38:56AM +0100, Jiri Pirko wrote:
> Thu, Nov 11, 2021 at 01:17:52PM CET, [email protected] wrote:
> >On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
> >> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
> >> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
> >> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> >> >> > > > I once sketched out fixing this by removing the need to hold the
> >> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
> >> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
> >> >> > > > finer grained locking.
> >> >> > >
> >> >> > > Seems to me the locking is just a symptom.
> >> >> >
> >> >> > My fear is this reload during net ns destruction is devlink uAPI now
> >> >> > and, yes it may be only a symptom, but the root cause may be unfixable
> >> >> > uAPI constraints.
> >> >>
> >> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
> >> >> for? DoS? ;)
> >> >>
> >> >> Hence my questions about the actual use cases.
> >> >
> >> >Removing namespace support from devlink would solve the crasher. I
> >> >certainly didn't feel bold enough to suggest such a thing :)
> >> >
> >> >If no other devlink driver cares about this it is probably the best
> >> >idea.
> >>
> >> Devlink namespace support is not generic, not related to any driver.
> >
> >What do you mean?
> >
> >devlink_pernet_pre_exit() calls to devlink reload, which means that only
> >drivers that support reload care about it. The reload is driver thing.
>
> However, Jason was talking about "namespace support removal from
> devlink"..
The code that sparkles deadlocks is in devlink_pernet_pre_exit() and
this will be nice to remove. I just don't know if it is possible to do
without ripping whole namespace support from devlink.
Thanks
>
>
> >
> >Thanks
> >
> >>
> >> >
> >> >Jason
Sun, Nov 14, 2021 at 07:19:02AM CET, [email protected] wrote:
>On Fri, Nov 12, 2021 at 08:38:56AM +0100, Jiri Pirko wrote:
>> Thu, Nov 11, 2021 at 01:17:52PM CET, [email protected] wrote:
>> >On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
>> >> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
>> >> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
>> >> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
>> >> >> > > > I once sketched out fixing this by removing the need to hold the
>> >> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
>> >> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
>> >> >> > > > finer grained locking.
>> >> >> > >
>> >> >> > > Seems to me the locking is just a symptom.
>> >> >> >
>> >> >> > My fear is this reload during net ns destruction is devlink uAPI now
>> >> >> > and, yes it may be only a symptom, but the root cause may be unfixable
>> >> >> > uAPI constraints.
>> >> >>
>> >> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
>> >> >> for? DoS? ;)
>> >> >>
>> >> >> Hence my questions about the actual use cases.
>> >> >
>> >> >Removing namespace support from devlink would solve the crasher. I
>> >> >certainly didn't feel bold enough to suggest such a thing :)
>> >> >
>> >> >If no other devlink driver cares about this it is probably the best
>> >> >idea.
>> >>
>> >> Devlink namespace support is not generic, not related to any driver.
>> >
>> >What do you mean?
>> >
>> >devlink_pernet_pre_exit() calls to devlink reload, which means that only
>> >drivers that support reload care about it. The reload is driver thing.
>>
>> However, Jason was talking about "namespace support removal from
>> devlink"..
>
>The code that sparkles deadlocks is in devlink_pernet_pre_exit() and
>this will be nice to remove. I just don't know if it is possible to do
>without ripping whole namespace support from devlink.
As discussed offline, the non-standard mlx5/IB usage of network
namespaces requires non standard mlx5/IB workaround. Does not make any
sense to remove the devlink net namespace support removal.
>
>Thanks
>
>>
>>
>> >
>> >Thanks
>> >
>> >>
>> >> >
>> >> >Jason
On Mon, Nov 15, 2021 at 12:20:21PM +0100, Jiri Pirko wrote:
> Sun, Nov 14, 2021 at 07:19:02AM CET, [email protected] wrote:
> >On Fri, Nov 12, 2021 at 08:38:56AM +0100, Jiri Pirko wrote:
> >> Thu, Nov 11, 2021 at 01:17:52PM CET, [email protected] wrote:
> >> >On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
> >> >> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
> >> >> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
> >> >> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
> >> >> >> > > > I once sketched out fixing this by removing the need to hold the
> >> >> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
> >> >> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
> >> >> >> > > > finer grained locking.
> >> >> >> > >
> >> >> >> > > Seems to me the locking is just a symptom.
> >> >> >> >
> >> >> >> > My fear is this reload during net ns destruction is devlink uAPI now
> >> >> >> > and, yes it may be only a symptom, but the root cause may be unfixable
> >> >> >> > uAPI constraints.
> >> >> >>
> >> >> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
> >> >> >> for? DoS? ;)
> >> >> >>
> >> >> >> Hence my questions about the actual use cases.
> >> >> >
> >> >> >Removing namespace support from devlink would solve the crasher. I
> >> >> >certainly didn't feel bold enough to suggest such a thing :)
> >> >> >
> >> >> >If no other devlink driver cares about this it is probably the best
> >> >> >idea.
> >> >>
> >> >> Devlink namespace support is not generic, not related to any driver.
> >> >
> >> >What do you mean?
> >> >
> >> >devlink_pernet_pre_exit() calls to devlink reload, which means that only
> >> >drivers that support reload care about it. The reload is driver thing.
> >>
> >> However, Jason was talking about "namespace support removal from
> >> devlink"..
> >
> >The code that sparkles deadlocks is in devlink_pernet_pre_exit() and
> >this will be nice to remove. I just don't know if it is possible to do
> >without ripping whole namespace support from devlink.
>
> As discussed offline, the non-standard mlx5/IB usage of network
> namespaces requires non standard mlx5/IB workaround. Does not make any
> sense to remove the devlink net namespace support removal.
Sorry, I don't agree that registering a net notifier in an aux device
probe function is non-standard or wrong.
This model must be supported sanely somehow in the netdev area and
cannot be worked around in leaf drivers.
Intel ice will have the same problem, as would broadcom if they ever
get their driver modernized.
Jason
Mon, Nov 15, 2021 at 01:53:59PM CET, [email protected] wrote:
>On Mon, Nov 15, 2021 at 12:20:21PM +0100, Jiri Pirko wrote:
>> Sun, Nov 14, 2021 at 07:19:02AM CET, [email protected] wrote:
>> >On Fri, Nov 12, 2021 at 08:38:56AM +0100, Jiri Pirko wrote:
>> >> Thu, Nov 11, 2021 at 01:17:52PM CET, [email protected] wrote:
>> >> >On Thu, Nov 11, 2021 at 01:05:11PM +0100, Jiri Pirko wrote:
>> >> >> Tue, Nov 09, 2021 at 07:24:27PM CET, [email protected] wrote:
>> >> >> >On Tue, Nov 09, 2021 at 08:20:42AM -0800, Jakub Kicinski wrote:
>> >> >> >> On Tue, 9 Nov 2021 11:33:35 -0400 Jason Gunthorpe wrote:
>> >> >> >> > > > I once sketched out fixing this by removing the need to hold the
>> >> >> >> > > > per_net_rwsem just for list iteration, which in turn avoids holding it
>> >> >> >> > > > over the devlink reload paths. It seemed like a reasonable step toward
>> >> >> >> > > > finer grained locking.
>> >> >> >> > >
>> >> >> >> > > Seems to me the locking is just a symptom.
>> >> >> >> >
>> >> >> >> > My fear is this reload during net ns destruction is devlink uAPI now
>> >> >> >> > and, yes it may be only a symptom, but the root cause may be unfixable
>> >> >> >> > uAPI constraints.
>> >> >> >>
>> >> >> >> If I'm reading this right it locks up 100% of the time, what is a uAPI
>> >> >> >> for? DoS? ;)
>> >> >> >>
>> >> >> >> Hence my questions about the actual use cases.
>> >> >> >
>> >> >> >Removing namespace support from devlink would solve the crasher. I
>> >> >> >certainly didn't feel bold enough to suggest such a thing :)
>> >> >> >
>> >> >> >If no other devlink driver cares about this it is probably the best
>> >> >> >idea.
>> >> >>
>> >> >> Devlink namespace support is not generic, not related to any driver.
>> >> >
>> >> >What do you mean?
>> >> >
>> >> >devlink_pernet_pre_exit() calls to devlink reload, which means that only
>> >> >drivers that support reload care about it. The reload is driver thing.
>> >>
>> >> However, Jason was talking about "namespace support removal from
>> >> devlink"..
>> >
>> >The code that sparkles deadlocks is in devlink_pernet_pre_exit() and
>> >this will be nice to remove. I just don't know if it is possible to do
>> >without ripping whole namespace support from devlink.
>>
>> As discussed offline, the non-standard mlx5/IB usage of network
>> namespaces requires non standard mlx5/IB workaround. Does not make any
>> sense to remove the devlink net namespace support removal.
>
>Sorry, I don't agree that registering a net notifier in an aux device
>probe function is non-standard or wrong.
Listening to events which happen in different namespaces and react to
them is the non-standard behaviour which I refered to. If you would not
need to do it, you could just use netns notofier which would solve your
issue. You know it.
>
>This model must be supported sanely somehow in the netdev area and
>cannot be worked around in leaf drivers.
>
>Intel ice will have the same problem, as would broadcom if they ever
>get their driver modernized.
>
>Jason
On Mon, Nov 15, 2021 at 03:42:58PM +0100, Jiri Pirko wrote:
> >Sorry, I don't agree that registering a net notifier in an aux device
> >probe function is non-standard or wrong.
>
> Listening to events which happen in different namespaces and react to
> them is the non-standard behaviour which I refered to. If you would not
> need to do it, you could just use netns notofier which would solve your
> issue. You know it.
Huh?
It calls the bog standard
register_netdevice_notifier()
Like hundreds of other drivers do from their probe functions
Which does:
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net *net;
int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
And deadlocks because devlink hols the pernet_ops_rwsem when it
triggers reload in some paths.
There is nothing wrong with a driver doing this standard pattern.
There is only one place in the entire kernel calling the per-ns
register_netdevice_notifier_dev_net() and it is burred inside another
part of mlx5 for some reason..
I believe Parav already looked at using that in rdma and it didn't
work for some reason I've forgotten.
It is not that we care about events in different namespaces, it is
that rdma, like everything else, doesn't care about namespaces and
wants events from the netdev no matter where it is located.
Jason
On Mon, 15 Nov 2021 11:09:31 -0400 Jason Gunthorpe wrote:
> On Mon, Nov 15, 2021 at 03:42:58PM +0100, Jiri Pirko wrote:
> > >Sorry, I don't agree that registering a net notifier in an aux device
> > >probe function is non-standard or wrong.
> >
> > Listening to events which happen in different namespaces and react to
> > them is the non-standard behaviour which I refered to. If you would not
> > need to do it, you could just use netns notofier which would solve your
> > issue. You know it.
>
> Huh?
>
> It calls the bog standard
>
> register_netdevice_notifier()
>
> Like hundreds of other drivers do from their probe functions
>
> Which does:
>
> int register_netdevice_notifier(struct notifier_block *nb)
> {
> struct net *net;
> int err;
>
> /* Close race with setup_net() and cleanup_net() */
> down_write(&pernet_ops_rwsem);
>
> And deadlocks because devlink hols the pernet_ops_rwsem when it
> triggers reload in some paths.
>
> There is nothing wrong with a driver doing this standard pattern.
>
> There is only one place in the entire kernel calling the per-ns
> register_netdevice_notifier_dev_net() and it is burred inside another
> part of mlx5 for some reason..
>
> I believe Parav already looked at using that in rdma and it didn't
> work for some reason I've forgotten.
>
> It is not that we care about events in different namespaces, it is
> that rdma, like everything else, doesn't care about namespaces and
> wants events from the netdev no matter where it is located.
devlink now allows drivers to be net ns-aware, and they should
obey if they declare support. Can we add a flag / capability
to devlink and make it an explicit opt-in for drivers who care?
Mon, Nov 15, 2021 at 04:09:31PM CET, [email protected] wrote:
>On Mon, Nov 15, 2021 at 03:42:58PM +0100, Jiri Pirko wrote:
>
>> >Sorry, I don't agree that registering a net notifier in an aux device
>> >probe function is non-standard or wrong.
>>
>> Listening to events which happen in different namespaces and react to
>> them is the non-standard behaviour which I refered to. If you would not
>> need to do it, you could just use netns notofier which would solve your
>> issue. You know it.
>
>Huh?
>
>It calls the bog standard
>
> register_netdevice_notifier()
>
>Like hundreds of other drivers do from their probe functions
>
>Which does:
>
>int register_netdevice_notifier(struct notifier_block *nb)
>{
> struct net *net;
> int err;
>
> /* Close race with setup_net() and cleanup_net() */
> down_write(&pernet_ops_rwsem);
>
>And deadlocks because devlink hols the pernet_ops_rwsem when it
>triggers reload in some paths.
>
>There is nothing wrong with a driver doing this standard pattern.
>
>There is only one place in the entire kernel calling the per-ns
>register_netdevice_notifier_dev_net() and it is burred inside another
>part of mlx5 for some reason..
Yep. I added it there to solve this deadlock.
>
>I believe Parav already looked at using that in rdma and it didn't
>work for some reason I've forgotten.
>
>It is not that we care about events in different namespaces, it is
>that rdma, like everything else, doesn't care about namespaces and
>wants events from the netdev no matter where it is located.
Wait, so there is no notion of netnamespaces in rdma? I was under
impression rdma supports netnamespaces...
>
>Jason
Mon, Nov 15, 2021 at 04:22:06PM CET, [email protected] wrote:
>On Mon, 15 Nov 2021 11:09:31 -0400 Jason Gunthorpe wrote:
>> On Mon, Nov 15, 2021 at 03:42:58PM +0100, Jiri Pirko wrote:
>> > >Sorry, I don't agree that registering a net notifier in an aux device
>> > >probe function is non-standard or wrong.
>> >
>> > Listening to events which happen in different namespaces and react to
>> > them is the non-standard behaviour which I refered to. If you would not
>> > need to do it, you could just use netns notofier which would solve your
>> > issue. You know it.
>>
>> Huh?
>>
>> It calls the bog standard
>>
>> register_netdevice_notifier()
>>
>> Like hundreds of other drivers do from their probe functions
>>
>> Which does:
>>
>> int register_netdevice_notifier(struct notifier_block *nb)
>> {
>> struct net *net;
>> int err;
>>
>> /* Close race with setup_net() and cleanup_net() */
>> down_write(&pernet_ops_rwsem);
>>
>> And deadlocks because devlink hols the pernet_ops_rwsem when it
>> triggers reload in some paths.
>>
>> There is nothing wrong with a driver doing this standard pattern.
>>
>> There is only one place in the entire kernel calling the per-ns
>> register_netdevice_notifier_dev_net() and it is burred inside another
>> part of mlx5 for some reason..
>>
>> I believe Parav already looked at using that in rdma and it didn't
>> work for some reason I've forgotten.
>>
>> It is not that we care about events in different namespaces, it is
>> that rdma, like everything else, doesn't care about namespaces and
>> wants events from the netdev no matter where it is located.
>
>devlink now allows drivers to be net ns-aware, and they should
>obey if they declare support. Can we add a flag / capability
>to devlink and make it an explicit opt-in for drivers who care?
Looks kind of odd to me. It is also not possible for netdevice to say if
it should be net namespace aware or not. The driver should not be the
one to decide this.
On Tue, Nov 16, 2021 at 07:57:09AM +0100, Jiri Pirko wrote:
> >There is only one place in the entire kernel calling the per-ns
> >register_netdevice_notifier_dev_net() and it is burred inside another
> >part of mlx5 for some reason..
>
> Yep. I added it there to solve this deadlock.
I wonder how it can work safely inside a driver, since when are
drivers NS aware?
uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent;
ret = register_netdevice_notifier_dev_net(netdev,
&uplink_priv->bond->nb,
&uplink_priv->bond->nn);
Doesn't that just loose events when the user moves netdev to another
namespace?
> >I believe Parav already looked at using that in rdma and it didn't
> >work for some reason I've forgotten.
> >
> >It is not that we care about events in different namespaces, it is
> >that rdma, like everything else, doesn't care about namespaces and
> >wants events from the netdev no matter where it is located.
>
> Wait, so there is no notion of netnamespaces in rdma? I was under
> impression rdma supports netnamespaces...
It does, but that doesn't change things, when it is attached to a
netdev it needs events, without any loss, no matter what NS that
netdev is in.
Jason
On Tue, 16 Nov 2021 08:00:10 +0100 Jiri Pirko wrote:
> >> It is not that we care about events in different namespaces, it is
> >> that rdma, like everything else, doesn't care about namespaces and
> >> wants events from the netdev no matter where it is located.
> >
> >devlink now allows drivers to be net ns-aware, and they should
> >obey if they declare support. Can we add a flag / capability
> >to devlink and make it an explicit opt-in for drivers who care?
>
> Looks kind of odd to me. It is also not possible for netdevice to say if
> it should be net namespace aware or not. The driver should not be the
> one to decide this.
NETIF_F_NETNS_LOCAL, but yeah, if it can be fixed that's obviously better.
On Tue, Nov 16, 2021 at 08:44:42AM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 16, 2021 at 07:57:09AM +0100, Jiri Pirko wrote:
>
> > >There is only one place in the entire kernel calling the per-ns
> > >register_netdevice_notifier_dev_net() and it is burred inside another
> > >part of mlx5 for some reason..
> >
> > Yep. I added it there to solve this deadlock.
>
> I wonder how it can work safely inside a driver, since when are
> drivers NS aware?
>
> uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent;
> ret = register_netdevice_notifier_dev_net(netdev,
> &uplink_priv->bond->nb,
> &uplink_priv->bond->nn);
>
> Doesn't that just loose events when the user moves netdev to another
> namespace?
I don't think so, it looks like holding rtnl_lock is enough.
However, we need all events and not NS-specific ones and it maybe solves
the deadlock, but doesn't solve our issue.
BTW, this makes me wonder how commit 554873e51711 ("net: Do not take net_rwsem in __rtnl_link_unregister()")
aligns with the comment near pernet_ops_rwsem and explode usage in other
places.
57 /*
58 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
59 * init_net_initialized and first_device pointer.
60 * This is internal net namespace object. Please, don't use it
61 * outside.
62 */
Thanks