Hi all,
Today's linux-next merge of the rdma tree got a conflict in:
drivers/infiniband/hw/mlx5/mr.c
between commit:
374012b00457 ("RDMA/mlx5: Fix mkey cache possible deadlock on cleanup")
from the rdma-fixes tree and commit:
57e7071683ef ("RDMA/mlx5: Implement mkeys management via LIFO queue")
from the rdma tree.
I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging. You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.
--
Cheers,
Stephen Rothwell
diff --cc drivers/infiniband/hw/mlx5/mr.c
index 433f96459246,b0fa2d644973..000000000000
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@@ -1025,21 -998,15 +999,21 @@@ void mlx5_mkey_cache_cleanup(struct mlx
if (!dev->cache.wq)
return;
- cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
mutex_lock(&dev->cache.rb_lock);
+ dev->cache.disable = true;
for (node = rb_first(root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
- xa_lock_irq(&ent->mkeys);
+ spin_lock_irq(&ent->mkeys_queue.lock);
ent->disabled = true;
- xa_unlock_irq(&ent->mkeys);
+ spin_unlock_irq(&ent->mkeys_queue.lock);
- cancel_delayed_work_sync(&ent->dwork);
}
+ mutex_unlock(&dev->cache.rb_lock);
+
+ /*
+ * After all entries are disabled and will not reschedule on WQ,
+ * flush it and all async commands.
+ */
+ flush_workqueue(dev->cache.wq);
mlx5_mkey_cache_debugfs_cleanup(dev);
mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
On Thu, Sep 28, 2023 at 11:38:51AM +1000, Stephen Rothwell wrote:
> Hi all,
>
> Today's linux-next merge of the rdma tree got a conflict in:
>
> drivers/infiniband/hw/mlx5/mr.c
>
> between commit:
>
> 374012b00457 ("RDMA/mlx5: Fix mkey cache possible deadlock on cleanup")
>
> from the rdma-fixes tree and commit:
>
> 57e7071683ef ("RDMA/mlx5: Implement mkeys management via LIFO queue")
>
> from the rdma tree.
>
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging. You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
>
> --
> Cheers,
> Stephen Rothwell
>
> diff --cc drivers/infiniband/hw/mlx5/mr.c
> index 433f96459246,b0fa2d644973..000000000000
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@@ -1025,21 -998,15 +999,21 @@@ void mlx5_mkey_cache_cleanup(struct mlx
> if (!dev->cache.wq)
> return;
>
> - cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
> mutex_lock(&dev->cache.rb_lock);
> + dev->cache.disable = true;
> for (node = rb_first(root); node; node = rb_next(node)) {
> ent = rb_entry(node, struct mlx5_cache_ent, node);
> - xa_lock_irq(&ent->mkeys);
> + spin_lock_irq(&ent->mkeys_queue.lock);
> ent->disabled = true;
> - xa_unlock_irq(&ent->mkeys);
> + spin_unlock_irq(&ent->mkeys_queue.lock);
> - cancel_delayed_work_sync(&ent->dwork);
> }
> + mutex_unlock(&dev->cache.rb_lock);
> +
> + /*
> + * After all entries are disabled and will not reschedule on WQ,
> + * flush it and all async commands.
> + */
> + flush_workqueue(dev->cache.wq);
>
> mlx5_mkey_cache_debugfs_cleanup(dev);
> mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
Thanks for the fix.