The agent TID is a 64 bit value split in two dwords. The least
significant dword is the TID running counter. The most significant
dword is the agent number. In the CX-3 shared port model, the mlx4
driver uses the most significant byte of the agent number to store the
slave number, making agent numbers greater and equal to 2^24 (3 bytes)
unusable. The current codebase uses a variable which is incremented
atomically for each new agent number giving too large agent numbers
over time. The IDA set of functions are used instead of the simple
counter approach. This allows re-use of agent numbers. A sysctl
variable is also introduced, to control the max agent number.
The signature of the bug is a MAD layer that stops working and the
console is flooded with messages like:
mlx4_ib: egress mad has non-null tid msb:1 class:4 slave:0
Signed-off-by: Hans Westgaard Ry <[email protected]>
---
drivers/infiniband/core/mad.c | 50 +++++++++++++++++++++++++++++++++++++++++--
1 file changed, 48 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index b28452a55a08..adce6cd5fc41 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -41,6 +41,8 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
+#include <linux/idr.h>
+#include <linux/sysctl.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
@@ -57,9 +59,27 @@ module_param_named(send_queue_size, mad_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+/* Sysctl variable to set largest mad agent number */
+static u32 ib_mad_sysctl_min_client_id_max;
+static u32 ib_mad_sysctl_max_client_id_max;
+static u32 ib_mad_sysctl_client_id_max;
+static struct ctl_table_header *ib_mad_sysctl_hdr;
+
+static struct ctl_table ib_mad_sysctl_table[] = {
+ {
+ .procname = "client_id_max",
+ .data = &ib_mad_sysctl_client_id_max,
+ .maxlen = sizeof(ib_mad_sysctl_client_id_max),
+ .mode = 0644,
+ .proc_handler = &proc_douintvec_minmax,
+ .extra1 = &ib_mad_sysctl_min_client_id_max,
+ .extra2 = &ib_mad_sysctl_max_client_id_max,
+ },
+ { }
+};
static struct list_head ib_mad_port_list;
-static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
+DEFINE_IDA(ib_mad_client_ids);
/* Port list lock */
static DEFINE_SPINLOCK(ib_mad_port_list_lock);
@@ -212,6 +232,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
int ret2, qpn;
unsigned long flags;
u8 mgmt_class, vclass;
+ u32 ib_mad_client_id;
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
@@ -376,8 +397,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error4;
}
+ ib_mad_client_id = ida_simple_get(&ib_mad_client_ids,
+ 0,
+ ib_mad_sysctl_client_id_max,
+ GFP_KERNEL);
+ if (ib_mad_client_id < 0) {
+ pr_err("Couldn't allocate agent tid; errcode: %#x\n",
+ ib_mad_client_id);
+ ret = ERR_PTR(ib_mad_client_id);
+ goto error4;
+ }
+ mad_agent_priv->agent.hi_tid = ib_mad_client_id;
spin_lock_irqsave(&port_priv->reg_lock, flags);
- mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id);
/*
* Make sure MAD registration (if supplied)
@@ -428,6 +459,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
error5:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+ ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
+
error4:
kfree(reg_req);
error3:
@@ -588,6 +621,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
cancel_delayed_work(&mad_agent_priv->timed_work);
spin_lock_irqsave(&port_priv->reg_lock, flags);
+ ida_simple_remove(&ib_mad_client_ids, mad_agent_priv->agent.hi_tid);
remove_mad_reg_req(mad_agent_priv);
list_del(&mad_agent_priv->agent_list);
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
@@ -3341,10 +3375,22 @@ int ib_mad_init(void)
return -EINVAL;
}
+ ib_mad_sysctl_min_client_id_max = 1 << 10;
+ ib_mad_sysctl_max_client_id_max = 1 << 23;
+ ib_mad_sysctl_client_id_max = 1 << 18;
+ ib_mad_sysctl_hdr = register_net_sysctl(&init_net, "net/ibmad",
+ ib_mad_sysctl_table);
+ if (!ib_mad_sysctl_hdr) {
+ pr_err("%s: register_net_sysctl failed\n", __func__);
+ ib_mad_cleanup();
+ return -EINVAL;
+ }
return 0;
}
void ib_mad_cleanup(void)
{
ib_unregister_client(&mad_client);
+ ida_destroy(&ib_mad_client_ids);
+ unregister_net_sysctl_table(ib_mad_sysctl_hdr);
}
--
2.13.6
On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> The agent TID is a 64 bit value split in two dwords. The least
> significant dword is the TID running counter. The most significant
> dword is the agent number. In the CX-3 shared port model, the mlx4
> driver uses the most significant byte of the agent number to store the
> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> unusable. The current codebase uses a variable which is incremented
> atomically for each new agent number giving too large agent numbers
> over time. The IDA set of functions are used instead of the simple
> counter approach. This allows re-use of agent numbers. A sysctl
> variable is also introduced, to control the max agent number.
Why don't you simply limit this number per-driver? By default, any
variable is allowed and mlx4_ib will set something else.
What is the advantage of having sysctl?
Thanks
On Tue, May 29, 2018 at 11:54:59AM +0300, Leon Romanovsky wrote:
> On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> > The agent TID is a 64 bit value split in two dwords. The least
> > significant dword is the TID running counter. The most significant
> > dword is the agent number. In the CX-3 shared port model, the mlx4
> > driver uses the most significant byte of the agent number to store the
> > slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> > unusable. The current codebase uses a variable which is incremented
> > atomically for each new agent number giving too large agent numbers
> > over time. The IDA set of functions are used instead of the simple
> > counter approach. This allows re-use of agent numbers. A sysctl
> > variable is also introduced, to control the max agent number.
>
> Why don't you simply limit this number per-driver? By default, any
> variable is allowed and mlx4_ib will set something else.
>
> What is the advantage of having sysctl?
Anyway, it doesn't pass smoke test.
[ 126.428407] RPC: Unregistered rdma transport module.
[ 126.428513] RPC: Unregistered rdma backchannel transport module.
[ 194.664081] IPv6: ADDRCONF(NETDEV_UP): ib0: link is not ready
[ 209.068702] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070
[ 209.068858] PGD 80000000341cf067 P4D 80000000341cf067 PUD 34188067 PMD 0
[ 209.068941] Oops: 0002 [#1] SMP PTI
[ 209.069006] Modules linked in: netconsole nfsv3 nfs fscache
mlx4_ib(-) mlx4_en mlx4_core devlink ib_ipoib rdma_ucm ib_ucm ib_uverbs
ib_umad rdma_cm ib_cm iw_cm ib_core dm_mirror dm_region_hash dm_log
dm_mod nfsd pcspkr i2c_piix4 auth_rpcgss nfs_acl lockd grace sunrpc
ip_tables ata_generic cirrus drm_kms_helper pata_acpi syscopyarea
sysfillrect sysimgblt fb_sys_fops ttm drm e1000 virtio_console i2c_core
serio_raw ata_piix floppy [last unloaded: mlxfw]
[ 209.069312] CPU: 4 PID: 11048 Comm: modprobe Not tainted 4.17.0-rc7-2018-05-29_11-04-56_Hans_Westgaard_Ry__hans_westga #1
[ 209.069413] Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
[ 209.069486] RIP: 0010:_raw_spin_lock_irqsave+0x1e/0x40
[ 209.069536] RSP: 0018:ffffc90000b4fd70 EFLAGS: 00010046
[ 209.069591] RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffea0004d7ed00
[ 209.069653] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000070
[ 209.069717] RBP: 0000000000000000 R08: ffff88013446fc00 R09: 000000018010000f
[ 209.069778] R10: 0000000000000001 R11: ffff88013446fc00 R12: 0000000000000070
[ 209.069849] R13: 0000000000000202 R14: 0000000000000000 R15: 0000000000000000
[ 209.069915] FS: 00007fc34caf7740(0000) GS:ffff88013fd00000(0000) knlGS:0000000000000000
[ 209.069987] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 209.070043] CR2: 0000000000000070 CR3: 000000008853e000 CR4: 00000000000006e0
[ 209.070128] Call Trace:
[ 209.070189] ib_unregister_mad_agent+0x2d/0x540 [ib_core]
[ 209.070260] ? __slab_free+0x9a/0x2d0
[ 209.070332] ib_agent_port_close+0xad/0xf0 [ib_core]
[ 209.070396] ib_mad_remove_device+0x59/0xb0 [ib_core]
[ 209.070466] ib_unregister_device+0xd4/0x180 [ib_core]
[ 209.070537] mlx4_ib_remove+0x67/0x1f0 [mlx4_ib]
[ 209.070594] mlx4_remove_device+0x93/0xa0 [mlx4_core]
[ 209.070648] mlx4_unregister_interface+0x37/0x90 [mlx4_core]
[ 209.070705] mlx4_ib_cleanup+0xc/0x4db [mlx4_ib]
[ 209.072113] __x64_sys_delete_module+0x15b/0x260
[ 209.073567] ? exit_to_usermode_loop+0x7f/0x95
[ 209.074945] do_syscall_64+0x48/0x100
[ 209.076448] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 209.077799] RIP: 0033:0x7fc34bfe36b7
[ 209.079122] RSP: 002b:00007ffc8ffa98b8 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[ 209.080500] RAX: ffffffffffffffda RBX: 00000000013455c0 RCX: 00007fc34bfe36b7
[ 209.081875] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000000001345628
[ 209.083265] RBP: 0000000000000000 R08: 00007fc34c2a8060 R09: 00007fc34c053a40
[ 209.084655] R10: 00007ffc8ffa9640 R11: 0000000000000206 R12: 0000000000000000
[ 209.086028] R13: 0000000000000001 R14: 0000000001345628 R15: 0000000000000000
[ 209.087416] Code: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00
00 53 9c 58 0f 1f 44 00 00 48 89 c3 fa 66 0f 1f 44 00 00 31 c0 ba 01 00
00 00 <f0> 0f b1 17 85 c0 75 05 48 89 d8 5b c3 89 c6 e8 1e c9 81 ff eb
[ 209.090262] RIP: _raw_spin_lock_irqsave+0x1e/0x40 RSP: ffffc90000b4fd70
[ 209.091720] CR2: 0000000000000070
[ 209.093137] ---[ end trace 7b8a6faa27868861 ]---
[ 209.094546] Kernel panic - not syncing: Fatal exception
[ 209.096910] Kernel Offset: disabled
[ 209.098291] ---[ end Kernel panic - not syncing: Fatal exception ]---
Thanks
>
> Thanks
On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> The agent TID is a 64 bit value split in two dwords. The least
> significant dword is the TID running counter. The most significant
> dword is the agent number. In the CX-3 shared port model, the mlx4
> driver uses the most significant byte of the agent number to store the
> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> unusable.
There is no reason for this to be an ida, just do something like
mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id) & mad_agent_priv->ib_dev->tid_mask;
And have the driver set tid_mask to 3 bytes of 0xFF
And no sysctl.
Jason
> On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
>> The agent TID is a 64 bit value split in two dwords. The least
>> significant dword is the TID running counter. The most significant
>> dword is the agent number. In the CX-3 shared port model, the mlx4
>> driver uses the most significant byte of the agent number to store the
>> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
>> unusable.
>
> There is no reason for this to be an ida, just do something like
>
> mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id) & mad_agent_priv->ib_dev->tid_mask;
>
> And have the driver set tid_mask to 3 bytes of 0xFF
The issue is that some of the mad agents are long-lived, so you will wrap and use the same TID twice.
Thxs, Håkon
>
> And no sysctl.
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 29, 2018 at 06:16:14PM +0200, Håkon Bugge wrote:
>
> > On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
> >
> > On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> >> The agent TID is a 64 bit value split in two dwords. The least
> >> significant dword is the TID running counter. The most significant
> >> dword is the agent number. In the CX-3 shared port model, the mlx4
> >> driver uses the most significant byte of the agent number to store the
> >> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> >> unusable.
> >
> > There is no reason for this to be an ida, just do something like
> >
> > mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id) & mad_agent_priv->ib_dev->tid_mask;
> >
> > And have the driver set tid_mask to 3 bytes of 0xFF
>
> The issue is that some of the mad agents are long-lived, so you will
> wrap and use the same TID twice.
We already have that problem, and using ida is problematic because we
need to maximize the time between TID re-use, which ida isn't doing.
Preventing re-use seems like a seperate issue from limiting the range
to be compatible with mlx4.
Jason
> On 29 May 2018, at 18:40, Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, May 29, 2018 at 06:16:14PM +0200, Håkon Bugge wrote:
>>
>>> On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
>>>
>>> On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
>>>> The agent TID is a 64 bit value split in two dwords. The least
>>>> significant dword is the TID running counter. The most significant
>>>> dword is the agent number. In the CX-3 shared port model, the mlx4
>>>> driver uses the most significant byte of the agent number to store the
>>>> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
>>>> unusable.
>>>
>>> There is no reason for this to be an ida, just do something like
>>>
>>> mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id) & mad_agent_priv->ib_dev->tid_mask;
>>>
>>> And have the driver set tid_mask to 3 bytes of 0xFF
>>
>> The issue is that some of the mad agents are long-lived, so you will
>> wrap and use the same TID twice.
>
> We already have that problem, and using ida is problematic because we
> need to maximize the time between TID re-use, which ida isn't doing.
Initially, I thought that too, but consulted the spec which states:
C13-18.1.1: When initiating a new operation, MADHeader:TransactionID
shall be set to such a value that within that MAD the combination of TID,
SGID, and MgmtClass is different from that of any other currently executing
operation. []
"currently executing operation" that is.
But if timeouts etc. leads to MAD floating around in the fabric, its of course more robust to maximize the time between reuse.
Then idr_alloc_cyclic() can be used.
> Preventing re-use seems like a seperate issue from limiting the range
> to be compatible with mlx4.
Yes. But a v2 of Hans' patch using idr_alloc_cyclic() solves both issues.
Thxs, Håkon
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 29 May 2018 10:40:32 -0600
Jason Gunthorpe <[email protected]> wrote:
> On Tue, May 29, 2018 at 06:16:14PM +0200, H?kon Bugge wrote:
> >
> > > On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
> > >
> > > On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry
> > > wrote:
> > >> The agent TID is a 64 bit value split in two dwords. The least
> > >> significant dword is the TID running counter. The most
> > >> significant dword is the agent number. In the CX-3 shared port
> > >> model, the mlx4 driver uses the most significant byte of the
> > >> agent number to store the slave number, making agent numbers
> > >> greater and equal to 2^24 (3 bytes) unusable.
> > >
> > > There is no reason for this to be an ida, just do something like
> > >
> > > mad_agent_priv->agent.hi_tid =
> > > atomic_inc_return(&ib_mad_client_id) &
> > > mad_agent_priv->ib_dev->tid_mask;
> > >
> > > And have the driver set tid_mask to 3 bytes of 0xFF
> >
> > The issue is that some of the mad agents are long-lived, so you will
> > wrap and use the same TID twice.
>
> We already have that problem, and using ida is problematic because we
> need to maximize the time between TID re-use, which ida isn't doing.
>
> Preventing re-use seems like a seperate issue from limiting the range
> to be compatible with mlx4.
>
Preventing immediate re-use can be accomplished by judicious use of the
start argument (second argument) in the call to ida_simple_get (to
introduce hysteresis into the id allocations).
For example, can do something like:
static atomic_t ib_mad_client_id_min = ATOMIC_INIT(1);
...
ib_mad_client_id = ida_simple_get(&ib_mad_client_ids,
atomic_read(&ib_mad_client_id_min),
ib_mad_sysctl_client_id_max,
GFP_KERNEL);
....
if (!(ib_mad_client_id % 1000) ||
ib_mad_sysctl_client_id_max - ib_mad_client_id <= 1000)
atomic_set(&ib_mad_client_id_min, 1);
else
atomic_set(&ib_mad_client_id_min, ib_mad_client_id + 1);
The above avoids immediate re-use of ids, and only searches for past
freed ids if the last allocated-id is zero mod 1000.
This is just suggestion -- will probably need some variation of the
above to handle what happens over time (i.e., to not depend on the
modulo operation to reset the search start to 1), to properly handle
how we deal with the start value when we are close to the allowed
client_id_max, and also to implement some sort of locking.
-Jack
On Tue, 29 May 2018 12:54:45 +0300
Leon Romanovsky <[email protected]> wrote:
> On Tue, May 29, 2018 at 11:54:59AM +0300, Leon Romanovsky wrote:
> > On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> > > The agent TID is a 64 bit value split in two dwords. The least
> > > significant dword is the TID running counter. The most significant
> > > dword is the agent number. In the CX-3 shared port model, the mlx4
> > > driver uses the most significant byte of the agent number to
> > > store the slave number, making agent numbers greater and equal to
> > > 2^24 (3 bytes) unusable. The current codebase uses a variable
> > > which is incremented atomically for each new agent number giving
> > > too large agent numbers over time. The IDA set of functions are
> > > used instead of the simple counter approach. This allows re-use
> > > of agent numbers. A sysctl variable is also introduced, to
> > > control the max agent number.
> >
> > Why don't you simply limit this number per-driver? By default, any
> > variable is allowed and mlx4_ib will set something else.
It is much simpler to do things here -- the current allocation scheme
does have a wrap-around problem, so there is a good reason for using a
global allocator residing in the ib core.
> >
> > What is the advantage of having sysctl?
>
> Anyway, it doesn't pass smoke test.
REASON: The start argument in ida_simple_get should be 1, not 0:
+ ib_mad_client_id = ida_simple_get(&ib_mad_client_ids,
+ 1,
+ ib_mad_sysctl_client_id_max,
+ GFP_KERNEL);
The agent ID 0 is interpreted as -no agent-, is used by snoop agents,
and is never allocated.
The first allocated agent id is 1:
static atomic_t ib_mad_client_id_min = ATOMIC_INIT(0);
...
mad_agent_priv->agent.hi_tid =atomic_inc_return(&ib_mad_client_id);
If this is fixed, we do not get the panic.
However, please note Jason's feedback and my suggestion -- this
ida-based interface needs to avoid immediate re-use of freed ids.
-Jack
>
> [ 126.428407] RPC: Unregistered rdma transport module.
> [ 126.428513] RPC: Unregistered rdma backchannel transport module.
> [ 194.664081] IPv6: ADDRCONF(NETDEV_UP): ib0: link is not ready
> [ 209.068702] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000070 [ 209.068858] PGD 80000000341cf067 P4D
> 80000000341cf067 PUD 34188067 PMD 0 [ 209.068941] Oops: 0002 [#1]
> SMP PTI [ 209.069006] Modules linked in: netconsole nfsv3 nfs fscache
> mlx4_ib(-) mlx4_en mlx4_core devlink ib_ipoib rdma_ucm ib_ucm
> ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core dm_mirror
> dm_region_hash dm_log dm_mod nfsd pcspkr i2c_piix4 auth_rpcgss
> nfs_acl lockd grace sunrpc ip_tables ata_generic cirrus
> drm_kms_helper pata_acpi syscopyarea sysfillrect sysimgblt
> fb_sys_fops ttm drm e1000 virtio_console i2c_core serio_raw ata_piix
> floppy [last unloaded: mlxfw] [ 209.069312] CPU: 4 PID: 11048 Comm:
> modprobe Not tainted
> 4.17.0-rc7-2018-05-29_11-04-56_Hans_Westgaard_Ry__hans_westga #1
> [ 209.069413] Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
> [ 209.069486] RIP: 0010:_raw_spin_lock_irqsave+0x1e/0x40
> [ 209.069536] RSP: 0018:ffffc90000b4fd70 EFLAGS: 00010046
> [ 209.069591] RAX: 0000000000000000 RBX: 0000000000000246 RCX:
> ffffea0004d7ed00 [ 209.069653] RDX: 0000000000000001 RSI:
> 0000000000000000 RDI: 0000000000000070 [ 209.069717] RBP:
> 0000000000000000 R08: ffff88013446fc00 R09: 000000018010000f
> [ 209.069778] R10: 0000000000000001 R11: ffff88013446fc00 R12:
> 0000000000000070 [ 209.069849] R13: 0000000000000202 R14:
> 0000000000000000 R15: 0000000000000000 [ 209.069915] FS:
> 00007fc34caf7740(0000) GS:ffff88013fd00000(0000)
> knlGS:0000000000000000 [ 209.069987] CS: 0010 DS: 0000 ES: 0000
> CR0: 0000000080050033 [ 209.070043] CR2: 0000000000000070 CR3:
> 000000008853e000 CR4: 00000000000006e0 [ 209.070128] Call Trace:
> [ 209.070189] ib_unregister_mad_agent+0x2d/0x540 [ib_core]
> [ 209.070260] ? __slab_free+0x9a/0x2d0 [ 209.070332]
> ib_agent_port_close+0xad/0xf0 [ib_core] [ 209.070396]
> ib_mad_remove_device+0x59/0xb0 [ib_core] [ 209.070466]
> ib_unregister_device+0xd4/0x180 [ib_core] [ 209.070537]
> mlx4_ib_remove+0x67/0x1f0 [mlx4_ib] [ 209.070594]
> mlx4_remove_device+0x93/0xa0 [mlx4_core] [ 209.070648]
> mlx4_unregister_interface+0x37/0x90 [mlx4_core] [ 209.070705]
> mlx4_ib_cleanup+0xc/0x4db [mlx4_ib] [ 209.072113]
> __x64_sys_delete_module+0x15b/0x260 [ 209.073567] ?
> exit_to_usermode_loop+0x7f/0x95 [ 209.074945]
> do_syscall_64+0x48/0x100 [ 209.076448]
> entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 209.077799] RIP:
> 0033:0x7fc34bfe36b7 [ 209.079122] RSP: 002b:00007ffc8ffa98b8 EFLAGS:
> 00000206 ORIG_RAX: 00000000000000b0 [ 209.080500] RAX:
> ffffffffffffffda RBX: 00000000013455c0 RCX: 00007fc34bfe36b7
> [ 209.081875] RDX: 0000000000000000 RSI: 0000000000000800 RDI:
> 0000000001345628 [ 209.083265] RBP: 0000000000000000 R08:
> 00007fc34c2a8060 R09: 00007fc34c053a40 [ 209.084655] R10:
> 00007ffc8ffa9640 R11: 0000000000000206 R12: 0000000000000000
> [ 209.086028] R13: 0000000000000001 R14: 0000000001345628 R15:
> 0000000000000000 [ 209.087416] Code: 66 66 66 66 2e 0f 1f 84 00 00
> 00 00 00 0f 1f 44 00 00 53 9c 58 0f 1f 44 00 00 48 89 c3 fa 66 0f 1f
> 44 00 00 31 c0 ba 01 00 00 00 <f0> 0f b1 17 85 c0 75 05 48 89 d8 5b
> c3 89 c6 e8 1e c9 81 ff eb [ 209.090262] RIP:
> _raw_spin_lock_irqsave+0x1e/0x40 RSP: ffffc90000b4fd70 [ 209.091720]
> CR2: 0000000000000070 [ 209.093137] ---[ end trace 7b8a6faa27868861
> ]--- [ 209.094546] Kernel panic - not syncing: Fatal exception
> [ 209.096910] Kernel Offset: disabled [ 209.098291] ---[ end Kernel
> panic - not syncing: Fatal exception ]---
>
> Thanks
>
> >
> > Thanks
>
>
On 05/30/2018 10:02 AM, jackm wrote:
> On Tue, 29 May 2018 10:40:32 -0600
> Jason Gunthorpe <[email protected]> wrote:
>
>> On Tue, May 29, 2018 at 06:16:14PM +0200, Håkon Bugge wrote:
>>>
>>>> On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
>>>>
>>>> On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry
>>>> wrote:
>>>>> The agent TID is a 64 bit value split in two dwords. The least
>>>>> significant dword is the TID running counter. The most
>>>>> significant dword is the agent number. In the CX-3 shared port
>>>>> model, the mlx4 driver uses the most significant byte of the
>>>>> agent number to store the slave number, making agent numbers
>>>>> greater and equal to 2^24 (3 bytes) unusable.
>>>> There is no reason for this to be an ida, just do something like
>>>>
>>>> mad_agent_priv->agent.hi_tid =
>>>> atomic_inc_return(&ib_mad_client_id) &
>>>> mad_agent_priv->ib_dev->tid_mask;
>>>>
>>>> And have the driver set tid_mask to 3 bytes of 0xFF
>>> The issue is that some of the mad agents are long-lived, so you will
>>> wrap and use the same TID twice.
>> We already have that problem, and using ida is problematic because we
>> need to maximize the time between TID re-use, which ida isn't doing.
>>
>> Preventing re-use seems like a seperate issue from limiting the range
>> to be compatible with mlx4.
>>
> Preventing immediate re-use can be accomplished by judicious use of the
> start argument (second argument) in the call to ida_simple_get (to
> introduce hysteresis into the id allocations).
>
> For example, can do something like:
>
> static atomic_t ib_mad_client_id_min = ATOMIC_INIT(1);
> ...
> ib_mad_client_id = ida_simple_get(&ib_mad_client_ids,
> atomic_read(&ib_mad_client_id_min),
> ib_mad_sysctl_client_id_max,
> GFP_KERNEL);
> ....
> if (!(ib_mad_client_id % 1000) ||
> ib_mad_sysctl_client_id_max - ib_mad_client_id <= 1000)
> atomic_set(&ib_mad_client_id_min, 1);
> else
> atomic_set(&ib_mad_client_id_min, ib_mad_client_id + 1);
>
> The above avoids immediate re-use of ids, and only searches for past
> freed ids if the last allocated-id is zero mod 1000.
>
> This is just suggestion -- will probably need some variation of the
> above to handle what happens over time (i.e., to not depend on the
> modulo operation to reset the search start to 1), to properly handle
> how we deal with the start value when we are close to the allowed
> client_id_max, and also to implement some sort of locking.
>
> -Jack
>
We came up with this code snippet which we think handles both preventing
immediate re-use and too big/wrapping...
max = mad_agent_priv->ib_dev->tid_max;
start = atomic_inc_return(&start);
retry:
tid = ida_simple_get(&ib_mad_client_ids, start, max, GFP_KERNEL);
if (unlikely(tid == -ENOSPC)) {
spin_lock_irq_save();
tid = ida_simple_get(&ib_mad_client_ids, start, max,
GFP_ATOMIC);
if (unlikely(tid == -ENOSPC)) {
atomic_set(&start, 1);
spin_unlock_irq_restore();
goto retry;
}
spin_unlock_irq_restore();
}
Hans
On Wed, May 30, 2018 at 02:22:56PM +0200, Hans Westgaard Ry wrote:
> We came up with this code snippet which we think handles both preventing
> immediate re-use and too big/wrapping...
Isn't this basically the same as idr_alloc_cyclic ?
Jason
On Wed, May 30, 2018 at 09:32:02AM +0200, Håkon Bugge wrote:
>
>
> > On 29 May 2018, at 18:40, Jason Gunthorpe <[email protected]> wrote:
> >
> > On Tue, May 29, 2018 at 06:16:14PM +0200, Håkon Bugge wrote:
> >>
> >>> On 29 May 2018, at 17:49, Jason Gunthorpe <[email protected]> wrote:
> >>>
> >>> On Tue, May 29, 2018 at 09:38:08AM +0200, Hans Westgaard Ry wrote:
> >>>> The agent TID is a 64 bit value split in two dwords. The least
> >>>> significant dword is the TID running counter. The most significant
> >>>> dword is the agent number. In the CX-3 shared port model, the mlx4
> >>>> driver uses the most significant byte of the agent number to store the
> >>>> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> >>>> unusable.
> >>>
> >>> There is no reason for this to be an ida, just do something like
> >>>
> >>> mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id) & mad_agent_priv->ib_dev->tid_mask;
> >>>
> >>> And have the driver set tid_mask to 3 bytes of 0xFF
> >>
> >> The issue is that some of the mad agents are long-lived, so you will
> >> wrap and use the same TID twice.
> >
> > We already have that problem, and using ida is problematic because we
> > need to maximize the time between TID re-use, which ida isn't doing.
>
> Initially, I thought that too, but consulted the spec which states:
>
> C13-18.1.1: When initiating a new operation, MADHeader:TransactionID
> shall be set to such a value that within that MAD the combination of TID,
> SGID, and MgmtClass is different from that of any other currently executing
> operation. []
>
> "currently executing operation" that is.
'currently executing operation' encompasses a wide range of behaviors
and it is not just 'completing the MAD at the local node'
You have to exceed all the timeouts before a single node can be
certain that the condition is satisfied.
This is why we just simply maximize the time between allocations, and
encourage the user providing the rest of the tid to do the same.
> > Preventing re-use seems like a seperate issue from limiting the range
> > to be compatible with mlx4.
>
> Yes. But a v2 of Hans' patch using idr_alloc_cyclic() solves both issues.
That might work, but still have to get rid of the sysctl
Jason
> On 30 May 2018, at 17:10, Jason Gunthorpe <[email protected]> wrote:
>
> On Wed, May 30, 2018 at 02:22:56PM +0200, Hans Westgaard Ry wrote:
>
>> We came up with this code snippet which we think handles both preventing
>> immediate re-use and too big/wrapping...
>
> Isn't this basically the same as idr_alloc_cyclic ?
I draw my statement back. The idr_alloc_cyclic() is the family of idr's that associates a pointer with the bit. Hence, each bit is a bit + 64b.
That's why we ended up with Hans' pseudo code.
Thxs, Håkon
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 30, 2018 at 10:07:16PM +0200, Håkon Bugge wrote:
>
>
> > On 30 May 2018, at 17:10, Jason Gunthorpe <[email protected]> wrote:
> >
> > On Wed, May 30, 2018 at 02:22:56PM +0200, Hans Westgaard Ry wrote:
> >
> >> We came up with this code snippet which we think handles both preventing
> >> immediate re-use and too big/wrapping...
> >
> > Isn't this basically the same as idr_alloc_cyclic ?
>
> I draw my statement back. The idr_alloc_cyclic() is the family of idr's that associates a pointer with the bit. Hence, each bit is a bit + 64b.
>
> That's why we ended up with Hans' pseudo code.
Okay, fair enough.
Is it worth adding a ida_get_new_cyclic for this?
Jason
> On 31 May 2018, at 00:09, Jason Gunthorpe <[email protected]> wrote:
>
> On Wed, May 30, 2018 at 10:07:16PM +0200, Håkon Bugge wrote:
>>
>>
>>> On 30 May 2018, at 17:10, Jason Gunthorpe <[email protected]> wrote:
>>>
>>> On Wed, May 30, 2018 at 02:22:56PM +0200, Hans Westgaard Ry wrote:
>>>
>>>> We came up with this code snippet which we think handles both preventing
>>>> immediate re-use and too big/wrapping...
>>>
>>> Isn't this basically the same as idr_alloc_cyclic ?
>>
>> I draw my statement back. The idr_alloc_cyclic() is the family of idr's that associates a pointer with the bit. Hence, each bit is a bit + 64b.
>>
>> That's why we ended up with Hans' pseudo code.
>
> Okay, fair enough.
>
> Is it worth adding a ida_get_new_cyclic for this?
My initial reaction was "no", but then I found the same cyclic behaviour in drivers/net/ipvlan/ipvlan_main.c
Then in my opinion, it makes sense to create a ida_simple_get_cyclic()
Thxs, Håkon
The agent TID is a 64 bit value split in two dwords. The least
significant dword is the TID running counter. The most significant
dword is the agent number. In the CX-3 shared port model, the mlx4
driver uses the most significant byte of the agent number to store the
slave number, making agent numbers greater and equal to 2^24 (3 bytes)
unusable. The current codebase uses a variable which is incremented
atomically for each new agent number giving too large agent numbers
over time. The IDA set of functions are used instead of the simple
counter approach. This allows re-use of agent numbers.
The signature of the bug is a MAD layer that stops working and the
console is flooded with messages like:
mlx4_ib: egress mad has non-null tid msb:1 class:4 slave:0
Orabug: 25571450
Signed-off-by: Hans Westgaard Ry <[email protected]>
---
drivers/infiniband/core/mad.c | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index b28452a55a08..c01a2d63ffa2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
+#include <linux/idr.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
@@ -59,8 +60,7 @@ module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
static struct list_head ib_mad_port_list;
-static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
-
+static DEFINE_IDA(ib_mad_client_ids);
/* Port list lock */
static DEFINE_SPINLOCK(ib_mad_port_list_lock);
@@ -212,7 +212,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
int ret2, qpn;
unsigned long flags;
u8 mgmt_class, vclass;
-
+ u32 ib_mad_client_id;
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1) {
@@ -375,9 +375,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
ret = ERR_PTR(ret2);
goto error4;
}
+ ib_mad_client_id = ida_simple_get_cyclic(&ib_mad_client_ids,
+ 1,
+ BIT(24) - 1,
+ GFP_KERNEL);
+ if (ib_mad_client_id < 0) {
+ pr_err("Couldn't allocate agent tid; errcode: %#x\n",
+ ib_mad_client_id);
+ ret = ERR_PTR(ib_mad_client_id);
+ goto error4;
+ }
+ mad_agent_priv->agent.hi_tid = ib_mad_client_id;
spin_lock_irqsave(&port_priv->reg_lock, flags);
- mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id);
/*
* Make sure MAD registration (if supplied)
@@ -428,6 +438,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
error5:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+ ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
+
error4:
kfree(reg_req);
error3:
@@ -576,6 +588,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
{
struct ib_mad_port_private *port_priv;
unsigned long flags;
+ u32 ib_mad_client_id;
/* Note that we could still be handling received MADs */
@@ -587,6 +600,8 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
port_priv = mad_agent_priv->qp_info->port_priv;
cancel_delayed_work(&mad_agent_priv->timed_work);
+ ib_mad_client_id = mad_agent_priv->agent.hi_tid;
+
spin_lock_irqsave(&port_priv->reg_lock, flags);
remove_mad_reg_req(mad_agent_priv);
list_del(&mad_agent_priv->agent_list);
@@ -602,6 +617,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
kfree(mad_agent_priv->reg_req);
kfree(mad_agent_priv);
+ ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
}
static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
@@ -3347,4 +3363,5 @@ int ib_mad_init(void)
void ib_mad_cleanup(void)
{
ib_unregister_client(&mad_client);
+ ida_destroy(&ib_mad_client_ids);
}
--
2.14.3
v2:
Implement ida_get_simple_cyclic and use it when allocating ib_mad_client_ids.
Hans Westgaard Ry (2):
idr: Add ida_simple_get_cyclic
IB/mad: Use ID allocator routines to allocate agent number
drivers/infiniband/core/mad.c | 24 +++++++++++++---
include/linux/idr.h | 8 ++++--
lib/idr.c | 66 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 92 insertions(+), 6 deletions(-)
--
2.14.3
Signed-off-by: Hans Westgaard Ry <[email protected]>
---
include/linux/idr.h | 8 +++++--
lib/idr.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e856f4e0ab35..f151ce89124a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -218,10 +218,12 @@ DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
struct ida {
struct radix_tree_root ida_rt;
+ unsigned int ida_next;
};
-#define IDA_INIT(name) { \
- .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
+#define IDA_INIT(name) { \
+ .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
+ .ida_next = 0, \
}
#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
@@ -232,6 +234,8 @@ void ida_destroy(struct ida *ida);
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
gfp_t gfp_mask);
+int ida_simple_get_cyclic(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask);
void ida_simple_remove(struct ida *ida, unsigned int id);
static inline void ida_init(struct ida *ida)
diff --git a/lib/idr.c b/lib/idr.c
index 823b813f08f8..3374c3fa4027 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -599,6 +599,72 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
return ret;
}
EXPORT_SYMBOL(ida_simple_get);
+/**
+ * ida_simple_get_cyclic - get a new id.
+ * @ida: the (initialized) ida.
+ * @start: the minimum id (inclusive, < 0x8000000)
+ * @end: the maximum id (exclusive, < 0x8000000 or 0)
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocates an id in the range start <= id < end, or returns -ENOSPC.
+ * On memory allocation failure, returns -ENOMEM.
+ * The search for an unused id will start at the last id allocated and will
+ * wrap around to @start if no free ids are found before reaching @end.
+ *
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
+ * Use ida_simple_remove() to get rid of an id.
+ */
+int ida_simple_get_cyclic(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask)
+{
+ int ret, id, next;
+ unsigned int max;
+ unsigned long flags;
+
+ WARN_ON((int)start < 0);
+ WARN_ON((int)end < 0);
+
+ if (end == 0) {
+ max = 0x80000000;
+ } else {
+ WARN_ON(end < start);
+ max = end - 1;
+ }
+
+again:
+ if (!ida_pre_get(ida, gfp_mask))
+ return -ENOMEM;
+
+ spin_lock_irqsave(&simple_ida_lock, flags);
+ next = ida->ida_next;
+ if (next < start || next >= end)
+ next = start;
+
+ ret = ida_get_new_above(ida, next, &id);
+ if (likely(!ret)) {
+ if (unlikely(id >= max)) {
+ ida_remove(ida, id);
+ if (next == start) {
+ ret = -ENOSPC;
+ } else {
+ ida->ida_next = start;
+ spin_unlock_irqrestore(&simple_ida_lock, flags);
+ goto again;
+ }
+ } else {
+ ida->ida_next = id + 1;
+ ret = id;
+ }
+ }
+ spin_unlock_irqrestore(&simple_ida_lock, flags);
+
+ if (unlikely(ret == -EAGAIN))
+ goto again;
+ return ret;
+}
+EXPORT_SYMBOL(ida_simple_get_cyclic);
/**
* ida_simple_remove - remove an allocated id.
--
2.14.3
The agent TID is a 64 bit value split in two dwords. The least
significant dword is the TID running counter. The most significant
dword is the agent number. In the CX-3 shared port model, the mlx4
driver uses the most significant byte of the agent number to store the
slave number, making agent numbers greater and equal to 2^24 (3 bytes)
unusable. The current codebase uses a variable which is incremented
atomically for each new agent number giving too large agent numbers
over time. The IDA set of functions are used instead of the simple
counter approach. This allows re-use of agent numbers.
The signature of the bug is a MAD layer that stops working and the
console is flooded with messages like:
mlx4_ib: egress mad has non-null tid msb:1 class:4 slave:0
Signed-off-by: Hans Westgaard Ry <[email protected]>
---
drivers/infiniband/core/mad.c | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index b28452a55a08..c01a2d63ffa2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
+#include <linux/idr.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
@@ -59,8 +60,7 @@ module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
static struct list_head ib_mad_port_list;
-static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
-
+static DEFINE_IDA(ib_mad_client_ids);
/* Port list lock */
static DEFINE_SPINLOCK(ib_mad_port_list_lock);
@@ -212,7 +212,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
int ret2, qpn;
unsigned long flags;
u8 mgmt_class, vclass;
-
+ u32 ib_mad_client_id;
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1) {
@@ -375,9 +375,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
ret = ERR_PTR(ret2);
goto error4;
}
+ ib_mad_client_id = ida_simple_get_cyclic(&ib_mad_client_ids,
+ 1,
+ BIT(24) - 1,
+ GFP_KERNEL);
+ if (ib_mad_client_id < 0) {
+ pr_err("Couldn't allocate agent tid; errcode: %#x\n",
+ ib_mad_client_id);
+ ret = ERR_PTR(ib_mad_client_id);
+ goto error4;
+ }
+ mad_agent_priv->agent.hi_tid = ib_mad_client_id;
spin_lock_irqsave(&port_priv->reg_lock, flags);
- mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id);
/*
* Make sure MAD registration (if supplied)
@@ -428,6 +438,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
error5:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+ ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
+
error4:
kfree(reg_req);
error3:
@@ -576,6 +588,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
{
struct ib_mad_port_private *port_priv;
unsigned long flags;
+ u32 ib_mad_client_id;
/* Note that we could still be handling received MADs */
@@ -587,6 +600,8 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
port_priv = mad_agent_priv->qp_info->port_priv;
cancel_delayed_work(&mad_agent_priv->timed_work);
+ ib_mad_client_id = mad_agent_priv->agent.hi_tid;
+
spin_lock_irqsave(&port_priv->reg_lock, flags);
remove_mad_reg_req(mad_agent_priv);
list_del(&mad_agent_priv->agent_list);
@@ -602,6 +617,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
kfree(mad_agent_priv->reg_req);
kfree(mad_agent_priv);
+ ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
}
static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
@@ -3347,4 +3363,5 @@ int ib_mad_init(void)
void ib_mad_cleanup(void)
{
ib_unregister_client(&mad_client);
+ ida_destroy(&ib_mad_client_ids);
}
--
2.14.3
v3:
- Removed Oracle internal bug number information
v2:
- Implement ida_get_simple_cyclic and use it when allocating
ib_mad_client_ids.
Hans Westgaard Ry (2):
idr: Add ida_simple_get_cyclic
IB/mad: Use ID allocator routines to allocate agent number
drivers/infiniband/core/mad.c | 24 +++++++++++++---
include/linux/idr.h | 8 ++++--
lib/idr.c | 66 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 92 insertions(+), 6 deletions(-)
--
2.14.3
Orabug: 25571450
Signed-off-by: Hans Westgaard Ry <[email protected]>
---
include/linux/idr.h | 8 +++++--
lib/idr.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e856f4e0ab35..f151ce89124a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -218,10 +218,12 @@ DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
struct ida {
struct radix_tree_root ida_rt;
+ unsigned int ida_next;
};
-#define IDA_INIT(name) { \
- .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
+#define IDA_INIT(name) { \
+ .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
+ .ida_next = 0, \
}
#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
@@ -232,6 +234,8 @@ void ida_destroy(struct ida *ida);
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
gfp_t gfp_mask);
+int ida_simple_get_cyclic(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask);
void ida_simple_remove(struct ida *ida, unsigned int id);
static inline void ida_init(struct ida *ida)
diff --git a/lib/idr.c b/lib/idr.c
index 823b813f08f8..3374c3fa4027 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -599,6 +599,72 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
return ret;
}
EXPORT_SYMBOL(ida_simple_get);
+/**
+ * ida_simple_get_cyclic - get a new id.
+ * @ida: the (initialized) ida.
+ * @start: the minimum id (inclusive, < 0x8000000)
+ * @end: the maximum id (exclusive, < 0x8000000 or 0)
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocates an id in the range start <= id < end, or returns -ENOSPC.
+ * On memory allocation failure, returns -ENOMEM.
+ * The search for an unused id will start at the last id allocated and will
+ * wrap around to @start if no free ids are found before reaching @end.
+ *
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
+ * Use ida_simple_remove() to get rid of an id.
+ */
+int ida_simple_get_cyclic(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask)
+{
+ int ret, id, next;
+ unsigned int max;
+ unsigned long flags;
+
+ WARN_ON((int)start < 0);
+ WARN_ON((int)end < 0);
+
+ if (end == 0) {
+ max = 0x80000000;
+ } else {
+ WARN_ON(end < start);
+ max = end - 1;
+ }
+
+again:
+ if (!ida_pre_get(ida, gfp_mask))
+ return -ENOMEM;
+
+ spin_lock_irqsave(&simple_ida_lock, flags);
+ next = ida->ida_next;
+ if (next < start || next >= end)
+ next = start;
+
+ ret = ida_get_new_above(ida, next, &id);
+ if (likely(!ret)) {
+ if (unlikely(id >= max)) {
+ ida_remove(ida, id);
+ if (next == start) {
+ ret = -ENOSPC;
+ } else {
+ ida->ida_next = start;
+ spin_unlock_irqrestore(&simple_ida_lock, flags);
+ goto again;
+ }
+ } else {
+ ida->ida_next = id + 1;
+ ret = id;
+ }
+ }
+ spin_unlock_irqrestore(&simple_ida_lock, flags);
+
+ if (unlikely(ret == -EAGAIN))
+ goto again;
+ return ret;
+}
+EXPORT_SYMBOL(ida_simple_get_cyclic);
/**
* ida_simple_remove - remove an allocated id.
--
2.14.3
Why do you need the ID to increment like this? Why can't you just use a unique ID?
> -----Original Message-----
> From: Hans Westgaard Ry [mailto:[email protected]]
> Sent: Thursday, June 7, 2018 7:15 AM
> To: Doug Ledford <[email protected]>; Jason Gunthorpe <[email protected]>;
> Hakon Bugge <[email protected]>; Parav Pandit
> <[email protected]>; Jack Morgenstein <[email protected]>; Pravin
> Shedge <[email protected]>; Matthew Wilcox
> <[email protected]>; Andrew Morton <[email protected]>;
> Jeff Layton <[email protected]>; Wei Wang <[email protected]>; Chris
> Mi <[email protected]>; Eric Biggers <[email protected]>; Rasmus
> Villemoes <[email protected]>; Mel Gorman
> <[email protected]>; [email protected]; linux-
> [email protected]
> Subject: [PATCH v3 2/2] IB/mad: Use ID allocator routines to allocate agent
> number
>
> The agent TID is a 64 bit value split in two dwords. The least
> significant dword is the TID running counter. The most significant
> dword is the agent number. In the CX-3 shared port model, the mlx4
> driver uses the most significant byte of the agent number to store the
> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
> unusable. The current codebase uses a variable which is incremented
> atomically for each new agent number giving too large agent numbers
> over time. The IDA set of functions are used instead of the simple
> counter approach. This allows re-use of agent numbers.
>
> The signature of the bug is a MAD layer that stops working and the
> console is flooded with messages like:
> mlx4_ib: egress mad has non-null tid msb:1 class:4 slave:0
>
> Signed-off-by: Hans Westgaard Ry <[email protected]>
> ---
> drivers/infiniband/core/mad.c | 25 +++++++++++++++++++++----
> 1 file changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index b28452a55a08..c01a2d63ffa2 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -41,6 +41,7 @@
> #include <linux/slab.h>
> #include <linux/module.h>
> #include <linux/security.h>
> +#include <linux/idr.h>
> #include <rdma/ib_cache.h>
>
> #include "mad_priv.h"
> @@ -59,8 +60,7 @@ module_param_named(recv_queue_size,
> mad_recvq_size, int, 0444);
> MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of
> work requests");
>
> static struct list_head ib_mad_port_list;
> -static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
> -
> +static DEFINE_IDA(ib_mad_client_ids);
> /* Port list lock */
> static DEFINE_SPINLOCK(ib_mad_port_list_lock);
>
> @@ -212,7 +212,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct
> ib_device *device,
> int ret2, qpn;
> unsigned long flags;
> u8 mgmt_class, vclass;
> -
> + u32 ib_mad_client_id;
> /* Validate parameters */
> qpn = get_spl_qp_index(qp_type);
> if (qpn == -1) {
> @@ -375,9 +375,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct
> ib_device *device,
> ret = ERR_PTR(ret2);
> goto error4;
> }
> + ib_mad_client_id = ida_simple_get_cyclic(&ib_mad_client_ids,
> + 1,
> + BIT(24) - 1,
> + GFP_KERNEL);
> + if (ib_mad_client_id < 0) {
> + pr_err("Couldn't allocate agent tid; errcode: %#x\n",
> + ib_mad_client_id);
> + ret = ERR_PTR(ib_mad_client_id);
> + goto error4;
> + }
> + mad_agent_priv->agent.hi_tid = ib_mad_client_id;
>
> spin_lock_irqsave(&port_priv->reg_lock, flags);
> - mad_agent_priv->agent.hi_tid =
> atomic_inc_return(&ib_mad_client_id);
>
> /*
> * Make sure MAD registration (if supplied)
> @@ -428,6 +438,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct
> ib_device *device,
> error5:
> spin_unlock_irqrestore(&port_priv->reg_lock, flags);
> ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
> + ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
> +
> error4:
> kfree(reg_req);
> error3:
> @@ -576,6 +588,7 @@ static void unregister_mad_agent(struct
> ib_mad_agent_private *mad_agent_priv)
> {
> struct ib_mad_port_private *port_priv;
> unsigned long flags;
> + u32 ib_mad_client_id;
>
> /* Note that we could still be handling received MADs */
>
> @@ -587,6 +600,8 @@ static void unregister_mad_agent(struct
> ib_mad_agent_private *mad_agent_priv)
> port_priv = mad_agent_priv->qp_info->port_priv;
> cancel_delayed_work(&mad_agent_priv->timed_work);
>
> + ib_mad_client_id = mad_agent_priv->agent.hi_tid;
> +
> spin_lock_irqsave(&port_priv->reg_lock, flags);
> remove_mad_reg_req(mad_agent_priv);
> list_del(&mad_agent_priv->agent_list);
> @@ -602,6 +617,7 @@ static void unregister_mad_agent(struct
> ib_mad_agent_private *mad_agent_priv)
>
> kfree(mad_agent_priv->reg_req);
> kfree(mad_agent_priv);
> + ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
> }
>
> static void unregister_mad_snoop(struct ib_mad_snoop_private
> *mad_snoop_priv)
> @@ -3347,4 +3363,5 @@ int ib_mad_init(void)
> void ib_mad_cleanup(void)
> {
> ib_unregister_client(&mad_client);
> + ida_destroy(&ib_mad_client_ids);
> }
> --
> 2.14.3
> On 7 Jun 2018, at 17:37, Matthew Wilcox <[email protected]> wrote:
>
> Why do you need the ID to increment like this? Why can't you just use a unique ID?
Hans first patch did that, but it was "beaten" up. Turns out, MAD packets can be hiding and lingering in the fabric, hence, when an ID is released after, lets say a timeout, we want to maximize the time until its reuse.
Thxs, Håkon
>
>> -----Original Message-----
>> From: Hans Westgaard Ry [mailto:[email protected]]
>> Sent: Thursday, June 7, 2018 7:15 AM
>> To: Doug Ledford <[email protected]>; Jason Gunthorpe <[email protected]>;
>> Hakon Bugge <[email protected]>; Parav Pandit
>> <[email protected]>; Jack Morgenstein <[email protected]>; Pravin
>> Shedge <[email protected]>; Matthew Wilcox
>> <[email protected]>; Andrew Morton <[email protected]>;
>> Jeff Layton <[email protected]>; Wei Wang <[email protected]>; Chris
>> Mi <[email protected]>; Eric Biggers <[email protected]>; Rasmus
>> Villemoes <[email protected]>; Mel Gorman
>> <[email protected]>; [email protected]; linux-
>> [email protected]
>> Subject: [PATCH v3 2/2] IB/mad: Use ID allocator routines to allocate agent
>> number
>>
>> The agent TID is a 64 bit value split in two dwords. The least
>> significant dword is the TID running counter. The most significant
>> dword is the agent number. In the CX-3 shared port model, the mlx4
>> driver uses the most significant byte of the agent number to store the
>> slave number, making agent numbers greater and equal to 2^24 (3 bytes)
>> unusable. The current codebase uses a variable which is incremented
>> atomically for each new agent number giving too large agent numbers
>> over time. The IDA set of functions are used instead of the simple
>> counter approach. This allows re-use of agent numbers.
>>
>> The signature of the bug is a MAD layer that stops working and the
>> console is flooded with messages like:
>> mlx4_ib: egress mad has non-null tid msb:1 class:4 slave:0
>>
>> Signed-off-by: Hans Westgaard Ry <[email protected]>
>> ---
>> drivers/infiniband/core/mad.c | 25 +++++++++++++++++++++----
>> 1 file changed, 21 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
>> index b28452a55a08..c01a2d63ffa2 100644
>> --- a/drivers/infiniband/core/mad.c
>> +++ b/drivers/infiniband/core/mad.c
>> @@ -41,6 +41,7 @@
>> #include <linux/slab.h>
>> #include <linux/module.h>
>> #include <linux/security.h>
>> +#include <linux/idr.h>
>> #include <rdma/ib_cache.h>
>>
>> #include "mad_priv.h"
>> @@ -59,8 +60,7 @@ module_param_named(recv_queue_size,
>> mad_recvq_size, int, 0444);
>> MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of
>> work requests");
>>
>> static struct list_head ib_mad_port_list;
>> -static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
>> -
>> +static DEFINE_IDA(ib_mad_client_ids);
>> /* Port list lock */
>> static DEFINE_SPINLOCK(ib_mad_port_list_lock);
>>
>> @@ -212,7 +212,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct
>> ib_device *device,
>> int ret2, qpn;
>> unsigned long flags;
>> u8 mgmt_class, vclass;
>> -
>> + u32 ib_mad_client_id;
>> /* Validate parameters */
>> qpn = get_spl_qp_index(qp_type);
>> if (qpn == -1) {
>> @@ -375,9 +375,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct
>> ib_device *device,
>> ret = ERR_PTR(ret2);
>> goto error4;
>> }
>> + ib_mad_client_id = ida_simple_get_cyclic(&ib_mad_client_ids,
>> + 1,
>> + BIT(24) - 1,
>> + GFP_KERNEL);
>> + if (ib_mad_client_id < 0) {
>> + pr_err("Couldn't allocate agent tid; errcode: %#x\n",
>> + ib_mad_client_id);
>> + ret = ERR_PTR(ib_mad_client_id);
>> + goto error4;
>> + }
>> + mad_agent_priv->agent.hi_tid = ib_mad_client_id;
>>
>> spin_lock_irqsave(&port_priv->reg_lock, flags);
>> - mad_agent_priv->agent.hi_tid =
>> atomic_inc_return(&ib_mad_client_id);
>>
>> /*
>> * Make sure MAD registration (if supplied)
>> @@ -428,6 +438,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct
>> ib_device *device,
>> error5:
>> spin_unlock_irqrestore(&port_priv->reg_lock, flags);
>> ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
>> + ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
>> +
>> error4:
>> kfree(reg_req);
>> error3:
>> @@ -576,6 +588,7 @@ static void unregister_mad_agent(struct
>> ib_mad_agent_private *mad_agent_priv)
>> {
>> struct ib_mad_port_private *port_priv;
>> unsigned long flags;
>> + u32 ib_mad_client_id;
>>
>> /* Note that we could still be handling received MADs */
>>
>> @@ -587,6 +600,8 @@ static void unregister_mad_agent(struct
>> ib_mad_agent_private *mad_agent_priv)
>> port_priv = mad_agent_priv->qp_info->port_priv;
>> cancel_delayed_work(&mad_agent_priv->timed_work);
>>
>> + ib_mad_client_id = mad_agent_priv->agent.hi_tid;
>> +
>> spin_lock_irqsave(&port_priv->reg_lock, flags);
>> remove_mad_reg_req(mad_agent_priv);
>> list_del(&mad_agent_priv->agent_list);
>> @@ -602,6 +617,7 @@ static void unregister_mad_agent(struct
>> ib_mad_agent_private *mad_agent_priv)
>>
>> kfree(mad_agent_priv->reg_req);
>> kfree(mad_agent_priv);
>> + ida_simple_remove(&ib_mad_client_ids, ib_mad_client_id);
>> }
>>
>> static void unregister_mad_snoop(struct ib_mad_snoop_private
>> *mad_snoop_priv)
>> @@ -3347,4 +3363,5 @@ int ib_mad_init(void)
>> void ib_mad_cleanup(void)
>> {
>> ib_unregister_client(&mad_client);
>> + ida_destroy(&ib_mad_client_ids);
>> }
>> --
>> 2.14.3
>
> N§ēæėrļyúčØbēXŽķĮ§vØ^)Þš{.nĮ+·Ĩ{ąŲ{ayšĘÚë,jĒfĢĒ·hāzđŪwĨĒļĒ·Ķj:+vĻwčjØmķĸūŦęįzZ+ųÝĒj"ú!
On Thu, Jun 07, 2018 at 01:14:34PM +0200, Hans Westgaard Ry wrote:
> Signed-off-by: Hans Westgaard Ry <[email protected]>
> ---
> include/linux/idr.h | 8 +++++--
> lib/idr.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 72 insertions(+), 2 deletions(-)
If you are going to do this you should convert the other places that
open coded this as well..
Jason