All firmware versions on POWER5 systems have a locking issue in the
HCA-related hCalls that can cause loss of Infiniband connectivity if
allocate and free calls happen in parallel. This may for example be caused
if two processes are using OpenMPI in parallel.
Circumvent this by serializing all HCA-related hCalls on POWER5.
Signed-off-by: Joachim Fenkes <[email protected]>
---
We tested this patch, especially the autodetection, and it works okay.
Please review and apply for 2.6.24-rc5 - thanks!
drivers/infiniband/hw/ehca/ehca_main.c | 16 ++++++++++++++++
drivers/infiniband/hw/ehca/hcp_if.c | 28 +++++++++++-----------------
2 files changed, 27 insertions(+), 17 deletions(-)
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 90d4334..8f33d06 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -43,6 +43,9 @@
#ifdef CONFIG_PPC_64K_PAGES
#include <linux/slab.h>
#endif
+
+#include <asm/cputable.h>
+
#include "ehca_classes.h"
#include "ehca_iverbs.h"
#include "ehca_mrmw.h"
@@ -66,6 +69,7 @@ int ehca_poll_all_eqs = 1;
int ehca_static_rate = -1;
int ehca_scaling_code = 0;
int ehca_mr_largepage = 1;
+int ehca_lock_hcalls = -1;
module_param_named(open_aqp1, ehca_open_aqp1, int, S_IRUGO);
module_param_named(debug_level, ehca_debug_level, int, S_IRUGO);
@@ -77,6 +81,7 @@ module_param_named(poll_all_eqs, ehca_poll_all_eqs, int, S_IRUGO);
module_param_named(static_rate, ehca_static_rate, int, S_IRUGO);
module_param_named(scaling_code, ehca_scaling_code, int, S_IRUGO);
module_param_named(mr_largepage, ehca_mr_largepage, int, S_IRUGO);
+module_param_named(lock_hcalls, ehca_lock_hcalls, bool, S_IRUGO);
MODULE_PARM_DESC(open_aqp1,
"AQP1 on startup (0: no (default), 1: yes)");
@@ -102,6 +107,9 @@ MODULE_PARM_DESC(scaling_code,
MODULE_PARM_DESC(mr_largepage,
"use large page for MR (0: use PAGE_SIZE (default), "
"1: use large page depending on MR size");
+MODULE_PARM_DESC(lock_hcalls,
+ "serialize all hCalls made by the driver "
+ "(default: autodetect)");
DEFINE_RWLOCK(ehca_qp_idr_lock);
DEFINE_RWLOCK(ehca_cq_idr_lock);
@@ -924,6 +932,14 @@ int __init ehca_module_init(void)
printk(KERN_INFO "eHCA Infiniband Device Driver "
"(Version " HCAD_VERSION ")\n");
+ /* Autodetect hCall locking -- we can't read the firmware version
+ * directly, but we know that starting with POWER6, all firmware
+ * versions are good.
+ */
+ if (ehca_lock_hcalls == -1)
+ ehca_lock_hcalls = !(cur_cpu_spec->cpu_user_features
+ & PPC_FEATURE_ARCH_2_05);
+
ret = ehca_create_comp_pool();
if (ret) {
ehca_gen_err("Cannot create comp pool.");
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
index c16a213..331b5e8 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -89,6 +89,7 @@
#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
static DEFINE_SPINLOCK(hcall_lock);
+extern int ehca_lock_hcalls;
static u32 get_longbusy_msecs(int longbusy_rc)
{
@@ -120,26 +121,21 @@ static long ehca_plpar_hcall_norets(unsigned long opcode,
unsigned long arg7)
{
long ret;
- int i, sleep_msecs, do_lock;
- unsigned long flags;
+ int i, sleep_msecs;
+ unsigned long flags = 0;
ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
- /* lock H_FREE_RESOURCE(MR) against itself and H_ALLOC_RESOURCE(MR) */
- if ((opcode == H_FREE_RESOURCE) && (arg7 == 5)) {
- arg7 = 0; /* better not upset firmware */
- do_lock = 1;
- }
-
for (i = 0; i < 5; i++) {
- if (do_lock)
+ /* serialize hCalls to work around firmware issue */
+ if (ehca_lock_hcalls)
spin_lock_irqsave(&hcall_lock, flags);
ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
arg5, arg6, arg7);
- if (do_lock)
+ if (ehca_lock_hcalls)
spin_unlock_irqrestore(&hcall_lock, flags);
if (H_IS_LONG_BUSY(ret)) {
@@ -174,24 +170,22 @@ static long ehca_plpar_hcall9(unsigned long opcode,
unsigned long arg9)
{
long ret;
- int i, sleep_msecs, do_lock;
+ int i, sleep_msecs;
unsigned long flags = 0;
ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9);
- /* lock H_ALLOC_RESOURCE(MR) against itself and H_FREE_RESOURCE(MR) */
- do_lock = ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5));
-
for (i = 0; i < 5; i++) {
- if (do_lock)
+ /* serialize hCalls to work around firmware issue */
+ if (ehca_lock_hcalls)
spin_lock_irqsave(&hcall_lock, flags);
ret = plpar_hcall9(opcode, outs,
arg1, arg2, arg3, arg4, arg5,
arg6, arg7, arg8, arg9);
- if (do_lock)
+ if (ehca_lock_hcalls)
spin_unlock_irqrestore(&hcall_lock, flags);
if (H_IS_LONG_BUSY(ret)) {
@@ -821,7 +815,7 @@ u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
adapter_handle.handle, /* r4 */
mr->ipz_mr_handle.handle, /* r5 */
- 0, 0, 0, 0, 5);
+ 0, 0, 0, 0, 0);
}
u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
--
1.5.2
On Thursday 06 December 2007, Joachim Fenkes wrote:
> ????????printk(KERN_INFO "eHCA Infiniband Device Driver "
> ???????? ? ? ? "(Version " HCAD_VERSION ")\n");
> ?
> +???????/* Autodetect hCall locking -- we can't read the firmware version
> +??????? * directly, but we know that starting with POWER6, all firmware
> +??????? * versions are good.
> +??????? */
> +???????if (ehca_lock_hcalls == -1)
> +???????????????ehca_lock_hcalls = !(cur_cpu_spec->cpu_user_features
> +??????????????????????????????? ? ? & PPC_FEATURE_ARCH_2_05);
> +
> ????????ret = ehca_create_comp_pool();
> ????????if (ret) {
> ????????????????ehca_gen_err("Cannot create comp pool.");
We already talked about this yesterday, but I still feel that checking the
instruction set of the CPU should not be used to determine whether a
specific device driver implementation is used int hypervisor.
At the very least, I think you should change this to read the hypervisor
version number from the device tree, though the ideal solution would be
to have the absence of this bug encoded in the device node for the ehca
device itself.
Regarding the performance problem, have you checked whether converting all
your spin_lock_irqsave to spin_lock/spin_lock_irq improves your performance
on the older machines? Maybe it's already fast enough that way.
Arnd <><
> > +???????????????ehca_lock_hcalls = !(cur_cpu_spec->cpu_user_features
> > +??????????????????????????????? ? ? & PPC_FEATURE_ARCH_2_05);
> We already talked about this yesterday, but I still feel that checking the
> instruction set of the CPU should not be used to determine whether a
> specific device driver implementation is used int hypervisor.
I had the same reaction... is testing cpu_user_features really the
best way to detect this issue?
I'll hold off applying this for a few days so you guys can decide the
best thing to do. We'll definitely get some fix into 2.6.24 but we
have time to make a good decision.
> Regarding the performance problem, have you checked whether converting all
> your spin_lock_irqsave to spin_lock/spin_lock_irq improves your performance
> on the older machines? Maybe it's already fast enough that way.
It does seem that the only places that the hcall_lock is taken also
use msleep, so they must always be in process context. So you can
safely just use spin_lock(), right?
- R.
On Thursday 06 December 2007, Roland Dreier wrote:
> ?> Regarding the performance problem, have you checked whether converting all
> ?> your spin_lock_irqsave to spin_lock/spin_lock_irq improves your performance
> ?> on the older machines? Maybe it's already fast enough that way.
>
> It does seem that the only places that the hcall_lock is taken also
> use msleep, so they must always be in process context. ?So you can
> safely just use spin_lock(), right?
I think it needs some more inspection. The msleep in there is only called
for hcalls that return H_IS_LONG_BUSY(). In theory, you can call
ehca_plpar_hcall_norets() from inside an interrupt handler if the
hcall in question never returns long busy.
Arnd <><
Roland Dreier <[email protected]> wrote on 06.12.2007 19:27:09:
> > > + ehca_lock_hcalls =
!(cur_cpu_spec->cpu_user_features
> > > + & PPC_FEATURE_ARCH_2_05);
>
> > We already talked about this yesterday, but I still feel that
checking the
> > instruction set of the CPU should not be used to determine whether a
> > specific device driver implementation is used int hypervisor.
>
> I had the same reaction... is testing cpu_user_features really the
> best way to detect this issue?
I concur it's not nice, but it was the only feasible method we could find
without adding a "bug fixed" feature flag to the partition<->firmware
interface. The firmware version reported in the OFDT is not a reliable
enough source, and even if it were, it would require a lot of string
parsing and matching against tables.
We're taking this to the firmware architects at the moment, but they're
not very fond of the idea of reporting the absence of bugs through
capability flags, as this could quickly lead to the exhaustion of flag
bits. We'll let the discussion stew for a bit, but if we don't get this
flag, we'll have to resort to the CPU features.
> I'll hold off applying this for a few days so you guys can decide the
> best thing to do. We'll definitely get some fix into 2.6.24 but we
> have time to make a good decision.
Right.
> > Regarding the performance problem, have you checked whether
converting all
> > your spin_lock_irqsave to spin_lock/spin_lock_irq improves your
performance
> > on the older machines? Maybe it's already fast enough that way.
>
> It does seem that the only places that the hcall_lock is taken also
> use msleep, so they must always be in process context. So you can
> safely just use spin_lock(), right?
As Arnd said, there are hCalls that will never return H_LONG_BUSY_*, such
as H_QUERY_PORT and chums, so they will never sleep. The surrounding
functions, though, are not prepared to be called from interrupt context
(GFP_KERNEL comes to mind), so I agree that a simple spin_lock() will
suffice. Thanks, Arnd, for pointing this out.
We'll keep you guys posted on the feature flag discussion. Until then,
have a nice weekend!
Joachim
> I think it needs some more inspection. The msleep in there is only called
> for hcalls that return H_IS_LONG_BUSY(). In theory, you can call
> ehca_plpar_hcall_norets() from inside an interrupt handler if the
> hcall in question never returns long busy.
Fair enough... according to Documentation/infiniband/core_locking.txt,
the only driver methods that cannot sleep are:
create_ah
modify_ah
query_ah
destroy_ah
bind_mw
post_send
post_recv
poll_cq
req_notify_cq
map_phys_fmr
and I don't think ehca does an hcall from any of those. Of course
there might be other driver-internal code paths that I don't know
about. Maybe do a quick audit and then stick might_sleep() in the
hcall functions to catch any mistakes?
- R.
On Monday 10 December 2007 00:22, Roland Dreier wrote:
> Fair enough... according to Documentation/infiniband/core_locking.txt,
> the only driver methods that cannot sleep are:
>
> [...]
> map_phys_fmr
In fact, we do use hCalls there. Our hardware doesn't actually support FMRs,
so we translate a "map FMR" into a "reallocate PMR", which doesn't work
without hCalls. What's more, the hCalls involved (e.g. H_FREE_RESOURCE)
might well return H_LONG_BUSY, so the whole operation might sleep; no way
around it.
How should we deal with this?
Thanks,
Joachim
Hi, guys,
> We're taking this to the firmware architects at the moment, but they're
not
> very fond of the idea of reporting the absence of bugs through
capability
> flags, as this could quickly lead to the exhaustion of flag bits. We'll
let
> the discussion stew for a bit, but if we don't get this flag, we'll have
to
> resort to the CPU features.
The architects have spoken, and we're getting a capability flag for this.
I'll repost my patch with new autodetection code that doesn't involve
checking the processor version.
> > > Regarding the performance problem, have you checked whether
converting all
> > > your spin_lock_irqsave to spin_lock/spin_lock_irq improves your
performance
> > > on the older machines? Maybe it's already fast enough that way.
> >
> > It does seem that the only places that the hcall_lock is taken also
> > use msleep, so they must always be in process context. So you can
> > safely just use spin_lock(), right?
>
> As Arnd said, there are hCalls that will never return H_LONG_BUSY_*,
such as
> H_QUERY_PORT and chums, so they will never sleep. The surrounding
functions,
> though, are not prepared to be called from interrupt context (GFP_KERNEL
comes
> to mind), so I agree that a simple spin_lock() will suffice. Thanks,
Arnd, for
> pointing this out.
As I pointed out in my earlier mail, there's still an issue with
map_phys_fmr possibly sleeping. Let's keep the irqsave for the time being
and revisit this part once we find a solution to map_phys_fmr.
Regards,
Joachim
> > map_phys_fmr
>
> In fact, we do use hCalls there. Our hardware doesn't actually support FMRs,
> so we translate a "map FMR" into a "reallocate PMR", which doesn't work
> without hCalls. What's more, the hCalls involved (e.g. H_FREE_RESOURCE)
> might well return H_LONG_BUSY, so the whole operation might sleep; no way
> around it.
It's a big problem. If you cannot implement FMRs in such a way that
you can handling having map_phys_fmr being called in a context that
can't sleep, then I think the only option is to remove your FMR
support. It's an optional device feature, so this should be OK
(although the iSER driver currently seems to depend on a device
supporting FMRs, which is probably going to be a problem with iWARP
support in the future anyway).
The fact that consumers can map FMRs from interrupt context, while
holding locks, etc, is pretty fundamental to the use of FMRs so I
don't see any way around the requirement that map_phys_fmr never
sleep.
- R.
Roland Dreier <[email protected]> wrote on 10.12.2007 22:47:37:
> It's a big problem. If you cannot implement FMRs in such a way that
> you can handling having map_phys_fmr being called in a context that
> can't sleep, then I think the only option is to remove your FMR
> support.
That's kind of what I feared you would say =)
> It's an optional device feature, so this should be OK
> (although the iSER driver currently seems to depend on a device
> supporting FMRs, which is probably going to be a problem with iWARP
> support in the future anyway).
I don't feel very well with removing code from the driver that iSER seems
to depend on. Are there plans to fix this in iSER?
In reality, PHYP rarely ever returns H_LONG_BUSY, and we haven't had any
problems with iSER in the field yet. I admit that our FMR code is
dangerous, but I prefer "dangerous but working for the customer" over "not
working for the customer at all".
Maybe we can agree on keeping the status quo until no more ULPs depend on
FMR, then remove FMR from ehca? If so, we'd also let the _irqsave
spinlocks around hCalls stay in place.
Regards,
Joachim
Joachim Fenkes wrote:
> Roland Dreier <[email protected]> wrote on 10.12.2007 22:47:37:
>> It's an optional device feature, so this should be OK
>> (although the iSER driver currently seems to depend on a device
>> supporting FMRs, which is probably going to be a problem with iWARP
>> support in the future anyway).
> I don't feel very well with removing code from the driver that iSER seems
> to depend on. Are there plans to fix this in iSER?
What is the fix you suggest, to add a device query that tells you for
which verbs the documentation does not apply? or enhance the code of the
map_phys_fmr verb within the ehca driver to return error if called
from non-sleepable context?
Or.
Or Gerlitz <[email protected]> wrote on 12.12.2007 13:14:25:
> Joachim Fenkes wrote:
> > Roland Dreier <[email protected]> wrote on 10.12.2007 22:47:37:
>
> >> It's an optional device feature, so this should be OK
> >> (although the iSER driver currently seems to depend on a device
> >> supporting FMRs, which is probably going to be a problem with iWARP
> >> support in the future anyway).
>
> > I don't feel very well with removing code from the driver that iSER
seems
> > to depend on. Are there plans to fix this in iSER?
>
> What is the fix you suggest, to add a device query that tells you for
> which verbs the documentation does not apply? or enhance the code of the
> map_phys_fmr verb within the ehca driver to return error if called
> from non-sleepable context?
Roland,
what is your suggestion here?
We could implement both versions Or is proposing, but having both
at the same time sound like overkill.
Christoph R.
> What is the fix you suggest, to add a device query that tells you for
> which verbs the documentation does not apply? or enhance the code of the
> map_phys_fmr verb within the ehca driver to return error if called
> from non-sleepable context?
I think the right fix for iSER would be to make iSER work even for
devices that don't support FMRs. For example cxgb3 doesn't implement
FMRs so if anyone ever updates iSER to work on iWARP and not just IB,
then this is something that has to be tackled anyway. Then ehca could
just get rid of the FMR support it has.
Roland Dreier wrote:
> I think the right fix for iSER would be to make iSER work even for
> devices that don't support FMRs. For example cxgb3 doesn't implement
> FMRs so if anyone ever updates iSER to work on iWARP and not just IB,
> then this is something that has to be tackled anyway. Then ehca could
> just get rid of the FMR support it has.
OK, The iSER design took into account the case of many initiators
running on strong/modern machines talking to possibly lightweight
embedded target for which the processing cost per I/O at the target side
should be minimized, that is at most --one-- RDMA operation should be
issued by the target to serve an I/O request.
For that end, iSER works with one descriptor (called stag in iWARP and
rkey in IB) per I/O direction sent from the initiator to the target and
hence can't work without some sort of FMR implementation.
The current implementation of the open iscsi initiator makes sure to
issue commands in thread (sleepable) context, see iscsi_xmitworker and
references to it in drivers/scsi/libiscsi.c , so this keeps ehca users
safe for the time being.
Or.
On Dec 13, 2007 12:30 AM, Or Gerlitz <[email protected]> wrote:
> Roland Dreier wrote:
> > I think the right fix for iSER would be to make iSER work even for
> > devices that don't support FMRs. For example cxgb3 doesn't implement
> > FMRs so if anyone ever updates iSER to work on iWARP and not just IB,
> > then this is something that has to be tackled anyway. Then ehca could
> > just get rid of the FMR support it has.
>
> OK, The iSER design took into account the case of many initiators
> running on strong/modern machines talking to possibly lightweight
> embedded target for which the processing cost per I/O at the target side
> should be minimized, that is at most --one-- RDMA operation should be
> issued by the target to serve an I/O request.
>
> For that end, iSER works with one descriptor (called stag in iWARP and
> rkey in IB) per I/O direction sent from the initiator to the target and
> hence can't work without some sort of FMR implementation.
>
> The current implementation of the open iscsi initiator makes sure to
> issue commands in thread (sleepable) context, see iscsi_xmitworker and
> references to it in drivers/scsi/libiscsi.c , so this keeps ehca users
> safe for the time being.
>
> Or.
>
I agree, *some* form of FMR support is important for iSER (and probably
for NFS over RDMA as well). Rather than adding a crippled NO FMR
mode it would make more sense to add support for FMR Work Requests.
I'm not certain what, if any, impact that would have on the Power5 problem,
but that's certainly a cleaner path for iWARP.
[email protected] wrote on 13.12.2007 20:22:49:
> On Dec 13, 2007 12:30 AM, Or Gerlitz <[email protected]> wrote:
> > The current implementation of the open iscsi initiator makes sure to
> > issue commands in thread (sleepable) context, see iscsi_xmitworker and
> > references to it in drivers/scsi/libiscsi.c , so this keeps ehca users
> > safe for the time being.
> I agree, *some* form of FMR support is important for iSER (and probably
> for NFS over RDMA as well). Rather than adding a crippled NO FMR
> mode it would make more sense to add support for FMR Work Requests.
> I'm not certain what, if any, impact that would have on the Power5
problem,
> but that's certainly a cleaner path for iWARP.
Well, FMR WRs wouldn't change the eHCA issue -- the driver would have to
make an hCall in any case, and the architecture says that the hCalls used
in this scenario might return H_LONG_BUSY, causing the driver to sleep. No
way around that. Because of this, eHCA's FMRs are actually standard MRs
with a different API.
If, as Or said, the iSCSI initiator issues commands in sleepable context
anyway, nothing would be lost by using standard MRs as a fallback solution
if FMRs aren't available, would it?
J.
> -----Original Message-----
> From: Joachim Fenkes [mailto:[email protected]]
> Sent: Thursday, December 13, 2007 1:00 PM
> To: Caitlin Bestler
> Cc: Arnd Bergmann; [email protected]; OF-General; LKML;
> [email protected]; Or Gerlitz; Roland Dreier; Stefan Roscher
> Subject: Re: [ofa-general] Re: [ewg] Re: [PATCH] IB/ehca: Serialize
> HCA-related hCalls on POWER5
>
> [email protected] wrote on 13.12.2007 20:22:49:
>
> > On Dec 13, 2007 12:30 AM, Or Gerlitz <[email protected]> wrote:
> > > The current implementation of the open iscsi initiator makes sure
> to
> > > issue commands in thread (sleepable) context, see iscsi_xmitworker
> and
> > > references to it in drivers/scsi/libiscsi.c , so this keeps ehca
> users
> > > safe for the time being.
>
> > I agree, *some* form of FMR support is important for iSER (and
> probably
> > for NFS over RDMA as well). Rather than adding a crippled NO FMR
> > mode it would make more sense to add support for FMR Work Requests.
> > I'm not certain what, if any, impact that would have on the Power5
> problem,
> > but that's certainly a cleaner path for iWARP.
>
> Well, FMR WRs wouldn't change the eHCA issue -- the driver would have
> to
> make an hCall in any case, and the architecture says that the hCalls
> used
> in this scenario might return H_LONG_BUSY, causing the driver to
sleep.
> No
> way around that. Because of this, eHCA's FMRs are actually standard
MRs
> with a different API.
>
> If, as Or said, the iSCSI initiator issues commands in sleepable
> context
> anyway, nothing would be lost by using standard MRs as a fallback
> solution
> if FMRs aren't available, would it?
>
To clarify, an FMR Work Request is simply posted to the SendQ like
any other Work Request (of course the QP has to be privileged, or
it will complete in error). An SQ Post should never block.
But yes, if the current iSCSI initiator always does all call-based
FMRs in a sleepable context then I would agree then any changes can
wait for the first vendor that wants to support FMR Work Requests.
FMR Work Requests can be pipelined, so anyone with hardware that
supported them would have strong motivation to enable the open
iSCSI initiator to take advantage of this.
"Caitlin Bestler" <[email protected]> wrote on 13.12.2007
22:08:34:
> To clarify, an FMR Work Request is simply posted to the SendQ like
> any other Work Request (of course the QP has to be privileged, or
> it will complete in error). An SQ Post should never block.
This would require hardware support, wouldn't it? eHCA2 doesn't have this
kind of support, so FMR WRs are not an option here.
J.
>To clarify, an FMR Work Request is simply posted to the SendQ like
>any other Work Request (of course the QP has to be privileged, or
>it will complete in error). An SQ Post should never block.
FMR's as defined by the IB spec and that created by Mellanox are not the same.
They, unfortunately, use the same name and acronym only. Mellanox FMRs use an
API that is more like that of standard MRs.
- Sean