2008-08-03 18:05:28

by James Bottomley

[permalink] [raw]
Subject: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

This patch uses the new pci_lost_interrupt() callback to note the loss
of an interrupt, and if the reason is MSI, to work around the problem.

I used the manufacturer config page for this, because every fusion type
has that one and its loss means that mpt_config() which is interrupt
driven, failed.

James

---
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index d6a0074..2d75c58 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -168,7 +168,7 @@ static int mpt_readScsiDevicePageHeaders(MPT_ADAPTER *ioc, int portnum);
static void mpt_read_ioc_pg_1(MPT_ADAPTER *ioc);
static void mpt_read_ioc_pg_4(MPT_ADAPTER *ioc);
static void mpt_timer_expired(unsigned long data);
-static void mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
+static int mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
static int SendEventNotification(MPT_ADAPTER *ioc, u8 EvSwitch);
static int SendEventAck(MPT_ADAPTER *ioc, EventNotificationReply_t *evnp);
static int mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_value, int sleepFlag);
@@ -2052,6 +2052,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
int irq_allocated = 0;
u8 *a;

+ retry:
printk(MYIOC_s_INFO_FMT "Initiating %s\n", ioc->name,
reason == MPT_HOSTEVENT_IOC_BRINGUP ? "bringup" : "recovery");

@@ -2268,6 +2269,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
* and we try GetLanConfigPages again...
*/
if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP)) {
+ int rc;

/*
* Initalize link list for inactive raid volumes.
@@ -2275,6 +2277,22 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
mutex_init(&ioc->raid_data.inactive_list_mutex);
INIT_LIST_HEAD(&ioc->raid_data.inactive_list);

+ /* May fail becuase of IRQ misrouting */
+ rc = mpt_get_manufacturing_pg_0(ioc);
+ if (rc) {
+ if (pci_lost_interrupt(ioc->pcidev) ==
+ PCI_LOST_IRQ_DISABLE_MSI) {
+ free_irq(ioc->pci_irq, ioc);
+ ioc->msi_enable = 0;
+ pci_disable_msi(ioc->pcidev);
+ goto retry;
+ }
+ printk(MYIOC_s_ERR_FMT "Cannot recover IRQ routing\n",
+ ioc->name);
+ return -1;
+ }
+
+
if (ioc->bus_type == SAS) {

/* clear persistency table */
@@ -2326,7 +2344,6 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
}

GetIoUnitPage2(ioc);
- mpt_get_manufacturing_pg_0(ioc);
}

/*
@@ -5697,13 +5714,14 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc)
return;
}

-static void
+static int
mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
{
CONFIGPARMS cfg;
ConfigPageHeader_t hdr;
dma_addr_t buf_dma;
ManufacturingPage0_t *pbuf = NULL;
+ int ret;

memset(&cfg, 0 , sizeof(CONFIGPARMS));
memset(&hdr, 0 , sizeof(ConfigPageHeader_t));
@@ -5714,20 +5732,23 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
cfg.action = MPI_CONFIG_ACTION_PAGE_HEADER;
cfg.timeout = 10;

- if (mpt_config(ioc, &cfg) != 0)
+ ret = mpt_config(ioc, &cfg);
+ if (ret != 0)
goto out;

if (!cfg.cfghdr.hdr->PageLength)
goto out;

cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT;
+ ret = -ENOMEM;
pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma);
if (!pbuf)
goto out;

cfg.physAddr = buf_dma;

- if (mpt_config(ioc, &cfg) != 0)
+ ret = mpt_config(ioc, &cfg);
+ if (ret != 0)
goto out;

memcpy(ioc->board_name, pbuf->BoardName, sizeof(ioc->board_name));
@@ -5738,6 +5759,7 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)

if (pbuf)
pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma);
+ return ret;
}

/*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/


2008-08-03 22:59:58

by Moore, Eric

[permalink] [raw]
Subject: RE: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

Thanks, I will try this out. However I thought I saw lost interrupts occurring randomly, meaning it was not necessarily the first config page access. I'm back in the office on 8/11, I will test it out then and provide feedback.

Eric

________________________________________
From: James Bottomley [[email protected]]
Sent: Sunday, August 03, 2008 12:05 PM
To: linux-scsi; linux-kernel; [email protected]
Cc: Moore, Eric
Subject: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

This patch uses the new pci_lost_interrupt() callback to note the loss
of an interrupt, and if the reason is MSI, to work around the problem.

I used the manufacturer config page for this, because every fusion type
has that one and its loss means that mpt_config() which is interrupt
driven, failed.

James

---
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index d6a0074..2d75c58 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -168,7 +168,7 @@ static int mpt_readScsiDevicePageHeaders(MPT_ADAPTER *ioc, int portnum);
static void mpt_read_ioc_pg_1(MPT_ADAPTER *ioc);
static void mpt_read_ioc_pg_4(MPT_ADAPTER *ioc);
static void mpt_timer_expired(unsigned long data);
-static void mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
+static int mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
static int SendEventNotification(MPT_ADAPTER *ioc, u8 EvSwitch);
static int SendEventAck(MPT_ADAPTER *ioc, EventNotificationReply_t *evnp);
static int mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_value, int sleepFlag);
@@ -2052,6 +2052,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
int irq_allocated = 0;
u8 *a;

+ retry:
printk(MYIOC_s_INFO_FMT "Initiating %s\n", ioc->name,
reason == MPT_HOSTEVENT_IOC_BRINGUP ? "bringup" : "recovery");

@@ -2268,6 +2269,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
* and we try GetLanConfigPages again...
*/
if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP)) {
+ int rc;

/*
* Initalize link list for inactive raid volumes.
@@ -2275,6 +2277,22 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
mutex_init(&ioc->raid_data.inactive_list_mutex);
INIT_LIST_HEAD(&ioc->raid_data.inactive_list);

+ /* May fail becuase of IRQ misrouting */
+ rc = mpt_get_manufacturing_pg_0(ioc);
+ if (rc) {
+ if (pci_lost_interrupt(ioc->pcidev) ==
+ PCI_LOST_IRQ_DISABLE_MSI) {
+ free_irq(ioc->pci_irq, ioc);
+ ioc->msi_enable = 0;
+ pci_disable_msi(ioc->pcidev);
+ goto retry;
+ }
+ printk(MYIOC_s_ERR_FMT "Cannot recover IRQ routing\n",
+ ioc->name);
+ return -1;
+ }
+
+
if (ioc->bus_type == SAS) {

/* clear persistency table */
@@ -2326,7 +2344,6 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
}

GetIoUnitPage2(ioc);
- mpt_get_manufacturing_pg_0(ioc);
}

/*
@@ -5697,13 +5714,14 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc)
return;
}

-static void
+static int
mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
{
CONFIGPARMS cfg;
ConfigPageHeader_t hdr;
dma_addr_t buf_dma;
ManufacturingPage0_t *pbuf = NULL;
+ int ret;

memset(&cfg, 0 , sizeof(CONFIGPARMS));
memset(&hdr, 0 , sizeof(ConfigPageHeader_t));
@@ -5714,20 +5732,23 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
cfg.action = MPI_CONFIG_ACTION_PAGE_HEADER;
cfg.timeout = 10;

- if (mpt_config(ioc, &cfg) != 0)
+ ret = mpt_config(ioc, &cfg);
+ if (ret != 0)
goto out;

if (!cfg.cfghdr.hdr->PageLength)
goto out;

cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT;
+ ret = -ENOMEM;
pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma);
if (!pbuf)
goto out;

cfg.physAddr = buf_dma;

- if (mpt_config(ioc, &cfg) != 0)
+ ret = mpt_config(ioc, &cfg);
+ if (ret != 0)
goto out;

memcpy(ioc->board_name, pbuf->BoardName, sizeof(ioc->board_name));
@@ -5738,6 +5759,7 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)

if (pbuf)
pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma);
+ return ret;
}

/*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

2008-08-03 23:08:00

by James Bottomley

[permalink] [raw]
Subject: RE: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

On Sun, 2008-08-03 at 16:59 -0600, Moore, Eric wrote:
> Thanks, I will try this out. However I thought I saw lost interrupts
> occurring randomly, meaning it was not necessarily the first config
> page access. I'm back in the office on 8/11, I will test it out then
> and provide feedback.

Well, this is the usual trouble. I don't have a misbehaving
motherboard, so I artificially induce MSI problems on my platform. For
me, it works, but obviously I'm losing every MSI interrupt.

James

2008-08-04 13:56:38

by David Vrabel

[permalink] [raw]
Subject: Re: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

Moore, Eric wrote:
> Thanks, I will try this out. However I thought I saw lost
> interrupts occurring randomly, meaning it was not necessarily the
> first config page access. I'm back in the office on 8/11, I will
> test it out then and provide feedback.

Is this using MSI on a device without per-vector mask bits? If so, then
this patch may help.

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd

David
--
David Vrabel, Senior Software Engineer, Drivers
CSR, Churchill House, Cambridge Business Park, Tel: +44 (0)1223 692562
Cowley Road, Cambridge, CB4 0WZ http://www.csr.com/

2008-08-05 16:36:50

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> Moore, Eric wrote:
> > Thanks, I will try this out. However I thought I saw lost
> > interrupts occurring randomly, meaning it was not necessarily the
> > first config page access. I'm back in the office on 8/11, I will
> > test it out then and provide feedback.
>
> Is this using MSI on a device without per-vector mask bits? If so, then
> this patch may help.
>
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd

We don't really know what the problem is. MSI interrupts get lost on
older motherboards (the ones most likely to contain a 1030). Why is
anybody's guess although the clever money is on the motherboard bridge
having issues.

James

2008-08-05 17:39:14

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

On Tuesday, August 05, 2008 9:36 am James Bottomley wrote:
> On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> > Moore, Eric wrote:
> > > Thanks, I will try this out. However I thought I saw lost
> > > interrupts occurring randomly, meaning it was not necessarily the
> > > first config page access. I'm back in the office on 8/11, I will
> > > test it out then and provide feedback.
> >
> > Is this using MSI on a device without per-vector mask bits? If so, then
> > this patch may help.
> >
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit
> >;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd
>
> We don't really know what the problem is. MSI interrupts get lost on
> older motherboards (the ones most likely to contain a 1030). Why is
> anybody's guess although the clever money is on the motherboard bridge
> having issues.

David just got us to fix an MSI masking bug recently, which could be related.

The issue is that the PCI & interrupt handling code was disabling MSI during
interrupt handling, which could end up causing missed interrupts. When MSI
is disabled, devices can still generate interrupts, but they'll go out the
interrupt line instead, so unless your IRQ handler is also registered with
that IRQ number, you'll probably lose them.

As of the last PCI upstream merge, we work around this issue by never masking
MSI interrupts unless the device supports the MSI mask bit (as opposed to
just the big hammer enable/disable flag).

Jesse

2008-08-05 20:52:52

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH 2/2] fusion: Implement generic interrupt misroute handling

On Tue, 2008-08-05 at 10:38 -0700, Jesse Barnes wrote:
> On Tuesday, August 05, 2008 9:36 am James Bottomley wrote:
> > On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> > > Moore, Eric wrote:
> > > > Thanks, I will try this out. However I thought I saw lost
> > > > interrupts occurring randomly, meaning it was not necessarily the
> > > > first config page access. I'm back in the office on 8/11, I will
> > > > test it out then and provide feedback.
> > >
> > > Is this using MSI on a device without per-vector mask bits? If so, then
> > > this patch may help.
> > >
> > > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit
> > >;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd
> >
> > We don't really know what the problem is. MSI interrupts get lost on
> > older motherboards (the ones most likely to contain a 1030). Why is
> > anybody's guess although the clever money is on the motherboard bridge
> > having issues.
>
> David just got us to fix an MSI masking bug recently, which could be related.
>
> The issue is that the PCI & interrupt handling code was disabling MSI during
> interrupt handling, which could end up causing missed interrupts. When MSI
> is disabled, devices can still generate interrupts, but they'll go out the
> interrupt line instead, so unless your IRQ handler is also registered with
> that IRQ number, you'll probably lose them.
>
> As of the last PCI upstream merge, we work around this issue by never masking
> MSI interrupts unless the device supports the MSI mask bit (as opposed to
> just the big hammer enable/disable flag).

True, but this is orthogonal.

I'm trying to put together a diagnosing tool. Once we know there's a
problem and who has it, then we can try seeing if patches like this fix
it.

James