2006-09-15 23:50:28

by linas

[permalink] [raw]
Subject: [PATCH 0/4]: PowerPC: EEH: Add support for MMIO enabled recovery step


Paul,

Please apply and submit upstream. These patches are not urgent,
and are mostly harmless.

The following set of patches add support for the MMIO-enabled
EEH recovery path, that is, for the path through te PCI error
recovery code where the device driver claims it can recover
on its own, and just wants MMIO enabled.

Actually, the first two patches are misc EEH cleanup,
the last two implement this function.

--linas


2006-09-15 23:55:37

by linas

[permalink] [raw]
Subject: [PATCH 1/4]: PowerPC: EEH: balance pcidev_get/put calls



This patch corrects a pci_dev get/put imbalance that can occur
only in highly unlikely situations (kmalloc failures, pci devices
with overlapping resource addresses). No actual failures seen,
this was spotted during code review.

Signed-off-by: Linas Vepstas <[email protected]>

----
arch/powerpc/platforms/pseries/eeh_cache.c | 17 ++---------------
1 file changed, 2 insertions(+), 15 deletions(-)

Index: linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_cache.c
===================================================================
--- linux-2.6.18-rc7-git1.orig/arch/powerpc/platforms/pseries/eeh_cache.c 2006-09-14 13:13:57.000000000 -0500
+++ linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_cache.c 2006-09-14 13:52:34.000000000 -0500
@@ -157,6 +157,7 @@ pci_addr_cache_insert(struct pci_dev *de
if (!piar)
return NULL;

+ pci_dev_get(dev);
piar->addr_lo = alo;
piar->addr_hi = ahi;
piar->pcidev = dev;
@@ -178,7 +179,6 @@ static void __pci_addr_cache_insert_devi
struct device_node *dn;
struct pci_dn *pdn;
int i;
- int inserted = 0;

dn = pci_device_to_OF_node(dev);
if (!dn) {
@@ -197,9 +197,6 @@ static void __pci_addr_cache_insert_devi
return;
}

- /* The cache holds a reference to the device... */
- pci_dev_get(dev);
-
/* Walk resources on this device, poke them into the tree */
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
unsigned long start = pci_resource_start(dev,i);
@@ -212,12 +209,7 @@ static void __pci_addr_cache_insert_devi
if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
continue;
pci_addr_cache_insert(dev, start, end, flags);
- inserted = 1;
}
-
- /* If there was nothing to add, the cache has no reference... */
- if (!inserted)
- pci_dev_put(dev);
}

/**
@@ -240,7 +232,6 @@ void pci_addr_cache_insert_device(struct
static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
{
struct rb_node *n;
- int removed = 0;

restart:
n = rb_first(&pci_io_addr_cache_root.rb_root);
@@ -250,16 +241,12 @@ restart:

if (piar->pcidev == dev) {
rb_erase(n, &pci_io_addr_cache_root.rb_root);
- removed = 1;
+ pci_dev_put(piar->pcidev);
kfree(piar);
goto restart;
}
n = rb_next(n);
}
-
- /* The cache no longer holds its reference to this device... */
- if (removed)
- pci_dev_put(dev);
}

/**

2006-09-15 23:56:38

by linas

[permalink] [raw]
Subject: [PATCH 2/4]: PowerPC: EEH: code comment cleanup


Clean up subroutine documentation; mostly formatting
changes, with some new content.

Signed-off-by: Linas Vepstas <[email protected]>

----
arch/powerpc/platforms/pseries/eeh.c | 19 ++++++++++++++-----
arch/powerpc/platforms/pseries/eeh_driver.c | 27 +++++++++++++++++++++------
2 files changed, 35 insertions(+), 11 deletions(-)

Index: linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh.c
===================================================================
--- linux-2.6.18-rc7-git1.orig/arch/powerpc/platforms/pseries/eeh.c 2006-09-14 14:07:43.000000000 -0500
+++ linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh.c 2006-09-14 14:44:40.000000000 -0500
@@ -449,7 +449,11 @@ EXPORT_SYMBOL(eeh_check_failure);
/* ------------------------------------------------------------- */
/* The code below deals with error recovery */

-/** Return negative value if a permanent error, else return
+/**
+ * eeh_slot_availability - returns error status of slot
+ * @pdn pci device node
+ *
+ * Return negative value if a permanent error, else return
* a number of milliseconds to wait until the PCI slot is
* ready to be used.
*/
@@ -477,8 +481,10 @@ eeh_slot_availability(struct pci_dn *pdn
return -1;
}

-/** rtas_pci_slot_reset raises/lowers the pci #RST line
- * state: 1/0 to raise/lower the #RST
+/**
+ * rtas_pci_slot_reset - raises/lowers the pci #RST line
+ * @pdn pci device node
+ * @state: 1/0 to raise/lower the #RST
*
* Clear the EEH-frozen condition on a slot. This routine
* asserts the PCI #RST line if the 'state' argument is '1',
@@ -518,8 +524,9 @@ rtas_pci_slot_reset(struct pci_dn *pdn,
}
}

-/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
- * dn -- device node to be reset.
+/**
+ * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
+ * @pdn: pci device node to be reset.
*
* Return 0 if success, else a non-zero value.
*/
@@ -582,6 +589,8 @@ rtas_set_slot_reset(struct pci_dn *pdn)

/**
* __restore_bars - Restore the Base Address Registers
+ * @pdn: pci device node
+ *
* Loads the PCI configuration space base address registers,
* the expansion ROM base address, the latency timer, and etc.
* from the saved values in the device node.
Index: linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_driver.c
===================================================================
--- linux-2.6.18-rc7-git1.orig/arch/powerpc/platforms/pseries/eeh_driver.c 2006-09-14 14:34:12.000000000 -0500
+++ linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_driver.c 2006-09-14 14:47:21.000000000 -0500
@@ -77,8 +77,12 @@ static int irq_in_use(unsigned int irq)
}

/* ------------------------------------------------------- */
-/** eeh_report_error - report an EEH error to each device,
- * collect up and merge the device responses.
+/**
+ * eeh_report_error - report pci error to each device driver
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
*/

static void eeh_report_error(struct pci_dev *dev, void *userdata)
@@ -108,8 +112,8 @@ static void eeh_report_error(struct pci_
rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
}

-/** eeh_report_reset -- tell this device that the pci slot
- * has been reset.
+/**
+ * eeh_report_reset - tell device that slot has been reset
*/

static void eeh_report_reset(struct pci_dev *dev, void *userdata)
@@ -132,6 +136,10 @@ static void eeh_report_reset(struct pci_
driver->err_handler->slot_reset(dev);
}

+/**
+ * eeh_report_resume - tell device to resume normal operations
+ */
+
static void eeh_report_resume(struct pci_dev *dev, void *userdata)
{
struct pci_driver *driver = dev->driver;
@@ -148,6 +156,13 @@ static void eeh_report_resume(struct pci
driver->err_handler->resume(dev);
}

+/**
+ * eeh_report_failure - tell device driver that device is dead.
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+
static void eeh_report_failure(struct pci_dev *dev, void *userdata)
{
struct pci_driver *driver = dev->driver;
@@ -190,11 +205,11 @@ static void eeh_report_failure(struct pc

/**
* eeh_reset_device() -- perform actual reset of a pci slot
- * Args: bus: pointer to the pci bus structure corresponding
+ * @bus: pointer to the pci bus structure corresponding
* to the isolated slot. A non-null value will
* cause all devices under the bus to be removed
* and then re-added.
- * pe_dn: pointer to a "Partionable Endpoint" device node.
+ * @pe_dn: pointer to a "Partionable Endpoint" device node.
* This is the top-level structure on which pci
* bus resets can be performed.
*/

2006-09-15 23:57:45

by linas

[permalink] [raw]
Subject: [PATCH 3/4]: PowerPC: EEH: enable MMIO/DMA on frozen slot



Add wrapper around the rtas call to enable MMIO or DMA
on a frozen pci slot.

Signed-off-by: Linas Vepstas <[email protected]>

----
arch/powerpc/platforms/pseries/eeh.c | 29 +++++++++++++++++++++++++++++
include/asm-powerpc/ppc-pci.h | 11 +++++++++++
2 files changed, 40 insertions(+)

Index: linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh.c
===================================================================
--- linux-2.6.18-rc7-git1.orig/arch/powerpc/platforms/pseries/eeh.c 2006-09-14 14:44:40.000000000 -0500
+++ linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh.c 2006-09-14 15:22:23.000000000 -0500
@@ -482,6 +482,35 @@ eeh_slot_availability(struct pci_dn *pdn
}

/**
+ * rtas_pci_enable - enable MMIO or DMA transfers for this slot
+ * @pdn pci device node
+ */
+
+int
+rtas_pci_enable(struct pci_dn *pdn, int function)
+{
+ int config_addr;
+ int rc;
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+ config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid),
+ function);
+
+ if (rc)
+ printk(KERN_WARNING "EEH: Cannot enable function %d, err=%d dn=%s\n",
+ function, rc, pdn->node->full_name);
+
+ return rc;
+}
+
+/**
* rtas_pci_slot_reset - raises/lowers the pci #RST line
* @pdn pci device node
* @state: 1/0 to raise/lower the #RST
Index: linux-2.6.18-rc7-git1/include/asm-powerpc/ppc-pci.h
===================================================================
--- linux-2.6.18-rc7-git1.orig/include/asm-powerpc/ppc-pci.h 2006-09-14 14:44:40.000000000 -0500
+++ linux-2.6.18-rc7-git1/include/asm-powerpc/ppc-pci.h 2006-09-14 15:25:14.000000000 -0500
@@ -69,6 +69,17 @@ struct pci_dev *pci_get_device_by_addr(u
void eeh_slot_error_detail (struct pci_dn *pdn, int severity);

/**
+ * rtas_pci_enableo - enable IO transfers for this slot
+ * @pdn: pci device node
+ * @function: either EEH_THAW_MMIO or EEH_THAW_DMA
+ *
+ * Enable I/O transfers to this slot
+ */
+#define EEH_THAW_MMIO 2
+#define EEH_THAW_DMA 3
+int rtas_pci_enable(struct pci_dn *pdn, int function);
+
+/**
* rtas_set_slot_reset -- unfreeze a frozen slot
*
* Clear the EEH-frozen condition on a slot. This routine

2006-09-15 23:59:04

by linas

[permalink] [raw]
Subject: [PATCH 4/4]: PowerPC: EEH: support MMIO enable recovery step


Update to te PowerPC PCI error recovery code.

Add code to enable MMIO if a device driver reports
that it is capable of recovering on its own. One
anticipated use of this having a device driver
enable MMIO so that it can take a register dump,
which might then be followed by the device driver
requesting a full reset.

Signed-off-by: Linas Vepstas <[email protected]>

----
arch/powerpc/platforms/pseries/eeh_driver.c | 81 ++++++++++++++++++++++------
1 file changed, 64 insertions(+), 17 deletions(-)

Index: linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_driver.c
===================================================================
--- linux-2.6.18-rc7-git1.orig/arch/powerpc/platforms/pseries/eeh_driver.c 2006-09-14 15:17:15.000000000 -0500
+++ linux-2.6.18-rc7-git1/arch/powerpc/platforms/pseries/eeh_driver.c 2006-09-14 17:54:15.000000000 -0500
@@ -100,14 +100,38 @@ static void eeh_report_error(struct pci_
PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
disable_irq_nosync(dev->irq);
}
- if (!driver->err_handler)
- return;
- if (!driver->err_handler->error_detected)
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected)
return;

rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
- if (*res == PCI_ERS_RESULT_NEED_RESET) return;
+ if (*res == PCI_ERS_RESULT_DISCONNECT &&
+ rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+}
+
+/**
+ * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+
+static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+{
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver = dev->driver;
+
+ // dev->error_state = pci_channel_mmio_enabled;
+
+ if (!driver ||
+ !driver->err_handler ||
+ !driver->err_handler->mmio_enabled)
+ return;
+
+ rc = driver->err_handler->mmio_enabled (dev);
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
if (*res == PCI_ERS_RESULT_DISCONNECT &&
rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
}
@@ -118,6 +142,7 @@ static void eeh_report_error(struct pci_

static void eeh_report_reset(struct pci_dev *dev, void *userdata)
{
+ enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver = dev->driver;
struct device_node *dn = pci_device_to_OF_node(dev);

@@ -128,12 +153,14 @@ static void eeh_report_reset(struct pci_
PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
enable_irq(dev->irq);
}
- if (!driver->err_handler)
- return;
- if (!driver->err_handler->slot_reset)
+ if (!driver->err_handler ||
+ !driver->err_handler->slot_reset)
return;

- driver->err_handler->slot_reset(dev);
+ rc = driver->err_handler->slot_reset(dev);
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+ if (*res == PCI_ERS_RESULT_DISCONNECT &&
+ rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
}

/**
@@ -362,23 +389,43 @@ struct pci_dn * handle_eeh_events (struc
goto hard_fail;
}

- /* If any device called out for a reset, then reset the slot */
- if (result == PCI_ERS_RESULT_NEED_RESET) {
- rc = eeh_reset_device(frozen_pdn, NULL);
- if (rc)
- goto hard_fail;
- pci_walk_bus(frozen_bus, eeh_report_reset, NULL);
+ /* If all devices reported they can proceed, then re-enable MMIO */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO);
+
+ if (rc) {
+ result = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ result = PCI_ERS_RESULT_NONE;
+ pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
+ }
}

- /* If all devices reported they can proceed, the re-enable PIO */
+ /* If all devices reported they can proceed, then re-enable DMA */
if (result == PCI_ERS_RESULT_CAN_RECOVER) {
- /* XXX Not supported; we brute-force reset the device */
+ rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA);
+
+ if (rc)
+ result = PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ /* If any device has a hard failure, then shut off everything. */
+ if (result == PCI_ERS_RESULT_DISCONNECT)
+ goto hard_fail;
+
+ /* If any device called out for a reset, then reset the slot */
+ if (result == PCI_ERS_RESULT_NEED_RESET) {
rc = eeh_reset_device(frozen_pdn, NULL);
if (rc)
goto hard_fail;
- pci_walk_bus(frozen_bus, eeh_report_reset, NULL);
+ result = PCI_ERS_RESULT_NONE;
+ pci_walk_bus(frozen_bus, eeh_report_reset, &result);
}

+ /* All devices should claim they have recovered by now. */
+ if (result != PCI_ERS_RESULT_RECOVERED)
+ goto hard_fail;
+
/* Tell all device drivers that they can resume operations */
pci_walk_bus(frozen_bus, eeh_report_resume, NULL);