LinuxLists.cc - [PATCH 0/7] PCI Error Recovery

2005-11-08 23:49:32

Subject: [PATCH 0/7] PCI Error Recovery

Greg,

Following seven patches implement the PCI error reporting and recovery
header and device driver changes as recently discussed, w/all requested
changes & etc. These are tested and wrk well. Please apply.

Signed-off-by: Linas Vepstas <[email protected]>

--linas

2005-11-08 23:54:13

by linas

[permalink] [raw]

Subject: [PATCH 1/7] PCI Error Recovery: header file patch

Please apply.
--------

PCI Error Recovery: header file patch

Various PCI bus errors can be signaled by newer PCI controllers. Recovering
from those errors requires an infrastructure to notify affected device drivers
of the error, and a way of walking through a reset sequence. This patch adds
a set of callbacks to be used by error recovery routines to notify device
drivers of the various stages of recovery.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/include/linux/pci.h
===================================================================
--- linux-2.6.14-git10.orig/include/linux/pci.h 2005-11-07 17:24:23.048968436 -0600
+++ linux-2.6.14-git10/include/linux/pci.h 2005-11-07 17:42:46.026024245 -0600
@@ -78,6 +78,23 @@
#define PCI_UNKNOWN ((pci_power_t __force) 5)
#define PCI_POWER_ERROR ((pci_power_t __force) -1)

+/** The pci_channel state describes connectivity between the CPU and
+ * the pci device. If some PCI bus between here and the pci device
+ * has crashed or locked up, this info is reflected here.
+ */
+typedef int __bitwise pci_channel_state_t;
+
+enum pci_channel_state {
+ /* I/O channel is in normal state */
+ pci_channel_io_normal = (__force pci_channel_state_t) 1,
+
+ /* I/O to channel is blocked */
+ pci_channel_io_frozen = (__force pci_channel_state_t) 2,
+
+ /* PCI card is dead */
+ pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
+};
+
/*
* The pci_dev structure is used to describe PCI devices.
*/
@@ -110,6 +127,7 @@
this is D0-D3, D0 being fully functional,
and D3 being off. */

+ pci_channel_state_t error_state; /* current connectivity state */
struct device dev; /* Generic device interface */

/* device is compatible with these IDs */
@@ -232,6 +250,54 @@
unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
};

+/* ---------------------------------------------------------------- */
+/** PCI Error Recovery System (PCI-ERS). If a PCI device driver provides
+ * a set fof callbacks in struct pci_error_handlers, then that device driver
+ * will be notified of PCI bus errors, and will be driven to recovery
+ * when an error occurs.
+ */
+
+typedef int __bitwise pci_ers_result_t;
+
+enum pci_ers_result {
+ /* no result/none/not supported in device driver */
+ PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
+
+ /* Device driver can recover without slot reset */
+ PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
+
+ /* Device driver wants slot to be reset. */
+ PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
+
+ /* Device has completely failed, is unrecoverable */
+ PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
+
+ /* Device driver is fully recovered and operational */
+ PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
+};
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers
+{
+ /* PCI bus error detected on this device */
+ pci_ers_result_t (*error_detected)(struct pci_dev *dev,
+ enum pci_channel_state error);
+
+ /* MMIO has been re-enabled, but not DMA */
+ pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
+
+ /* PCI Express link has been reset */
+ pci_ers_result_t (*link_reset)(struct pci_dev *dev);
+
+ /* PCI slot has been reset */
+ pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
+
+ /* Device driver may resume normal operations */
+ void (*resume)(struct pci_dev *dev);
+};
+
+/* ---------------------------------------------------------------- */
+
struct module;
struct pci_driver {
struct list_head node;
@@ -245,6 +311,7 @@
int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */
void (*shutdown) (struct pci_dev *dev);

+ struct pci_error_handlers *err_handler;
struct device_driver driver;
struct pci_dynids dynids;
};

2005-11-08 23:56:00

by linas

[permalink] [raw]

Subject: [PATCH 2/7] PCI Error Recovery: IPR SCSI device driver

Please apply.
------

Various PCI bus errors can be signaled by newer PCI controllers. This
patch adds the PCI error recovery callbacks to the IPR SCSI device driver.
The patch has been tested, and appears to work well.

Signed-off-by: Linas Vepstas <[email protected]>
Signed-off-by: Brian King <[email protected]>

--
Index: linux-2.6.14-git10/drivers/scsi/ipr.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/ipr.c 2005-11-07 17:24:13.000000000 -0600
+++ linux-2.6.14-git10/drivers/scsi/ipr.c 2005-11-07 17:44:35.415656790 -0600
@@ -5328,6 +5328,92 @@
shutdown_type);
}

+/* --------------- PCI Error Recovery infrastructure ----------- */
+/** If the PCI slot is frozen, hold off all i/o
+ * activity; then, as soon as the slot is available again,
+ * initiate an adapter reset.
+ */
+static int ipr_reset_freeze(struct ipr_cmnd *ipr_cmd)
+{
+ /* Disallow new interrupts, avoid loop */
+ ipr_cmd->ioa_cfg->allow_interrupts = 0;
+ list_add_tail(&ipr_cmd->queue, &ipr_cmd->ioa_cfg->pending_q);
+ ipr_cmd->done = ipr_reset_ioa_job;
+ return IPR_RC_JOB_RETURN;
+}
+
+/** ipr_eeh_frozen -- called when slot has experience PCI bus error.
+ * This routine is called to tell us that the PCI bus is down.
+ * Can't do anything here, except put the device driver into a
+ * holding pattern, waiting for the PCI bus to come back.
+ */
+static void ipr_eeh_frozen (struct pci_dev *pdev)
+{
+ unsigned long flags = 0;
+ struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);
+
+ spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+ _ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_freeze, IPR_SHUTDOWN_NONE);
+ spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+}
+
+/** ipr_eeh_slot_reset - called when pci slot has been reset.
+ *
+ * This routine is called by the pci error recovery recovery
+ * code after the PCI slot has been reset, just before we
+ * should resume normal operations.
+ */
+static pci_ers_result_t ipr_eeh_slot_reset(struct pci_dev *pdev)
+{
+ unsigned long flags = 0;
+ struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);
+
+ spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+ _ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_restore_cfg_space,
+ IPR_SHUTDOWN_NONE);
+ spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/** This routine is called when the PCI bus has permanently
+ * failed. This routine should purge all pending I/O and
+ * shut down the device driver (close and unload).
+ */
+static void ipr_eeh_perm_failure(struct pci_dev *pdev)
+{
+ unsigned long flags = 0;
+ struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);
+
+ spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+ if (ioa_cfg->sdt_state == WAIT_FOR_DUMP)
+ ioa_cfg->sdt_state = ABORT_DUMP;
+ ioa_cfg->reset_retries = IPR_NUM_RESET_RELOAD_RETRIES;
+ ioa_cfg->in_ioa_bringdown = 1;
+ ipr_initiate_ioa_reset(ioa_cfg, IPR_SHUTDOWN_NONE);
+ spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+}
+
+static pci_ers_result_t ipr_eeh_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ switch (state) {
+ case pci_channel_io_frozen:
+ ipr_eeh_frozen (pdev);
+ return PCI_ERS_RESULT_NEED_RESET;
+
+ case pci_channel_io_perm_failure:
+ ipr_eeh_perm_failure (pdev);
+ return PCI_ERS_RESULT_DISCONNECT;
+ break;
+ default:
+ break;
+ }
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/* ------------- end of PCI Error Recovery suport ----------- */
+
/**
* ipr_probe_ioa_part2 - Initializes IOAs found in ipr_probe_ioa(..)
* @ioa_cfg: ioa cfg struct
@@ -6065,12 +6151,18 @@
};
MODULE_DEVICE_TABLE(pci, ipr_pci_table);

+static struct pci_error_handlers ipr_err_handler = {
+ .error_detected = ipr_eeh_error_detected,
+ .slot_reset = ipr_eeh_slot_reset,
+};
+
static struct pci_driver ipr_driver = {
.name = IPR_NAME,
.id_table = ipr_pci_table,
.probe = ipr_probe,
.remove = ipr_remove,
.shutdown = ipr_shutdown,
+ .err_handler = &ipr_err_handler,
};

/**

2005-11-08 23:57:28

by linas

[permalink] [raw]

Subject: [PATCH 3/7] PCI Error Recovery: Symbios SCSI device driver

Please apply.

---

Various PCI bus errors can be signaled by newer PCI controllers. This
patch adds the PCI error recovery callbacks to the Symbios SCSI device driver.
The patch has been tested, and appears to work well.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/sym53c8xx_2/sym_glue.c 2005-10-27 19:02:08.000000000 -0500
+++ linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.c 2005-11-07 17:44:37.766326553 -0600
@@ -686,6 +686,10 @@

if (DEBUG_FLAGS & DEBUG_TINY) printf_debug ("[");

+ /* Avoid spinloop trying to handle interrupts on frozen device */
+ if (np->s.io_state != pci_channel_io_normal)
+ return IRQ_HANDLED;
+
spin_lock_irqsave(np->s.host->host_lock, flags);
sym_interrupt(np);
spin_unlock_irqrestore(np->s.host->host_lock, flags);
@@ -759,6 +763,25 @@
*/
static void sym_eh_timeout(u_long p) { __sym_eh_done((struct scsi_cmnd *)p, 1); }

+static void sym_eeh_timeout(u_long p)
+{
+ struct sym_eh_wait *ep = (struct sym_eh_wait *) p;
+ if (!ep)
+ return;
+ complete(&ep->done);
+}
+
+static void sym_eeh_done(struct sym_eh_wait *ep)
+{
+ if (!ep)
+ return;
+ ep->timed_out = 0;
+ if (!del_timer(&ep->timer))
+ return;
+
+ complete(&ep->done);
+}
+
/*
* Generic method for our eh processing.
* The 'op' argument tells what we have to do.
@@ -799,6 +822,35 @@

/* Try to proceed the operation we have been asked for */
sts = -1;
+
+ /* We may be in an error condition because the PCI bus
+ * went down. In this case, we need to wait until the
+ * PCI bus is reset, the card is reset, and only then
+ * proceed with the scsi error recovery. We'll wait
+ * for 15 seconds for this to happen.
+ */
+#define WAIT_FOR_PCI_RECOVERY 15
+ if (np->s.io_state != pci_channel_io_normal) {
+ struct sym_eh_wait eeh, *eep = &eeh;
+ np->s.io_reset_wait = eep;
+ init_completion(&eep->done);
+ init_timer(&eep->timer);
+ eep->to_do = SYM_EH_DO_WAIT;
+ eep->timer.expires = jiffies + (WAIT_FOR_PCI_RECOVERY*HZ);
+ eep->timer.function = sym_eeh_timeout;
+ eep->timer.data = (u_long)eep;
+ eep->timed_out = 1; /* Be pessimistic for once :) */
+ add_timer(&eep->timer);
+ spin_unlock_irq(np->s.host->host_lock);
+ wait_for_completion(&eep->done);
+ spin_lock_irq(np->s.host->host_lock);
+ if (eep->timed_out) {
+ printk (KERN_ERR "%s: Timed out waiting for PCI reset\n",
+ sym_name(np));
+ }
+ np->s.io_reset_wait = NULL;
+ }
+
switch(op) {
case SYM_EH_ABORT:
sts = sym_abort_scsiio(np, cmd, 1);
@@ -1584,6 +1636,8 @@
np->maxoffs = dev->chip.offset_max;
np->maxburst = dev->chip.burst_max;
np->myaddr = dev->host_id;
+ np->s.io_state = pci_channel_io_normal;
+ np->s.io_reset_wait = NULL;

/*
* Edit its name.
@@ -1916,6 +1970,58 @@
return 1;
}

+/* ------------- PCI Error Recovery infrastructure -------------- */
+/** sym2_io_error_detected() is called when PCI error is detected */
+static pci_ers_result_t sym2_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct sym_hcb *np = pci_get_drvdata(pdev);
+
+ np->s.io_state = state;
+ // XXX If slot is permanently frozen, then what?
+ // Should we scsi_remove_host() maybe ??
+
+ /* Request a slot slot reset. */
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/** sym2_io_slot_reset is called when the pci bus has been reset.
+ * Restart the card from scratch. */
+static pci_ers_result_t sym2_io_slot_reset (struct pci_dev *pdev)
+{
+ struct sym_hcb *np = pci_get_drvdata(pdev);
+
+ printk (KERN_INFO "%s: recovering from a PCI slot reset\n",
+ sym_name(np));
+
+ if (pci_enable_device(pdev))
+ printk (KERN_ERR "%s: device setup failed most egregiously\n",
+ sym_name(np));
+
+ pci_set_master(pdev);
+ enable_irq (pdev->irq);
+
+ /* Perform host reset only on one instance of the card */
+ if (0 == PCI_FUNC (pdev->devfn))
+ sym_reset_scsi_bus(np, 0);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/** sym2_io_resume is called when the error recovery driver
+ * tells us that its OK to resume normal operation.
+ */
+static void sym2_io_resume (struct pci_dev *pdev)
+{
+ struct sym_hcb *np = pci_get_drvdata(pdev);
+
+ /* Perform device startup only once for this card. */
+ if (0 == PCI_FUNC (pdev->devfn))
+ sym_start_up (np, 1);
+
+ np->s.io_state = pci_channel_io_normal;
+ sym_eeh_done (np->s.io_reset_wait);
+}
+
/*
* Driver host template.
*/
@@ -2169,11 +2275,18 @@

MODULE_DEVICE_TABLE(pci, sym2_id_table);

+static struct pci_error_handlers sym2_err_handler = {
+ .error_detected = sym2_io_error_detected,
+ .slot_reset = sym2_io_slot_reset,
+ .resume = sym2_io_resume,
+};
+
static struct pci_driver sym2_driver = {
.name = NAME53C8XX,
.id_table = sym2_id_table,
.probe = sym2_probe,
.remove = __devexit_p(sym2_remove),
+ .err_handler = &sym2_err_handler,
};

static int __init sym2_init(void)
Index: linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.h
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/sym53c8xx_2/sym_glue.h 2005-10-27 19:02:08.000000000 -0500
+++ linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.h 2005-11-07 17:44:37.768326272 -0600
@@ -181,6 +181,10 @@
char chip_name[8];
struct pci_dev *device;

+ /* pci bus i/o state; waiter for clearing of i/o state */
+ pci_channel_state_t io_state;
+ struct sym_eh_wait *io_reset_wait;
+
struct Scsi_Host *host;

void __iomem * ioaddr; /* MMIO kernel io address */
Index: linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_hipd.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/sym53c8xx_2/sym_hipd.c 2005-11-07 17:24:14.000000000 -0600
+++ linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_hipd.c 2005-11-07 17:44:37.813319951 -0600
@@ -2809,6 +2809,7 @@
u_char istat, istatc;
u_char dstat;
u_short sist;
+ u_int icnt;

/*
* interrupt on the fly ?
@@ -2850,6 +2851,7 @@
sist = 0;
dstat = 0;
istatc = istat;
+ icnt = 0;
do {
if (istatc & SIP)
sist |= INW(np, nc_sist);
@@ -2857,6 +2859,19 @@
dstat |= INB(np, nc_dstat);
istatc = INB(np, nc_istat);
istat |= istatc;
+
+ /* Prevent deadlock waiting on a condition that may never clear. */
+ /* XXX this is a temporary kludge; the correct to detect
+ * a PCI bus error would be to use the io_check interfaces
+ * proposed by Hidetoshi Seto <[email protected]>
+ * Problem with polling like that is the state flag might not
+ * be set.
+ */
+ icnt ++;
+ if (100 < icnt) {
+ if (np->s.device->error_state != pci_channel_io_normal)
+ return;
+ }
} while (istatc & (SIP|DIP));

if (DEBUG_FLAGS & DEBUG_TINY)

2005-11-08 23:59:05

by linas

[permalink] [raw]

Subject: [PATCH 4/7] PCI Error Recovery: e100 network device driver

Please apply.

-----
Various PCI bus errors can be signaled by newer PCI controllers. This
patch adds the PCI error recovery callbacks to the intel ethernet e100
device driver. The patch has been tested, and appears to work well.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/drivers/net/e100.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/e100.c 2005-11-07 17:24:10.000000000 -0600
+++ linux-2.6.14-git10/drivers/net/e100.c 2005-11-07 17:44:42.911603712 -0600
@@ -2465,6 +2465,75 @@
}

+/* ------------------ PCI Error Recovery infrastructure -------------- */
+/** e100_io_error_detected() is called when PCI error is detected */
+static pci_ers_result_t e100_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+
+ /* Same as calling e100_down(netdev_priv(netdev)), but generic */
+ netdev->stop(netdev);
+
+ /* Is a detach needed ?? */
+ // netif_device_detach(netdev);
+
+ /* Request a slot reset. */
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/** e100_io_slot_reset is called after the pci bus has been reset.
+ * Restart the card from scratch. */
+static pci_ers_result_t e100_io_slot_reset(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct nic *nic = netdev_priv(netdev);
+
+ if(pci_enable_device(pdev)) {
+ printk(KERN_ERR "e100: Cannot re-enable PCI device after reset.\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ pci_set_master(pdev);
+
+ /* Only one device per card can do a reset */
+ if (0 != PCI_FUNC (pdev->devfn))
+ return PCI_ERS_RESULT_RECOVERED;
+
+ e100_hw_reset(nic);
+ e100_phy_init(nic);
+
+ if(e100_hw_init(nic)) {
+ DPRINTK(HW, ERR, "e100_hw_init failed\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/** e100_io_resume is called when the error recovery driver
+ * tells us that its OK to resume normal operation.
+ */
+static void e100_io_resume(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct nic *nic = netdev_priv(netdev);
+
+ /* ack any pending wake events, disable PME */
+ pci_enable_wake(pdev, 0, 0);
+
+ netif_device_attach(netdev);
+ if(netif_running(netdev)) {
+ e100_open (netdev);
+ mod_timer(&nic->watchdog, jiffies);
+ }
+}
+
+static struct pci_error_handlers e100_err_handler = {
+ .error_detected = e100_io_error_detected,
+ .slot_reset = e100_io_slot_reset,
+ .resume = e100_io_resume,
+};
+
+
static struct pci_driver e100_driver = {
.name = DRV_NAME,
.id_table = e100_id_table,
@@ -2475,6 +2544,7 @@
.resume = e100_resume,
#endif
.shutdown = e100_shutdown,
+ .err_handler = &e100_err_handler,
};

static int __init e100_init_module(void)

2005-11-09 00:00:46

by linas

[permalink] [raw]

Subject: [PATCH 5/7] PCI Error Recovery: e1000 network device driver

Please apply.

----

Various PCI bus errors can be signaled by newer PCI controllers. This
patch adds the PCI error recovery callbacks to the intel gigabit
ethernet e1000 device driver. The patch has been tested, and appears
to work well.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/e1000/e1000_main.c 2005-11-07 17:24:10.000000000 -0600
+++ linux-2.6.14-git10/drivers/net/e1000/e1000_main.c 2005-11-07 17:44:45.143290190 -0600
@@ -206,6 +206,16 @@
void e1000_rx_schedule(void *data);
#endif

+static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state);
+static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev);
+static void e1000_io_resume(struct pci_dev *pdev);
+
+static struct pci_error_handlers e1000_err_handler = {
+ .error_detected = e1000_io_error_detected,
+ .slot_reset = e1000_io_slot_reset,
+ .resume = e1000_io_resume,
+};
+
/* Exported from other modules */

extern void e1000_check_options(struct e1000_adapter *adapter);
@@ -218,8 +228,9 @@
/* Power Managment Hooks */
#ifdef CONFIG_PM
.suspend = e1000_suspend,
- .resume = e1000_resume
+ .resume = e1000_resume,
#endif
+ .err_handler = &e1000_err_handler,
};

MODULE_AUTHOR("Intel Corporation, <[email protected]>");
@@ -2937,6 +2948,10 @@

#define PHY_IDLE_ERROR_COUNT_MASK 0x00FF

+ /* Prevent stats update while adapter is being reset */
+ if (adapter->link_speed == 0)
+ return;
+
spin_lock_irqsave(&adapter->stats_lock, flags);

/* these counters are modified from e1000_adjust_tbi_stats,
@@ -4358,4 +4373,88 @@
}
#endif

+/* --------------- PCI Error Recovery infrastructure ------------ */
+/** e1000_io_error_detected() is called when PCI error is detected */
+static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct e1000_adapter *adapter = netdev->priv;
+
+ if (netif_running(netdev))
+ e1000_down(adapter);
+
+ /* Request a slot slot reset. */
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/** e1000_io_slot_reset is called after the pci bus has been reset.
+ * Restart the card from scratch.
+ * Implementation resembles the first-half of the
+ * e1000_resume routine.
+ */
+static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct e1000_adapter *adapter = netdev->priv;
+
+ if (pci_enable_device(pdev)) {
+ printk(KERN_ERR "e1000: Cannot re-enable PCI device after reset.\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ pci_set_master(pdev);
+
+ pci_enable_wake(pdev, 3, 0);
+ pci_enable_wake(pdev, 4, 0); /* 4 == D3 cold */
+
+ /* Perform card reset only on one instance of the card */
+ if(0 != PCI_FUNC (pdev->devfn))
+ return PCI_ERS_RESULT_RECOVERED;
+
+ e1000_reset(adapter);
+ E1000_WRITE_REG(&adapter->hw, WUS, ~0);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/** e1000_io_resume is called when the error recovery driver
+ * tells us that its OK to resume normal operation.
+ * Implementation resembles the second-half of the
+ * e1000_resume routine.
+ */
+static void e1000_io_resume(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct e1000_adapter *adapter = netdev->priv;
+ uint32_t manc, swsm;
+
+ if(netif_running(netdev)) {
+ if (e1000_up(adapter)) {
+ printk("e1000: can't bring device back up after reset\n");
+ return;
+ }
+ }
+
+ netif_device_attach(netdev);
+
+ if(adapter->hw.mac_type >= e1000_82540 &&
+ adapter->hw.media_type == e1000_media_type_copper) {
+ manc = E1000_READ_REG(&adapter->hw, MANC);
+ manc &= ~(E1000_MANC_ARP_EN);
+ E1000_WRITE_REG(&adapter->hw, MANC, manc);
+ }
+
+ switch(adapter->hw.mac_type) {
+ case e1000_82573:
+ swsm = E1000_READ_REG(&adapter->hw, SWSM);
+ E1000_WRITE_REG(&adapter->hw, SWSM,
+ swsm | E1000_SWSM_DRV_LOAD);
+ break;
+ default:
+ break;
+ }
+
+ if(netif_running(netdev))
+ mod_timer(&adapter->watchdog_timer, jiffies);
+}
+
/* e1000_main.c */

2005-11-09 00:02:07

by linas

[permalink] [raw]

Subject: [PATCH 6/7] PCI Error Recovery: ixgb network device driver

Please apply.

----

Various PCI bus errors can be signaled by newer PCI controllers. This
patch adds the PCI error recovery callbacks to the intel ten-gigabit
ethernet ixgb device driver. The patch has been tested, and appears
to work well.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/drivers/net/ixgb/ixgb_main.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/ixgb/ixgb_main.c 2005-11-07 17:24:11.000000000 -0600
+++ linux-2.6.14-git10/drivers/net/ixgb/ixgb_main.c 2005-11-07 17:44:50.380554424 -0600
@@ -132,6 +132,16 @@
static void ixgb_netpoll(struct net_device *dev);
#endif

+static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state);
+static pci_ers_result_t ixgb_io_slot_reset (struct pci_dev *pdev);
+static void ixgb_io_resume (struct pci_dev *pdev);
+
+static struct pci_error_handlers ixgb_err_handler = {
+ .error_detected = ixgb_io_error_detected,
+ .slot_reset = ixgb_io_slot_reset,
+ .resume = ixgb_io_resume,
+};
+
/* Exported from other modules */

extern void ixgb_check_options(struct ixgb_adapter *adapter);
@@ -141,6 +151,8 @@
.id_table = ixgb_pci_tbl,
.probe = ixgb_probe,
.remove = __devexit_p(ixgb_remove),
+ .err_handler = &ixgb_err_handler,
+
};

MODULE_AUTHOR("Intel Corporation, <[email protected]>");
@@ -1654,8 +1666,16 @@
unsigned int i;
#endif

+#ifdef XXX_CONFIG_IXGB_EEH_RECOVERY
+ if(unlikely(icr==EEH_IO_ERROR_VALUE(4))) {
+ if (eeh_slot_is_isolated (adapter->pdev))
+ // disable_irq_nosync (adapter->pdev->irq);
+ return IRQ_NONE; /* Not our interrupt */
+ }
+#else
if(unlikely(!icr))
return IRQ_NONE; /* Not our interrupt */
+#endif /* CONFIG_IXGB_EEH_RECOVERY */

if(unlikely(icr & (IXGB_INT_RXSEQ | IXGB_INT_LSC))) {
mod_timer(&adapter->watchdog_timer, jiffies);
@@ -2125,4 +2145,70 @@
}
#endif

+/* -------------- PCI Error Recovery infrastructure ---------------- */
+/** ixgb_io_error_detected() is called when PCI error is detected */
+static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct ixgb_adapter *adapter = netdev->priv;
+
+ if(netif_running(netdev))
+ ixgb_down(adapter, TRUE);
+
+ /* Request a slot reset. */
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/** ixgb_io_slot_reset is called after the pci bus has been reset.
+ * Restart the card from scratch.
+ * Implementation resembles the first-half of the
+ * ixgb_resume routine.
+ */
+static pci_ers_result_t ixgb_io_slot_reset (struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct ixgb_adapter *adapter = netdev->priv;
+
+ if(pci_enable_device(pdev)) {
+ printk(KERN_ERR "ixgb: Cannot re-enable PCI device after reset.\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ pci_set_master(pdev);
+
+ /* Perform card reset only on one instance of the card */
+ if (0 != PCI_FUNC (pdev->devfn))
+ return PCI_ERS_RESULT_RECOVERED;
+
+ ixgb_reset(adapter);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/** ixgb_io_resume is called when the error recovery driver
+ * tells us that its OK to resume normal operation.
+ * Implementation resembles the second-half of the
+ * ixgb_resume routine.
+ */
+static void ixgb_io_resume (struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct ixgb_adapter *adapter = netdev->priv;
+
+ if(netif_running(netdev)) {
+ if(ixgb_up(adapter)) {
+ printk ("ixgb: can't bring device back up after reset\n");
+ return;
+ }
+ }
+
+ netif_device_attach(netdev);
+ if(netif_running(netdev))
+ mod_timer(&adapter->watchdog_timer, jiffies);
+
+ /* Reading all-ff's from the adapter will completely hose
+ * the counts and statistics. So just clear them out */
+ memset(&adapter->stats, 0, sizeof(struct ixgb_hw_stats));
+ ixgb_update_stats(adapter);
+}
+
/* ixgb_main.c */

2005-11-09 00:03:39

by linas

[permalink] [raw]

Subject: [PATCH 7/7] PCI Error Recovery: CONFIG_PCI_ERROR_RECOVERY wrappers

Please apply.
-----

This OPTIONAL/RFC patch adds ifdef's around the PCI error recovery code in the
various device drivers. This patch is "optional" in that its a little bit
messy, but it does solve a little problem.

-- The good news: this gives some users (e.g. embeddd systems) the option
of not compiling in this code, thus making thier device drivers a tiny
bit smaller.

-- The bad news: This also clutters up the drivers with extraneous markup
and the config process with yet another config.

I don't know if this patch is worth it. Its up to you ... :-)

Signed-off-by: Linas Vepstas <[email protected]>

Index: linux-2.6.14-git10/drivers/scsi/ipr.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/ipr.c 2005-11-07 17:44:35.415656790 -0600
+++ linux-2.6.14-git10/drivers/scsi/ipr.c 2005-11-07 17:44:56.315720610 -0600
@@ -5329,6 +5329,8 @@
}

/* --------------- PCI Error Recovery infrastructure ----------- */
+#ifdef CONFIG_PCI_ERROR_RECOVERY
+
/** If the PCI slot is frozen, hold off all i/o
* activity; then, as soon as the slot is available again,
* initiate an adapter reset.
@@ -5412,6 +5414,7 @@
return PCI_ERS_RESULT_NEED_RESET;
}

+#endif /* CONFIG_PCI_ERROR_RECOVERY */
/* ------------- end of PCI Error Recovery suport ----------- */

/**
@@ -6151,10 +6154,12 @@
};
MODULE_DEVICE_TABLE(pci, ipr_pci_table);

+#ifdef CONFIG_PCI_ERROR_RECOVERY
static struct pci_error_handlers ipr_err_handler = {
.error_detected = ipr_eeh_error_detected,
.slot_reset = ipr_eeh_slot_reset,
};
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

static struct pci_driver ipr_driver = {
.name = IPR_NAME,
@@ -6162,7 +6167,9 @@
.probe = ipr_probe,
.remove = ipr_remove,
.shutdown = ipr_shutdown,
+#ifdef CONFIG_PCI_ERROR_RECOVERY
.err_handler = &ipr_err_handler,
+#endif /* CONFIG_PCI_ERROR_RECOVERY */
};

/**
Index: linux-2.6.14-git10/drivers/pci/Kconfig
===================================================================
--- linux-2.6.14-git10.orig/drivers/pci/Kconfig 2005-10-27 19:02:08.000000000 -0500
+++ linux-2.6.14-git10/drivers/pci/Kconfig 2005-11-07 17:44:56.327718924 -0600
@@ -13,6 +13,21 @@

If you don't know what to do here, say N.

+config PCI_ERR_RECOVERY
+ bool "PCI Error Recovery support"
+ depends on PCI
+ depends on PPC_PSERIES
+ default y
+ help
+ PCI Error Recovery is a mechanism by which crashed/hung
+ PCI adapters are automatically detected and rebooted without
+ otherwise disturbing the operation of the system. Support
+ for this recovery requires special PCI bridge chips (some
+ PCI-E chips may have this support) as well as support in
+ the device drivers (not all device drivers can handle this).
+
+ When in doubt, say Y.
+
config PCI_LEGACY_PROC
bool "Legacy /proc/pci interface"
depends on PCI
Index: linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/scsi/sym53c8xx_2/sym_glue.c 2005-11-07 17:44:37.766326553 -0600
+++ linux-2.6.14-git10/drivers/scsi/sym53c8xx_2/sym_glue.c 2005-11-07 17:44:56.332718222 -0600
@@ -763,6 +763,7 @@
*/
static void sym_eh_timeout(u_long p) { __sym_eh_done((struct scsi_cmnd *)p, 1); }

+#ifdef CONFIG_PCI_ERROR_RECOVERY
static void sym_eeh_timeout(u_long p)
{
struct sym_eh_wait *ep = (struct sym_eh_wait *) p;
@@ -781,6 +782,7 @@

complete(&ep->done);
}
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/*
* Generic method for our eh processing.
@@ -823,6 +825,7 @@
/* Try to proceed the operation we have been asked for */
sts = -1;

+#ifdef CONFIG_PCI_ERROR_RECOVERY
/* We may be in an error condition because the PCI bus
* went down. In this case, we need to wait until the
* PCI bus is reset, the card is reset, and only then
@@ -850,6 +853,7 @@
}
np->s.io_reset_wait = NULL;
}
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

switch(op) {
case SYM_EH_ABORT:
@@ -1971,6 +1975,7 @@
}

/* ------------- PCI Error Recovery infrastructure -------------- */
+#ifdef CONFIG_PCI_ERROR_RECOVERY
/** sym2_io_error_detected() is called when PCI error is detected */
static pci_ers_result_t sym2_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state)
{
@@ -2021,6 +2026,7 @@
np->s.io_state = pci_channel_io_normal;
sym_eeh_done (np->s.io_reset_wait);
}
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/*
* Driver host template.
@@ -2275,18 +2281,22 @@

MODULE_DEVICE_TABLE(pci, sym2_id_table);

+#ifdef CONFIG_PCI_ERROR_RECOVERY
static struct pci_error_handlers sym2_err_handler = {
.error_detected = sym2_io_error_detected,
.slot_reset = sym2_io_slot_reset,
.resume = sym2_io_resume,
};
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

static struct pci_driver sym2_driver = {
.name = NAME53C8XX,
.id_table = sym2_id_table,
.probe = sym2_probe,
.remove = __devexit_p(sym2_remove),
+#ifdef CONFIG_PCI_ERROR_RECOVERY
.err_handler = &sym2_err_handler,
+#endif /* CONFIG_PCI_ERROR_RECOVERY */
};

static int __init sym2_init(void)
Index: linux-2.6.14-git10/drivers/net/e100.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/e100.c 2005-11-07 17:44:42.911603712 -0600
+++ linux-2.6.14-git10/drivers/net/e100.c 2005-11-07 17:44:56.337717520 -0600
@@ -2466,6 +2466,7 @@

/* ------------------ PCI Error Recovery infrastructure -------------- */
+#ifdef CONFIG_PCI_ERROR_RECOVERY
/** e100_io_error_detected() is called when PCI error is detected */
static pci_ers_result_t e100_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
{
@@ -2532,6 +2533,7 @@
.slot_reset = e100_io_slot_reset,
.resume = e100_io_resume,
};
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

static struct pci_driver e100_driver = {
@@ -2544,7 +2546,9 @@
.resume = e100_resume,
#endif
.shutdown = e100_shutdown,
+#ifdef CONFIG_PCI_ERROR_RECOVERY
.err_handler = &e100_err_handler,
+#endif /* CONFIG_PCI_ERROR_RECOVERY */
};

static int __init e100_init_module(void)
Index: linux-2.6.14-git10/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/e1000/e1000_main.c 2005-11-07 17:44:45.143290190 -0600
+++ linux-2.6.14-git10/drivers/net/e1000/e1000_main.c 2005-11-07 17:44:56.344716537 -0600
@@ -206,6 +206,7 @@
void e1000_rx_schedule(void *data);
#endif

+#ifdef CONFIG_PCI_ERROR_RECOVERY
static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state);
static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev);
static void e1000_io_resume(struct pci_dev *pdev);
@@ -215,6 +216,7 @@
.slot_reset = e1000_io_slot_reset,
.resume = e1000_io_resume,
};
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/* Exported from other modules */

@@ -230,7 +232,9 @@
.suspend = e1000_suspend,
.resume = e1000_resume,
#endif
+#ifdef CONFIG_PCI_ERROR_RECOVERY
.err_handler = &e1000_err_handler,
+#endif /* CONFIG_PCI_ERROR_RECOVERY */
};

MODULE_AUTHOR("Intel Corporation, <[email protected]>");
@@ -4374,6 +4378,7 @@
#endif

/* --------------- PCI Error Recovery infrastructure ------------ */
+#ifdef CONFIG_PCI_ERROR_RECOVERY
/** e1000_io_error_detected() is called when PCI error is detected */
static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
{
@@ -4456,5 +4461,6 @@
if(netif_running(netdev))
mod_timer(&adapter->watchdog_timer, jiffies);
}
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/* e1000_main.c */
Index: linux-2.6.14-git10/drivers/net/ixgb/ixgb_main.c
===================================================================
--- linux-2.6.14-git10.orig/drivers/net/ixgb/ixgb_main.c 2005-11-07 17:44:50.380554424 -0600
+++ linux-2.6.14-git10/drivers/net/ixgb/ixgb_main.c 2005-11-07 17:44:56.350715694 -0600
@@ -132,6 +132,7 @@
static void ixgb_netpoll(struct net_device *dev);
#endif

+#ifdef CONFIG_PCI_ERROR_RECOVERY
static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state);
static pci_ers_result_t ixgb_io_slot_reset (struct pci_dev *pdev);
static void ixgb_io_resume (struct pci_dev *pdev);
@@ -141,6 +142,7 @@
.slot_reset = ixgb_io_slot_reset,
.resume = ixgb_io_resume,
};
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/* Exported from other modules */

@@ -151,8 +153,9 @@
.id_table = ixgb_pci_tbl,
.probe = ixgb_probe,
.remove = __devexit_p(ixgb_remove),
+#ifdef CONFIG_PCI_ERROR_RECOVERY
.err_handler = &ixgb_err_handler,
-
+#endif /* CONFIG_PCI_ERROR_RECOVERY */
};

MODULE_AUTHOR("Intel Corporation, <[email protected]>");
@@ -2146,6 +2149,7 @@
#endif

/* -------------- PCI Error Recovery infrastructure ---------------- */
+#ifdef CONFIG_PCI_ERROR_RECOVERY
/** ixgb_io_error_detected() is called when PCI error is detected */
static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, pci_channel_state_t state)
{
@@ -2210,5 +2214,6 @@
memset(&adapter->stats, 0, sizeof(struct ixgb_hw_stats));
ixgb_update_stats(adapter);
}
+#endif /* CONFIG_PCI_ERROR_RECOVERY */

/* ixgb_main.c */

2005-11-09 00:12:07

by Stephen Hemminger

[permalink] [raw]

Subject: Re: [PATCH 1/7] PCI Error Recovery: header file patch

>
> +/** The pci_channel state describes connectivity between the CPU and
> + * the pci device. If some PCI bus between here and the pci device
> + * has crashed or locked up, this info is reflected here.
> + */
> +typedef int __bitwise pci_channel_state_t;

Bit operations should be on unsigned not signed value.

--
Stephen Hemminger <[email protected]>
OSDL http://developer.osdl.org/~shemminger

2005-11-14 22:49:25

by Greg KH

[permalink] [raw]

Subject: Re: [PATCH 1/7] PCI Error Recovery: header file patch

On Tue, Nov 08, 2005 at 04:11:58PM -0800, Stephen Hemminger wrote:
>
> >
> > +/** The pci_channel state describes connectivity between the CPU and
> > + * the pci device. If some PCI bus between here and the pci device
> > + * has crashed or locked up, this info is reflected here.
> > + */
> > +typedef int __bitwise pci_channel_state_t;
>
> Bit operations should be on unsigned not signed value.

Agreed. I'll wait for Linas to respin these.

thanks,

greg k-h

2005-11-15 16:57:09

by linas

[permalink] [raw]

Subject: Re: [PATCH 0/7] PCI Error Recovery

On Tue, Nov 08, 2005 at 05:49:11PM -0600, linas was heard to remark:
>
> Following seven patches implement the PCI error reporting and recovery
> header and device driver changes as recently discussed, w/all requested
> changes & etc. These are tested and wrk well. Please apply.

These patches don't seem to be in either linux-2.6.15-rc1-git2 or linux-2.6.15-mm2

Is there something else I need to do, besides nag?

--linas

2005-11-15 17:02:48

by Greg KH

[permalink] [raw]

Subject: Re: [PATCH 0/7] PCI Error Recovery

On Mon, Nov 14, 2005 at 03:47:03PM -0600, linas wrote:
> On Tue, Nov 08, 2005 at 05:49:11PM -0600, linas was heard to remark:
> >
> > Following seven patches implement the PCI error reporting and recovery
> > header and device driver changes as recently discussed, w/all requested
> > changes & etc. These are tested and wrk well. Please apply.
>
> These patches don't seem to be in either linux-2.6.15-rc1-git2 or linux-2.6.15-mm2
>
> Is there something else I need to do, besides nag?

Address the issue that was brought up on lkml with them?

thanks,

greg k-h

2005-11-15 17:59:38

by linas

[permalink] [raw]

Subject: Re: [PATCH 0/7] PCI Error Recovery

On Tue, Nov 15, 2005 at 08:49:01AM -0800, Greg KH was heard to remark:
> On Mon, Nov 14, 2005 at 03:47:03PM -0600, linas wrote:
> > On Tue, Nov 08, 2005 at 05:49:11PM -0600, linas was heard to remark:
> > >
> > > Following seven patches implement the PCI error reporting and recovery
> > > header and device driver changes as recently discussed, w/all requested
> > > changes & etc. These are tested and wrk well. Please apply.
> >
> > These patches don't seem to be in either linux-2.6.15-rc1-git2 or linux-2.6.15-mm2
> >
> > Is there something else I need to do, besides nag?
>
> Address the issue that was brought up on lkml with them?

? I'm sorry, I'm crawling the archives, and can't find any threads
that haven't already been addressed in the final patchset.

--linas

2005-11-15 21:56:17

by Paul Mackerras

[permalink] [raw]

Subject: Re: [PATCH 0/7] PCI Error Recovery

linas writes:

> ? I'm sorry, I'm crawling the archives, and can't find any threads
> that haven't already been addressed in the final patchset.

I think someone wanted you to make the bitwise thing an unsigned int
rather than an int. I don't remember any other changes being
requested, if someone did want something, hopefully they'll chime in
and remind us. :)

Paul.

2005-11-16 00:54:32

by linas

[permalink] [raw]

Subject: Re: [PATCH 0/7] PCI Error Recovery

On Wed, Nov 16, 2005 at 08:56:12AM +1100, Paul Mackerras was heard to remark:
> linas writes:
>
> > ? I'm sorry, I'm crawling the archives, and can't find any threads
> > that haven't already been addressed in the final patchset.
>
> I think someone wanted you to make the bitwise thing an unsigned int
> rather than an int.

Oh right. I replied off-list. Teach me to go off list.

--linas

2005-11-16 23:10:59

by linas

[permalink] [raw]

Subject: Re: [PATCH 1/7] PCI Error Recovery: header file patch

Greg, Please apply. This has been modified to use unsigned int's
per disucssion.

--linas

--------

PCI Error Recovery: header file patch

Various PCI bus errors can be signaled by newer PCI controllers. Recovering
from those errors requires an infrastructure to notify affected device drivers
of the error, and a way of walking through a reset sequence. This patch adds
a set of callbacks to be used by error recovery routines to notify device
drivers of the various stages of recovery.

Signed-off-by: Linas Vepstas <[email protected]>

--
Index: linux-2.6.14-git10/include/linux/pci.h
===================================================================
--- linux-2.6.14-git10.orig/include/linux/pci.h 2005-11-07 17:24:23.048968436 -0600
+++ linux-2.6.14-git10/include/linux/pci.h 2005-11-07 17:42:46.026024245 -0600
@@ -78,6 +78,23 @@
#define PCI_UNKNOWN ((pci_power_t __force) 5)
#define PCI_POWER_ERROR ((pci_power_t __force) -1)

+/** The pci_channel state describes connectivity between the CPU and
+ * the pci device. If some PCI bus between here and the pci device
+ * has crashed or locked up, this info is reflected here.
+ */
+typedef unsigned int __bitwise pci_channel_state_t;
+
+enum pci_channel_state {
+ /* I/O channel is in normal state */
+ pci_channel_io_normal = (__force pci_channel_state_t) 1,
+
+ /* I/O to channel is blocked */
+ pci_channel_io_frozen = (__force pci_channel_state_t) 2,
+
+ /* PCI card is dead */
+ pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
+};
+
/*
* The pci_dev structure is used to describe PCI devices.
*/
@@ -110,6 +127,7 @@
this is D0-D3, D0 being fully functional,
and D3 being off. */

+ pci_channel_state_t error_state; /* current connectivity state */
struct device dev; /* Generic device interface */

/* device is compatible with these IDs */
@@ -232,6 +250,54 @@
unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
};

+/* ---------------------------------------------------------------- */
+/** PCI Error Recovery System (PCI-ERS). If a PCI device driver provides
+ * a set fof callbacks in struct pci_error_handlers, then that device driver
+ * will be notified of PCI bus errors, and will be driven to recovery
+ * when an error occurs.
+ */
+
+typedef unsigned int __bitwise pci_ers_result_t;
+
+enum pci_ers_result {
+ /* no result/none/not supported in device driver */
+ PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
+
+ /* Device driver can recover without slot reset */
+ PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
+
+ /* Device driver wants slot to be reset. */
+ PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
+
+ /* Device has completely failed, is unrecoverable */
+ PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
+
+ /* Device driver is fully recovered and operational */
+ PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
+};
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers
+{
+ /* PCI bus error detected on this device */
+ pci_ers_result_t (*error_detected)(struct pci_dev *dev,
+ enum pci_channel_state error);
+
+ /* MMIO has been re-enabled, but not DMA */
+ pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
+
+ /* PCI Express link has been reset */
+ pci_ers_result_t (*link_reset)(struct pci_dev *dev);
+
+ /* PCI slot has been reset */
+ pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
+
+ /* Device driver may resume normal operations */
+ void (*resume)(struct pci_dev *dev);
+};
+
+/* ---------------------------------------------------------------- */
+
struct module;
struct pci_driver {
struct list_head node;
@@ -245,6 +311,7 @@
int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */
void (*shutdown) (struct pci_dev *dev);

+ struct pci_error_handlers *err_handler;
struct device_driver driver;
struct pci_dynids dynids;
};
_______________________________________________

2005-11-17 23:59:49

by Greg KH

[permalink] [raw]

Subject: Re: [PATCH 1/7] PCI Error Recovery: header file patch

On Wed, Nov 16, 2005 at 05:10:41PM -0600, linas wrote:
>
> Greg, Please apply. This has been modified to use unsigned int's
> per disucssion.

Ok, I've added this one now, and dropped the previous two I had. Can
you bounce me the other 6 patches in the series, I dropped them from my
inbox a while ago.

thanks,

greg k-h