2007-05-14 23:37:47

by linas

[permalink] [raw]
Subject: [PATCH 1/2] s2io: add PCI error recovery support


This patch adds PCI error recovery support to the
s2io 10-Gigabit ethernet device driver. Third revision,
blocks interrupts and the watchdog.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <[email protected]>
Acked-by: Ramkrishna Vepa <[email protected]>
Cc: Raghavendra Koushik <[email protected]>
Cc: Wen Xiong <[email protected]>

----

Jeff, please apply for 2.6.22

--linas


drivers/net/s2io.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++---
drivers/net/s2io.h | 5 ++
2 files changed, 116 insertions(+), 5 deletions(-)

Index: linux-2.6.22-rc1/drivers/net/s2io.c
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.c 2007-05-14 17:05:03.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.c 2007-05-14 17:23:45.000000000 -0500
@@ -480,11 +480,18 @@ static struct pci_device_id s2io_tbl[] _

MODULE_DEVICE_TABLE(pci, s2io_tbl);

+static struct pci_error_handlers s2io_err_handler = {
+ .error_detected = s2io_io_error_detected,
+ .slot_reset = s2io_io_slot_reset,
+ .resume = s2io_io_resume,
+};
+
static struct pci_driver s2io_driver = {
.name = "S2IO",
.id_table = s2io_tbl,
.probe = s2io_init_nic,
.remove = __devexit_p(s2io_rem_nic),
+ .err_handler = &s2io_err_handler,
};

/* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -2700,6 +2707,9 @@ static void s2io_netpoll(struct net_devi
u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
int i;

+ if (pci_channel_offline(nic->pdev))
+ return;
+
disable_irq(dev->irq);

atomic_inc(&nic->isr_cnt);
@@ -3225,6 +3235,8 @@ static void alarm_intr_handler(struct s2
int i;
if (atomic_read(&nic->card_state) == CARD_DOWN)
return;
+ if (pci_channel_offline(nic->pdev))
+ return;
nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
/* Handling the XPAK counters update */
if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count < 72000) {
@@ -4324,6 +4336,10 @@ static irqreturn_t s2io_isr(int irq, voi
struct mac_info *mac_control;
struct config_param *config;

+ /* Pretend we handled any irq's from a disconnected card */
+ if (pci_channel_offline(sp->pdev))
+ return IRQ_NONE;
+
atomic_inc(&sp->isr_cnt);
mac_control = &sp->mac_control;
config = &sp->config;
@@ -6579,7 +6595,7 @@ static void s2io_rem_isr(struct s2io_nic
} while(cnt < 5);
}

-static void s2io_card_down(struct s2io_nic * sp)
+static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
{
int cnt = 0;
struct XENA_dev_config __iomem *bar0 = sp->bar0;
@@ -6594,7 +6610,8 @@ static void s2io_card_down(struct s2io_n
atomic_set(&sp->card_state, CARD_DOWN);

/* disable Tx and Rx traffic on the NIC */
- stop_nic(sp);
+ if (do_io)
+ stop_nic(sp);

s2io_rem_isr(sp);

@@ -6602,7 +6619,7 @@ static void s2io_card_down(struct s2io_n
tasklet_kill(&sp->task);

/* Check if the device is Quiescent and then Reset the NIC */
- do {
+ while(do_io) {
/* As per the HW requirement we need to replenish the
* receive buffer to avoid the ring bump. Since there is
* no intention of processing the Rx frame at this pointwe are
@@ -6627,8 +6644,9 @@ static void s2io_card_down(struct s2io_n
(unsigned long long) val64);
break;
}
- } while (1);
- s2io_reset(sp);
+ }
+ if (do_io)
+ s2io_reset(sp);

spin_lock_irqsave(&sp->tx_lock, flags);
/* Free all Tx buffers */
@@ -6643,6 +6661,11 @@ static void s2io_card_down(struct s2io_n
clear_bit(0, &(sp->link_state));
}

+static void s2io_card_down(struct s2io_nic * sp)
+{
+ do_s2io_card_down(sp, 1);
+}
+
static int s2io_card_up(struct s2io_nic * sp)
{
int i, ret = 0;
@@ -8020,3 +8043,86 @@ static void lro_append_pkt(struct s2io_n
sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
return;
}
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ netif_device_detach(netdev);
+
+ if (netif_running(netdev)) {
+ /* Bring down the card, while avoiding PCI I/O */
+ do_s2io_card_down(sp, 0);
+ sp->device_close_flag = TRUE; /* Device is shut down. */
+ }
+ pci_disable_device(pdev);
+
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ * At this point, the card has exprienced a hard reset,
+ * followed by fixups by BIOS, and has its config space
+ * set up identically to what it was at cold boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ if (pci_enable_device(pdev)) {
+ printk(KERN_ERR "s2io: "
+ "Cannot re-enable PCI device after reset.\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ pci_set_master(pdev);
+ s2io_reset(sp);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells
+ * us that its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev)
+{
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ if (netif_running(netdev)) {
+ if (s2io_card_up(sp)) {
+ printk(KERN_ERR "s2io: "
+ "Can't bring device back up after reset.\n");
+ return;
+ }
+
+ if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) {
+ s2io_card_down(sp);
+ printk(KERN_ERR "s2io: "
+ "Can't resetore mac addr after reset.\n");
+ return;
+ }
+ }
+
+ netif_device_attach(netdev);
+ netif_wake_queue(netdev);
+}
Index: linux-2.6.22-rc1/drivers/net/s2io.h
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.h 2007-05-14 17:05:03.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.h 2007-05-14 17:23:45.000000000 -0500
@@ -1052,6 +1052,11 @@ static void lro_append_pkt(struct s2io_n
struct sk_buff *skb, u32 tcp_len);
static int rts_ds_steer(struct s2io_nic *nic, u8 ds_codepoint, u8 ring);

+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
+static void s2io_io_resume(struct pci_dev *pdev);
+
#define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
#define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
#define s2io_offload_type(skb) skb_shinfo(skb)->gso_type


2007-05-14 23:41:57

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/2] s2io: add PCI error recovery support


s2io cleanup suggestions, per discussion on mailing lists.

Signed-off-by: Linas Vepstas <[email protected]>

----
drivers/net/s2io.c | 2 --
drivers/net/s2io.h | 1 -
2 files changed, 3 deletions(-)

Index: linux-2.6.22-rc1/drivers/net/s2io.c
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.c 2007-05-14 17:23:45.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.c 2007-05-14 17:24:03.000000000 -0500
@@ -3980,7 +3980,6 @@ static int s2io_close(struct net_device
/* Reset card, kill tasklet and free Tx and Rx buffers. */
s2io_card_down(sp);

- sp->device_close_flag = TRUE; /* Device is shut down. */
return 0;
}

@@ -8063,7 +8062,6 @@ static pci_ers_result_t s2io_io_error_de
if (netif_running(netdev)) {
/* Bring down the card, while avoiding PCI I/O */
do_s2io_card_down(sp, 0);
- sp->device_close_flag = TRUE; /* Device is shut down. */
}
pci_disable_device(pdev);

Index: linux-2.6.22-rc1/drivers/net/s2io.h
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.h 2007-05-14 17:23:45.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.h 2007-05-14 17:24:03.000000000 -0500
@@ -794,7 +794,6 @@ struct s2io_nic {

struct net_device_stats stats;
int high_dma_flag;
- int device_close_flag;
int device_enabled_once;

char name[60];

2007-05-14 23:43:19

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2] s2io: add PCI error recovery support


I failed to cc some of the people on the cc list ... so am resending.
--linas

On Mon, May 14, 2007 at 06:37:30PM -0500, Linas Vepstas wrote:
>
> This patch adds PCI error recovery support to the
> s2io 10-Gigabit ethernet device driver. Third revision,
> blocks interrupts and the watchdog.
>
> Tested, seems to work well.
>
> Signed-off-by: Linas Vepstas <[email protected]>
> Acked-by: Ramkrishna Vepa <[email protected]>
> Cc: Raghavendra Koushik <[email protected]>
> Cc: Wen Xiong <[email protected]>
>
> ----
>
> Jeff, please apply for 2.6.22
>
> --linas
>
>
> drivers/net/s2io.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> drivers/net/s2io.h | 5 ++
> 2 files changed, 116 insertions(+), 5 deletions(-)
>
> Index: linux-2.6.22-rc1/drivers/net/s2io.c
> ===================================================================
> --- linux-2.6.22-rc1.orig/drivers/net/s2io.c 2007-05-14 17:05:03.000000000 -0500
> +++ linux-2.6.22-rc1/drivers/net/s2io.c 2007-05-14 17:23:45.000000000 -0500
> @@ -480,11 +480,18 @@ static struct pci_device_id s2io_tbl[] _
>
> MODULE_DEVICE_TABLE(pci, s2io_tbl);
>
> +static struct pci_error_handlers s2io_err_handler = {
> + .error_detected = s2io_io_error_detected,
> + .slot_reset = s2io_io_slot_reset,
> + .resume = s2io_io_resume,
> +};
> +
> static struct pci_driver s2io_driver = {
> .name = "S2IO",
> .id_table = s2io_tbl,
> .probe = s2io_init_nic,
> .remove = __devexit_p(s2io_rem_nic),
> + .err_handler = &s2io_err_handler,
> };
>
> /* A simplifier macro used both by init and free shared_mem Fns(). */
> @@ -2700,6 +2707,9 @@ static void s2io_netpoll(struct net_devi
> u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
> int i;
>
> + if (pci_channel_offline(nic->pdev))
> + return;
> +
> disable_irq(dev->irq);
>
> atomic_inc(&nic->isr_cnt);
> @@ -3225,6 +3235,8 @@ static void alarm_intr_handler(struct s2
> int i;
> if (atomic_read(&nic->card_state) == CARD_DOWN)
> return;
> + if (pci_channel_offline(nic->pdev))
> + return;
> nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
> /* Handling the XPAK counters update */
> if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count < 72000) {
> @@ -4324,6 +4336,10 @@ static irqreturn_t s2io_isr(int irq, voi
> struct mac_info *mac_control;
> struct config_param *config;
>
> + /* Pretend we handled any irq's from a disconnected card */
> + if (pci_channel_offline(sp->pdev))
> + return IRQ_NONE;
> +
> atomic_inc(&sp->isr_cnt);
> mac_control = &sp->mac_control;
> config = &sp->config;
> @@ -6579,7 +6595,7 @@ static void s2io_rem_isr(struct s2io_nic
> } while(cnt < 5);
> }
>
> -static void s2io_card_down(struct s2io_nic * sp)
> +static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
> {
> int cnt = 0;
> struct XENA_dev_config __iomem *bar0 = sp->bar0;
> @@ -6594,7 +6610,8 @@ static void s2io_card_down(struct s2io_n
> atomic_set(&sp->card_state, CARD_DOWN);
>
> /* disable Tx and Rx traffic on the NIC */
> - stop_nic(sp);
> + if (do_io)
> + stop_nic(sp);
>
> s2io_rem_isr(sp);
>
> @@ -6602,7 +6619,7 @@ static void s2io_card_down(struct s2io_n
> tasklet_kill(&sp->task);
>
> /* Check if the device is Quiescent and then Reset the NIC */
> - do {
> + while(do_io) {
> /* As per the HW requirement we need to replenish the
> * receive buffer to avoid the ring bump. Since there is
> * no intention of processing the Rx frame at this pointwe are
> @@ -6627,8 +6644,9 @@ static void s2io_card_down(struct s2io_n
> (unsigned long long) val64);
> break;
> }
> - } while (1);
> - s2io_reset(sp);
> + }
> + if (do_io)
> + s2io_reset(sp);
>
> spin_lock_irqsave(&sp->tx_lock, flags);
> /* Free all Tx buffers */
> @@ -6643,6 +6661,11 @@ static void s2io_card_down(struct s2io_n
> clear_bit(0, &(sp->link_state));
> }
>
> +static void s2io_card_down(struct s2io_nic * sp)
> +{
> + do_s2io_card_down(sp, 1);
> +}
> +
> static int s2io_card_up(struct s2io_nic * sp)
> {
> int i, ret = 0;
> @@ -8020,3 +8043,86 @@ static void lro_append_pkt(struct s2io_n
> sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
> return;
> }
> +
> +/**
> + * s2io_io_error_detected - called when PCI error is detected
> + * @pdev: Pointer to PCI device
> + * @state: The current pci conneection state
> + *
> + * This function is called after a PCI bus error affecting
> + * this device has been detected.
> + */
> +static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
> + pci_channel_state_t state)
> +{
> + struct net_device *netdev = pci_get_drvdata(pdev);
> + struct s2io_nic *sp = netdev->priv;
> +
> + netif_device_detach(netdev);
> +
> + if (netif_running(netdev)) {
> + /* Bring down the card, while avoiding PCI I/O */
> + do_s2io_card_down(sp, 0);
> + sp->device_close_flag = TRUE; /* Device is shut down. */
> + }
> + pci_disable_device(pdev);
> +
> + return PCI_ERS_RESULT_NEED_RESET;
> +}
> +
> +/**
> + * s2io_io_slot_reset - called after the pci bus has been reset.
> + * @pdev: Pointer to PCI device
> + *
> + * Restart the card from scratch, as if from a cold-boot.
> + * At this point, the card has exprienced a hard reset,
> + * followed by fixups by BIOS, and has its config space
> + * set up identically to what it was at cold boot.
> + */
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev)
> +{
> + struct net_device *netdev = pci_get_drvdata(pdev);
> + struct s2io_nic *sp = netdev->priv;
> +
> + if (pci_enable_device(pdev)) {
> + printk(KERN_ERR "s2io: "
> + "Cannot re-enable PCI device after reset.\n");
> + return PCI_ERS_RESULT_DISCONNECT;
> + }
> +
> + pci_set_master(pdev);
> + s2io_reset(sp);
> +
> + return PCI_ERS_RESULT_RECOVERED;
> +}
> +
> +/**
> + * s2io_io_resume - called when traffic can start flowing again.
> + * @pdev: Pointer to PCI device
> + *
> + * This callback is called when the error recovery driver tells
> + * us that its OK to resume normal operation.
> + */
> +static void s2io_io_resume(struct pci_dev *pdev)
> +{
> + struct net_device *netdev = pci_get_drvdata(pdev);
> + struct s2io_nic *sp = netdev->priv;
> +
> + if (netif_running(netdev)) {
> + if (s2io_card_up(sp)) {
> + printk(KERN_ERR "s2io: "
> + "Can't bring device back up after reset.\n");
> + return;
> + }
> +
> + if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) {
> + s2io_card_down(sp);
> + printk(KERN_ERR "s2io: "
> + "Can't resetore mac addr after reset.\n");
> + return;
> + }
> + }
> +
> + netif_device_attach(netdev);
> + netif_wake_queue(netdev);
> +}
> Index: linux-2.6.22-rc1/drivers/net/s2io.h
> ===================================================================
> --- linux-2.6.22-rc1.orig/drivers/net/s2io.h 2007-05-14 17:05:03.000000000 -0500
> +++ linux-2.6.22-rc1/drivers/net/s2io.h 2007-05-14 17:23:45.000000000 -0500
> @@ -1052,6 +1052,11 @@ static void lro_append_pkt(struct s2io_n
> struct sk_buff *skb, u32 tcp_len);
> static int rts_ds_steer(struct s2io_nic *nic, u8 ds_codepoint, u8 ring);
>
> +static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
> + pci_channel_state_t state);
> +static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev);
> +static void s2io_io_resume(struct pci_dev *pdev);
> +
> #define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size
> #define s2io_udp_mss(skb) skb_shinfo(skb)->gso_size
> #define s2io_offload_type(skb) skb_shinfo(skb)->gso_type

2007-05-18 07:21:21

by Sivakumar Subramani

[permalink] [raw]
Subject: RE: [PATCH 1/2] s2io: add PCI error recovery support

Hi,

Fix looks good. I have couple of comments,

1) Return from s2io_updt_stats function if the PCI bus is offline
(pci_channel_offline).
if (pci_channel_offline(pdev))
return;

2) No Need to call netif_wake_queue() in s2io_io_resume as
netif_device_attach() will take care of calling it.

3) We need to return from s2io_msix_ring_handle(),
s2io_msix_fifo_handle(), s2io_msi_handle() if the PCI channel is
offline. Some thing similary to the below condition that is added in the
s2io_isr()

-4117,6 +4129,10 @@ static irqreturn_t s2io_isr(int irq, voi
struct mac_info *mac_control;
struct config_param *config;

+ /* Pretend we handled any irq's from a disconnected card */
+ if (pci_channel_offline(sp->pdev))
+ return IRQ_NONE;

Thanks,
~Siva

-----Original Message-----
From: [email protected] [mailto:[email protected]]
On Behalf Of Linas Vepstas
Sent: Tuesday, May 15, 2007 5:08 AM
To: Jeff Garzik
Cc: [email protected]; [email protected];
[email protected]; [email protected]
Subject: [PATCH 1/2] s2io: add PCI error recovery support


This patch adds PCI error recovery support to the s2io 10-Gigabit
ethernet device driver. Third revision, blocks interrupts and the
watchdog.

Tested, seems to work well.

Signed-off-by: Linas Vepstas <[email protected]>
Acked-by: Ramkrishna Vepa <[email protected]>
Cc: Raghavendra Koushik <[email protected]>
Cc: Wen Xiong <[email protected]>

----

Jeff, please apply for 2.6.22

--linas


drivers/net/s2io.c | 116
++++++++++++++++++++++++++++++++++++++++++++++++++---
drivers/net/s2io.h | 5 ++
2 files changed, 116 insertions(+), 5 deletions(-)

Index: linux-2.6.22-rc1/drivers/net/s2io.c
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.c 2007-05-14
17:05:03.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.c 2007-05-14 17:23:45.000000000
-0500
@@ -480,11 +480,18 @@ static struct pci_device_id s2io_tbl[] _

MODULE_DEVICE_TABLE(pci, s2io_tbl);

+static struct pci_error_handlers s2io_err_handler = {
+ .error_detected = s2io_io_error_detected,
+ .slot_reset = s2io_io_slot_reset,
+ .resume = s2io_io_resume,
+};
+
static struct pci_driver s2io_driver = {
.name = "S2IO",
.id_table = s2io_tbl,
.probe = s2io_init_nic,
.remove = __devexit_p(s2io_rem_nic),
+ .err_handler = &s2io_err_handler,
};

/* A simplifier macro used both by init and free shared_mem Fns(). */
@@ -2700,6 +2707,9 @@ static void s2io_netpoll(struct net_devi
u64 val64 = 0xFFFFFFFFFFFFFFFFULL;
int i;

+ if (pci_channel_offline(nic->pdev))
+ return;
+
disable_irq(dev->irq);

atomic_inc(&nic->isr_cnt);
@@ -3225,6 +3235,8 @@ static void alarm_intr_handler(struct s2
int i;
if (atomic_read(&nic->card_state) == CARD_DOWN)
return;
+ if (pci_channel_offline(nic->pdev))
+ return;
nic->mac_control.stats_info->sw_stat.ring_full_cnt = 0;
/* Handling the XPAK counters update */
if(nic->mac_control.stats_info->xpak_stat.xpak_timer_count <
72000) { @@ -4324,6 +4336,10 @@ static irqreturn_t s2io_isr(int irq, voi
struct mac_info *mac_control;
struct config_param *config;

+ /* Pretend we handled any irq's from a disconnected card */
+ if (pci_channel_offline(sp->pdev))
+ return IRQ_NONE;
+
atomic_inc(&sp->isr_cnt);
mac_control = &sp->mac_control;
config = &sp->config;
@@ -6579,7 +6595,7 @@ static void s2io_rem_isr(struct s2io_nic
} while(cnt < 5);
}

-static void s2io_card_down(struct s2io_nic * sp)
+static void do_s2io_card_down(struct s2io_nic * sp, int do_io)
{
int cnt = 0;
struct XENA_dev_config __iomem *bar0 = sp->bar0; @@ -6594,7
+6610,8 @@ static void s2io_card_down(struct s2io_n
atomic_set(&sp->card_state, CARD_DOWN);

/* disable Tx and Rx traffic on the NIC */
- stop_nic(sp);
+ if (do_io)
+ stop_nic(sp);

s2io_rem_isr(sp);

@@ -6602,7 +6619,7 @@ static void s2io_card_down(struct s2io_n
tasklet_kill(&sp->task);

/* Check if the device is Quiescent and then Reset the NIC */
- do {
+ while(do_io) {
/* As per the HW requirement we need to replenish the
* receive buffer to avoid the ring bump. Since there is
* no intention of processing the Rx frame at this
pointwe are @@ -6627,8 +6644,9 @@ static void s2io_card_down(struct
s2io_n
(unsigned long long) val64);
break;
}
- } while (1);
- s2io_reset(sp);
+ }
+ if (do_io)
+ s2io_reset(sp);

spin_lock_irqsave(&sp->tx_lock, flags);
/* Free all Tx buffers */
@@ -6643,6 +6661,11 @@ static void s2io_card_down(struct s2io_n
clear_bit(0, &(sp->link_state));
}

+static void s2io_card_down(struct s2io_nic * sp) {
+ do_s2io_card_down(sp, 1);
+}
+
static int s2io_card_up(struct s2io_nic * sp) {
int i, ret = 0;
@@ -8020,3 +8043,86 @@ static void lro_append_pkt(struct s2io_n
sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
return;
}
+
+/**
+ * s2io_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci conneection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t
+state) {
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ netif_device_detach(netdev);
+
+ if (netif_running(netdev)) {
+ /* Bring down the card, while avoiding PCI I/O */
+ do_s2io_card_down(sp, 0);
+ sp->device_close_flag = TRUE; /* Device is shut down.
*/
+ }
+ pci_disable_device(pdev);
+
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * s2io_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ * At this point, the card has exprienced a hard reset,
+ * followed by fixups by BIOS, and has its config space
+ * set up identically to what it was at cold boot.
+ */
+static pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev) {
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ if (pci_enable_device(pdev)) {
+ printk(KERN_ERR "s2io: "
+ "Cannot re-enable PCI device after reset.\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ pci_set_master(pdev);
+ s2io_reset(sp);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * s2io_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells
+ * us that its OK to resume normal operation.
+ */
+static void s2io_io_resume(struct pci_dev *pdev) {
+ struct net_device *netdev = pci_get_drvdata(pdev);
+ struct s2io_nic *sp = netdev->priv;
+
+ if (netif_running(netdev)) {
+ if (s2io_card_up(sp)) {
+ printk(KERN_ERR "s2io: "
+ "Can't bring device back up after
reset.\n");
+ return;
+ }
+
+ if (s2io_set_mac_addr(netdev, netdev->dev_addr) ==
FAILURE) {
+ s2io_card_down(sp);
+ printk(KERN_ERR "s2io: "
+ "Can't resetore mac addr after
reset.\n");
+ return;
+ }
+ }
+
+ netif_device_attach(netdev);
+ netif_wake_queue(netdev);
+}
Index: linux-2.6.22-rc1/drivers/net/s2io.h
===================================================================
--- linux-2.6.22-rc1.orig/drivers/net/s2io.h 2007-05-14
17:05:03.000000000 -0500
+++ linux-2.6.22-rc1/drivers/net/s2io.h 2007-05-14 17:23:45.000000000
-0500
@@ -1052,6 +1052,11 @@ static void lro_append_pkt(struct s2io_n
struct sk_buff *skb, u32 tcp_len); static
int rts_ds_steer(struct s2io_nic *nic, u8 ds_codepoint, u8 ring);

+static pci_ers_result_t s2io_io_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t
state); static
+pci_ers_result_t s2io_io_slot_reset(struct pci_dev *pdev); static void
+s2io_io_resume(struct pci_dev *pdev);
+
#define s2io_tcp_mss(skb) skb_shinfo(skb)->gso_size #define
s2io_udp_mss(skb) skb_shinfo(skb)->gso_size #define
s2io_offload_type(skb) skb_shinfo(skb)->gso_type

2007-05-21 17:08:20

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2] s2io: add PCI error recovery support

On Fri, May 18, 2007 at 03:06:53AM -0400, Sivakumar Subramani wrote:
> Hi,
>
> Fix looks good. I have couple of comments,
>
> 1) Return from s2io_updt_stats function if the PCI bus is offline
> (pci_channel_offline).
> if (pci_channel_offline(pdev))
> return;

OK, missed that one.

> 2) No Need to call netif_wake_queue() in s2io_io_resume as
> netif_device_attach() will take care of calling it.

OK.

> 3) We need to return from s2io_msix_ring_handle(),
> s2io_msix_fifo_handle(), s2io_msi_handle() if the PCI channel is
> offline. Some thing similary to the below condition that is added in the
> s2io_isr()

I was thinking that these wouldn't be needed, as MSI should stop
flowing along with everything else. However, there is a small race
window where the MSI was delivered on the PCI bus, then the error was
detected, and then the kernel delivers the MSI to device driver.
The if statements should close that gap.

I'll send an updated patch shortly.

--linas