2005-09-01 05:42:56

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

This patch implements ia64-specific IOCHK interfaces that enable
PCI drivers to detect error and make their error handling easier.

Please refer archives if you need, e.g. http://lwn.net/Articles/139240/

Thanks,
H.Seto

Signed-off-by: Hidetoshi Seto <[email protected]>

---
arch/ia64/Kconfig | 13 +++
arch/ia64/kernel/mca.c | 34 ++++++++
arch/ia64/kernel/mca_asm.S | 32 ++++++++
arch/ia64/kernel/mca_drv.c | 85 ++++++++++++++++++++++
arch/ia64/lib/Makefile | 1
arch/ia64/lib/iomap_check.c | 168 ++++++++++++++++++++++++++++++++++++++++++++
include/asm-ia64/io.h | 139 ++++++++++++++++++++++++++++++++++++
7 files changed, 472 insertions(+)

Index: linux-2.6.13/arch/ia64/lib/Makefile
===================================================================
--- linux-2.6.13.orig/arch/ia64/lib/Makefile
+++ linux-2.6.13/arch/ia64/lib/Makefile
@@ -16,6 +16,7 @@ lib-$(CONFIG_MCKINLEY) += copy_page_mck.
lib-$(CONFIG_PERFMON) += carta_random.o
lib-$(CONFIG_MD_RAID5) += xor.o
lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+lib-$(CONFIG_IOMAP_CHECK) += iomap_check.o

AFLAGS___divdi3.o =
AFLAGS___udivdi3.o = -DUNSIGNED
Index: linux-2.6.13/arch/ia64/Kconfig
===================================================================
--- linux-2.6.13.orig/arch/ia64/Kconfig
+++ linux-2.6.13/arch/ia64/Kconfig
@@ -399,6 +399,19 @@ config PCI_DOMAINS
bool
default PCI

+config IOMAP_CHECK
+ bool "Support iochk interfaces for IO error detection."
+ depends on PCI && EXPERIMENTAL
+ ---help---
+ Saying Y provides iochk infrastructure for "RAS-aware" drivers
+ to detect and recover some IO errors, which strongly required by
+ some of very-high-reliable systems.
+ The implementation of this infrastructure is highly depend on arch,
+ bus system, chipset and so on.
+ Currently, very few drivers or architectures implement this support.
+
+ If you don't know what to do here, say N.
+
source "drivers/pci/Kconfig"

source "drivers/pci/hotplug/Kconfig"
Index: linux-2.6.13/arch/ia64/lib/iomap_check.c
===================================================================
--- /dev/null
+++ linux-2.6.13/arch/ia64/lib/iomap_check.c
@@ -0,0 +1,168 @@
+/*
+ * File: iomap_check.c
+ * Purpose: Implement the IA64 specific iomap recovery interfaces
+ */
+
+#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+void iochk_init(void);
+void iochk_clear(iocookie *cookie, struct pci_dev *dev);
+int iochk_read(iocookie *cookie);
+
+struct list_head iochk_devices;
+DEFINE_RWLOCK(iochk_lock); /* all works are excluded on this lock */
+
+static struct pci_dev *search_host_bridge(struct pci_dev *dev);
+static int have_error(struct pci_dev *dev);
+
+void notify_bridge_error(struct pci_dev *bridge);
+void clear_bridge_error(struct pci_dev *bridge);
+void save_bridge_error(void);
+
+void iochk_init(void)
+{
+ /* setup */
+ INIT_LIST_HEAD(&iochk_devices);
+}
+
+void iochk_clear(iocookie *cookie, struct pci_dev *dev)
+{
+ unsigned long flag;
+
+ INIT_LIST_HEAD(&(cookie->list));
+
+ cookie->dev = dev;
+ cookie->host = search_host_bridge(dev);
+
+ write_lock_irqsave(&iochk_lock, flag);
+ if (cookie->host && have_error(cookie->host)) {
+ /* someone under my bridge causes error... */
+ notify_bridge_error(cookie->host);
+ clear_bridge_error(cookie->host);
+ }
+ list_add(&cookie->list, &iochk_devices);
+ write_unlock_irqrestore(&iochk_lock, flag);
+
+ cookie->error = 0;
+}
+
+int iochk_read(iocookie *cookie)
+{
+ unsigned long flag;
+ int ret = 0;
+
+ write_lock_irqsave(&iochk_lock, flag);
+ if ( cookie->error || have_error(cookie->dev)
+ || (cookie->host && have_error(cookie->host)) )
+ ret = 1;
+ list_del(&cookie->list);
+ write_unlock_irqrestore(&iochk_lock, flag);
+
+ return ret;
+}
+
+struct pci_dev *search_host_bridge(struct pci_dev *dev)
+{
+ struct pci_bus *pbus;
+
+ /* there is no bridge */
+ if (!dev->bus->self)
+ return NULL;
+
+ /* find root bus bridge */
+ for (pbus = dev->bus; pbus->parent && pbus->parent->self;
+ pbus = pbus->parent);
+
+ return pbus->self;
+}
+
+static int have_error(struct pci_dev *dev)
+{
+ u16 status;
+
+ /* check status */
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL: /* 0 */
+ pci_read_config_word(dev, PCI_STATUS, &status);
+ break;
+ case PCI_HEADER_TYPE_BRIDGE: /* 1 */
+ pci_read_config_word(dev, PCI_SEC_STATUS, &status);
+ break;
+ case PCI_HEADER_TYPE_CARDBUS: /* 2 */
+ return 0; /* FIX ME */
+ default:
+ BUG();
+ }
+
+ if ( (status & PCI_STATUS_REC_TARGET_ABORT)
+ || (status & PCI_STATUS_REC_MASTER_ABORT)
+ || (status & PCI_STATUS_DETECTED_PARITY) )
+ return 1;
+
+ return 0;
+}
+
+void notify_bridge_error(struct pci_dev *bridge)
+{
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return;
+
+ /* notify error to all transactions using this host bridge */
+ if (!bridge) {
+ /* global notify, ex. MCA */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ cookie->error = 1;
+ }
+ } else {
+ /* local notify, ex. Parity, Abort etc. */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ if (cookie->host == bridge)
+ cookie->error = 1;
+ }
+ }
+}
+
+void clear_bridge_error(struct pci_dev *bridge)
+{
+ u16 status = ( PCI_STATUS_REC_TARGET_ABORT
+ | PCI_STATUS_REC_MASTER_ABORT
+ | PCI_STATUS_DETECTED_PARITY );
+
+ /* clear bridge status */
+ switch (bridge->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL: /* 0 */
+ pci_write_config_word(bridge, PCI_STATUS, status);
+ break;
+ case PCI_HEADER_TYPE_BRIDGE: /* 1 */
+ pci_write_config_word(bridge, PCI_SEC_STATUS, status);
+ break;
+ case PCI_HEADER_TYPE_CARDBUS: /* 2 */
+ default:
+ BUG();
+ }
+}
+
+void save_bridge_error(void)
+{
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return;
+
+ /* mark devices if its root bus bridge have errors */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ if (cookie->error)
+ continue;
+ if (have_error(cookie->host))
+ notify_bridge_error(cookie->host);
+ }
+}
+
+EXPORT_SYMBOL(iochk_lock);
+EXPORT_SYMBOL(iochk_read);
+EXPORT_SYMBOL(iochk_clear);
+EXPORT_SYMBOL(iochk_devices); /* for MCA driver */
Index: linux-2.6.13/include/asm-ia64/io.h
===================================================================
--- linux-2.6.13.orig/include/asm-ia64/io.h
+++ linux-2.6.13/include/asm-ia64/io.h
@@ -70,6 +70,26 @@ extern unsigned int num_io_spaces;
#include <asm/machvec.h>
#include <asm/page.h>
#include <asm/system.h>
+
+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/* ia64 iocookie */
+typedef struct {
+ struct list_head list;
+ struct pci_dev *dev; /* target device */
+ struct pci_dev *host; /* hosting bridge */
+ unsigned long error; /* error flag */
+} iocookie;
+
+extern rwlock_t iochk_lock; /* see arch/ia64/lib/iomap_check.c */
+
+/* Enable ia64 iochk - See arch/ia64/lib/iomap_check.c */
+#define HAVE_ARCH_IOMAP_CHECK
+
+#endif /* CONFIG_IOMAP_CHECK */
+
#include <asm-generic/iomap.h>

/*
@@ -164,14 +184,23 @@ __ia64_mk_io_addr (unsigned long port)
* during optimization, which is why we use "volatile" pointers.
*/

+#ifdef CONFIG_IOMAP_CHECK
+
+extern void ia64_mca_barrier(unsigned long value);
+
static inline unsigned int
___ia64_inb (unsigned long port)
{
volatile unsigned char *addr = __ia64_mk_io_addr(port);
unsigned char ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
return ret;
}

@@ -180,9 +209,14 @@ ___ia64_inw (unsigned long port)
{
volatile unsigned short *addr = __ia64_mk_io_addr(port);
unsigned short ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
return ret;
}

@@ -191,12 +225,54 @@ ___ia64_inl (unsigned long port)
{
volatile unsigned int *addr = __ia64_mk_io_addr(port);
unsigned int ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
return ret;
}

+#else /* CONFIG_IOMAP_CHECK */
+
+static inline unsigned int
+___ia64_inb (unsigned long port)
+{
+ volatile unsigned char *addr = __ia64_mk_io_addr(port);
+ unsigned char ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+static inline unsigned int
+___ia64_inw (unsigned long port)
+{
+ volatile unsigned short *addr = __ia64_mk_io_addr(port);
+ unsigned short ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+static inline unsigned int
+___ia64_inl (unsigned long port)
+{
+ volatile unsigned int *addr = __ia64_mk_io_addr(port);
+ unsigned int ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+#endif /* CONFIG_IOMAP_CHECK */
+
static inline void
___ia64_outb (unsigned char val, unsigned long port)
{
@@ -313,6 +389,67 @@ __outsl (unsigned long port, const void
* a good idea). Writes are ok though for all existing ia64 platforms (and
* hopefully it'll stay that way).
*/
+
+#ifdef CONFIG_IOMAP_CHECK
+
+static inline unsigned char
+___ia64_readb (const volatile void __iomem *addr)
+{
+ unsigned char val;
+ unsigned long flags;
+
+ read_lock_irqsave(&iochk_lock,flags);
+ val = *(volatile unsigned char __force *)addr;
+ ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
+ return val;
+}
+
+static inline unsigned short
+___ia64_readw (const volatile void __iomem *addr)
+{
+ unsigned short val;
+ unsigned long flags;
+
+ read_lock_irqsave(&iochk_lock,flags);
+ val = *(volatile unsigned short __force *)addr;
+ ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
+ return val;
+}
+
+static inline unsigned int
+___ia64_readl (const volatile void __iomem *addr)
+{
+ unsigned int val;
+ unsigned long flags;
+
+ read_lock_irqsave(&iochk_lock,flags);
+ val = *(volatile unsigned int __force *) addr;
+ ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
+ return val;
+}
+
+static inline unsigned long
+___ia64_readq (const volatile void __iomem *addr)
+{
+ unsigned long val;
+ unsigned long flags;
+
+ read_lock_irqsave(&iochk_lock,flags);
+ val = *(volatile unsigned long __force *) addr;
+ ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);
+
+ return val;
+}
+
+#else /* CONFIG_IOMAP_CHECK */
+
static inline unsigned char
___ia64_readb (const volatile void __iomem *addr)
{
@@ -337,6 +474,8 @@ ___ia64_readq (const volatile void __iom
return *(volatile unsigned long __force *) addr;
}

+#endif /* CONFIG_IOMAP_CHECK */
+
static inline void
__writeb (unsigned char val, volatile void __iomem *addr)
{
Index: linux-2.6.13/arch/ia64/kernel/mca.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca.c
+++ linux-2.6.13/arch/ia64/kernel/mca.c
@@ -77,6 +77,13 @@
#include <asm/irq.h>
#include <asm/hw_irq.h>

+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/pci.h>
+extern void notify_bridge_error(struct pci_dev *bridge);
+extern void save_bridge_error(void);
+extern rwlock_t iochk_lock;
+#endif
+
#if defined(IA64_MCA_DEBUG_INFO)
# define IA64_MCA_DEBUG(fmt...) printk(fmt)
#else
@@ -283,11 +290,30 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
__FUNCTION__, cpe_irq, smp_processor_id());

+#ifndef CONFIG_IOMAP_CHECK
+
/* SAL spec states this should run w/ interrupts enabled */
local_irq_enable();

/* Get the CPE error record and log it */
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+#else
+ /*
+ * Because SAL_GET_STATE_INFO for CPE might clear bridge states
+ * in process of gathering error information from the system,
+ * we should check the states before clearing it.
+ * While OS and SAL are handling bridge status, we have to protect
+ * the states from changing by any other I/Os running simultaneously,
+ * so this should be handled w/ lock and interrupts disabled.
+ */
+ write_lock(&iochk_lock);
+ save_bridge_error();
+ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+ write_unlock(&iochk_lock);
+
+ /* Rests can go w/ interrupt enabled as usual */
+ local_irq_enable();
+#endif

spin_lock(&cpe_history_lock);
if (!cpe_poll_enabled && cpe_vector >= 0) {
@@ -893,6 +919,14 @@ ia64_mca_ucmc_handler(void)
sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
rh->severity = sal_log_severity_corrected;
ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
+
+#ifdef CONFIG_IOMAP_CHECK
+ /*
+ * SAL already reads and clears error bits on bridge registers,
+ * so we should have all running transactions to retry.
+ */
+ notify_bridge_error(0);
+#endif
}
/*
* Wakeup all the processors which are spinning in the rendezvous
Index: linux-2.6.13/arch/ia64/kernel/mca_asm.S
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca_asm.S
+++ linux-2.6.13/arch/ia64/kernel/mca_asm.S
@@ -16,6 +16,9 @@
// 04/11/12 Russ Anderson <[email protected]>
// Added per cpu MCA/INIT stack save areas.
//
+// 05/08/08 Hidetoshi Seto <[email protected]>
+// Added ia64_mca_barrier.
+//
#include <linux/config.h>
#include <linux/threads.h>

@@ -944,3 +947,32 @@ END(ia64_monarch_init_handler)
GLOBAL_ENTRY(ia64_slave_init_handler)
1: br.sptk 1b
END(ia64_slave_init_handler)
+
+//EndInitHandlers//////////////////////////////////////////////////////////////
+
+#ifdef CONFIG_IOMAP_CHECK
+
+//
+// ia64_mca_barrier:
+//
+// Some I/O bridges may poison the data read, instead of signaling a BERR.
+// The consummation of poisoned data triggers a MCA, which tells us the
+// polluted address.
+// Note that the read operation by itself does not consume the bad data,
+// so we have to do something with it before it is too late.
+//
+// Calling this function forces a consumption of the passed value since
+// the compiler will have to copy it from whatever register it was in to
+// an "out" register to pass to the function.
+// To avoid possible optimization by compiler, and to make doubly sure,
+// this assenbly clearly consumes the value by moving it to r8.
+//
+// In this way, the value is guaranteed secure, not poisoned, and sanity.
+//
+
+GLOBAL_ENTRY(ia64_mca_barrier)
+ mov r8=r32
+ br.ret.sptk.many rp
+END(ia64_mca_barrier)
+
+#endif
Index: linux-2.6.13/arch/ia64/kernel/mca_drv.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca_drv.c
+++ linux-2.6.13/arch/ia64/kernel/mca_drv.c
@@ -35,6 +35,12 @@

#include "mca_drv.h"

+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/pci.h>
+#include <asm/io.h>
+extern struct list_head iochk_devices;
+#endif
+
/* max size of SAL error record (default) */
static int sal_rec_max = 10000;

@@ -377,6 +383,79 @@ is_mca_global(peidx_table_t *peidx, pal_
return MCA_IS_GLOBAL;
}

+#ifdef CONFIG_IOMAP_CHECK
+
+/**
+ * get_target_identifier - get address of target_identifier
+ * @peidx: pointer of index of processor error section
+ *
+ * Return value:
+ * addr if valid / 0 if not valid
+ */
+static u64 get_target_identifier(peidx_table_t *peidx)
+{
+ sal_log_mod_error_info_t *smei;
+
+ smei = peidx_bus_check(peidx, 0);
+ if (smei->valid.target_identifier)
+ return (smei->target_identifier);
+ return 0;
+}
+
+/**
+ * offending_addr_in_check - Check if the addr is in checking resource.
+ * @addr: address offending this MCA
+ *
+ * Return value:
+ * 1 if in / 0 if out
+ */
+static int offending_addr_in_check(u64 addr)
+{
+ int i;
+ struct pci_dev *tdev;
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return 0;
+
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ tdev = cookie->dev;
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ if (tdev->resource[i].start <= addr
+ && addr <= tdev->resource[i].end)
+ return 1;
+ if ((tdev->resource[i].flags
+ & (PCI_BASE_ADDRESS_SPACE|PCI_BASE_ADDRESS_MEM_TYPE_MASK))
+ == (PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64))
+ i++;
+ }
+ }
+ return 0;
+}
+
+/**
+ * iochk_error_recovery - Check if MCA occur on transaction in iochk.
+ * @peidx: pointer of index of processor error section
+ *
+ * Return value:
+ * 1 if error could be cought in driver / 0 if not
+ */
+static int iochk_error_recovery(peidx_table_t *peidx)
+{
+ u64 addr;
+
+ addr = get_target_identifier(peidx);
+ if (!addr)
+ return 0;
+
+ if (offending_addr_in_check(addr))
+ return 1;
+
+ return 0;
+}
+
+#endif /* CONFIG_IOMAP_CHECK */
+
/**
* recover_from_read_error - Try to recover the errors which type are "read"s.
* @slidx: pointer of index of SAL error record
@@ -445,6 +524,12 @@ recover_from_read_error(slidx_table_t *s

}

+#ifdef CONFIG_IOMAP_CHECK
+ /* Are there any RAS-aware drivers? */
+ if (iochk_error_recovery(peidx))
+ return 1;
+#endif
+
return 0;
}



2005-09-01 22:46:05

by Brent Casavant

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

On Thu, 1 Sep 2005, Hidetoshi Seto wrote:

> Index: linux-2.6.13/include/asm-ia64/io.h
> ===================================================================
> --- linux-2.6.13.orig/include/asm-ia64/io.h
> +++ linux-2.6.13/include/asm-ia64/io.h
> @@ -70,6 +70,26 @@ extern unsigned int num_io_spaces;
> #include <asm/machvec.h>
> #include <asm/page.h>
> #include <asm/system.h>
> +
> +#ifdef CONFIG_IOMAP_CHECK
> +#include <linux/list.h>
> +#include <linux/spinlock.h>
> +
> +/* ia64 iocookie */
> +typedef struct {
> + struct list_head list;
> + struct pci_dev *dev; /* target device */
> + struct pci_dev *host; /* hosting bridge */
> + unsigned long error; /* error flag */
> +} iocookie;
> +
> +extern rwlock_t iochk_lock; /* see arch/ia64/lib/iomap_check.c */
> +
> +/* Enable ia64 iochk - See arch/ia64/lib/iomap_check.c */
> +#define HAVE_ARCH_IOMAP_CHECK
> +
> +#endif /* CONFIG_IOMAP_CHECK */
> +
> #include <asm-generic/iomap.h>
>
> /*
> @@ -164,14 +184,23 @@ __ia64_mk_io_addr (unsigned long port)
> * during optimization, which is why we use "volatile" pointers.
> */
>
> +#ifdef CONFIG_IOMAP_CHECK
> +
> +extern void ia64_mca_barrier(unsigned long value);
> +
> static inline unsigned int
> ___ia64_inb (unsigned long port)
> {
> volatile unsigned char *addr = __ia64_mk_io_addr(port);
> unsigned char ret;
> + unsigned long flags;
>
> + read_lock_irqsave(&iochk_lock,flags);
> ret = *addr;
> __ia64_mf_a();
> + ia64_mca_barrier(ret);
> + read_unlock_irqrestore(&iochk_lock,flags);
> +
> return ret;
> }
>
> @@ -180,9 +209,14 @@ ___ia64_inw (unsigned long port)
> {
> volatile unsigned short *addr = __ia64_mk_io_addr(port);
> unsigned short ret;
> + unsigned long flags;
>
> + read_lock_irqsave(&iochk_lock,flags);
> ret = *addr;
> __ia64_mf_a();
> + ia64_mca_barrier(ret);
> + read_unlock_irqrestore(&iochk_lock,flags);
> +
> return ret;
> }
>
> @@ -191,12 +225,54 @@ ___ia64_inl (unsigned long port)
> {
> volatile unsigned int *addr = __ia64_mk_io_addr(port);
> unsigned int ret;
> + unsigned long flags;
>
> + read_lock_irqsave(&iochk_lock,flags);
> ret = *addr;
> __ia64_mf_a();
> + ia64_mca_barrier(ret);
> + read_unlock_irqrestore(&iochk_lock,flags);
> +
> return ret;
> }

I am extremely concerned about the performance implications of this
implementation. These changes have several deleterious effects on I/O
performance.

The first is serialization of all I/O reads and writes. This will
be a severe problem on systems with large numbers of PCI buses, the
very type of system that stands the most to gain in reliability from
these efforts. At a minimum any locking should be done on a per-bus
basis.

The second is the raw performance penalty from acquiring and dropping
a lock with every read and write. This will be a substantial amount
of activity for any I/O-intensive system, heck even for moderate I/O
levels.

The third is lock contention for this single lock -- I would fully expect
many dozens of processors to be performing I/O at any given time on
systems of interest, causing this to be a heavily contended lock.
This will be even more severe on NUMA systems, as the lock cacheline
bounces across the memory fabric. A per-bus lock would again be much
more appropriate.

The final problem is that these performance penalties are paid even
by drivers which are not IOCHK aware, which for the time being is
all of them. A reasonable solution must pay these penalties only
for drivers which are IOCHK aware. Reinstating the readX_check()
interface is the obvious solution here. It's simply too heavy a
performance burden to pay when almost no drivers currently benefit
from it.

Otherwise, I also wonder if you have any plans to handle similar
errors experienced under device-initiated DMA, or asynchronous I/O.
It's not clear that there's sufficient infrastructure in the current
patches to adequately handle those situations.

Thank you,
Brent Casavant

--
Brent Casavant If you had nothing to fear,
[email protected] how then could you be brave?
Silicon Graphics, Inc. -- Queen Dama, Source Wars

2005-09-02 10:32:36

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 2.6.13 1/2] IOCHK interface for I/O error handling/detecting (for ia64)

This patch implements ia64-specific IOCHK interfaces that enable
PCI drivers to detect error and make their error handling easier.

Please refer archives if you need, e.g. http://lwn.net/Articles/139240/

This is the former part of original patch, MCA related codes which enable
drivers to catch errors like parity errors.
Originally such PCI-bus parity error becomes a fatal MCA and results in
system down shortly. But this infrastructure allows drivers to stop OS
shutting down if the error was on the device's resource, and turn over
OS's responsibility to the RAS-aware drivers.

I'd like to merge this part into 2.6.13-rc1 even if the latter half isn't
accepted. This former half functions without the latter, and helps
realize of effective recovery from MCA.

Tony, could you apply this part to your tree?

Thanks,
H.Seto

Signed-off-by: Hidetoshi Seto <[email protected]>

---

arch/ia64/Kconfig | 13 +++
arch/ia64/kernel/mca.c | 13 +++
arch/ia64/kernel/mca_asm.S | 32 +++++++++
arch/ia64/kernel/mca_drv.c | 85 ++++++++++++++++++++++++
arch/ia64/lib/Makefile | 1
arch/ia64/lib/iomap_check.c | 150 ++++++++++++++++++++++++++++++++++++++++++++
include/asm-ia64/io.h | 115 +++++++++++++++++++++++++++++++++
7 files changed, 409 insertions(+)

Index: linux-2.6.13/arch/ia64/lib/Makefile
===================================================================
--- linux-2.6.13.orig/arch/ia64/lib/Makefile
+++ linux-2.6.13/arch/ia64/lib/Makefile
@@ -16,6 +16,7 @@ lib-$(CONFIG_MCKINLEY) += copy_page_mck.
lib-$(CONFIG_PERFMON) += carta_random.o
lib-$(CONFIG_MD_RAID5) += xor.o
lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+lib-$(CONFIG_IOMAP_CHECK) += iomap_check.o

AFLAGS___divdi3.o =
AFLAGS___udivdi3.o = -DUNSIGNED
Index: linux-2.6.13/arch/ia64/Kconfig
===================================================================
--- linux-2.6.13.orig/arch/ia64/Kconfig
+++ linux-2.6.13/arch/ia64/Kconfig
@@ -399,6 +399,19 @@ config PCI_DOMAINS
bool
default PCI

+config IOMAP_CHECK
+ bool "Support iochk interfaces for IO error detection."
+ depends on PCI && EXPERIMENTAL
+ ---help---
+ Saying Y provides iochk infrastructure for "RAS-aware" drivers
+ to detect and recover some IO errors, which strongly required by
+ some of very-high-reliable systems.
+ The implementation of this infrastructure is highly depend on arch,
+ bus system, chipset and so on.
+ Currently, very few drivers or architectures implement this support.
+
+ If you don't know what to do here, say N.
+
source "drivers/pci/Kconfig"

source "drivers/pci/hotplug/Kconfig"
Index: linux-2.6.13/arch/ia64/lib/iomap_check.c
===================================================================
--- /dev/null
+++ linux-2.6.13/arch/ia64/lib/iomap_check.c
@@ -0,0 +1,150 @@
+/*
+ * File: iomap_check.c
+ * Purpose: Implement the IA64 specific iomap recovery interfaces
+ */
+
+#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+void iochk_init(void);
+void iochk_clear(iocookie *cookie, struct pci_dev *dev);
+int iochk_read(iocookie *cookie);
+
+struct list_head iochk_devices;
+DEFINE_SPINLOCK(iochk_lock); /* all works are excluded on this lock */
+
+static struct pci_dev *search_host_bridge(struct pci_dev *dev);
+static int have_error(struct pci_dev *dev);
+
+void notify_bridge_error(struct pci_dev *bridge);
+void clear_bridge_error(struct pci_dev *bridge);
+
+void iochk_init(void)
+{
+ /* setup */
+ INIT_LIST_HEAD(&iochk_devices);
+}
+
+void iochk_clear(iocookie *cookie, struct pci_dev *dev)
+{
+ unsigned long flag;
+
+ INIT_LIST_HEAD(&(cookie->list));
+
+ cookie->dev = dev;
+ cookie->host = search_host_bridge(dev);
+
+ spin_lock_irqsave(&iochk_lock, flag);
+ if (cookie->host && have_error(cookie->host)) {
+ /* someone under my bridge causes error... */
+ notify_bridge_error(cookie->host);
+ clear_bridge_error(cookie->host);
+ }
+ list_add(&cookie->list, &iochk_devices);
+ spin_unlock_irqrestore(&iochk_lock, flag);
+
+ cookie->error = 0;
+}
+
+int iochk_read(iocookie *cookie)
+{
+ unsigned long flag;
+ int ret = 0;
+
+ spin_lock_irqsave(&iochk_lock, flag);
+ if ( cookie->error || have_error(cookie->dev)
+ || (cookie->host && have_error(cookie->host)) )
+ ret = 1;
+ list_del(&cookie->list);
+ spin_unlock_irqrestore(&iochk_lock, flag);
+
+ return ret;
+}
+
+struct pci_dev *search_host_bridge(struct pci_dev *dev)
+{
+ struct pci_bus *pbus;
+
+ /* there is no bridge */
+ if (!dev->bus->self)
+ return NULL;
+
+ /* find root bus bridge */
+ for (pbus = dev->bus; pbus->parent && pbus->parent->self;
+ pbus = pbus->parent);
+
+ return pbus->self;
+}
+
+static int have_error(struct pci_dev *dev)
+{
+ u16 status;
+
+ /* check status */
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL: /* 0 */
+ pci_read_config_word(dev, PCI_STATUS, &status);
+ break;
+ case PCI_HEADER_TYPE_BRIDGE: /* 1 */
+ pci_read_config_word(dev, PCI_SEC_STATUS, &status);
+ break;
+ case PCI_HEADER_TYPE_CARDBUS: /* 2 */
+ return 0; /* FIX ME */
+ default:
+ BUG();
+ }
+
+ if ( (status & PCI_STATUS_REC_TARGET_ABORT)
+ || (status & PCI_STATUS_REC_MASTER_ABORT)
+ || (status & PCI_STATUS_DETECTED_PARITY) )
+ return 1;
+
+ return 0;
+}
+
+void notify_bridge_error(struct pci_dev *bridge)
+{
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return;
+
+ /* notify error to all transactions using this host bridge */
+ if (!bridge) {
+ /* global notify, ex. MCA */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ cookie->error = 1;
+ }
+ } else {
+ /* local notify, ex. Parity, Abort etc. */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ if (cookie->host == bridge)
+ cookie->error = 1;
+ }
+ }
+}
+
+void clear_bridge_error(struct pci_dev *bridge)
+{
+ u16 status = ( PCI_STATUS_REC_TARGET_ABORT
+ | PCI_STATUS_REC_MASTER_ABORT
+ | PCI_STATUS_DETECTED_PARITY );
+
+ /* clear bridge status */
+ switch (bridge->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL: /* 0 */
+ pci_write_config_word(bridge, PCI_STATUS, status);
+ break;
+ case PCI_HEADER_TYPE_BRIDGE: /* 1 */
+ pci_write_config_word(bridge, PCI_SEC_STATUS, status);
+ break;
+ case PCI_HEADER_TYPE_CARDBUS: /* 2 */
+ default:
+ BUG();
+ }
+}
+
+EXPORT_SYMBOL(iochk_read);
+EXPORT_SYMBOL(iochk_clear);
+EXPORT_SYMBOL(iochk_devices); /* for MCA driver */
Index: linux-2.6.13/include/asm-ia64/io.h
===================================================================
--- linux-2.6.13.orig/include/asm-ia64/io.h
+++ linux-2.6.13/include/asm-ia64/io.h
@@ -70,6 +70,23 @@ extern unsigned int num_io_spaces;
#include <asm/machvec.h>
#include <asm/page.h>
#include <asm/system.h>
+
+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/list.h>
+
+/* ia64 iocookie */
+typedef struct {
+ struct list_head list;
+ struct pci_dev *dev; /* target device */
+ struct pci_dev *host; /* hosting bridge */
+ unsigned long error; /* error flag */
+} iocookie;
+
+/* Enable ia64 iochk - See arch/ia64/lib/iomap_check.c */
+#define HAVE_ARCH_IOMAP_CHECK
+
+#endif /* CONFIG_IOMAP_CHECK */
+
#include <asm-generic/iomap.h>

/*
@@ -164,6 +181,10 @@ __ia64_mk_io_addr (unsigned long port)
* during optimization, which is why we use "volatile" pointers.
*/

+#ifdef CONFIG_IOMAP_CHECK
+
+extern void ia64_mca_barrier(unsigned long value);
+
static inline unsigned int
___ia64_inb (unsigned long port)
{
@@ -172,6 +193,8 @@ ___ia64_inb (unsigned long port)

ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+
return ret;
}

@@ -183,6 +206,8 @@ ___ia64_inw (unsigned long port)

ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+
return ret;
}

@@ -194,9 +219,48 @@ ___ia64_inl (unsigned long port)

ret = *addr;
__ia64_mf_a();
+ ia64_mca_barrier(ret);
+
return ret;
}

+#else /* CONFIG_IOMAP_CHECK */
+
+static inline unsigned int
+___ia64_inb (unsigned long port)
+{
+ volatile unsigned char *addr = __ia64_mk_io_addr(port);
+ unsigned char ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+static inline unsigned int
+___ia64_inw (unsigned long port)
+{
+ volatile unsigned short *addr = __ia64_mk_io_addr(port);
+ unsigned short ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+static inline unsigned int
+___ia64_inl (unsigned long port)
+{
+ volatile unsigned int *addr = __ia64_mk_io_addr(port);
+ unsigned int ret;
+
+ ret = *addr;
+ __ia64_mf_a();
+ return ret;
+}
+
+#endif /* CONFIG_IOMAP_CHECK */
+
static inline void
___ia64_outb (unsigned char val, unsigned long port)
{
@@ -313,6 +377,55 @@ __outsl (unsigned long port, const void
* a good idea). Writes are ok though for all existing ia64 platforms (and
* hopefully it'll stay that way).
*/
+
+#ifdef CONFIG_IOMAP_CHECK
+
+static inline unsigned char
+___ia64_readb (const volatile void __iomem *addr)
+{
+ unsigned char val;
+
+ val = *(volatile unsigned char __force *)addr;
+ ia64_mca_barrier(val);
+
+ return val;
+}
+
+static inline unsigned short
+___ia64_readw (const volatile void __iomem *addr)
+{
+ unsigned short val;
+
+ val = *(volatile unsigned short __force *)addr;
+ ia64_mca_barrier(val);
+
+ return val;
+}
+
+static inline unsigned int
+___ia64_readl (const volatile void __iomem *addr)
+{
+ unsigned int val;
+
+ val = *(volatile unsigned int __force *) addr;
+ ia64_mca_barrier(val);
+
+ return val;
+}
+
+static inline unsigned long
+___ia64_readq (const volatile void __iomem *addr)
+{
+ unsigned long val;
+
+ val = *(volatile unsigned long __force *) addr;
+ ia64_mca_barrier(val);
+
+ return val;
+}
+
+#else /* CONFIG_IOMAP_CHECK */
+
static inline unsigned char
___ia64_readb (const volatile void __iomem *addr)
{
@@ -337,6 +450,8 @@ ___ia64_readq (const volatile void __iom
return *(volatile unsigned long __force *) addr;
}

+#endif /* CONFIG_IOMAP_CHECK */
+
static inline void
__writeb (unsigned char val, volatile void __iomem *addr)
{
Index: linux-2.6.13/arch/ia64/kernel/mca.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca.c
+++ linux-2.6.13/arch/ia64/kernel/mca.c
@@ -77,6 +77,11 @@
#include <asm/irq.h>
#include <asm/hw_irq.h>

+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/pci.h>
+extern void notify_bridge_error(struct pci_dev *bridge);
+#endif
+
#if defined(IA64_MCA_DEBUG_INFO)
# define IA64_MCA_DEBUG(fmt...) printk(fmt)
#else
@@ -893,6 +898,14 @@ ia64_mca_ucmc_handler(void)
sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
rh->severity = sal_log_severity_corrected;
ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
+
+#ifdef CONFIG_IOMAP_CHECK
+ /*
+ * SAL already reads and clears error bits on bridge registers,
+ * so we should have all running transactions to retry.
+ */
+ notify_bridge_error(0);
+#endif
}
/*
* Wakeup all the processors which are spinning in the rendezvous
Index: linux-2.6.13/arch/ia64/kernel/mca_asm.S
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca_asm.S
+++ linux-2.6.13/arch/ia64/kernel/mca_asm.S
@@ -16,6 +16,9 @@
// 04/11/12 Russ Anderson <[email protected]>
// Added per cpu MCA/INIT stack save areas.
//
+// 05/08/08 Hidetoshi Seto <[email protected]>
+// Added ia64_mca_barrier.
+//
#include <linux/config.h>
#include <linux/threads.h>

@@ -944,3 +947,32 @@ END(ia64_monarch_init_handler)
GLOBAL_ENTRY(ia64_slave_init_handler)
1: br.sptk 1b
END(ia64_slave_init_handler)
+
+//EndInitHandlers//////////////////////////////////////////////////////////////
+
+#ifdef CONFIG_IOMAP_CHECK
+
+//
+// ia64_mca_barrier:
+//
+// Some I/O bridges may poison the data read, instead of signaling a BERR.
+// The consummation of poisoned data triggers a MCA, which tells us the
+// polluted address.
+// Note that the read operation by itself does not consume the bad data,
+// so we have to do something with it before it is too late.
+//
+// Calling this function forces a consumption of the passed value since
+// the compiler will have to copy it from whatever register it was in to
+// an "out" register to pass to the function.
+// To avoid possible optimization by compiler, and to make doubly sure,
+// this assenbly clearly consumes the value by moving it to r8.
+//
+// In this way, the value is guaranteed secure, not poisoned, and sanity.
+//
+
+GLOBAL_ENTRY(ia64_mca_barrier)
+ mov r8=r32
+ br.ret.sptk.many rp
+END(ia64_mca_barrier)
+
+#endif
Index: linux-2.6.13/arch/ia64/kernel/mca_drv.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca_drv.c
+++ linux-2.6.13/arch/ia64/kernel/mca_drv.c
@@ -35,6 +35,12 @@

#include "mca_drv.h"

+#ifdef CONFIG_IOMAP_CHECK
+#include <linux/pci.h>
+#include <asm/io.h>
+extern struct list_head iochk_devices;
+#endif
+
/* max size of SAL error record (default) */
static int sal_rec_max = 10000;

@@ -377,6 +383,79 @@ is_mca_global(peidx_table_t *peidx, pal_
return MCA_IS_GLOBAL;
}

+#ifdef CONFIG_IOMAP_CHECK
+
+/**
+ * get_target_identifier - get address of target_identifier
+ * @peidx: pointer of index of processor error section
+ *
+ * Return value:
+ * addr if valid / 0 if not valid
+ */
+static u64 get_target_identifier(peidx_table_t *peidx)
+{
+ sal_log_mod_error_info_t *smei;
+
+ smei = peidx_bus_check(peidx, 0);
+ if (smei->valid.target_identifier)
+ return (smei->target_identifier);
+ return 0;
+}
+
+/**
+ * offending_addr_in_check - Check if the addr is in checking resource.
+ * @addr: address offending this MCA
+ *
+ * Return value:
+ * 1 if in / 0 if out
+ */
+static int offending_addr_in_check(u64 addr)
+{
+ int i;
+ struct pci_dev *tdev;
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return 0;
+
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ tdev = cookie->dev;
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ if (tdev->resource[i].start <= addr
+ && addr <= tdev->resource[i].end)
+ return 1;
+ if ((tdev->resource[i].flags
+ & (PCI_BASE_ADDRESS_SPACE|PCI_BASE_ADDRESS_MEM_TYPE_MASK))
+ == (PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64))
+ i++;
+ }
+ }
+ return 0;
+}
+
+/**
+ * iochk_error_recovery - Check if MCA occur on transaction in iochk.
+ * @peidx: pointer of index of processor error section
+ *
+ * Return value:
+ * 1 if error could be cought in driver / 0 if not
+ */
+static int iochk_error_recovery(peidx_table_t *peidx)
+{
+ u64 addr;
+
+ addr = get_target_identifier(peidx);
+ if (!addr)
+ return 0;
+
+ if (offending_addr_in_check(addr))
+ return 1;
+
+ return 0;
+}
+
+#endif /* CONFIG_IOMAP_CHECK */
+
/**
* recover_from_read_error - Try to recover the errors which type are "read"s.
* @slidx: pointer of index of SAL error record
@@ -445,6 +524,12 @@ recover_from_read_error(slidx_table_t *s

}

+#ifdef CONFIG_IOMAP_CHECK
+ /* Are there any RAS-aware drivers? */
+ if (iochk_error_recovery(peidx))
+ return 1;
+#endif
+
return 0;
}



2005-09-02 10:33:13

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 2.6.13 2/2] IOCHK interface for I/O error handling/detecting (for ia64)

This patch implements ia64-specific IOCHK interfaces that enable
PCI drivers to detect error and make their error handling easier.

Please refer archives if you need, e.g. http://lwn.net/Articles/139240/

This is the latter part of original patch, CPE and SAL call related
codes which prevents host bridge status from unexpected clear.
Because SAL_GET_STATE_INFO for CPE might clear bridge states in
process of gathering error information from the system, we should
check the states before clearing it.
While OS and SAL are handling bridge status, we have to protect
the states from changing by any other I/Os running simultaneously.
In the opposite sight, we have to protect the status from cleaning
by such SAL call before it be checked.

As pointed by Brent, the implementation of this latter half wouldn't
be enough to use in huge boxes. Even though this code still have
some benefit depends on the situation.

Comments, to this paranoia part, are welcomed.

Thanks,
H.Seto

Signed-off-by: Hidetoshi Seto <[email protected]>

---

arch/ia64/kernel/mca.c | 21 +++++++++++++++++++++
arch/ia64/lib/iomap_check.c | 28 +++++++++++++++++++++++-----
include/asm-ia64/io.h | 24 ++++++++++++++++++++++++
3 files changed, 68 insertions(+), 5 deletions(-)

Index: linux-2.6.13/arch/ia64/lib/iomap_check.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/lib/iomap_check.c
+++ linux-2.6.13/arch/ia64/lib/iomap_check.c
@@ -12,13 +12,14 @@ void iochk_clear(iocookie *cookie, struc
int iochk_read(iocookie *cookie);

struct list_head iochk_devices;
-DEFINE_SPINLOCK(iochk_lock); /* all works are excluded on this lock */
+DEFINE_RWLOCK(iochk_lock); /* all works are excluded on this lock */

static struct pci_dev *search_host_bridge(struct pci_dev *dev);
static int have_error(struct pci_dev *dev);

void notify_bridge_error(struct pci_dev *bridge);
void clear_bridge_error(struct pci_dev *bridge);
+void save_bridge_error(void);

void iochk_init(void)
{
@@ -35,14 +36,14 @@ void iochk_clear(iocookie *cookie, struc
cookie->dev = dev;
cookie->host = search_host_bridge(dev);

- spin_lock_irqsave(&iochk_lock, flag);
+ write_lock_irqsave(&iochk_lock, flag);
if (cookie->host && have_error(cookie->host)) {
/* someone under my bridge causes error... */
notify_bridge_error(cookie->host);
clear_bridge_error(cookie->host);
}
list_add(&cookie->list, &iochk_devices);
- spin_unlock_irqrestore(&iochk_lock, flag);
+ write_unlock_irqrestore(&iochk_lock, flag);

cookie->error = 0;
}
@@ -52,12 +53,12 @@ int iochk_read(iocookie *cookie)
unsigned long flag;
int ret = 0;

- spin_lock_irqsave(&iochk_lock, flag);
+ write_lock_irqsave(&iochk_lock, flag);
if ( cookie->error || have_error(cookie->dev)
|| (cookie->host && have_error(cookie->host)) )
ret = 1;
list_del(&cookie->list);
- spin_unlock_irqrestore(&iochk_lock, flag);
+ write_unlock_irqrestore(&iochk_lock, flag);

return ret;
}
@@ -145,6 +146,23 @@ void clear_bridge_error(struct pci_dev *
}
}

+void save_bridge_error(void)
+{
+ iocookie *cookie;
+
+ if (list_empty(&iochk_devices))
+ return;
+
+ /* mark devices if its root bus bridge have errors */
+ list_for_each_entry(cookie, &iochk_devices, list) {
+ if (cookie->error)
+ continue;
+ if (have_error(cookie->host))
+ notify_bridge_error(cookie->host);
+ }
+}
+
+EXPORT_SYMBOL(iochk_lock);
EXPORT_SYMBOL(iochk_read);
EXPORT_SYMBOL(iochk_clear);
EXPORT_SYMBOL(iochk_devices); /* for MCA driver */
Index: linux-2.6.13/arch/ia64/kernel/mca.c
===================================================================
--- linux-2.6.13.orig/arch/ia64/kernel/mca.c
+++ linux-2.6.13/arch/ia64/kernel/mca.c
@@ -80,6 +80,8 @@
#ifdef CONFIG_IOMAP_CHECK
#include <linux/pci.h>
extern void notify_bridge_error(struct pci_dev *bridge);
+extern void save_bridge_error(void);
+extern rwlock_t iochk_lock;
#endif

#if defined(IA64_MCA_DEBUG_INFO)
@@ -288,11 +290,30 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
__FUNCTION__, cpe_irq, smp_processor_id());

+#ifndef CONFIG_IOMAP_CHECK
+
/* SAL spec states this should run w/ interrupts enabled */
local_irq_enable();

/* Get the CPE error record and log it */
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+#else
+ /*
+ * Because SAL_GET_STATE_INFO for CPE might clear bridge states
+ * in process of gathering error information from the system,
+ * we should check the states before clearing it.
+ * While OS and SAL are handling bridge status, we have to protect
+ * the states from changing by any other I/Os running simultaneously,
+ * so this should be handled w/ lock and interrupts disabled.
+ */
+ write_lock(&iochk_lock);
+ save_bridge_error();
+ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+ write_unlock(&iochk_lock);
+
+ /* Rests can go w/ interrupt enabled as usual */
+ local_irq_enable();
+#endif

spin_lock(&cpe_history_lock);
if (!cpe_poll_enabled && cpe_vector >= 0) {
Index: linux-2.6.13/include/asm-ia64/io.h
===================================================================
--- linux-2.6.13.orig/include/asm-ia64/io.h
+++ linux-2.6.13/include/asm-ia64/io.h
@@ -73,6 +73,7 @@ extern unsigned int num_io_spaces;

#ifdef CONFIG_IOMAP_CHECK
#include <linux/list.h>
+#include <linux/spinlock.h>

/* ia64 iocookie */
typedef struct {
@@ -82,6 +83,8 @@ typedef struct {
unsigned long error; /* error flag */
} iocookie;

+extern rwlock_t iochk_lock; /* see arch/ia64/lib/iomap_check.c */
+
/* Enable ia64 iochk - See arch/ia64/lib/iomap_check.c */
#define HAVE_ARCH_IOMAP_CHECK

@@ -190,10 +193,13 @@ ___ia64_inb (unsigned long port)
{
volatile unsigned char *addr = __ia64_mk_io_addr(port);
unsigned char ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);

return ret;
}
@@ -203,10 +209,13 @@ ___ia64_inw (unsigned long port)
{
volatile unsigned short *addr = __ia64_mk_io_addr(port);
unsigned short ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);

return ret;
}
@@ -216,10 +225,13 @@ ___ia64_inl (unsigned long port)
{
volatile unsigned int *addr = __ia64_mk_io_addr(port);
unsigned int ret;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
ret = *addr;
__ia64_mf_a();
ia64_mca_barrier(ret);
+ read_unlock_irqrestore(&iochk_lock,flags);

return ret;
}
@@ -384,9 +396,12 @@ static inline unsigned char
___ia64_readb (const volatile void __iomem *addr)
{
unsigned char val;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
val = *(volatile unsigned char __force *)addr;
ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);

return val;
}
@@ -395,9 +410,12 @@ static inline unsigned short
___ia64_readw (const volatile void __iomem *addr)
{
unsigned short val;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
val = *(volatile unsigned short __force *)addr;
ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);

return val;
}
@@ -406,9 +424,12 @@ static inline unsigned int
___ia64_readl (const volatile void __iomem *addr)
{
unsigned int val;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
val = *(volatile unsigned int __force *) addr;
ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);

return val;
}
@@ -417,9 +438,12 @@ static inline unsigned long
___ia64_readq (const volatile void __iomem *addr)
{
unsigned long val;
+ unsigned long flags;

+ read_lock_irqsave(&iochk_lock,flags);
val = *(volatile unsigned long __force *) addr;
ia64_mca_barrier(val);
+ read_unlock_irqrestore(&iochk_lock,flags);

return val;
}


2005-09-02 10:32:24

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

Thank you for your comment, Brent.

Brent Casavant wrote:
> On Thu, 1 Sep 2005, Hidetoshi Seto wrote:
>> static inline unsigned int
>> ___ia64_inb (unsigned long port)
>> {
>> volatile unsigned char *addr = __ia64_mk_io_addr(port);
>> unsigned char ret;
>>+ unsigned long flags;
>>
>>+ read_lock_irqsave(&iochk_lock,flags);
>> ret = *addr;
>> __ia64_mf_a();
>>+ ia64_mca_barrier(ret);
>>+ read_unlock_irqrestore(&iochk_lock,flags);
>>+
>> return ret;
>> }
>
> I am extremely concerned about the performance implications of this
> implementation. These changes have several deleterious effects on I/O
> performance.

Always there is a trade-off between security and performance.
However, I know these are kinds of interfaces for paranoia.

It would help us if I divide this patch into 2 parts, MCA related part
and CPE related part. I'd appreciate it if you could find why I wrote
such crazy rwlock in the latter part and if you could help me with your
good sight. I'll divide them, please wait my next mails.

> The first is serialization of all I/O reads and writes. This will
> be a severe problem on systems with large numbers of PCI buses, the
> very type of system that stands the most to gain in reliability from
> these efforts. At a minimum any locking should be done on a per-bus
> basis.

Yes, there is a room for improvement about the lock granularity.
Maybe it should be done not on a per-bus but per-host, I think.

> The second is the raw performance penalty from acquiring and dropping
> a lock with every read and write. This will be a substantial amount
> of activity for any I/O-intensive system, heck even for moderate I/O
> levels.

Yes, but improbably some of paranoias accepts such unpleasant without
complaining... We could complain about the performance of RAID-5 disks.

> The third is lock contention for this single lock -- I would fully expect
> many dozens of processors to be performing I/O at any given time on
> systems of interest, causing this to be a heavily contended lock.
> This will be even more severe on NUMA systems, as the lock cacheline
> bounces across the memory fabric. A per-bus lock would again be much
> more appropriate.

Yes. This implementation (at least the rwlock) wouldn't fit to such
monster boxes. However the goal of this interface is definitely in
such hi-end world, so possible improvement should be taken in near
future.

> The final problem is that these performance penalties are paid even
> by drivers which are not IOCHK aware, which for the time being is
> all of them. A reasonable solution must pay these penalties only
> for drivers which are IOCHK aware. Reinstating the readX_check()
> interface is the obvious solution here. It's simply too heavy a
> performance burden to pay when almost no drivers currently benefit
> from it.

Mixing aware and non-aware would make both unhappy.
At least we need to make efforts to separate them and locate them under
different host.
* readX_check(): That was kicked out by Linus.

> Otherwise, I also wonder if you have any plans to handle similar
> errors experienced under device-initiated DMA, or asynchronous I/O.
> It's not clear that there's sufficient infrastructure in the current
> patches to adequately handle those situations.
>
> Thank you,
> Brent Casavant

Every improvements could be.

Requiring data integrity on device-initiated DMA or asynchronous I/O
isn't wrong thing. But I don't think that all errors should be handled
in one infrastructure. There is an another approach to PCI error
handling, asynchronous recovery which Linas Vepstas (IBM) working on,
so maybe both would be required to handle various situation.

Thanks,
H.Seto

2005-09-02 16:44:41

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

On Thu, Sep 01, 2005 at 05:45:54PM -0500, Brent Casavant wrote:
...
> The first is serialization of all I/O reads and writes. This will
> be a severe problem on systems with large numbers of PCI buses, the
> very type of system that stands the most to gain in reliability from
> these efforts. At a minimum any locking should be done on a per-bus
> basis.

The lock could be per "error domain" - that would require some
arch specific support though to define the scope of the "error domain".

> The second is the raw performance penalty from acquiring and dropping
> a lock with every read and write. This will be a substantial amount
> of activity for any I/O-intensive system, heck even for moderate I/O
> levels.

Sorry - I think this is BS.

Please run mmio_test on your box and share the results.
mmio_test is available here:
svn co http://svn.gnumonks.org/trunk/mmio_test/

Then we can talk about the cost of spinlocks vs cost of MMIO access.

thanks,
grant

2005-09-02 18:16:19

by David Mosberger-Tang

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

On 9/2/05, Grant Grundler <[email protected]> wrote:
> On Thu, Sep 01, 2005 at 05:45:54PM -0500, Brent Casavant wrote:
> ...
> > The first is serialization of all I/O reads and writes. This will
> > be a severe problem on systems with large numbers of PCI buses, the
> > very type of system that stands the most to gain in reliability from
> > these efforts. At a minimum any locking should be done on a per-bus
> > basis.
>
> The lock could be per "error domain" - that would require some
> arch specific support though to define the scope of the "error domain".

I do not think the basic inX/outX and readX/writeX operations should
involve spinlocks. That would be really nasty if an MCA/INIT handler
had to call them, for example...

> > The second is the raw performance penalty from acquiring and dropping
> > a lock with every read and write. This will be a substantial amount
> > of activity for any I/O-intensive system, heck even for moderate I/O
> > levels.
>
> Sorry - I think this is BS.
>
> Please run mmio_test on your box and share the results.
> mmio_test is available here:
> svn co http://svn.gnumonks.org/trunk/mmio_test/

Reads are slow, sure, but writes are not (or should not).

--david
--
Mosberger Consulting LLC, voice/fax: 510-744-9372,
http://www.mosberger-consulting.com/
35706 Runckel Lane, Fremont, CA 94536

2005-09-02 18:24:56

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

On Thu, Sep 01, 2005 at 05:45:54PM -0500, Brent Casavant wrote:
> I am extremely concerned about the performance implications of this
> implementation. These changes have several deleterious effects on I/O
> performance.

I agree. I think the iochk patches should be abandoned and the feature
reimplemented on top of the asynchronous PCI error notification patches
from Linas Vepstas.

2005-09-02 22:20:07

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

On Fri, Sep 02, 2005 at 11:16:10AM -0700, david mosberger wrote:
> > Sorry - I think this is BS.
> >
> > Please run mmio_test on your box and share the results.
> > mmio_test is available here:
> > svn co http://svn.gnumonks.org/trunk/mmio_test/
>
> Reads are slow, sure, but writes are not (or should not).

Sure, MMIO writes are generally posted. But those aren't always "free".
At some point, I expect MMIO reads typically will flush those writes
and thus stall until 2 (or more) PCI bus transactions complete.

ISTR locking around MMIO writes was necessary if the box
to enforce syncronization of the error with the driver.
ISTR this syncronization was neccessary. Was that wrong?

Complicating the MMIO perf picture are fabrics connecting the NUMA cells
which do NOT enforce MMIO ordering (e.g. Altix).
In that case, arch code will sometimes need to enforce the write ordering
by flushing MMIO writes before a driver releases a spinlock or other
syncronization primitive. This was discussed before and is archived in
the dialog between Jesse Barns and myself in late 2004 (IIRC).

In any case, mmio_test currently only tests MMIO read perf.
I need to think about how we might also test MMIO write perf.
Ie how much more expensive is MMIO read when it follows an MMIO write.


grant

2005-09-03 07:44:11

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 2.6.13 1/2] IOCHK interface for I/O error handling/detecting (for ia64)

Oh my,

Hidetoshi Seto wrote:
> I'd like to merge this part into 2.6.13-rc1 even if the latter half isn't

This is typo, should be 2.6.14-rc1. :-p

Thanks,
H.Seto

2005-09-03 09:37:22

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 2.6.13] IOCHK interface for I/O error handling/detecting (for ia64)

Matthew Wilcox wrote:
> On Thu, Sep 01, 2005 at 05:45:54PM -0500, Brent Casavant wrote:
>
>>I am extremely concerned about the performance implications of this
>>implementation. These changes have several deleterious effects on I/O
>>performance.
>
>
> I agree. I think the iochk patches should be abandoned and the feature
> reimplemented on top of the asynchronous PCI error notification patches
> from Linas Vepstas.

Do you mean that all architecture especially other than PPC64 already
have enough synchronization point or enough infrastructure to invoke
those notifiers effectively? I don't think so.

As far as I know, PPC64 is enveloped by a favorable situation.
At least in situation of readX and inX on PPC64, they already have
a error check point, and the EEH technology and the firmware make their
error recovery easier.
Because PPC64 firmware (or hardware? - I'm not sure) automatically detects
errors, isolates erroneous device and returns "(type)~0" to kernel,
readX/inX doesn't need to do any expensive thing unless it get "(type)~0."
Therefore PPC64 can have a nice chance to invoke notifiers by extending
the codes in the error check point.
It is clear that doing same thing on other architecture will be quite
painful and expensive.

Why don't you think that it would be useful if the error notifier
could be invoked from iochk_read()? I believe the iochk patches
will help implementing PCI error notification on not only IA64 but
also i386 and so on.
Or do you mean we would be happy if we all have a PPC64 box?


Thanks,
H.Seto