These patches reset PCIe devices at boot time to address DMA problem on
kdump with iommu. When "reset_devices" is specified, a hot reset is
triggered on each PCIe root port and downstream port to reset its
downstream endpoint.
Background:
A kdump problem about DMA has been discussed for a long time. That is,
when a kernel is switched to the kdump kernel DMA derived from first
kernel affects second kernel. Recently this problem surfaces when iommu
is used for PCI passthrough on KVM guest. In the case of the machine I
use, when intel_iommu=on is specified, DMAR error is detected in kdump
kernel and PCI SERR is also detected. Finally kdump fails because some
devices does not work correctly.
The root cause is that ongoing DMA from first kernel causes DMAR fault
because page table of DMAR is initialized while kdump kernel is booting
up. Therefore to address this problem DMA needs to be stopped before
DMAR is initialized at kdump kernel boot time. By these patches, PCIe
devices are reset by hot reset and its DMA is stopped when reset_devices
is specified. One problem of this solution is that the monitor blacks
out when VGA controller is reset. So this patch does not reset the port
whose child endpoint is VGA device.
What I tried:
- Clearing bus master bit and INTx disable bit at boot time
This did not solve this problem. I still got DMAR error on devices.
- Resetting devices in fixup_final(v1 patch)
DMAR error disappeared, but sometimes PCI SERR was detected.
This is well explained here.
https://lkml.org/lkml/2012/9/9/245
This PCI SERR seems to be related to interrupt remapping.
- Clearing bus master in setup_arch() and resetting devices in fixup_final
Neither DMAR error nor PCI SERR occurred. But on certain machine
kdump kernel hung up when resetting devices. It seems to be a
problem specific to the platform.
- Resetting devices in setup_arch() (v2 patch)
This solution solves all problems I found so far.
v3:
Move alloc_bootmem and free_bootmem to early_reset_pcie_devices so that
they are called only once.
v2:
Reset devices in setup_arch() because reset need to be done before
interrupt remapping is initialized.
https://lkml.org/lkml/2012/10/2/54
v1:
Add fixup_final quirk to reset PCIe devices
https://lkml.org/lkml/2012/8/3/160
Thanks,
Takao Indoh
This patch resets PCIe devices at boot time by hot reset when
"reset_devices" is specified.
Signed-off-by: Takao Indoh <[email protected]>
---
arch/x86/include/asm/pci-direct.h | 1
arch/x86/kernel/setup.c | 3
arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++
drivers/pci/pci.c | 18 -
include/linux/pci.h | 18 +
init/main.c | 4
6 files changed, 323 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
index b1e7a45..de30db2 100644
--- a/arch/x86/include/asm/pci-direct.h
+++ b/arch/x86/include/asm/pci-direct.h
@@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
extern unsigned int pci_early_dump_regs;
extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
extern void early_dump_pci_devices(void);
+extern void early_reset_pcie_devices(void);
#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f4b9b80..24b011c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p)
generic_apic_probe();
early_quirks();
+#ifdef CONFIG_PCI
+ early_reset_pcie_devices();
+#endif
/*
* Read APIC and some other early information from ACPI tables.
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index d1067d5..584f16b 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -1,5 +1,6 @@
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <linux/bootmem.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
#include <asm/pci_x86.h>
@@ -109,3 +110,301 @@ void early_dump_pci_devices(void)
}
}
}
+
+struct save_config {
+ u32 pci[16];
+ u16 pcie[PCI_EXP_SAVE_REGS];
+};
+
+struct devinfo {
+ int pcie_pos; /* position of PCI Express capability */
+ int pcie_flags; /* PCI_EXP_FLAGS */
+ struct save_config *save;
+};
+
+static struct save_config *save_cfg;
+static void __init pci_udelay(int loops)
+{
+ while (loops--) {
+ /* Approximately 1 us */
+ native_io_delay();
+ }
+}
+
+/* Derived from drivers/pci/pci.c */
+#define PCI_FIND_CAP_TTL 48
+static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
+ u8 pos, int cap, int *ttl)
+{
+ u8 id;
+
+ while ((*ttl)--) {
+ pos = read_pci_config_byte(bus, slot, func, pos);
+ if (pos < 0x40)
+ break;
+ pos &= ~3;
+ id = read_pci_config_byte(bus, slot, func,
+ pos + PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos += PCI_CAP_LIST_NEXT;
+ }
+ return 0;
+}
+
+static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap)
+{
+ int ttl = PCI_FIND_CAP_TTL;
+
+ return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
+}
+
+static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
+ u8 hdr_type)
+{
+ u16 status;
+
+ status = read_pci_config_16(bus, slot, func, PCI_STATUS);
+ if (!(status & PCI_STATUS_CAP_LIST))
+ return 0;
+
+ switch (hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL:
+ case PCI_HEADER_TYPE_BRIDGE:
+ return PCI_CAPABILITY_LIST;
+ case PCI_HEADER_TYPE_CARDBUS:
+ return PCI_CB_CAPABILITY_LIST;
+ default:
+ return 0;
+ }
+
+ return 0;
+}
+
+static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap)
+{
+ int pos;
+ u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
+
+ pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
+ if (pos)
+ pos = __pci_find_next_cap(bus, slot, func, pos, cap);
+
+ return pos;
+}
+
+static void __init do_reset(u8 bus, u8 slot, u8 func)
+{
+ u16 ctrl;
+
+ printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
+
+ /* Assert Secondary Bus Reset */
+ ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
+ ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+ write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
+
+ pci_udelay(5000);
+
+ /* De-assert Secondary Bus Reset */
+ ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+ write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
+
+ pci_udelay(500000);
+}
+
+static void __init save_state(unsigned bus, unsigned slot, unsigned func,
+ struct devinfo *info)
+{
+ int i;
+ int pcie, flags, pcie_type;
+ struct save_config *save;
+
+ pcie = info->pcie_pos;
+ flags = info->pcie_flags;
+ pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ save = info->save;
+
+ printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
+
+ for (i = 0; i < 16; i++)
+ save->pci[i] = read_pci_config(bus, slot, func, i * 4);
+ i = 0;
+ if (pcie_cap_has_devctl(pcie_type, flags))
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_DEVCTL);
+ if (pcie_cap_has_lnkctl(pcie_type, flags))
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_LNKCTL);
+ if (pcie_cap_has_sltctl(pcie_type, flags))
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_SLTCTL);
+ if (pcie_cap_has_rtctl(pcie_type, flags))
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_RTCTL);
+
+ if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_DEVCTL2);
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_LNKCTL2);
+ save->pcie[i++] = read_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_SLTCTL2);
+ }
+}
+
+static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
+ struct devinfo *info)
+{
+ int i = 0;
+ int pcie, flags, pcie_type;
+ struct save_config *save;
+
+ pcie = info->pcie_pos;
+ flags = info->pcie_flags;
+ pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ save = info->save;
+
+ printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
+ bus, slot, func);
+
+ if (pcie_cap_has_devctl(pcie_type, flags))
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
+ if (pcie_cap_has_lnkctl(pcie_type, flags))
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
+ if (pcie_cap_has_sltctl(pcie_type, flags))
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
+ if (pcie_cap_has_rtctl(pcie_type, flags))
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_RTCTL, save->pcie[i++]);
+
+ if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
+ write_pci_config_16(bus, slot, func,
+ pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
+ }
+
+ for (i = 15; i >= 0; i--)
+ write_pci_config(bus, slot, func, i * 4, save->pci[i]);
+}
+
+static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func)
+{
+ int f, count;
+ int pcie, pcie_type;
+ u8 type;
+ u16 vendor, flags;
+ u32 class;
+ int secondary;
+ struct devinfo child[8];
+
+ pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
+ if (!pcie)
+ return;
+
+ flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
+ pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
+ return;
+
+ type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
+ if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
+ return;
+ secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
+ memset(child, 0, sizeof(child));
+ for (count = 0, f = 0; f < 8; f++) {
+ vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
+ if (vendor == 0xffff)
+ continue;
+
+ pcie = early_pci_find_capability(secondary, 0, f,
+ PCI_CAP_ID_EXP);
+ if (!pcie)
+ continue;
+
+ flags = read_pci_config_16(secondary, 0, f,
+ pcie + PCI_EXP_FLAGS);
+ pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
+ (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
+ /* Don't reset switch, bridge */
+ return;
+
+ class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
+ if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
+ /* Don't reset VGA device */
+ return;
+
+ count++;
+ child[f].pcie_pos = pcie;
+ child[f].pcie_flags = flags;
+ child[f].save = save_cfg + f;
+ }
+
+ if (!count)
+ return;
+
+ /* save */
+ for (f = 0; f < 8; f++)
+ if (child[f].pcie_pos)
+ save_state(secondary, 0, f, &child[f]);
+
+ do_reset(bus, slot, func);
+
+ /* restore */
+ for (f = 0; f < 8; f++)
+ if (child[f].pcie_pos)
+ restore_state(secondary, 0, f, &child[f]);
+}
+
+void __init early_reset_pcie_devices(void)
+{
+ unsigned bus, slot, func;
+ int size;
+
+ if (!early_pci_allowed() || !reset_devices)
+ return;
+
+ /* alloc space to save config */
+ size = sizeof(struct save_config)*8;
+ save_cfg = (struct save_config *)alloc_bootmem(size);
+ if (save_cfg == NULL) {
+ printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n");
+ return;
+ }
+
+ for (bus = 0; bus < 256; bus++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u16 vendor;
+ u8 type;
+ vendor = read_pci_config_16(bus, slot, func,
+ PCI_VENDOR_ID);
+
+ if (vendor == 0xffff)
+ continue;
+
+ reset_pcie_device(bus, slot, func);
+
+ if (func == 0) {
+ type = read_pci_config_byte(bus, slot,
+ func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+ }
+
+ free_bootmem(__pa(save_cfg), size);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index ab4bf5a..a7a4125 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
EXPORT_SYMBOL(pci_choose_state);
-#define PCI_EXP_SAVE_REGS 7
-
-#define pcie_cap_has_devctl(type, flags) 1
-#define pcie_cap_has_lnkctl(type, flags) \
- ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
- (type == PCI_EXP_TYPE_ROOT_PORT || \
- type == PCI_EXP_TYPE_ENDPOINT || \
- type == PCI_EXP_TYPE_LEG_END))
-#define pcie_cap_has_sltctl(type, flags) \
- ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
- ((type == PCI_EXP_TYPE_ROOT_PORT) || \
- (type == PCI_EXP_TYPE_DOWNSTREAM && \
- (flags & PCI_EXP_FLAGS_SLOT))))
-#define pcie_cap_has_rtctl(type, flags) \
- ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
- (type == PCI_EXP_TYPE_ROOT_PORT || \
- type == PCI_EXP_TYPE_RC_EC))
-
static struct pci_cap_saved_state *pci_find_saved_cap(
struct pci_dev *pci_dev, char cap)
{
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5faa831..8e10401 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
*/
struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
+#define PCI_EXP_SAVE_REGS 7
+
+#define pcie_cap_has_devctl(type, flags) 1
+#define pcie_cap_has_lnkctl(type, flags) \
+ ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
+ (type == PCI_EXP_TYPE_ROOT_PORT || \
+ type == PCI_EXP_TYPE_ENDPOINT || \
+ type == PCI_EXP_TYPE_LEG_END))
+#define pcie_cap_has_sltctl(type, flags) \
+ ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
+ ((type == PCI_EXP_TYPE_ROOT_PORT) || \
+ (type == PCI_EXP_TYPE_DOWNSTREAM && \
+ (flags & PCI_EXP_FLAGS_SLOT))))
+#define pcie_cap_has_rtctl(type, flags) \
+ ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
+ (type == PCI_EXP_TYPE_ROOT_PORT || \
+ type == PCI_EXP_TYPE_RC_EC))
+
#endif /* __KERNEL__ */
#endif /* LINUX_PCI_H */
diff --git a/init/main.c b/init/main.c
index b286730..ebaf067 100644
--- a/init/main.c
+++ b/init/main.c
@@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
static int __init set_reset_devices(char *str)
{
reset_devices = 1;
- return 1;
+ return 0;
}
-__setup("reset_devices", set_reset_devices);
+early_param("reset_devices", set_reset_devices);
static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
This patch enables INTx if MSI is disabled in pcibios_enable_device().
In normal case interrupt disable bit in command register is 0b on boot
time, but in case of kdump, this bit may be 1b. It causes problems of
some drivers. At leaset I confirmed mptsas driver does not work in such
a case. This patch fix this problem.
Signed-off-by: Takao Indoh <[email protected]>
---
arch/x86/pci/common.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 720e973..2bb7ecc 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -615,8 +615,10 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
if ((err = pci_enable_resources(dev, mask)) < 0)
return err;
- if (!pci_dev_msi_enabled(dev))
+ if (!pci_dev_msi_enabled(dev)) {
+ pci_intx(dev, true);
return pcibios_enable_irq(dev);
+ }
return 0;
}
Please see comments inline:
On Wed, 2012-10-10 at 16:51 +0900, Takao Indoh wrote:
> This patch resets PCIe devices at boot time by hot reset when
> "reset_devices" is specified.
>
>
> Signed-off-by: Takao Indoh <[email protected]>
> ---
> arch/x86/include/asm/pci-direct.h | 1
> arch/x86/kernel/setup.c | 3
> arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++
> drivers/pci/pci.c | 18 -
> include/linux/pci.h | 18 +
> init/main.c | 4
> 6 files changed, 323 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
> index b1e7a45..de30db2 100644
> --- a/arch/x86/include/asm/pci-direct.h
> +++ b/arch/x86/include/asm/pci-direct.h
> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
> extern unsigned int pci_early_dump_regs;
> extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
> extern void early_dump_pci_devices(void);
> +extern void early_reset_pcie_devices(void);
> #endif /* _ASM_X86_PCI_DIRECT_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index f4b9b80..24b011c 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p)
> generic_apic_probe();
>
> early_quirks();
> +#ifdef CONFIG_PCI
> + early_reset_pcie_devices();
> +#endif
>
> /*
> * Read APIC and some other early information from ACPI tables.
> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
> index d1067d5..584f16b 100644
> --- a/arch/x86/pci/early.c
> +++ b/arch/x86/pci/early.c
> @@ -1,5 +1,6 @@
> #include <linux/kernel.h>
> #include <linux/pci.h>
> +#include <linux/bootmem.h>
> #include <asm/pci-direct.h>
> #include <asm/io.h>
> #include <asm/pci_x86.h>
> @@ -109,3 +110,301 @@ void early_dump_pci_devices(void)
> }
> }
> }
> +
> +struct save_config {
> + u32 pci[16];
> + u16 pcie[PCI_EXP_SAVE_REGS];
> +};
> +
> +struct devinfo {
> + int pcie_pos; /* position of PCI Express capability */
> + int pcie_flags; /* PCI_EXP_FLAGS */
> + struct save_config *save;
> +};
> +
> +static struct save_config *save_cfg;
> +static void __init pci_udelay(int loops)
> +{
> + while (loops--) {
> + /* Approximately 1 us */
> + native_io_delay();
> + }
> +}
> +
> +/* Derived from drivers/pci/pci.c */
> +#define PCI_FIND_CAP_TTL 48
> +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
> + u8 pos, int cap, int *ttl)
> +{
> + u8 id;
> +
> + while ((*ttl)--) {
> + pos = read_pci_config_byte(bus, slot, func, pos);
> + if (pos < 0x40)
> + break;
> + pos &= ~3;
> + id = read_pci_config_byte(bus, slot, func,
> + pos + PCI_CAP_LIST_ID);
> + if (id == 0xff)
> + break;
> + if (id == cap)
> + return pos;
> + pos += PCI_CAP_LIST_NEXT;
> + }
> + return 0;
> +}
> +
> +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap)
> +{
> + int ttl = PCI_FIND_CAP_TTL;
> +
> + return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
> +}
> +
> +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
> + u8 hdr_type)
> +{
> + u16 status;
> +
> + status = read_pci_config_16(bus, slot, func, PCI_STATUS);
> + if (!(status & PCI_STATUS_CAP_LIST))
> + return 0;
> +
> + switch (hdr_type) {
> + case PCI_HEADER_TYPE_NORMAL:
> + case PCI_HEADER_TYPE_BRIDGE:
> + return PCI_CAPABILITY_LIST;
> + case PCI_HEADER_TYPE_CARDBUS:
> + return PCI_CB_CAPABILITY_LIST;
> + default:
> + return 0;
> + }
> +
> + return 0;
> +}
> +
> +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap)
> +{
> + int pos;
> + u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> +
> + pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
> + if (pos)
> + pos = __pci_find_next_cap(bus, slot, func, pos, cap);
> +
> + return pos;
> +}
> +
> +static void __init do_reset(u8 bus, u8 slot, u8 func)
> +{
> + u16 ctrl;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
> +
> + /* Assert Secondary Bus Reset */
> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(5000);
> +
> + /* De-assert Secondary Bus Reset */
> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(500000);
This is 0.5 second. This will add up quickly on larger servers with
multiple busses. Is 0.5 second required by the spec?
aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
then waits another 200 ms after de-asserting it. Still long, but less
than half of the delay in above code..
> +}
> +
> +static void __init save_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
> +
> + for (i = 0; i < 16; i++)
> + save->pci[i] = read_pci_config(bus, slot, func, i * 4);
> + i = 0;
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2);
> + }
> +}
> +
> +static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i = 0;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
> + bus, slot, func);
> +
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL, save->pcie[i++]);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
> + }
> +
> + for (i = 15; i >= 0; i--)
> + write_pci_config(bus, slot, func, i * 4, save->pci[i]);
> +}
> +
> +static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func)
> +{
> + int f, count;
> + int pcie, pcie_type;
> + u8 type;
> + u16 vendor, flags;
> + u32 class;
> + int secondary;
> + struct devinfo child[8];
> +
> + pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
> + if (!pcie)
> + return;
> +
> + flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
> + (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
> + return;
> +
> + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> + if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
> + return;
> + secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
> + memset(child, 0, sizeof(child));
> + for (count = 0, f = 0; f < 8; f++) {
Can we use a constant instead of "8" in the loop here? There are a few
other places in kernel code with very similar loops enumerating over PCI
functions that again use "8" instead of a constant like
PCI_MAX_FUNCTIONS. I would suggest we use a constant at least in the new
code.
> + vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
> + if (vendor == 0xffff)
> + continue;
> +
> + pcie = early_pci_find_capability(secondary, 0, f,
> + PCI_CAP_ID_EXP);
> + if (!pcie)
> + continue;
> +
> + flags = read_pci_config_16(secondary, 0, f,
> + pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
> + (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
> + /* Don't reset switch, bridge */
> + return;
> +
> + class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
> + if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
> + /* Don't reset VGA device */
> + return;
> +
> + count++;
> + child[f].pcie_pos = pcie;
> + child[f].pcie_flags = flags;
> + child[f].save = save_cfg + f;
> + }
> +
> + if (!count)
> + return;
> +
> + /* save */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + save_state(secondary, 0, f, &child[f]);
> +
> + do_reset(bus, slot, func);
> +
> + /* restore */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + restore_state(secondary, 0, f, &child[f]);
> +}
> +
> +void __init early_reset_pcie_devices(void)
> +{
> + unsigned bus, slot, func;
> + int size;
> +
> + if (!early_pci_allowed() || !reset_devices)
> + return;
> +
> + /* alloc space to save config */
> + size = sizeof(struct save_config)*8;
Use a constant instead of "8", please.
> + save_cfg = (struct save_config *)alloc_bootmem(size);
> + if (save_cfg == NULL) {
> + printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n");
> + return;
> + }
> +
> + for (bus = 0; bus < 256; bus++) {
> + for (slot = 0; slot < 32; slot++) {
> + for (func = 0; func < 8; func++) {
> + u16 vendor;
> + u8 type;
> + vendor = read_pci_config_16(bus, slot, func,
> + PCI_VENDOR_ID);
> +
> + if (vendor == 0xffff)
> + continue;
> +
> + reset_pcie_device(bus, slot, func);
> +
> + if (func == 0) {
> + type = read_pci_config_byte(bus, slot,
> + func,
> + PCI_HEADER_TYPE);
> + if (!(type & 0x80))
> + break;
> + }
> + }
> + }
> + }
> +
> + free_bootmem(__pa(save_cfg), size);
> +}
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index ab4bf5a..a7a4125 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
>
> EXPORT_SYMBOL(pci_choose_state);
>
> -#define PCI_EXP_SAVE_REGS 7
> -
> -#define pcie_cap_has_devctl(type, flags) 1
> -#define pcie_cap_has_lnkctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_ENDPOINT || \
> - type == PCI_EXP_TYPE_LEG_END))
> -#define pcie_cap_has_sltctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> - (type == PCI_EXP_TYPE_DOWNSTREAM && \
> - (flags & PCI_EXP_FLAGS_SLOT))))
> -#define pcie_cap_has_rtctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_RC_EC))
> -
> static struct pci_cap_saved_state *pci_find_saved_cap(
> struct pci_dev *pci_dev, char cap)
> {
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 5faa831..8e10401 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
> */
> struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
>
> +#define PCI_EXP_SAVE_REGS 7
> +
> +#define pcie_cap_has_devctl(type, flags) 1
> +#define pcie_cap_has_lnkctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_ENDPOINT || \
> + type == PCI_EXP_TYPE_LEG_END))
> +#define pcie_cap_has_sltctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> + (type == PCI_EXP_TYPE_DOWNSTREAM && \
> + (flags & PCI_EXP_FLAGS_SLOT))))
> +#define pcie_cap_has_rtctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_RC_EC))
> +
> #endif /* __KERNEL__ */
> #endif /* LINUX_PCI_H */
> diff --git a/init/main.c b/init/main.c
> index b286730..ebaf067 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
> static int __init set_reset_devices(char *str)
> {
> reset_devices = 1;
> - return 1;
> + return 0;
> }
>
> -__setup("reset_devices", set_reset_devices);
> +early_param("reset_devices", set_reset_devices);
>
> static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
> const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
>
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec
We have been seeing problems with kexec/kdump kernel for quite some time
that are related to I/O devices not being quiesced before kexec. I had
added code to clear Bus Master bit to help stop runaway DMAs which
helped many cases, but obviously not all. If resetting downstream ports
helps stop runaway I/O from PCIe devices, I am all for this approach.
This patch still doesn't do anything for old PCI devices though.
--
Khalid
(2012/10/11 5:08), Khalid Aziz wrote:
> Please see comments inline:
>
> On Wed, 2012-10-10 at 16:51 +0900, Takao Indoh wrote:
>> This patch resets PCIe devices at boot time by hot reset when
>> "reset_devices" is specified.
>>
>>
>> Signed-off-by: Takao Indoh <[email protected]>
>> ---
>> arch/x86/include/asm/pci-direct.h | 1
>> arch/x86/kernel/setup.c | 3
>> arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++
>> drivers/pci/pci.c | 18 -
>> include/linux/pci.h | 18 +
>> init/main.c | 4
>> 6 files changed, 323 insertions(+), 20 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
>> index b1e7a45..de30db2 100644
>> --- a/arch/x86/include/asm/pci-direct.h
>> +++ b/arch/x86/include/asm/pci-direct.h
>> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
>> extern unsigned int pci_early_dump_regs;
>> extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
>> extern void early_dump_pci_devices(void);
>> +extern void early_reset_pcie_devices(void);
>> #endif /* _ASM_X86_PCI_DIRECT_H */
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index f4b9b80..24b011c 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p)
>> generic_apic_probe();
>>
>> early_quirks();
>> +#ifdef CONFIG_PCI
>> + early_reset_pcie_devices();
>> +#endif
>>
>> /*
>> * Read APIC and some other early information from ACPI tables.
>> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
>> index d1067d5..584f16b 100644
>> --- a/arch/x86/pci/early.c
>> +++ b/arch/x86/pci/early.c
>> @@ -1,5 +1,6 @@
>> #include <linux/kernel.h>
>> #include <linux/pci.h>
>> +#include <linux/bootmem.h>
>> #include <asm/pci-direct.h>
>> #include <asm/io.h>
>> #include <asm/pci_x86.h>
>> @@ -109,3 +110,301 @@ void early_dump_pci_devices(void)
>> }
>> }
>> }
>> +
>> +struct save_config {
>> + u32 pci[16];
>> + u16 pcie[PCI_EXP_SAVE_REGS];
>> +};
>> +
>> +struct devinfo {
>> + int pcie_pos; /* position of PCI Express capability */
>> + int pcie_flags; /* PCI_EXP_FLAGS */
>> + struct save_config *save;
>> +};
>> +
>> +static struct save_config *save_cfg;
>> +static void __init pci_udelay(int loops)
>> +{
>> + while (loops--) {
>> + /* Approximately 1 us */
>> + native_io_delay();
>> + }
>> +}
>> +
>> +/* Derived from drivers/pci/pci.c */
>> +#define PCI_FIND_CAP_TTL 48
>> +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
>> + u8 pos, int cap, int *ttl)
>> +{
>> + u8 id;
>> +
>> + while ((*ttl)--) {
>> + pos = read_pci_config_byte(bus, slot, func, pos);
>> + if (pos < 0x40)
>> + break;
>> + pos &= ~3;
>> + id = read_pci_config_byte(bus, slot, func,
>> + pos + PCI_CAP_LIST_ID);
>> + if (id == 0xff)
>> + break;
>> + if (id == cap)
>> + return pos;
>> + pos += PCI_CAP_LIST_NEXT;
>> + }
>> + return 0;
>> +}
>> +
>> +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap)
>> +{
>> + int ttl = PCI_FIND_CAP_TTL;
>> +
>> + return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
>> +}
>> +
>> +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
>> + u8 hdr_type)
>> +{
>> + u16 status;
>> +
>> + status = read_pci_config_16(bus, slot, func, PCI_STATUS);
>> + if (!(status & PCI_STATUS_CAP_LIST))
>> + return 0;
>> +
>> + switch (hdr_type) {
>> + case PCI_HEADER_TYPE_NORMAL:
>> + case PCI_HEADER_TYPE_BRIDGE:
>> + return PCI_CAPABILITY_LIST;
>> + case PCI_HEADER_TYPE_CARDBUS:
>> + return PCI_CB_CAPABILITY_LIST;
>> + default:
>> + return 0;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap)
>> +{
>> + int pos;
>> + u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
>> +
>> + pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
>> + if (pos)
>> + pos = __pci_find_next_cap(bus, slot, func, pos, cap);
>> +
>> + return pos;
>> +}
>> +
>> +static void __init do_reset(u8 bus, u8 slot, u8 func)
>> +{
>> + u16 ctrl;
>> +
>> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
>> +
>> + /* Assert Secondary Bus Reset */
>> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
>> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
>> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
>> +
>> + pci_udelay(5000);
>> +
>> + /* De-assert Secondary Bus Reset */
>> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
>> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
>> +
>> + pci_udelay(500000);
>
> This is 0.5 second. This will add up quickly on larger servers with
> multiple busses. Is 0.5 second required by the spec?
> aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
> then waits another 200 ms after de-asserting it. Still long, but less
> than half of the delay in above code..
According to PCIe spec, they should be more than 1ms and 100ms. The reason
why they're 2ms and 200ms in aer_do_secondary_bus_reset() is that it's a
kind of safety margin, I think.
At first these delay were 2ms and 200ms as well as
aer_do_secondary_bus_reset(), but I found a problem that on certain
machine it was not enough. I think this problem occurs because
native_io_delay() is not precise. Therefore I extended them to have more
margin.
If this delay should be shortened, I have two solutions.
1)
Call early_reset_pcie_devices() after calibrate_delay() so that we can
use precise mdelay. Personally I don't want to do this because I think
DMA should be stopped asap.
2)
Make it tuneable. For exmple, improve "reset_devices"
reset_devices=reset_delay=500
or something like that. As default, 200ms is enough as far as I tested.
But if it is not enough, we can adjust delay time using this.
Any other idea?
>> +}
>> +
>> +static void __init save_state(unsigned bus, unsigned slot, unsigned func,
>> + struct devinfo *info)
>> +{
>> + int i;
>> + int pcie, flags, pcie_type;
>> + struct save_config *save;
>> +
>> + pcie = info->pcie_pos;
>> + flags = info->pcie_flags;
>> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
>> + save = info->save;
>> +
>> + printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
>> +
>> + for (i = 0; i < 16; i++)
>> + save->pci[i] = read_pci_config(bus, slot, func, i * 4);
>> + i = 0;
>> + if (pcie_cap_has_devctl(pcie_type, flags))
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_DEVCTL);
>> + if (pcie_cap_has_lnkctl(pcie_type, flags))
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_LNKCTL);
>> + if (pcie_cap_has_sltctl(pcie_type, flags))
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_SLTCTL);
>> + if (pcie_cap_has_rtctl(pcie_type, flags))
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_RTCTL);
>> +
>> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_DEVCTL2);
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_LNKCTL2);
>> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_SLTCTL2);
>> + }
>> +}
>> +
>> +static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
>> + struct devinfo *info)
>> +{
>> + int i = 0;
>> + int pcie, flags, pcie_type;
>> + struct save_config *save;
>> +
>> + pcie = info->pcie_pos;
>> + flags = info->pcie_flags;
>> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
>> + save = info->save;
>> +
>> + printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
>> + bus, slot, func);
>> +
>> + if (pcie_cap_has_devctl(pcie_type, flags))
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
>> + if (pcie_cap_has_lnkctl(pcie_type, flags))
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
>> + if (pcie_cap_has_sltctl(pcie_type, flags))
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
>> + if (pcie_cap_has_rtctl(pcie_type, flags))
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_RTCTL, save->pcie[i++]);
>> +
>> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
>> + write_pci_config_16(bus, slot, func,
>> + pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
>> + }
>> +
>> + for (i = 15; i >= 0; i--)
>> + write_pci_config(bus, slot, func, i * 4, save->pci[i]);
>> +}
>> +
>> +static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func)
>> +{
>> + int f, count;
>> + int pcie, pcie_type;
>> + u8 type;
>> + u16 vendor, flags;
>> + u32 class;
>> + int secondary;
>> + struct devinfo child[8];
>> +
>> + pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
>> + if (!pcie)
>> + return;
>> +
>> + flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
>> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
>> + if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
>> + (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
>> + return;
>> +
>> + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
>> + if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
>> + return;
>> + secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
>> + memset(child, 0, sizeof(child));
>> + for (count = 0, f = 0; f < 8; f++) {
>
> Can we use a constant instead of "8" in the loop here? There are a few
> other places in kernel code with very similar loops enumerating over PCI
> functions that again use "8" instead of a constant like
> PCI_MAX_FUNCTIONS. I would suggest we use a constant at least in the new
> code.
Ok, I'll fix it.
>> + vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
>> + if (vendor == 0xffff)
>> + continue;
>> +
>> + pcie = early_pci_find_capability(secondary, 0, f,
>> + PCI_CAP_ID_EXP);
>> + if (!pcie)
>> + continue;
>> +
>> + flags = read_pci_config_16(secondary, 0, f,
>> + pcie + PCI_EXP_FLAGS);
>> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
>> + if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
>> + (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
>> + /* Don't reset switch, bridge */
>> + return;
>> +
>> + class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
>> + if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
>> + /* Don't reset VGA device */
>> + return;
>> +
>> + count++;
>> + child[f].pcie_pos = pcie;
>> + child[f].pcie_flags = flags;
>> + child[f].save = save_cfg + f;
>> + }
>> +
>> + if (!count)
>> + return;
>> +
>> + /* save */
>> + for (f = 0; f < 8; f++)
>> + if (child[f].pcie_pos)
>> + save_state(secondary, 0, f, &child[f]);
>> +
>> + do_reset(bus, slot, func);
>> +
>> + /* restore */
>> + for (f = 0; f < 8; f++)
>> + if (child[f].pcie_pos)
>> + restore_state(secondary, 0, f, &child[f]);
>> +}
>> +
>> +void __init early_reset_pcie_devices(void)
>> +{
>> + unsigned bus, slot, func;
>> + int size;
>> +
>> + if (!early_pci_allowed() || !reset_devices)
>> + return;
>> +
>> + /* alloc space to save config */
>> + size = sizeof(struct save_config)*8;
>
> Use a constant instead of "8", please.
Will do.
>
>> + save_cfg = (struct save_config *)alloc_bootmem(size);
>> + if (save_cfg == NULL) {
>> + printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n");
>> + return;
>> + }
>> +
>> + for (bus = 0; bus < 256; bus++) {
>> + for (slot = 0; slot < 32; slot++) {
>> + for (func = 0; func < 8; func++) {
>> + u16 vendor;
>> + u8 type;
>> + vendor = read_pci_config_16(bus, slot, func,
>> + PCI_VENDOR_ID);
>> +
>> + if (vendor == 0xffff)
>> + continue;
>> +
>> + reset_pcie_device(bus, slot, func);
>> +
>> + if (func == 0) {
>> + type = read_pci_config_byte(bus, slot,
>> + func,
>> + PCI_HEADER_TYPE);
>> + if (!(type & 0x80))
>> + break;
>> + }
>> + }
>> + }
>> + }
>> +
>> + free_bootmem(__pa(save_cfg), size);
>> +}
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index ab4bf5a..a7a4125 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
>>
>> EXPORT_SYMBOL(pci_choose_state);
>>
>> -#define PCI_EXP_SAVE_REGS 7
>> -
>> -#define pcie_cap_has_devctl(type, flags) 1
>> -#define pcie_cap_has_lnkctl(type, flags) \
>> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> - (type == PCI_EXP_TYPE_ROOT_PORT || \
>> - type == PCI_EXP_TYPE_ENDPOINT || \
>> - type == PCI_EXP_TYPE_LEG_END))
>> -#define pcie_cap_has_sltctl(type, flags) \
>> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> - ((type == PCI_EXP_TYPE_ROOT_PORT) || \
>> - (type == PCI_EXP_TYPE_DOWNSTREAM && \
>> - (flags & PCI_EXP_FLAGS_SLOT))))
>> -#define pcie_cap_has_rtctl(type, flags) \
>> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> - (type == PCI_EXP_TYPE_ROOT_PORT || \
>> - type == PCI_EXP_TYPE_RC_EC))
>> -
>> static struct pci_cap_saved_state *pci_find_saved_cap(
>> struct pci_dev *pci_dev, char cap)
>> {
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 5faa831..8e10401 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
>> */
>> struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
>>
>> +#define PCI_EXP_SAVE_REGS 7
>> +
>> +#define pcie_cap_has_devctl(type, flags) 1
>> +#define pcie_cap_has_lnkctl(type, flags) \
>> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> + (type == PCI_EXP_TYPE_ROOT_PORT || \
>> + type == PCI_EXP_TYPE_ENDPOINT || \
>> + type == PCI_EXP_TYPE_LEG_END))
>> +#define pcie_cap_has_sltctl(type, flags) \
>> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> + ((type == PCI_EXP_TYPE_ROOT_PORT) || \
>> + (type == PCI_EXP_TYPE_DOWNSTREAM && \
>> + (flags & PCI_EXP_FLAGS_SLOT))))
>> +#define pcie_cap_has_rtctl(type, flags) \
>> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
>> + (type == PCI_EXP_TYPE_ROOT_PORT || \
>> + type == PCI_EXP_TYPE_RC_EC))
>> +
>> #endif /* __KERNEL__ */
>> #endif /* LINUX_PCI_H */
>> diff --git a/init/main.c b/init/main.c
>> index b286730..ebaf067 100644
>> --- a/init/main.c
>> +++ b/init/main.c
>> @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
>> static int __init set_reset_devices(char *str)
>> {
>> reset_devices = 1;
>> - return 1;
>> + return 0;
>> }
>>
>> -__setup("reset_devices", set_reset_devices);
>> +early_param("reset_devices", set_reset_devices);
>>
>> static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
>> const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
>>
>>
>> _______________________________________________
>> kexec mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/kexec
>
> We have been seeing problems with kexec/kdump kernel for quite some time
> that are related to I/O devices not being quiesced before kexec. I had
> added code to clear Bus Master bit to help stop runaway DMAs which
> helped many cases, but obviously not all. If resetting downstream ports
> helps stop runaway I/O from PCIe devices, I am all for this approach.
> This patch still doesn't do anything for old PCI devices though.
This patch does not reset PCI devices because of VGA problem. As I wrote
in patch description, the monitor blacks out when VGA controller is
reset. So when VGA device is found we need to skip it. In the case of
PCIe, we can decide whether we do bus reset or not on each device, but
we cannot in the case of PCI.
Thanks,
Takao Indoh
On Thu, 2012-10-11 at 15:16 +0900, Takao Indoh wrote:
> (2012/10/11 5:08), Khalid Aziz wrote:
.....
> >> +static void __init do_reset(u8 bus, u8 slot, u8 func)
> >> +{
> >> + u16 ctrl;
> >> +
> >> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
> >> +
> >> + /* Assert Secondary Bus Reset */
> >> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
> >> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
> >> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> >> +
> >> + pci_udelay(5000);
> >> +
> >> + /* De-assert Secondary Bus Reset */
> >> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
> >> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> >> +
> >> + pci_udelay(500000);
> >
> > This is 0.5 second. This will add up quickly on larger servers with
> > multiple busses. Is 0.5 second required by the spec?
> > aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
> > then waits another 200 ms after de-asserting it. Still long, but less
> > than half of the delay in above code..
>
> According to PCIe spec, they should be more than 1ms and 100ms. The reason
> why they're 2ms and 200ms in aer_do_secondary_bus_reset() is that it's a
> kind of safety margin, I think.
>
> At first these delay were 2ms and 200ms as well as
> aer_do_secondary_bus_reset(), but I found a problem that on certain
> machine it was not enough. I think this problem occurs because
> native_io_delay() is not precise. Therefore I extended them to have more
> margin.
>
> If this delay should be shortened, I have two solutions.
>
> 1)
> Call early_reset_pcie_devices() after calibrate_delay() so that we can
> use precise mdelay. Personally I don't want to do this because I think
> DMA should be stopped asap.
>
> 2)
> Make it tuneable. For exmple, improve "reset_devices"
> reset_devices=reset_delay=500
> or something like that. As default, 200ms is enough as far as I tested.
> But if it is not enough, we can adjust delay time using this.
>
> Any other idea?
>
I don't like the idea of asking end user to determine how many msec of
delay they would need on their system for a reset. If we have to go that
route, I would rather have a default of 200 msec and then add a kernel
option like "reset_devices=reset_delay=long" which changes the delay to
500 msec.
Here is another idea. The code you currently have does:
for (each device) {
save config registers
reset
wait for 500 ms
restore config registers
}
You need to wait for 500 ms because you can not access config registers
until reset is complete. So how about this - why not just save config
registers, reset each device and then wait only once for 500 ms before
restoring config registers on all devices. Here is what this will look
like:
for (each device) {
save config registers
reset
}
wait 500 ms
for (each device) {
restore config registers
}
This is obviously more complex and requires you to allocate more space
for saving state, but it has the benefits of minimizing boot up delay
and making the delay constant irrespective of number of switches/bridges
on the system.
Does this sound reasonable?
> >
> > We have been seeing problems with kexec/kdump kernel for quite some time
> > that are related to I/O devices not being quiesced before kexec. I had
> > added code to clear Bus Master bit to help stop runaway DMAs which
> > helped many cases, but obviously not all. If resetting downstream ports
> > helps stop runaway I/O from PCIe devices, I am all for this approach.
> > This patch still doesn't do anything for old PCI devices though.
>
> This patch does not reset PCI devices because of VGA problem. As I wrote
> in patch description, the monitor blacks out when VGA controller is
> reset. So when VGA device is found we need to skip it. In the case of
> PCIe, we can decide whether we do bus reset or not on each device, but
> we cannot in the case of PCI.
OK. My concern is there are plenty of older servers out there that
potentially have the problem with kexec/kdump failing often and we are
not solving the problem for them. If we can only solve it for PCIe for
now, I am ok with starting there.
--
Khalid
(2012/10/12 2:28), Khalid Aziz wrote:
> On Thu, 2012-10-11 at 15:16 +0900, Takao Indoh wrote:
>> (2012/10/11 5:08), Khalid Aziz wrote:
> .....
>>>> +static void __init do_reset(u8 bus, u8 slot, u8 func)
>>>> +{
>>>> + u16 ctrl;
>>>> +
>>>> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
>>>> +
>>>> + /* Assert Secondary Bus Reset */
>>>> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
>>>> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
>>>> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
>>>> +
>>>> + pci_udelay(5000);
>>>> +
>>>> + /* De-assert Secondary Bus Reset */
>>>> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
>>>> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
>>>> +
>>>> + pci_udelay(500000);
>>>
>>> This is 0.5 second. This will add up quickly on larger servers with
>>> multiple busses. Is 0.5 second required by the spec?
>>> aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
>>> then waits another 200 ms after de-asserting it. Still long, but less
>>> than half of the delay in above code..
>>
>> According to PCIe spec, they should be more than 1ms and 100ms. The reason
>> why they're 2ms and 200ms in aer_do_secondary_bus_reset() is that it's a
>> kind of safety margin, I think.
>>
>> At first these delay were 2ms and 200ms as well as
>> aer_do_secondary_bus_reset(), but I found a problem that on certain
>> machine it was not enough. I think this problem occurs because
>> native_io_delay() is not precise. Therefore I extended them to have more
>> margin.
>>
>> If this delay should be shortened, I have two solutions.
>>
>> 1)
>> Call early_reset_pcie_devices() after calibrate_delay() so that we can
>> use precise mdelay. Personally I don't want to do this because I think
>> DMA should be stopped asap.
>>
>> 2)
>> Make it tuneable. For exmple, improve "reset_devices"
>> reset_devices=reset_delay=500
>> or something like that. As default, 200ms is enough as far as I tested.
>> But if it is not enough, we can adjust delay time using this.
>>
>> Any other idea?
>>
>
> I don't like the idea of asking end user to determine how many msec of
> delay they would need on their system for a reset. If we have to go that
> route, I would rather have a default of 200 msec and then add a kernel
> option like "reset_devices=reset_delay=long" which changes the delay to
> 500 msec.
>
> Here is another idea. The code you currently have does:
>
> for (each device) {
> save config registers
> reset
> wait for 500 ms
> restore config registers
> }
>
> You need to wait for 500 ms because you can not access config registers
> until reset is complete. So how about this - why not just save config
> registers, reset each device and then wait only once for 500 ms before
> restoring config registers on all devices. Here is what this will look
> like:
>
> for (each device) {
> save config registers
> reset
> }
> wait 500 ms
> for (each device) {
> restore config registers
> }
>
> This is obviously more complex and requires you to allocate more space
> for saving state, but it has the benefits of minimizing boot up delay
> and making the delay constant irrespective of number of switches/bridges
> on the system.
>
> Does this sound reasonable?
Yep, that looks good idea. I'll update my patch with this idea.
Thanks,
Takao Indoh
>
>
>>>
>>> We have been seeing problems with kexec/kdump kernel for quite some time
>>> that are related to I/O devices not being quiesced before kexec. I had
>>> added code to clear Bus Master bit to help stop runaway DMAs which
>>> helped many cases, but obviously not all. If resetting downstream ports
>>> helps stop runaway I/O from PCIe devices, I am all for this approach.
>>> This patch still doesn't do anything for old PCI devices though.
>>
>> This patch does not reset PCI devices because of VGA problem. As I wrote
>> in patch description, the monitor blacks out when VGA controller is
>> reset. So when VGA device is found we need to skip it. In the case of
>> PCIe, we can decide whether we do bus reset or not on each device, but
>> we cannot in the case of PCI.
>
> OK. My concern is there are plenty of older servers out there that
> potentially have the problem with kexec/kdump failing often and we are
> not solving the problem for them. If we can only solve it for PCIe for
> now, I am ok with starting there.
>
> --
> Khalid
>
>
>
>