2009-10-05 04:55:40

by Yinghai Lu

[permalink] [raw]
Subject: [PATCH] x86/pci: intel bus root res with IOH reading -v2


for intel system with multi IOH. we could read peer root resource from PCI conf,
and don't trust _CRS again for root bus

v2: address change request from Ingo

Signed-off-by: Yinghai Lu <[email protected]>

---
arch/x86/pci/Makefile | 1
arch/x86/pci/amd_bus.c | 45 +++++++++--------------
arch/x86/pci/bus_numa.h | 26 +++++++++++++
arch/x86/pci/intel_bus.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 135 insertions(+), 27 deletions(-)

Index: linux-2.6/arch/x86/pci/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/pci/Makefile
+++ linux-2.6/arch/x86/pci/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o

obj-y += common.o early.o
obj-y += amd_bus.o
+obj-$(CONFIG_X86_64) += intel_bus.o
Index: linux-2.6/arch/x86/pci/intel_bus.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/pci/intel_bus.c
@@ -0,0 +1,90 @@
+/*
+ * to read io range from IOH pci conf, need to do it after mmconfig is there
+ */
+
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+
+#include "bus_numa.h"
+
+static inline void print_ioh_resources(struct pci_root_info *info)
+{
+ int res_num;
+ int busnum;
+ int i;
+
+ printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n",
+ info->bus_min, info->bus_max);
+ res_num = info->res_num;
+ busnum = info->bus_min;
+ for (i = 0; i < res_num; i++) {
+ struct resource *res;
+
+ res = &info->res[i];
+ printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n",
+ busnum, i,
+ (res->flags & IORESOURCE_IO) ? "io port" :
+ "mmio",
+ res->start, res->end);
+ }
+}
+
+#define IOH_LIO 0x108
+#define IOH_LMMIOL 0x10c
+#define IOH_LMMIOH 0x110
+#define IOH_LMMIOH_BASEU 0x114
+#define IOH_LMMIOH_LIMITU 0x118
+#define IOH_LCFGBUS 0x11c
+
+static void __devinit pci_root_bus_res(struct pci_dev *dev)
+{
+ u16 word;
+ u32 dword;
+ struct pci_root_info *info;
+ u16 io_base, io_end;
+ u32 mmiol_base, mmiol_end;
+ u64 mmioh_base, mmioh_end;
+ int bus_base, bus_end;
+
+ if (pci_root_num >= PCI_ROOT_NR) {
+ printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
+ return;
+ }
+
+ info = &pci_root_info[pci_root_num];
+ pci_root_num++;
+
+ pci_read_config_word(dev, IOH_LCFGBUS, &word);
+ bus_base = (word & 0xff);
+ bus_end = (word & 0xff00) >> 8;
+ sprintf(info->name, "PCI Bus #%02x", bus_base);
+ info->bus_min = bus_base;
+ info->bus_max = bus_end;
+
+ pci_read_config_word(dev, IOH_LIO, &word);
+ io_base = (word & 0xf0) << (12 - 4);
+ io_end = (word & 0xf000) | 0xfff;
+ update_res(info, io_base, io_end, IORESOURCE_IO, 0);
+
+ pci_read_config_dword(dev, IOH_LMMIOL, &dword);
+ mmiol_base = (dword & 0xff00) << (24 - 8);
+ mmiol_end = (dword & 0xff000000) | 0xffffff;
+ update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0);
+
+ pci_read_config_dword(dev, IOH_LMMIOH, &dword);
+ mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10);
+ mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff);
+ pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword);
+ mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
+ pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
+ mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
+ update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
+
+ print_ioh_resources(info);
+}
+
+/* intel IOH */
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res);
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -10,6 +10,8 @@
#include <linux/cpumask.h>
#endif

+#include "bus_numa.h"
+
/*
* This discovers the pcibus <-> node mapping on AMD K8.
* also get peer root bus resource for io,mmio
@@ -17,25 +19,9 @@

#ifdef CONFIG_X86_64

-/*
- * sub bus (transparent) will use entres from 3 to store extra from root,
- * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
- */
-#define RES_NUM 16
-struct pci_root_info {
- char name[12];
- unsigned int res_num;
- struct resource res[RES_NUM];
- int bus_min;
- int bus_max;
- int node;
- int link;
-};
-
-/* 4 at this time, it may become to 32 */
-#define PCI_ROOT_NR 4
-static int pci_root_num;
-static struct pci_root_info pci_root_info[PCI_ROOT_NR];
+int pci_root_num;
+struct pci_root_info pci_root_info[PCI_ROOT_NR];
+static int found_all_numa_early;

void x86_pci_root_bus_res_quirks(struct pci_bus *b)
{
@@ -48,8 +34,11 @@ void x86_pci_root_bus_res_quirks(struct
b->resource[1] != &iomem_resource)
return;

- /* if only one root bus, don't need to anything */
- if (pci_root_num < 2)
+ if (!pci_root_num)
+ return;
+
+ /* for amd, if only one root bus, don't need to do anything */
+ if (pci_root_num < 2 && found_all_numa_early)
return;

for (i = 0; i < pci_root_num; i++) {
@@ -130,12 +119,15 @@ static void __init update_range(struct r
}
}

-static void __init update_res(struct pci_root_info *info, size_t start,
+void __init update_res(struct pci_root_info *info, size_t start,
size_t end, unsigned long flags, int merge)
{
int i;
struct resource *res;

+ if (start > end)
+ return;
+
if (!merge)
goto addit;

@@ -230,7 +222,6 @@ static int __init early_fill_mp_bus_info
int j;
unsigned uninitialized_var(bus);
unsigned uninitialized_var(slot);
- int found;
int node;
int link;
int def_node;
@@ -247,7 +238,7 @@ static int __init early_fill_mp_bus_info
if (!early_pci_allowed())
return -1;

- found = 0;
+ found_all_numa_early = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
u32 id;
u16 device;
@@ -261,12 +252,12 @@ static int __init early_fill_mp_bus_info
device = (id>>16) & 0xffff;
if (pci_probes[i].vendor == vendor &&
pci_probes[i].device == device) {
- found = 1;
+ found_all_numa_early = 1;
break;
}
}

- if (!found)
+ if (!found_all_numa_early)
return 0;

pci_root_num = 0;
@@ -490,7 +481,7 @@ static int __init early_fill_mp_bus_info
info = &pci_root_info[i];
res_num = info->res_num;
busnum = info->bus_min;
- printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n",
+ printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
info->bus_min, info->bus_max, info->node, info->link);
for (j = 0; j < res_num; j++) {
res = &info->res[j];
Index: linux-2.6/arch/x86/pci/bus_numa.h
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/pci/bus_numa.h
@@ -0,0 +1,26 @@
+#ifdef CONFIG_X86_64
+
+/*
+ * sub bus (transparent) will use entres from 3 to store extra from
+ * root, so need to make sure we have enought slot there, Should we
+ * increase PCI_BUS_NUM_RESOURCES?
+ */
+#define RES_NUM 16
+struct pci_root_info {
+ char name[12];
+ unsigned int res_num;
+ struct resource res[RES_NUM];
+ int bus_min;
+ int bus_max;
+ int node;
+ int link;
+};
+
+/* 4 at this time, it may become to 32 */
+#define PCI_ROOT_NR 4
+extern int pci_root_num;
+extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
+
+extern void update_res(struct pci_root_info *info, size_t start,
+ size_t end, unsigned long flags, int merge);
+#endif


2009-10-06 16:53:39

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On Sun, 04 Oct 2009 21:54:24 -0700
Yinghai Lu <[email protected]> wrote:

>
> for intel system with multi IOH. we could read peer root resource
> from PCI conf, and don't trust _CRS again for root bus
>
> v2: address change request from Ingo
>
> Signed-off-by: Yinghai Lu <[email protected]>

Applied to my linux-next branch, thanks.

--
Jesse Barnes, Intel Open Source Technology Center

2009-10-06 17:34:55

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
> for intel system with multi IOH. we could read peer root resource from PCI conf,
> and don't trust _CRS again for root bus

Ugh. Are we going to end up with amd_bus.c, intel_bus.c, nvidia_bus.c,
broadcom_bus.c, serverworks_bus.c, etc.?

This is basically a simple PCI host bridge driver, but it's completely
separate from the ACPI pci_root.c driver, and it completely ignores
useful things like multiple domain (_SEG) support and address translation
(_TRA) support. These are going to be important issues for large x86
machines.

I think this is leading toward an architectural mess. Yes, we have
issues with _CRS for root bridges. But ACPI does give us a generic
framework powerful enough to handle everything you're doing here. In
my opinion, we should fix the implementation issues with that framework
rather than adding platform-specific setup code every time we trip
over something.

I expect that will mean some quirks in pci_root.c, and maybe even some
code similar to pci_root_bus_res() to validate or override what we learn
from _CRS. But we ought to try for some conceptual integrity in the
design by putting all the putting all the host bridge driver code together.

What is the specific problem solved by this patch? Does "pci=use_crs"
address any of that problem? (I know "pci=use_crs" breaks some machines,
and I know it's unacceptable to require users to use it. But I want to
understand whether the concept is related, and whether you've tripped
over a BIOS defect or a Linux pci_root.c defect.)

Bjorn

> ---
> arch/x86/pci/Makefile | 1
> arch/x86/pci/amd_bus.c | 45 +++++++++--------------
> arch/x86/pci/bus_numa.h | 26 +++++++++++++
> arch/x86/pci/intel_bus.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 135 insertions(+), 27 deletions(-)
>
> Index: linux-2.6/arch/x86/pci/Makefile
> ===================================================================
> --- linux-2.6.orig/arch/x86/pci/Makefile
> +++ linux-2.6/arch/x86/pci/Makefile
> @@ -15,3 +15,4 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
>
> obj-y += common.o early.o
> obj-y += amd_bus.o
> +obj-$(CONFIG_X86_64) += intel_bus.o
> Index: linux-2.6/arch/x86/pci/intel_bus.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/arch/x86/pci/intel_bus.c
> @@ -0,0 +1,90 @@
> +/*
> + * to read io range from IOH pci conf, need to do it after mmconfig is there
> + */
> +
> +#include <linux/delay.h>
> +#include <linux/dmi.h>
> +#include <linux/pci.h>
> +#include <linux/init.h>
> +#include <asm/pci_x86.h>
> +
> +#include "bus_numa.h"
> +
> +static inline void print_ioh_resources(struct pci_root_info *info)
> +{
> + int res_num;
> + int busnum;
> + int i;
> +
> + printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n",
> + info->bus_min, info->bus_max);
> + res_num = info->res_num;
> + busnum = info->bus_min;
> + for (i = 0; i < res_num; i++) {
> + struct resource *res;
> +
> + res = &info->res[i];
> + printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n",
> + busnum, i,
> + (res->flags & IORESOURCE_IO) ? "io port" :
> + "mmio",
> + res->start, res->end);
> + }
> +}
> +
> +#define IOH_LIO 0x108
> +#define IOH_LMMIOL 0x10c
> +#define IOH_LMMIOH 0x110
> +#define IOH_LMMIOH_BASEU 0x114
> +#define IOH_LMMIOH_LIMITU 0x118
> +#define IOH_LCFGBUS 0x11c
> +
> +static void __devinit pci_root_bus_res(struct pci_dev *dev)
> +{
> + u16 word;
> + u32 dword;
> + struct pci_root_info *info;
> + u16 io_base, io_end;
> + u32 mmiol_base, mmiol_end;
> + u64 mmioh_base, mmioh_end;
> + int bus_base, bus_end;
> +
> + if (pci_root_num >= PCI_ROOT_NR) {
> + printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
> + return;
> + }
> +
> + info = &pci_root_info[pci_root_num];
> + pci_root_num++;
> +
> + pci_read_config_word(dev, IOH_LCFGBUS, &word);
> + bus_base = (word & 0xff);
> + bus_end = (word & 0xff00) >> 8;
> + sprintf(info->name, "PCI Bus #%02x", bus_base);
> + info->bus_min = bus_base;
> + info->bus_max = bus_end;
> +
> + pci_read_config_word(dev, IOH_LIO, &word);
> + io_base = (word & 0xf0) << (12 - 4);
> + io_end = (word & 0xf000) | 0xfff;
> + update_res(info, io_base, io_end, IORESOURCE_IO, 0);
> +
> + pci_read_config_dword(dev, IOH_LMMIOL, &dword);
> + mmiol_base = (dword & 0xff00) << (24 - 8);
> + mmiol_end = (dword & 0xff000000) | 0xffffff;
> + update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0);
> +
> + pci_read_config_dword(dev, IOH_LMMIOH, &dword);
> + mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10);
> + mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff);
> + pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword);
> + mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
> + pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
> + mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
> + update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
> +
> + print_ioh_resources(info);
> +}
> +
> +/* intel IOH */
> +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res);
> Index: linux-2.6/arch/x86/pci/amd_bus.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/pci/amd_bus.c
> +++ linux-2.6/arch/x86/pci/amd_bus.c
> @@ -10,6 +10,8 @@
> #include <linux/cpumask.h>
> #endif
>
> +#include "bus_numa.h"
> +
> /*
> * This discovers the pcibus <-> node mapping on AMD K8.
> * also get peer root bus resource for io,mmio
> @@ -17,25 +19,9 @@
>
> #ifdef CONFIG_X86_64
>
> -/*
> - * sub bus (transparent) will use entres from 3 to store extra from root,
> - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
> - */
> -#define RES_NUM 16
> -struct pci_root_info {
> - char name[12];
> - unsigned int res_num;
> - struct resource res[RES_NUM];
> - int bus_min;
> - int bus_max;
> - int node;
> - int link;
> -};
> -
> -/* 4 at this time, it may become to 32 */
> -#define PCI_ROOT_NR 4
> -static int pci_root_num;
> -static struct pci_root_info pci_root_info[PCI_ROOT_NR];
> +int pci_root_num;
> +struct pci_root_info pci_root_info[PCI_ROOT_NR];
> +static int found_all_numa_early;
>
> void x86_pci_root_bus_res_quirks(struct pci_bus *b)
> {
> @@ -48,8 +34,11 @@ void x86_pci_root_bus_res_quirks(struct
> b->resource[1] != &iomem_resource)
> return;
>
> - /* if only one root bus, don't need to anything */
> - if (pci_root_num < 2)
> + if (!pci_root_num)
> + return;
> +
> + /* for amd, if only one root bus, don't need to do anything */
> + if (pci_root_num < 2 && found_all_numa_early)
> return;
>
> for (i = 0; i < pci_root_num; i++) {
> @@ -130,12 +119,15 @@ static void __init update_range(struct r
> }
> }
>
> -static void __init update_res(struct pci_root_info *info, size_t start,
> +void __init update_res(struct pci_root_info *info, size_t start,
> size_t end, unsigned long flags, int merge)
> {
> int i;
> struct resource *res;
>
> + if (start > end)
> + return;
> +
> if (!merge)
> goto addit;
>
> @@ -230,7 +222,6 @@ static int __init early_fill_mp_bus_info
> int j;
> unsigned uninitialized_var(bus);
> unsigned uninitialized_var(slot);
> - int found;
> int node;
> int link;
> int def_node;
> @@ -247,7 +238,7 @@ static int __init early_fill_mp_bus_info
> if (!early_pci_allowed())
> return -1;
>
> - found = 0;
> + found_all_numa_early = 0;
> for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
> u32 id;
> u16 device;
> @@ -261,12 +252,12 @@ static int __init early_fill_mp_bus_info
> device = (id>>16) & 0xffff;
> if (pci_probes[i].vendor == vendor &&
> pci_probes[i].device == device) {
> - found = 1;
> + found_all_numa_early = 1;
> break;
> }
> }
>
> - if (!found)
> + if (!found_all_numa_early)
> return 0;
>
> pci_root_num = 0;
> @@ -490,7 +481,7 @@ static int __init early_fill_mp_bus_info
> info = &pci_root_info[i];
> res_num = info->res_num;
> busnum = info->bus_min;
> - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n",
> + printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
> info->bus_min, info->bus_max, info->node, info->link);
> for (j = 0; j < res_num; j++) {
> res = &info->res[j];
> Index: linux-2.6/arch/x86/pci/bus_numa.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6/arch/x86/pci/bus_numa.h
> @@ -0,0 +1,26 @@
> +#ifdef CONFIG_X86_64
> +
> +/*
> + * sub bus (transparent) will use entres from 3 to store extra from
> + * root, so need to make sure we have enought slot there, Should we
> + * increase PCI_BUS_NUM_RESOURCES?
> + */
> +#define RES_NUM 16
> +struct pci_root_info {
> + char name[12];
> + unsigned int res_num;
> + struct resource res[RES_NUM];
> + int bus_min;
> + int bus_max;
> + int node;
> + int link;
> +};
> +
> +/* 4 at this time, it may become to 32 */
> +#define PCI_ROOT_NR 4
> +extern int pci_root_num;
> +extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
> +
> +extern void update_res(struct pci_root_info *info, size_t start,
> + size_t end, unsigned long flags, int merge);
> +#endif
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

2009-10-06 17:52:36

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

Bjorn Helgaas wrote:
> On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
>> for intel system with multi IOH. we could read peer root resource from PCI conf,
>> and don't trust _CRS again for root bus
>
> Ugh. Are we going to end up with amd_bus.c, intel_bus.c, nvidia_bus.c,
> broadcom_bus.c, serverworks_bus.c, etc.?
only needed when you have muti ...
>
> This is basically a simple PCI host bridge driver, but it's completely
> separate from the ACPI pci_root.c driver, and it completely ignores
> useful things like multiple domain (_SEG) support and address translation
> (_TRA) support. These are going to be important issues for large x86
> machines.
>
> I think this is leading toward an architectural mess. Yes, we have
> issues with _CRS for root bridges. But ACPI does give us a generic
> framework powerful enough to handle everything you're doing here. In
> my opinion, we should fix the implementation issues with that framework
> rather than adding platform-specific setup code every time we trip
> over something.

again we should trust HW conf than BIOS.

>
> I expect that will mean some quirks in pci_root.c, and maybe even some
> code similar to pci_root_bus_res() to validate or override what we learn
> from _CRS. But we ought to try for some conceptual integrity in the
> design by putting all the putting all the host bridge driver code together.
>
> What is the specific problem solved by this patch? Does "pci=use_crs"
> address any of that problem? (I know "pci=use_crs" breaks some machines,
> and I know it's unacceptable to require users to use it. But I want to
> understand whether the concept is related, and whether you've tripped
> over a BIOS defect or a Linux pci_root.c defect.)

BIOS doesn't allocate resource to some pci devices when too many devices. and need kernel to allocate resource ( 32bit mmio, 64 mmio)
to those devices.
current only known fw that can allocate mmio 64 ( with correct setting) is LinuxBIOS.

also could help os to fend off some range that is wrongly allocated by BIOS that is cross the boundary between different peer root bus.

_CRS doesn't report any MMIO 64 range, even HW does have that set.

YH

2009-10-06 18:58:35

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On Tuesday 06 October 2009 11:51:22 am Yinghai Lu wrote:
> Bjorn Helgaas wrote:
> > On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
> >> for intel system with multi IOH. we could read peer root resource from PCI conf,
> >> and don't trust _CRS again for root bus
> >
> > Ugh. Are we going to end up with amd_bus.c, intel_bus.c, nvidia_bus.c,
> > broadcom_bus.c, serverworks_bus.c, etc.?
> only needed when you have muti ...

I think that translates to "yes, we will need all those bits as soon
as those vendors support larger systems with multiple IOHs." And I
think that's the wrong answer.

> > This is basically a simple PCI host bridge driver, but it's completely
> > separate from the ACPI pci_root.c driver, and it completely ignores
> > useful things like multiple domain (_SEG) support and address translation
> > (_TRA) support. These are going to be important issues for large x86
> > machines.
> >
> > I think this is leading toward an architectural mess. Yes, we have
> > issues with _CRS for root bridges. But ACPI does give us a generic
> > framework powerful enough to handle everything you're doing here. In
> > my opinion, we should fix the implementation issues with that framework
> > rather than adding platform-specific setup code every time we trip
> > over something.
>
> again we should trust HW conf than BIOS.

Certainly there's a tradeoff between a generic driver that relies on
the BIOS, and a platform-specific driver that uses only the hardware.
The first leaves us vulnerable to BIOS bugs, but the second leads to
a plethora of drivers that require updates as hardware changes.

For example, pci_root.c already supports multiple PCI domains and
address translation. Where is that support in intel_bus.c and
amd_bus.c?

> > I expect that will mean some quirks in pci_root.c, and maybe even some
> > code similar to pci_root_bus_res() to validate or override what we learn
> > from _CRS. But we ought to try for some conceptual integrity in the
> > design by putting all the putting all the host bridge driver code together.
> >
> > What is the specific problem solved by this patch? Does "pci=use_crs"
> > address any of that problem? (I know "pci=use_crs" breaks some machines,
> > and I know it's unacceptable to require users to use it. But I want to
> > understand whether the concept is related, and whether you've tripped
> > over a BIOS defect or a Linux pci_root.c defect.)
>
> BIOS doesn't allocate resource to some pci devices when too many devices. and need kernel to allocate resource ( 32bit mmio, 64 mmio)
> to those devices.
> current only known fw that can allocate mmio 64 ( with correct setting) is LinuxBIOS.
>
> also could help os to fend off some range that is wrongly allocated by BIOS that is cross the boundary between different peer root bus.
>
> _CRS doesn't report any MMIO 64 range, even HW does have that set.

This sounds like a plain-vanilla BIOS defect. What prevents us from
fixing the BIOS (if the platform hasn't shipped) or adding some sort of
kernel quirk to make the generic driver work? We don't need to throw
it away completely.

But to be fair, I guess I'm criticizing this patch based on my vision
of where pci_root.c *should* be, not where it is today. Today, it
ignores _CRS on x86, so even with a correct BIOS, you'd have to use
"pci=use_crs". That needs to get fixed somehow.

I want to work on getting it fixed rather than adding platform-specific
workarounds like this patch. I think this patch sweeps the issue under
the rug and adds code that will be difficult to remove later.

Bjorn

2009-10-12 20:35:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2


* Bjorn Helgaas <[email protected]> wrote:

> On Tuesday 06 October 2009 11:51:22 am Yinghai Lu wrote:
> > Bjorn Helgaas wrote:
> > > On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
> > >> for intel system with multi IOH. we could read peer root resource from PCI conf,
> > >> and don't trust _CRS again for root bus
> > >
> > > Ugh. Are we going to end up with amd_bus.c, intel_bus.c, nvidia_bus.c,
> > > broadcom_bus.c, serverworks_bus.c, etc.?
> > only needed when you have muti ...
>
> I think that translates to "yes, we will need all those bits as soon
> as those vendors support larger systems with multiple IOHs." And I
> think that's the wrong answer.

Why is having cleanly separated per vendor information/driver wrong? I
think it's the right answer in most cases. _Especially_ when the other
option is to 'rely on the firmware'.

> > again we should trust HW conf than BIOS.
>
> Certainly there's a tradeoff between a generic driver that relies on
> the BIOS, and a platform-specific driver that uses only the hardware.
> The first leaves us vulnerable to BIOS bugs, but the second leads to a
> plethora of drivers that require updates as hardware changes.

We do that quite well in Linux - it's one of our main strengths.

Why should we have to rely on correct firmware? Why is it wrong to know
about the hw's structure, to query the hardware and ignore the firmware
if the hardware state tells us otherwise?

Ingo

2009-10-12 20:50:18

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On Mon, 12 Oct 2009 22:34:53 +0200
Ingo Molnar <[email protected]> wrote:

>
> * Bjorn Helgaas <[email protected]> wrote:
>
> > On Tuesday 06 October 2009 11:51:22 am Yinghai Lu wrote:
> > > Bjorn Helgaas wrote:
> > > > On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
> > > >> for intel system with multi IOH. we could read peer root
> > > >> resource from PCI conf, and don't trust _CRS again for root bus
> > > >
> > > > Ugh. Are we going to end up with amd_bus.c, intel_bus.c,
> > > > nvidia_bus.c, broadcom_bus.c, serverworks_bus.c, etc.?
> > > only needed when you have muti ...
> >
> > I think that translates to "yes, we will need all those bits as
> > soon as those vendors support larger systems with multiple IOHs."
> > And I think that's the wrong answer.
>
> Why is having cleanly separated per vendor information/driver wrong?
> I think it's the right answer in most cases. _Especially_ when the
> other option is to 'rely on the firmware'.
>
> > > again we should trust HW conf than BIOS.
> >
> > Certainly there's a tradeoff between a generic driver that relies
> > on the BIOS, and a platform-specific driver that uses only the
> > hardware. The first leaves us vulnerable to BIOS bugs, but the
> > second leads to a plethora of drivers that require updates as
> > hardware changes.
>
> We do that quite well in Linux - it's one of our main strengths.
>
> Why should we have to rely on correct firmware? Why is it wrong to
> know about the hw's structure, to query the hardware and ignore the
> firmware if the hardware state tells us otherwise?

Right, I think the approaches are complimentary. We want the hw info,
for cases where the fw is broken or there's no support for a given
configuration, but we also want the fw info for cases where we don't
yet have hw info (e.g. the bridge driver isn't updated) or it isn't
complete.

--
Jesse Barnes, Intel Open Source Technology Center

2009-10-12 21:52:25

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On 10/12/2009 02:45 PM, Bjorn Helgaas wrote:
> For the patch in question, we don't even have a root cause for the bug
> (or at least, I couldn't decipher it from the changelog). There's a
> reference to _CRS being wrong, but we don't currently use _CRS for
> x86 host bridges.
>
> But in general, my objection is that even if BIOS provides perfectly
> valid information about host bridge apertures, the the fact that Linux
> ignores that information means we have to add this sort of vendor-
> specific code every time we trip over something. And we're tripping
> over things quite often.
>
> Windows consumes this _CRS information, so while I grant there are
> certainly BIOS bugs there, I think most of the bugs are actually in
> Linux.

I think the right policy for most if not all things should be "use the
BIOS information unless we know better."

-hpa

2009-10-12 22:15:49

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

[Resending to linux-kernel and linux-pci because I fat-fingered
something and introduced HTML, so the lists bounced it.]

On Monday 12 October 2009 02:34:53 pm Ingo Molnar wrote:
>
> * Bjorn Helgaas <[email protected]> wrote:
>
> > On Tuesday 06 October 2009 11:51:22 am Yinghai Lu wrote:
> > > Bjorn Helgaas wrote:
> > > > On Sunday 04 October 2009 10:54:24 pm Yinghai Lu wrote:
> > > >> for intel system with multi IOH. we could read peer root resource from PCI conf,
> > > >> and don't trust _CRS again for root bus
> > > >
> > > > Ugh. Are we going to end up with amd_bus.c, intel_bus.c, nvidia_bus.c,
> > > > broadcom_bus.c, serverworks_bus.c, etc.?
> > > only needed when you have muti ...
> >
> > I think that translates to "yes, we will need all those bits as soon
> > as those vendors support larger systems with multiple IOHs." And I
> > think that's the wrong answer.
>
> Why is having cleanly separated per vendor information/driver wrong? I
> think it's the right answer in most cases. _Especially_ when the other
> option is to 'rely on the firmware'.

For the patch in question, we don't even have a root cause for the bug
(or at least, I couldn't decipher it from the changelog). There's a
reference to _CRS being wrong, but we don't currently use _CRS for
x86 host bridges.

But in general, my objection is that even if BIOS provides perfectly
valid information about host bridge apertures, the the fact that Linux
ignores that information means we have to add this sort of vendor-
specific code every time we trip over something. And we're tripping
over things quite often.

Windows consumes this _CRS information, so while I grant there are
certainly BIOS bugs there, I think most of the bugs are actually in
Linux.

> > > again we should trust HW conf than BIOS.
> >
> > Certainly there's a tradeoff between a generic driver that relies on
> > the BIOS, and a platform-specific driver that uses only the hardware.
> > The first leaves us vulnerable to BIOS bugs, but the second leads to a
> > plethora of drivers that require updates as hardware changes.
>
> We do that quite well in Linux - it's one of our main strengths.
>
> Why should we have to rely on correct firmware? Why is it wrong to know
> about the hw's structure, to query the hardware and ignore the firmware
> if the hardware state tells us otherwise?

If we need to learn something useful about the HW that the generic
description isn't expressive enough to tell us, that's fine. But
that's not the case here -- all we need is the very simple description
of the bridge apertures. I object to the idea that "new machine X
comes out, it has a correct ACPI description of the host bridges,
let's go write some X-specific code so Linux can run on it."

I *like* the idea of "new machine X comes out, oops, the BIOS is
broken, let's write a quirk to work around the BIOS defect so Linux
can run on it." But I think this is actually less common than many
people think, simply because most machines are tested with Windows,
and that shakes out the most egregious BIOS bugs.

Bjorn

2009-10-13 07:04:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2


* H. Peter Anvin <[email protected]> wrote:

> On 10/12/2009 02:45 PM, Bjorn Helgaas wrote:
>
> > For the patch in question, we don't even have a root cause for the
> > bug (or at least, I couldn't decipher it from the changelog).
> > There's a reference to _CRS being wrong, but we don't currently use
> > _CRS for x86 host bridges.
> >
> > But in general, my objection is that even if BIOS provides perfectly
> > valid information about host bridge apertures, the the fact that
> > Linux ignores that information means we have to add this sort of
> > vendor- specific code every time we trip over something. And we're
> > tripping over things quite often.
> >
> > Windows consumes this _CRS information, so while I grant there are
> > certainly BIOS bugs there, I think most of the bugs are actually in
> > Linux.
>
> I think the right policy for most if not all things should be "use the
> BIOS information unless we know better."

My argument is that if we know how to access the hardware and know how
to interpret its state then that is _ALWAYS_ higher quality information
than anything the BIOS tells us.

Basically the BIOS is a second-level fall-back, and the less we have to
use this fall-back in Linux, the better. In many cases we have no other
info but BIOS info - that puts all our eggs into the BIOS basket and we
have to live with that.

And reducing that dependency on the BIOS is what Yinghai's patch is
doing in essence - it adds a higher-grade source of information: reading
the PCI config space directly.

( I make no argument about the specific correctness of the patch - i
just raised the principle and i think we should move in this direction
in general. )

Thanks,

Ingo

2009-10-19 20:17:16

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] x86/pci: intel bus root res with IOH reading -v2

On Tuesday 06 October 2009 11:51:22 am Yinghai Lu wrote:
> > What is the specific problem solved by this patch? ?Does "pci=use_crs"
> > address any of that problem? ?(I know "pci=use_crs" breaks some machines,
> > and I know it's unacceptable to require users to use it. ?But I want to
> > understand whether the concept is related, and whether you've tripped
> > over a BIOS defect or a Linux pci_root.c defect.)
>
> BIOS doesn't allocate resource to some pci devices when too many devices. and need kernel to allocate resource ( 32bit mmio, 64 mmio)
> to those devices.
> current only known fw that can allocate mmio 64 ( with correct setting) is LinuxBIOS.
>
> also could help os to fend off some range that is wrongly allocated by BIOS that is cross the boundary between different peer root bus.
>
> _CRS doesn't report any MMIO 64 range, even HW does have that set.

This discussion got derailed into "BIOS bad, Linux good" before I
could learn more about the specific problem you're solving.

Can you tell us what machine this fixes? Can you include logs, e.g.,
dmesg/lspci/iomem, that show the problem, and corresponding ones
with your patch applied that show the fix?

Has the machine been released? Does Windows work on it?

I'd also like to know more about the "range that is wrongly allocated
by BIOS that is cross the boundary between different peer root bus."
Does this mean the BIOS programmed the host bridges wrong, or does it
mean it reported something invalid in the host bridge _CRS, or something
else?

Thanks,
Bjorn