2007-08-01 15:59:24

by Andy Whitcroft

[permalink] [raw]
Subject: Re: - pci-device-ensure-sysdata-initialised-v3.patch removed from -mm tree

On Thu, Jul 26, 2007 at 11:24:45PM -0700, [email protected] wrote:
>
> The patch titled
> pci device ensure sysdata initialised v3
> has been removed from the -mm tree. Its filename was
> pci-device-ensure-sysdata-initialised-v3.patch
>
> This patch was dropped because Yinghai Lu's stuff hits on the same code

I still get panics with 2.6.23-rc1-mm2 so I have rebased this patch
on top of that. I seem to have to trot this one out on a regular
basis and it seems to live its life in -mm. It is clear that there
are broken call sites here and something needs to be done to them to
avoid this. Ideas for alternative solutions or can we get something
like this merged?

-apw

=== 8< ===
pci device ensure sysdata initialised v4

We have been seeing panic's on NUMA systems in pci_call_probe() in
2.6.19-rc1-mm1 and later. This is related to the changes introduced
in the commit below:

[x86, PCI] Switch pci_bus::sysdata from NUMA node integer to a pointer
0a247a58fc3e2ecfc17654301033e8b8d08df2a2

In this change the sysdata has changed from directly representing
a value (the node number in NUMA) to a pointer to a structure.
However, it seems that we do not always initialise this sysdata
before we probe the device.

Prior to the changes above the node was defaulted to 'NULL'
allocating the devices to node 0 unconditionally. This patch adds
a default sysdata entry (pci_default_sysdata), this is then used
where 'NULL' was used previously. pci_default_sysdata defaults
the node to unknown (-1). This is a more accurate assignment,
mirroring the value returned where no topology support is provided
and no locality information is available.

There are only two uses of this value in the affected architectures
(x86, x86_64) and generic code:

1) in x86_64, dma_alloc_pages() looks up the node in order to
allocate node local memory. Here if the node is invalid we
will default to the first online node. Behaviour here should
be unchanged.
2) in generic, pci_call_probe() looks up the node in order to
restrict execution of the probe on the card local node, to
favor node local allocation. Where this is unknown previously
we would force execution (and thereby allocation) to node 0,
this is arguably wrong and using -1 releases this restriction.

In an ideal world we should be supplying a sysdata for the
appropriate node where it is known. Where it is not known defaulting
to -1 seems a better course, and would help us where node 0 is
short of memory.

Signed-off-by: Andy Whitcroft <[email protected]>
---
arch/i386/pci/common.c | 2 ++
arch/i386/pci/fixup.c | 8 +++++---
arch/i386/pci/numa.c | 8 +++++---
arch/i386/pci/visws.c | 4 ++--
include/asm-i386/pci.h | 1 +
include/asm-x86_64/pci.h | 1 +
6 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index d188e10..ed1e986 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -27,6 +27,8 @@ unsigned long pirq_table_addr;
struct pci_bus *pci_root_bus;
struct pci_raw_ops *raw_pci_ops;

+struct pci_sysdata pci_default_sysdata = { .node = -1 };
+
static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
{
return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c
index e7306db..80365ac 100644
--- a/arch/i386/pci/fixup.c
+++ b/arch/i386/pci/fixup.c
@@ -25,9 +25,11 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
pci_read_config_byte(d, reg++, &subb);
DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
if (busno)
- pci_scan_bus(busno, &pci_root_ops, NULL); /* Bus A */
+ pci_scan_bus(busno, &pci_root_ops,
+ &pci_default_sysdata); /* Bus A */
if (suba < subb)
- pci_scan_bus(suba+1, &pci_root_ops, NULL); /* Bus B */
+ pci_scan_bus(suba+1, &pci_root_ops,
+ &pci_default_sysdata); /* Bus B */
}
pcibios_last_bus = -1;
}
@@ -42,7 +44,7 @@ static void __devinit pci_fixup_i450gx(struct pci_dev *d)
u8 busno;
pci_read_config_byte(d, 0x4a, &busno);
printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno);
- pci_scan_bus(busno, &pci_root_ops, NULL);
+ pci_scan_bus(busno, &pci_root_ops, &pci_default_sysdata);
pcibios_last_bus = -1;
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
diff --git a/arch/i386/pci/numa.c b/arch/i386/pci/numa.c
index adbe17a..8d2fd6c 100644
--- a/arch/i386/pci/numa.c
+++ b/arch/i386/pci/numa.c
@@ -97,9 +97,11 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
pci_read_config_byte(d, reg++, &subb);
DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
if (busno)
- pci_scan_bus(QUADLOCAL2BUS(quad,busno), &pci_root_ops, NULL); /* Bus A */
+ pci_scan_bus(QUADLOCAL2BUS(quad,busno), &pci_root_ops,
+ &pci_default_sysdata); /* Bus A */
if (suba < subb)
- pci_scan_bus(QUADLOCAL2BUS(quad,suba+1), &pci_root_ops, NULL); /* Bus B */
+ pci_scan_bus(QUADLOCAL2BUS(quad,suba+1), &pci_root_ops,
+ &pci_default_sysdata); /* Bus B */
}
pcibios_last_bus = -1;
}
@@ -124,7 +126,7 @@ static int __init pci_numa_init(void)
printk("Scanning PCI bus %d for quad %d\n",
QUADLOCAL2BUS(quad,0), quad);
pci_scan_bus(QUADLOCAL2BUS(quad,0),
- &pci_root_ops, NULL);
+ &pci_root_ops, &pci_default_sysdata);
}
return 0;
}
diff --git a/arch/i386/pci/visws.c b/arch/i386/pci/visws.c
index f1b486d..2539371 100644
--- a/arch/i386/pci/visws.c
+++ b/arch/i386/pci/visws.c
@@ -101,8 +101,8 @@ static int __init pcibios_init(void)
"bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0);

raw_pci_ops = &pci_direct_conf1;
- pci_scan_bus(pci_bus0, &pci_root_ops, NULL);
- pci_scan_bus(pci_bus1, &pci_root_ops, NULL);
+ pci_scan_bus(pci_bus0, &pci_root_ops, &pci_default_sysdata);
+ pci_scan_bus(pci_bus1, &pci_root_ops, &pci_default_sysdata);
pci_fixup_irqs(visws_swizzle, visws_map_irq);
pcibios_resource_survey();
return 0;
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index d790343..7003604 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -7,6 +7,7 @@
struct pci_sysdata {
int node; /* NUMA node */
};
+extern struct pci_sysdata pci_default_sysdata;

#include <linux/mm.h> /* for struct page */

diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 88926eb..2c2b092 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -9,6 +9,7 @@ struct pci_sysdata {
int node; /* NUMA node */
void* iommu; /* IOMMU private data */
};
+extern struct pci_sysdata pci_default_sysdata;

#ifdef CONFIG_CALGARY_IOMMU
static inline void* pci_iommu(struct pci_bus *bus)


2007-08-01 16:52:22

by Yinghai Lu

[permalink] [raw]
Subject: Re: - pci-device-ensure-sysdata-initialised-v3.patch removed from -mm tree

Andy Whitcroft wrote:
> On Thu, Jul 26, 2007 at 11:24:45PM -0700, [email protected] wrote:
>> The patch titled
>> pci device ensure sysdata initialised v3
>> has been removed from the -mm tree. Its filename was
>> pci-device-ensure-sysdata-initialised-v3.patch
>>
>> This patch was dropped because Yinghai Lu's stuff hits on the same code
>
> I still get panics with 2.6.23-rc1-mm2 so I have rebased this patch
> on top of that. I seem to have to trot this one out on a regular
> basis and it seems to live its life in -mm. It is clear that there
> are broken call sites here and something needs to be done to them to
> avoid this. Ideas for alternative solutions or can we get something
> like this merged?
>
> -apw
>
> === 8< ===
> pci device ensure sysdata initialised v4
>
> We have been seeing panic's on NUMA systems in pci_call_probe() in
> 2.6.19-rc1-mm1 and later. This is related to the changes introduced
> in the commit below:
>
> [x86, PCI] Switch pci_bus::sysdata from NUMA node integer to a pointer
> 0a247a58fc3e2ecfc17654301033e8b8d08df2a2
>
> In this change the sysdata has changed from directly representing
> a value (the node number in NUMA) to a pointer to a structure.
> However, it seems that we do not always initialise this sysdata
> before we probe the device.
>
> Prior to the changes above the node was defaulted to 'NULL'
> allocating the devices to node 0 unconditionally. This patch adds
> a default sysdata entry (pci_default_sysdata), this is then used
> where 'NULL' was used previously. pci_default_sysdata defaults
> the node to unknown (-1). This is a more accurate assignment,
> mirroring the value returned where no topology support is provided
> and no locality information is available.
>
> There are only two uses of this value in the affected architectures
> (x86, x86_64) and generic code:
>
> 1) in x86_64, dma_alloc_pages() looks up the node in order to
> allocate node local memory. Here if the node is invalid we
> will default to the first online node. Behaviour here should
> be unchanged.
> 2) in generic, pci_call_probe() looks up the node in order to
> restrict execution of the probe on the card local node, to
> favor node local allocation. Where this is unknown previously
> we would force execution (and thereby allocation) to node 0,
> this is arguably wrong and using -1 releases this restriction.
>
> In an ideal world we should be supplying a sysdata for the
> appropriate node where it is known. Where it is not known defaulting
> to -1 seems a better course, and would help us where node 0 is
> short of memory.
>
> Signed-off-by: Andy Whitcroft <[email protected]>
> ---
> arch/i386/pci/common.c | 2 ++
> arch/i386/pci/fixup.c | 8 +++++---
> arch/i386/pci/numa.c | 8 +++++---
> arch/i386/pci/visws.c | 4 ++--
> include/asm-i386/pci.h | 1 +
> include/asm-x86_64/pci.h | 1 +
> 6 files changed, 16 insertions(+), 8 deletions(-)
> diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
> index d188e10..ed1e986 100644
> --- a/arch/i386/pci/common.c
> +++ b/arch/i386/pci/common.c
> @@ -27,6 +27,8 @@ unsigned long pirq_table_addr;
> struct pci_bus *pci_root_bus;
> struct pci_raw_ops *raw_pci_ops;
>
> +struct pci_sysdata pci_default_sysdata = { .node = -1 };
> +
> static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
> {
> return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
> diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c
> index e7306db..80365ac 100644
> --- a/arch/i386/pci/fixup.c
> +++ b/arch/i386/pci/fixup.c
> @@ -25,9 +25,11 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
> pci_read_config_byte(d, reg++, &subb);
> DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
> if (busno)
> - pci_scan_bus(busno, &pci_root_ops, NULL); /* Bus A */
> + pci_scan_bus(busno, &pci_root_ops,
> + &pci_default_sysdata); /* Bus A */
> if (suba < subb)
> - pci_scan_bus(suba+1, &pci_root_ops, NULL); /* Bus B */
> + pci_scan_bus(suba+1, &pci_root_ops,
> + &pci_default_sysdata); /* Bus B */
> }
> pcibios_last_bus = -1;
> }
> @@ -42,7 +44,7 @@ static void __devinit pci_fixup_i450gx(struct pci_dev *d)
> u8 busno;
> pci_read_config_byte(d, 0x4a, &busno);
> printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno);
> - pci_scan_bus(busno, &pci_root_ops, NULL);
> + pci_scan_bus(busno, &pci_root_ops, &pci_default_sysdata);
> pcibios_last_bus = -1;
> }
> DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
> diff --git a/arch/i386/pci/numa.c b/arch/i386/pci/numa.c
> index adbe17a..8d2fd6c 100644
> --- a/arch/i386/pci/numa.c
> +++ b/arch/i386/pci/numa.c
> @@ -97,9 +97,11 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
> pci_read_config_byte(d, reg++, &subb);
> DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
> if (busno)
> - pci_scan_bus(QUADLOCAL2BUS(quad,busno), &pci_root_ops, NULL); /* Bus A */
> + pci_scan_bus(QUADLOCAL2BUS(quad,busno), &pci_root_ops,
> + &pci_default_sysdata); /* Bus A */
> if (suba < subb)
> - pci_scan_bus(QUADLOCAL2BUS(quad,suba+1), &pci_root_ops, NULL); /* Bus B */
> + pci_scan_bus(QUADLOCAL2BUS(quad,suba+1), &pci_root_ops,
> + &pci_default_sysdata); /* Bus B */
> }
> pcibios_last_bus = -1;
> }
> @@ -124,7 +126,7 @@ static int __init pci_numa_init(void)
> printk("Scanning PCI bus %d for quad %d\n",
> QUADLOCAL2BUS(quad,0), quad);
> pci_scan_bus(QUADLOCAL2BUS(quad,0),
> - &pci_root_ops, NULL);
> + &pci_root_ops, &pci_default_sysdata);
> }
> return 0;
> }
> diff --git a/arch/i386/pci/visws.c b/arch/i386/pci/visws.c
> index f1b486d..2539371 100644
> --- a/arch/i386/pci/visws.c
> +++ b/arch/i386/pci/visws.c
> @@ -101,8 +101,8 @@ static int __init pcibios_init(void)
> "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0);
>
> raw_pci_ops = &pci_direct_conf1;
> - pci_scan_bus(pci_bus0, &pci_root_ops, NULL);
> - pci_scan_bus(pci_bus1, &pci_root_ops, NULL);
> + pci_scan_bus(pci_bus0, &pci_root_ops, &pci_default_sysdata);
> + pci_scan_bus(pci_bus1, &pci_root_ops, &pci_default_sysdata);
> pci_fixup_irqs(visws_swizzle, visws_map_irq);
> pcibios_resource_survey();
> return 0;
> diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
> index d790343..7003604 100644
> --- a/include/asm-i386/pci.h
> +++ b/include/asm-i386/pci.h
> @@ -7,6 +7,7 @@
> struct pci_sysdata {
> int node; /* NUMA node */
> };
> +extern struct pci_sysdata pci_default_sysdata;
>
> #include <linux/mm.h> /* for struct page */
>
> diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
> index 88926eb..2c2b092 100644
> --- a/include/asm-x86_64/pci.h
> +++ b/include/asm-x86_64/pci.h
> @@ -9,6 +9,7 @@ struct pci_sysdata {
> int node; /* NUMA node */
> void* iommu; /* IOMMU private data */
> };
> +extern struct pci_sysdata pci_default_sysdata;
>
> #ifdef CONFIG_CALGARY_IOMMU
> static inline void* pci_iommu(struct pci_bus *bus)

Acked-by: Yinghai Lu <[email protected]>

anyway i still think we should put .node and .iommu in pci_bus directly.

YH