2009-07-10 17:44:29

by Jesse Barnes

[permalink] [raw]
Subject: [PATCH] x86/PCI: initialize PCI bus node numbers early

The current mp_bus_to_node array is initialized only by AMD specific
code, since AMD platforms have registers that can be used for
determining mode numbers. On new Intel platforms it's necessary to
initialize this array as well though, otherwise all PCI node numbers
will be 0, when in fact they should be -1 (indicating that I/O isn't
tied to any particular node).

So move the mp_bus_to_node code into the common PCI code, and
initialize it early with a default value of -1. This may be overridden
later by arch code (e.g. the AMD code).

With this change, PCI consistent memory and other node specific
allocations (e.g. skbuff allocs) should occur on the "current" node.
If, for performance reasons, applications want to be bound to specific
nodes, they should open their devices only after being pinned to the
CPU where they'll run, for maximum locality.

Any thoughts here Yinghai or Jesse?


include/asm/pci.h | 2 +
kernel/setup.c | 2 +
pci/amd_bus.c | 61 +-----------------------------------------
pci/common.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 83 insertions(+), 59 deletions(-)

Thanks,
--
Jesse Barnes, Intel Open Source Technology Center

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 927958d..c74bbd3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -80,8 +80,10 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
*strat = PCI_DMA_BURST_INFINITY;
*strategy_parameter = ~0UL;
}
+extern void pci_bus_to_node_init(void);
#else
static inline void early_quirks(void) { }
+static inline void pci_bus_to_node_init(void) { }
#endif

extern void pci_iommu_alloc(void);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab1..3b788f4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -972,6 +972,8 @@ void __init setup_arch(char **cmdline_p)

early_quirks();

+ pci_bus_to_node_init();
+
/*
* Read APIC and some other early information from ACPI tables.
*/
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10d..10fa176 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
* also get peer root bus resource for io,mmio
*/

-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node = -1;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return node;
-
- node = mp_bus_to_node[busnum];
-
- /*
- * let numa_node_id to decide it later in dma_alloc_pages
- * if there is no ram on that node
- */
- if (node != -1 && !node_online(node))
- node = -1;
-
- return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return 0;
- node = mp_bus_to_node[busnum];
- return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_X86_64

/*
@@ -303,7 +246,7 @@ static int __init early_fill_mp_bus_info(void)

#ifdef CONFIG_NUMA
for (i = 0; i < BUS_NR; i++)
- mp_bus_to_node[i] = -1;
+ set_mp_bus_to_node(i, -1);
#endif

if (!early_pci_allowed())
@@ -346,7 +289,7 @@ static int __init early_fill_mp_bus_info(void)
node = (reg >> 4) & 0x07;
#ifdef CONFIG_NUMA
for (j = min_bus; j <= max_bus; j++)
- mp_bus_to_node[j] = (unsigned char) node;
+ set_mp_bus_to_node(j, node);
#endif
link = (reg >> 8) & 0x03;

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b62..27a9dd6 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,80 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
{
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node". In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR];
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node = -1;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return node;
+
+ node = mp_bus_to_node[busnum];
+
+ /*
+ * let numa_node_id to decide it later in dma_alloc_pages
+ * if there is no ram on that node
+ */
+ if (node != -1 && !node_online(node))
+ node = -1;
+
+ return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static unsigned char mp_bus_to_node[BUS_NR];
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return 0;
+ node = mp_bus_to_node[busnum];
+ return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+void __init pci_bus_to_node_init(void)
+{
+ int i;
+
+ /*
+ * Default to "no node" for each bus, and let later code update
+ * it if need be (e.g. amd_postcore_init)
+ */
+ for (i = 0; i < BUS_NR; i++)
+ set_mp_bus_to_node(i, -1);
+}
+
+#endif /* CONFIG_NUMA */


2009-07-10 18:33:49

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

Oops, here's one that actually builds & links.

--
Jesse Barnes, Intel Open Source Technology Center

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 927958d..d30bbfa 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -84,6 +84,12 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
static inline void early_quirks(void) { }
#endif

+#if defined (CONFIG_PCI) && defined(CONFIG_NUMA)
+extern void pci_bus_to_node_init(void);
+#else
+static inline void pci_bus_to_node_init(void) { }
+#endif
+
extern void pci_iommu_alloc(void);

/* MSI arch hook */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab1..3b788f4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -972,6 +972,8 @@ void __init setup_arch(char **cmdline_p)

early_quirks();

+ pci_bus_to_node_init();
+
/*
* Read APIC and some other early information from ACPI tables.
*/
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10d..572ee97 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
* also get peer root bus resource for io,mmio
*/

-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node = -1;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return node;
-
- node = mp_bus_to_node[busnum];
-
- /*
- * let numa_node_id to decide it later in dma_alloc_pages
- * if there is no ram on that node
- */
- if (node != -1 && !node_online(node))
- node = -1;
-
- return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return 0;
- node = mp_bus_to_node[busnum];
- return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_X86_64

/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
u64 val;
u32 address;

-#ifdef CONFIG_NUMA
- for (i = 0; i < BUS_NR; i++)
- mp_bus_to_node[i] = -1;
-#endif
-
if (!early_pci_allowed())
return -1;

@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
node = (reg >> 4) & 0x07;
#ifdef CONFIG_NUMA
for (j = min_bus; j <= max_bus; j++)
- mp_bus_to_node[j] = (unsigned char) node;
+ set_mp_bus_to_node(j, node);
#endif
link = (reg >> 8) & 0x03;

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b62..27a9dd6 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,80 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
{
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node". In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR];
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node = -1;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return node;
+
+ node = mp_bus_to_node[busnum];
+
+ /*
+ * let numa_node_id to decide it later in dma_alloc_pages
+ * if there is no ram on that node
+ */
+ if (node != -1 && !node_online(node))
+ node = -1;
+
+ return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static unsigned char mp_bus_to_node[BUS_NR];
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return 0;
+ node = mp_bus_to_node[busnum];
+ return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+void __init pci_bus_to_node_init(void)
+{
+ int i;
+
+ /*
+ * Default to "no node" for each bus, and let later code update
+ * it if need be (e.g. amd_postcore_init)
+ */
+ for (i = 0; i < BUS_NR; i++)
+ set_mp_bus_to_node(i, -1);
+}
+
+#endif /* CONFIG_NUMA */

2009-07-10 20:11:25

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

Jesse Barnes wrote:
> The current mp_bus_to_node array is initialized only by AMD specific
> code, since AMD platforms have registers that can be used for
> determining mode numbers. On new Intel platforms it's necessary to
> initialize this array as well though, otherwise all PCI node numbers
> will be 0, when in fact they should be -1 (indicating that I/O isn't
> tied to any particular node).
>
> So move the mp_bus_to_node code into the common PCI code, and
> initialize it early with a default value of -1. This may be overridden
> later by arch code (e.g. the AMD code).
>
> With this change, PCI consistent memory and other node specific
> allocations (e.g. skbuff allocs) should occur on the "current" node.
> If, for performance reasons, applications want to be bound to specific
> nodes, they should open their devices only after being pinned to the
> CPU where they'll run, for maximum locality.
>
> Any thoughts here Yinghai or Jesse?
>
>
> include/asm/pci.h | 2 +
> kernel/setup.c | 2 +
> pci/amd_bus.c | 61 +-----------------------------------------
> pci/common.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 83 insertions(+), 59 deletions(-)
>
> Thanks,

good to me.

YH

2009-07-10 20:18:35

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

Jesse Barnes wrote:
> The current mp_bus_to_node array is initialized only by AMD specific
> code, since AMD platforms have registers that can be used for
> determining mode numbers. On new Intel platforms it's necessary to
> initialize this array as well though, otherwise all PCI node numbers
> will be 0, when in fact they should be -1 (indicating that I/O isn't
> tied to any particular node).
>
> So move the mp_bus_to_node code into the common PCI code, and
> initialize it early with a default value of -1. This may be overridden
> later by arch code (e.g. the AMD code).
>
> With this change, PCI consistent memory and other node specific
> allocations (e.g. skbuff allocs) should occur on the "current" node.
> If, for performance reasons, applications want to be bound to specific
> nodes, they should open their devices only after being pinned to the
> CPU where they'll run, for maximum locality.
>
> Any thoughts here Yinghai or Jesse?
>
>
> include/asm/pci.h | 2 +
> kernel/setup.c | 2 +
> pci/amd_bus.c | 61 +-----------------------------------------
> pci/common.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 83 insertions(+), 59 deletions(-)
>
> Thanks,

could use

static int mp_bus_to_node[BUS_NR] = {
[0 ... BUS_NR - 1] = -1
};

so we avoid to add pci_bus_to_node_init()

YH

2009-07-10 20:19:52

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Fri, 10 Jul 2009 13:10:49 -0700
Yinghai Lu <[email protected]> wrote:

> Jesse Barnes wrote:
> > The current mp_bus_to_node array is initialized only by AMD specific
> > code, since AMD platforms have registers that can be used for
> > determining mode numbers. On new Intel platforms it's necessary to
> > initialize this array as well though, otherwise all PCI node numbers
> > will be 0, when in fact they should be -1 (indicating that I/O isn't
> > tied to any particular node).
> >
> > So move the mp_bus_to_node code into the common PCI code, and
> > initialize it early with a default value of -1. This may be
> > overridden later by arch code (e.g. the AMD code).
> >
> > With this change, PCI consistent memory and other node specific
> > allocations (e.g. skbuff allocs) should occur on the "current" node.
> > If, for performance reasons, applications want to be bound to
> > specific nodes, they should open their devices only after being
> > pinned to the CPU where they'll run, for maximum locality.
> >
> > Any thoughts here Yinghai or Jesse?
> >
> >
> > include/asm/pci.h | 2 +
> > kernel/setup.c | 2 +
> > pci/amd_bus.c | 61 +-----------------------------------------
> > pci/common.c | 77
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files
> > changed, 83 insertions(+), 59 deletions(-)
> >
> > Thanks,
>
> good to me.

Any chance you could test it on an AMD box for me? I don't have any
here...

Thanks,
--
Jesse Barnes, Intel Open Source Technology Center

2009-07-10 20:22:59

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Fri, 10 Jul 2009 13:18:06 -0700
Yinghai Lu <[email protected]> wrote:

> Jesse Barnes wrote:
> > The current mp_bus_to_node array is initialized only by AMD specific
> > code, since AMD platforms have registers that can be used for
> > determining mode numbers. On new Intel platforms it's necessary to
> > initialize this array as well though, otherwise all PCI node numbers
> > will be 0, when in fact they should be -1 (indicating that I/O isn't
> > tied to any particular node).
> >
> > So move the mp_bus_to_node code into the common PCI code, and
> > initialize it early with a default value of -1. This may be
> > overridden later by arch code (e.g. the AMD code).
> >
> > With this change, PCI consistent memory and other node specific
> > allocations (e.g. skbuff allocs) should occur on the "current" node.
> > If, for performance reasons, applications want to be bound to
> > specific nodes, they should open their devices only after being
> > pinned to the CPU where they'll run, for maximum locality.
> >
> > Any thoughts here Yinghai or Jesse?
> >
> >
> > include/asm/pci.h | 2 +
> > kernel/setup.c | 2 +
> > pci/amd_bus.c | 61 +-----------------------------------------
> > pci/common.c | 77
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files
> > changed, 83 insertions(+), 59 deletions(-)
> >
> > Thanks,
>
> could use
>
> static int mp_bus_to_node[BUS_NR] = {
> [0 ... BUS_NR - 1] = -1
> };
>
> so we avoid to add pci_bus_to_node_init()

Ah yeah that would clean things up a bit... Thanks.

--
Jesse Barnes, Intel Open Source Technology Center

2009-07-10 21:07:10

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Fri, 10 Jul 2009 13:22:49 -0700
Jesse Barnes <[email protected]> wrote:

> On Fri, 10 Jul 2009 13:18:06 -0700
> Yinghai Lu <[email protected]> wrote:
>
> > Jesse Barnes wrote:
> > > The current mp_bus_to_node array is initialized only by AMD
> > > specific code, since AMD platforms have registers that can be
> > > used for determining mode numbers. On new Intel platforms it's
> > > necessary to initialize this array as well though, otherwise all
> > > PCI node numbers will be 0, when in fact they should be -1
> > > (indicating that I/O isn't tied to any particular node).
> > >
> > > So move the mp_bus_to_node code into the common PCI code, and
> > > initialize it early with a default value of -1. This may be
> > > overridden later by arch code (e.g. the AMD code).
> > >
> > > With this change, PCI consistent memory and other node specific
> > > allocations (e.g. skbuff allocs) should occur on the "current"
> > > node. If, for performance reasons, applications want to be bound
> > > to specific nodes, they should open their devices only after being
> > > pinned to the CPU where they'll run, for maximum locality.
> > >
> > > Any thoughts here Yinghai or Jesse?
> > >
> > >
> > > include/asm/pci.h | 2 +
> > > kernel/setup.c | 2 +
> > > pci/amd_bus.c | 61
> > > +----------------------------------------- pci/common.c |
> > > 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files
> > > changed, 83 insertions(+), 59 deletions(-)
> > >
> > > Thanks,
> >
> > could use
> >
> > static int mp_bus_to_node[BUS_NR] = {
> > [0 ... BUS_NR - 1] = -1
> > };
> >
> > so we avoid to add pci_bus_to_node_init()
>
> Ah yeah that would clean things up a bit... Thanks.
>

So something like this...

--

>From 2b51fba93f7b2dabf453a74923a9a217611ebc1a Mon Sep 17 00:00:00 2001
From: Jesse Barnes <[email protected]>
Date: Fri, 10 Jul 2009 14:04:30 -0700
Subject: [PATCH] x86/PCI: initialize PCI bus node numbers early

The current mp_bus_to_node array is initialized only by AMD specific
code, since AMD platforms have registers that can be used for
determining mode numbers. On new Intel platforms it's necessary to
initialize this array as well though, otherwise all PCI node numbers
will be 0, when in fact they should be -1 (indicating that I/O isn't
tied to any particular node).

So move the mp_bus_to_node code into the common PCI code, and
initialize it early with a default value of -1. This may be overridden
later by arch code (e.g. the AMD code).

With this change, PCI consistent memory and other node specific
allocations (e.g. skbuff allocs) should occur on the "current" node.
If, for performance reasons, applications want to be bound to specific
nodes, they should open their devices only after being pinned to the
CPU where they'll run, for maximum locality.

Acked-by: Yinghai Lu <[email protected]>
Tested-by: Jesse Brandeburg <[email protected]>
Signed-off-by: Jesse Barnes <[email protected]>
---
arch/x86/pci/amd_bus.c | 64 +-------------------------------------------
arch/x86/pci/common.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 70 insertions(+), 63 deletions(-)

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10d..572ee97 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
* also get peer root bus resource for io,mmio
*/

-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node = -1;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return node;
-
- node = mp_bus_to_node[busnum];
-
- /*
- * let numa_node_id to decide it later in dma_alloc_pages
- * if there is no ram on that node
- */
- if (node != -1 && !node_online(node))
- node = -1;
-
- return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return 0;
- node = mp_bus_to_node[busnum];
- return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_X86_64

/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
u64 val;
u32 address;

-#ifdef CONFIG_NUMA
- for (i = 0; i < BUS_NR; i++)
- mp_bus_to_node[i] = -1;
-#endif
-
if (!early_pci_allowed())
return -1;

@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
node = (reg >> 4) & 0x07;
#ifdef CONFIG_NUMA
for (j = min_bus; j <= max_bus; j++)
- mp_bus_to_node[j] = (unsigned char) node;
+ set_mp_bus_to_node(j, node);
#endif
link = (reg >> 8) & 0x03;

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b62..8ce1ce1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
{
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node". In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1;
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node = -1;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return node;
+
+ node = mp_bus_to_node[busnum];
+
+ /*
+ * let numa_node_id to decide it later in dma_alloc_pages
+ * if there is no ram on that node
+ */
+ if (node != -1 && !node_online(node))
+ node = -1;
+
+ return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static unsigned char mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1;
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return 0;
+ node = mp_bus_to_node[busnum];
+ return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* CONFIG_NUMA */
--
1.6.0.4



--
Jesse Barnes, Intel Open Source Technology Center

2009-07-14 07:41:32

by Jesse Brandeburg

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Fri, Jul 10, 2009 at 2:06 PM, Jesse Barnes<[email protected]> wrote:
> From 2b51fba93f7b2dabf453a74923a9a217611ebc1a Mon Sep 17 00:00:00 2001
> From: Jesse Barnes <[email protected]>
> Date: Fri, 10 Jul 2009 14:04:30 -0700
> Subject: [PATCH] x86/PCI: initialize PCI bus node numbers early
>
> The current mp_bus_to_node array is initialized only by AMD specific
> code, since AMD platforms have registers that can be used for
> determining mode numbers. ?On new Intel platforms it's necessary to
> initialize this array as well though, otherwise all PCI node numbers
> will be 0, when in fact they should be -1 (indicating that I/O isn't
> tied to any particular node).
>
> So move the mp_bus_to_node code into the common PCI code, and
> initialize it early with a default value of -1. ?This may be overridden
> later by arch code (e.g. the AMD code).
>
> With this change, PCI consistent memory and other node specific
> allocations (e.g. skbuff allocs) should occur on the "current" node.
> If, for performance reasons, applications want to be bound to specific
> nodes, they should open their devices only after being pinned to the
> CPU where they'll run, for maximum locality.
>
> Acked-by: Yinghai Lu <[email protected]>
> Tested-by: Jesse Brandeburg <[email protected]>
> Signed-off-by: Jesse Barnes <[email protected]>

I can confirm this works, aside from the MSI-X interrupt migration
instability (panics) that I believe are unrelated since they happen
without this patch.

I also see a pretty nice performance boost by running with this change
on a 5520 motherboard, with an 82599 10GbE forwarding packets, esp
with interrupt affinity set correctly.

I'd like to see this applied if at all possible, I think it is really
hampering I/O traffic performance due to limiting all network (among
others) memory allocation to one of the two numa nodes.

2009-07-14 15:47:36

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Tue, 14 Jul 2009 00:41:30 -0700
Jesse Brandeburg <[email protected]> wrote:

> On Fri, Jul 10, 2009 at 2:06 PM, Jesse
> Barnes<[email protected]> wrote:
> > From 2b51fba93f7b2dabf453a74923a9a217611ebc1a Mon Sep 17 00:00:00
> > 2001 From: Jesse Barnes <[email protected]>
> > Date: Fri, 10 Jul 2009 14:04:30 -0700
> > Subject: [PATCH] x86/PCI: initialize PCI bus node numbers early
> >
> > The current mp_bus_to_node array is initialized only by AMD specific
> > code, since AMD platforms have registers that can be used for
> > determining mode numbers.  On new Intel platforms it's necessary to
> > initialize this array as well though, otherwise all PCI node numbers
> > will be 0, when in fact they should be -1 (indicating that I/O isn't
> > tied to any particular node).
> >
> > So move the mp_bus_to_node code into the common PCI code, and
> > initialize it early with a default value of -1.  This may be
> > overridden later by arch code (e.g. the AMD code).
> >
> > With this change, PCI consistent memory and other node specific
> > allocations (e.g. skbuff allocs) should occur on the "current" node.
> > If, for performance reasons, applications want to be bound to
> > specific nodes, they should open their devices only after being
> > pinned to the CPU where they'll run, for maximum locality.
> >
> > Acked-by: Yinghai Lu <[email protected]>
> > Tested-by: Jesse Brandeburg <[email protected]>
> > Signed-off-by: Jesse Barnes <[email protected]>
>
> I can confirm this works, aside from the MSI-X interrupt migration
> instability (panics) that I believe are unrelated since they happen
> without this patch.
>
> I also see a pretty nice performance boost by running with this change
> on a 5520 motherboard, with an 82599 10GbE forwarding packets, esp
> with interrupt affinity set correctly.
>
> I'd like to see this applied if at all possible, I think it is really
> hampering I/O traffic performance due to limiting all network (among
> others) memory allocation to one of the two numa nodes.

Ok, thanks for testing. I've pushed it to my linux-next branch.

--
Jesse Barnes, Intel Open Source Technology Center

2009-09-01 13:54:28

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early


* Jesse Barnes <[email protected]> wrote:

> The current mp_bus_to_node array is initialized only by AMD specific
> code, since AMD platforms have registers that can be used for
> determining mode numbers. On new Intel platforms it's necessary to
> initialize this array as well though, otherwise all PCI node numbers
> will be 0, when in fact they should be -1 (indicating that I/O isn't
> tied to any particular node).
>
> So move the mp_bus_to_node code into the common PCI code, and
> initialize it early with a default value of -1. This may be overridden
> later by arch code (e.g. the AMD code).
>
> With this change, PCI consistent memory and other node specific
> allocations (e.g. skbuff allocs) should occur on the "current" node.
> If, for performance reasons, applications want to be bound to specific
> nodes, they should open their devices only after being pinned to the
> CPU where they'll run, for maximum locality.
>
> Any thoughts here Yinghai or Jesse?
>
>
> include/asm/pci.h | 2 +
> kernel/setup.c | 2 +
> pci/amd_bus.c | 61 +-----------------------------------------
> pci/common.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 83 insertions(+), 59 deletions(-)

FYI, this commit:

acccaba: x86/PCI: initialize PCI bus node numbers early

caused a boot crash in -tip testing:

calling amd_init+0x0/0x16 @ 1
pata_amd 0000:00:06.0: version 0.4.1
pata_amd 0000:00:06.0: setting latency timer to 64
BUG: unable to handle kernel NULL pointer dereference at 00000e44
IP: [<c10b7db4>] __alloc_pages_nodemask+0x44/0x4e0
*pdpt = 00000000024e8001 *pde = 0000000000000000
Oops: 0000 [#1] PREEMPT SMP
last sysfs file:
kthreadd_task: f68a8440

Pid: 1, comm: swapper Not tainted (2.6.31-rc2-00263-gacccaba-dirty #9102) System Product Name
EIP: 0060:[<c10b7db4>] EFLAGS: 00010246 CPU: 0
EIP is at __alloc_pages_nodemask+0x44/0x4e0
EAX: 00000000 EBX: 000080d0 ECX: 00000e40 EDX: 00000e40
ESI: f5ff923c EDI: 003fffff EBP: f689ed9c ESP: f689ed44
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process swapper (pid: 1, ti=f689e000 task=f68a8000 task.ti=f689e000)
Stack:
00331788 00000e40 00000000 f72c1f20 00000000 c1582968 f689ed88 c10dc4a9
<0> 00000001 ffffffff 000000d0 c2421788 f5832940 f689ed98 00000000 c2421788
<0> 00000202 f689edbc c10dda24 000000ff f5ff923c 000080d0 f689edcc c102707b
Call Trace:
[<c1582968>] ? dmam_alloc_coherent+0x28/0x120
[<c10dc4a9>] ? __slab_alloc+0x449/0x4d0
[<c10dda24>] ? __kmalloc_track_caller+0xc4/0x190
[<c102707b>] ? dma_generic_alloc_coherent+0x9b/0x160
[<c1026fe0>] ? dma_generic_alloc_coherent+0x0/0x160
[<c1582a25>] ? dmam_alloc_coherent+0xe5/0x120
[<c16d45a3>] ? ata_port_start+0x23/0x40
[<c16e3e24>] ? ata_sff_port_start+0x14/0x30
[<c16d4435>] ? ata_host_start+0xf5/0x1d0
[<c16e33aa>] ? ata_pci_sff_activate_host+0x2a/0x1d0
[<c16e52b0>] ? ata_sff_interrupt+0x0/0xe0
[<c16e3a32>] ? ata_pci_sff_init_one+0xe2/0x110
[<c16f4210>] ? amd_init_one+0xc0/0x1d0
[<c1444773>] ? local_pci_probe+0x13/0x20
[<c1444a01>] ? pci_device_probe+0x81/0xd0
[<c157efb4>] ? driver_probe_device+0x84/0x240
[<c157f1e9>] ? __driver_attach+0x79/0x80
[<c157e430>] ? bus_for_each_dev+0x50/0x90
[<c157edd9>] ? driver_attach+0x19/0x20
[<c157f170>] ? __driver_attach+0x0/0x80
[<c157eb37>] ? bus_add_driver+0x217/0x330
[<c1444920>] ? pci_device_remove+0x0/0x40
[<c1444980>] ? pci_device_probe+0x0/0xd0
[<c157f4a7>] ? driver_register+0x77/0x170
[<c246a75f>] ? amd_init+0x0/0x16
[<c246a75f>] ? amd_init+0x0/0x16
[<c1444e3a>] ? __pci_register_driver+0x3a/0xa0
[<c246a75f>] ? amd_init+0x0/0x16
[<c246a773>] ? amd_init+0x14/0x16
[<c1001039>] ? do_one_initcall+0x29/0x130
[<c1090f35>] ? init_irq_proc+0x65/0x80
[<c24335e6>] ? kernel_init+0x11e/0x176
[<c24334c8>] ? kernel_init+0x0/0x176
[<c1023057>] ? kernel_thread_helper+0x7/0x10
Code: 08 00 3d 08 00 08 00 0f 84 da 00 00 00 8b 3d c0 36 42 c2 85 ff 0f 84 8c 00 00 00 c7 45 b8 00 00 00 00 8b 3d b4 36 42 c2 8b 55 ac <8b> 72 04 85 f6 0f 84 a1 00 00 00 89 d9 83 e1 0f 01 c9 be 21 01
EIP: [<c10b7db4>] __alloc_pages_nodemask+0x44/0x4e0 SS:ESP 0068:f689ed44
CR2: 0000000000000e44
---[ end trace bdfad5a2d2b496c5 ]---

via the x86/mrst topic tree. Config attached.

Ingo


Attachments:
(No filename) (4.14 kB)
config (75.71 kB)
Download all attachments

2009-09-01 16:56:25

by Yinghai Lu

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

Ingo Molnar wrote:
> * Jesse Barnes <[email protected]> wrote:
>
>> The current mp_bus_to_node array is initialized only by AMD specific
>> code, since AMD platforms have registers that can be used for
>> determining mode numbers. On new Intel platforms it's necessary to
>> initialize this array as well though, otherwise all PCI node numbers
>> will be 0, when in fact they should be -1 (indicating that I/O isn't
>> tied to any particular node).
>>
>> So move the mp_bus_to_node code into the common PCI code, and
>> initialize it early with a default value of -1. This may be overridden
>> later by arch code (e.g. the AMD code).
>>
>> With this change, PCI consistent memory and other node specific
>> allocations (e.g. skbuff allocs) should occur on the "current" node.
>> If, for performance reasons, applications want to be bound to specific
>> nodes, they should open their devices only after being pinned to the
>> CPU where they'll run, for maximum locality.
>>
>> Any thoughts here Yinghai or Jesse?
>>
>>
>> include/asm/pci.h | 2 +
>> kernel/setup.c | 2 +
>> pci/amd_bus.c | 61 +-----------------------------------------
>> pci/common.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 83 insertions(+), 59 deletions(-)
>
> FYI, this commit:
>
> acccaba: x86/PCI: initialize PCI bus node numbers early
>
> caused a boot crash in -tip testing:
>

looks like the patch change default 0 in 32bit to -1 ...

YH

2009-09-01 17:10:13

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Tue, 01 Sep 2009 09:55:34 -0700
Yinghai Lu <[email protected]> wrote:

> Ingo Molnar wrote:
> > * Jesse Barnes <[email protected]> wrote:
> >
> >> The current mp_bus_to_node array is initialized only by AMD
> >> specific code, since AMD platforms have registers that can be used
> >> for determining mode numbers. On new Intel platforms it's
> >> necessary to initialize this array as well though, otherwise all
> >> PCI node numbers will be 0, when in fact they should be -1
> >> (indicating that I/O isn't tied to any particular node).
> >>
> >> So move the mp_bus_to_node code into the common PCI code, and
> >> initialize it early with a default value of -1. This may be
> >> overridden later by arch code (e.g. the AMD code).
> >>
> >> With this change, PCI consistent memory and other node specific
> >> allocations (e.g. skbuff allocs) should occur on the "current"
> >> node. If, for performance reasons, applications want to be bound
> >> to specific nodes, they should open their devices only after being
> >> pinned to the CPU where they'll run, for maximum locality.
> >>
> >> Any thoughts here Yinghai or Jesse?
> >>
> >>
> >> include/asm/pci.h | 2 +
> >> kernel/setup.c | 2 +
> >> pci/amd_bus.c | 61
> >> +----------------------------------------- pci/common.c |
> >> 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files
> >> changed, 83 insertions(+), 59 deletions(-)
> >
> > FYI, this commit:
> >
> > acccaba: x86/PCI: initialize PCI bus node numbers early
> >
> > caused a boot crash in -tip testing:
> >
>
> looks like the patch change default 0 in 32bit to -1 ...

Yeah, that's the idea. We want the node to be "don't care" on
platforms that don't have a specific bridge to node mapping. Not sure
why it fails in the mrst tree though..

Looks like we're taking an amd specific path in the backtrace, maybe
with the reshuffling of things we're calling one of the amd routines
before the real node numbers have been set up and they can't handle -1?

--
Jesse Barnes, Intel Open Source Technology Center

2009-09-01 22:29:18

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Tue, 1 Sep 2009 15:53:58 +0200
Ingo Molnar <[email protected]> wrote:
> acccaba: x86/PCI: initialize PCI bus node numbers early
>
> caused a boot crash in -tip testing:
>
> calling amd_init+0x0/0x16 @ 1
> pata_amd 0000:00:06.0: version 0.4.1
> pata_amd 0000:00:06.0: setting latency timer to 64
> BUG: unable to handle kernel NULL pointer dereference at 00000e44

The 32 bit part looked a bit off, (unsigned char)-1 == 255. Could
definitely cause issues ("any node" is supposed to be a real -1).

Can you give this patch a try? (Note this code is a copy & paste of
the original AMD code, it could stand some cleanup.)

Thanks,
Jesse

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index eb6eb61..ffcb516 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -640,7 +640,7 @@ int get_mp_bus_to_node(int busnum)

#else /* CONFIG_X86_32 */

-static unsigned char mp_bus_to_node[BUS_NR] = {
+static int mp_bus_to_node[BUS_NR] = {
[0 ... BUS_NR - 1] = -1
};

2009-09-15 00:45:24

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] x86/PCI: initialize PCI bus node numbers early

On Tue, 01 Sep 2009 09:55:34 -0700
Yinghai Lu <[email protected]> wrote:

> Ingo Molnar wrote:
> > * Jesse Barnes <[email protected]> wrote:
> >
> >> The current mp_bus_to_node array is initialized only by AMD
> >> specific code, since AMD platforms have registers that can be used
> >> for determining mode numbers. On new Intel platforms it's
> >> necessary to initialize this array as well though, otherwise all
> >> PCI node numbers will be 0, when in fact they should be -1
> >> (indicating that I/O isn't tied to any particular node).
> >>
> >> So move the mp_bus_to_node code into the common PCI code, and
> >> initialize it early with a default value of -1. This may be
> >> overridden later by arch code (e.g. the AMD code).
> >>
> >> With this change, PCI consistent memory and other node specific
> >> allocations (e.g. skbuff allocs) should occur on the "current"
> >> node. If, for performance reasons, applications want to be bound
> >> to specific nodes, they should open their devices only after being
> >> pinned to the CPU where they'll run, for maximum locality.
> >>
> >> Any thoughts here Yinghai or Jesse?
> >>
> >>
> >> include/asm/pci.h | 2 +
> >> kernel/setup.c | 2 +
> >> pci/amd_bus.c | 61
> >> +----------------------------------------- pci/common.c |
> >> 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files
> >> changed, 83 insertions(+), 59 deletions(-)
> >
> > FYI, this commit:
> >
> > acccaba: x86/PCI: initialize PCI bus node numbers early
> >
> > caused a boot crash in -tip testing:
> >
>
> looks like the patch change default 0 in 32bit to -1 ...

Ingo or Yinghai, did you get a chance to try the patch I sent? Maybe
you've already integrated it...

Thanks,
--
Jesse Barnes, Intel Open Source Technology Center