2014-11-02 08:10:50

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH v3 1/5] Numachip: Fix 16-bit APIC ID truncation

Prevent 16-bit APIC IDs being truncated by using correct mask. This fixes
booting large systems, where the wrong core would receive the startup and
init IPIs, causing hanging.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 81d70ba..bd083c0 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x)
unsigned int id;

rdmsrl(MSR_FAM10H_NODE_ID, value);
- id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);

return id;
}
--
1.9.1


2014-11-02 08:10:58

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH v3 3/5] Numachip: Add safe is-present function

Add safe function to check if Numachip is detected, to be used elsewhere.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/numachip/numachip.h | 9 +++++++++
arch/x86/kernel/apic/apic_numachip.c | 9 +++++++--
2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
index 1c6f7f6..3e1f4f9 100644
--- a/arch/x86/include/asm/numachip/numachip.h
+++ b/arch/x86/include/asm/numachip/numachip.h
@@ -16,4 +16,13 @@

extern int __init pci_numachip_init(void);

+#ifdef CONFIG_X86_NUMACHIP
+extern bool is_numachip_system(void);
+#else
+static inline bool is_numachip_system(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index bd083c0..c965b69 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@
#include <asm/apic_flat_64.h>
#include <asm/pgtable.h>

-static int numachip_system __read_mostly;
+static bool numachip_system __read_mostly;

static const struct apic apic_numachip __refconst;

@@ -173,11 +173,16 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
}
}

+bool is_numachip_system(void)
+{
+ return numachip_system;
+}
+
static int __init numachip_system_init(void)
{
unsigned int val;

- if (!numachip_system)
+ if (!is_numachip_system())
return 0;

x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
--
1.9.1

2014-11-02 08:11:10

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On larger x64-64 systems, use a 2GB memory block size to reduce sysfs
entry creation time by 16x. Large is defined as 64GB or more memory.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/mm/init_64.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4cb8763..6002e80 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,7 +52,6 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
-#include <asm/uv/uv.h>
#include <asm/setup.h>

#include "mm_internal.h"
@@ -1247,9 +1246,9 @@ static unsigned long probe_memory_block_size(void)
/* start from 2g */
unsigned long bz = 1UL<<31;

-#ifdef CONFIG_X86_UV
- if (is_uv_system()) {
- printk(KERN_INFO "UV: memory block size 2GB\n");
+#ifdef CONFIG_X86_64
+ if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
+ pr_info("Using 2GB memory block size for large-memory system\n");
return 2UL * 1024 * 1024 * 1024;
}
#endif

2014-11-02 08:11:08

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH v3 4/5] Numachip: APIC driver cleanups

Drop printing that serves no purpose, as it's printing fixed or known
values, and mark constant structure appropriately.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 22 +++-------------------
arch/x86/pci/numachip.c | 2 +-
2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c965b69..6374d94 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -153,20 +153,8 @@ static int __init numachip_probe(void)
return apic == &apic_numachip;
}

-static void __init map_csrs(void)
-{
- printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
- NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
-
- printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
- NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
-}
-
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
-
if (c->phys_proc_id != node) {
c->phys_proc_id = node;
per_cpu(cpu_llc_id, smp_processor_id()) = node;
@@ -180,19 +168,15 @@ bool is_numachip_system(void)

static int __init numachip_system_init(void)
{
- unsigned int val;
-
if (!is_numachip_system())
return 0;

+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
x86_init.pci.arch_init = pci_numachip_init;

- map_csrs();
-
- val = read_lcsr(CSR_G0_NODE_IDS);
- printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
-
return 0;
}
early_initcall(numachip_system_init);
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 7307d9d..2e565e6 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -103,7 +103,7 @@ static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus,
return 0;
}

-const struct pci_raw_ops pci_mmcfg_numachip = {
+static const struct pci_raw_ops pci_mmcfg_numachip = {
.read = pci_mmcfg_read_numachip,
.write = pci_mmcfg_write_numachip,
};
--
1.9.1

2014-11-02 08:10:48

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH v3 2/5] Numachip: Elide self-IPI ICR polling

The default self-IPI path polls the ICR to delay sending the IPI until
there is no IPI in progress. This is redundant on x86-86 APICs, since
IPIs are queued. See the AMD64 Architecture Programmer's Manual, vol 2,
p525.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 81d70ba..bd083c0 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector)

static void numachip_send_IPI_self(int vector)
{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+ apic_write(APIC_SELF_IPI, vector);
}

static int __init numachip_probe(void)
--
1.9.1

2014-11-03 19:38:35

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On Sun, 2 Nov 2014, Daniel J Blueman wrote:

> On larger x64-64 systems, use a 2GB memory block size to reduce sysfs
> entry creation time by 16x. Large is defined as 64GB or more memory.

This changelog sucks.

It neither tells which sysfs entries are meant nor does it explain
what the actual effect of this change is aside of speeding up some
random sysfs thingy.

> @@ -1247,9 +1246,9 @@ static unsigned long probe_memory_block_size(void)
> /* start from 2g */
> unsigned long bz = 1UL<<31;
>
> -#ifdef CONFIG_X86_UV
> - if (is_uv_system()) {
> - printk(KERN_INFO "UV: memory block size 2GB\n");
> +#ifdef CONFIG_X86_64

And this brainless 's/CONFIG_X86_UV/CONFIG_X86_64/' sucks even
more. I'm sure you can figure out the WHY yourself.

> + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
> + pr_info("Using 2GB memory block size for large-memory system\n");
> return 2UL * 1024 * 1024 * 1024;
> }
> #endif

Thanks,

tglx

2014-11-03 19:45:56

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] Numachip: Add safe is-present function

On Sun, 2 Nov 2014, Daniel J Blueman wrote:

> Add safe function to check if Numachip is detected, to be used elsewhere.

I cannot find a use case for this. I guess this is a left over of the
earlier 2G change, right?

Thanks,

tglx

2014-11-03 22:55:54

by Daniel J Blueman

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] Numachip: Add safe is-present function

On 11/04/2014 03:45 AM, Thomas Gleixner wrote:
> On Sun, 2 Nov 2014, Daniel J Blueman wrote:
>
>> Add safe function to check if Numachip is detected, to be used elsewhere.
>
> I cannot find a use case for this. I guess this is a left over of the
> earlier 2G change, right?

I left it in as it's common to apic_numachip.c and a Numachip2 patch
series I'll present later.

Daniel
--
Daniel J Blueman
Principal Software Engineer, Numascale

2014-11-03 22:58:49

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] Numachip: Add safe is-present function

On Tue, 4 Nov 2014, Daniel J Blueman wrote:

> On 11/04/2014 03:45 AM, Thomas Gleixner wrote:
> > On Sun, 2 Nov 2014, Daniel J Blueman wrote:
> >
> > > Add safe function to check if Numachip is detected, to be used elsewhere.
> >
> > I cannot find a use case for this. I guess this is a left over of the
> > earlier 2G change, right?
>
> I left it in as it's common to apic_numachip.c and a Numachip2 patch series
> I'll present later.

Please move it to that series then.

Thanks,

tglx

2014-11-03 23:15:54

by Daniel J Blueman

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On 11/04/2014 03:38 AM, Thomas Gleixner wrote:
> On Sun, 2 Nov 2014, Daniel J Blueman wrote:
>
>> On larger x64-64 systems, use a 2GB memory block size to reduce sysfs
>> entry creation time by 16x. Large is defined as 64GB or more memory.
>
> This changelog sucks.
>
> It neither tells which sysfs entries are meant nor does it explain
> what the actual effect of this change is aside of speeding up some
> random sysfs thingy.

How about this?

On large-memory systems of 64GB or more with memory hot-plug enabled,
use a 2GB memory block size. Eg with 64GB memory, this reduces the
number of directories in /sys/devices/system/memory from 512 to 32,
making it more manageable, and reducing the creation time accordingly.

>> @@ -1247,9 +1246,9 @@ static unsigned long probe_memory_block_size(void)
>> /* start from 2g */
>> unsigned long bz = 1UL<<31;
>>
>> -#ifdef CONFIG_X86_UV
>> - if (is_uv_system()) {
>> - printk(KERN_INFO "UV: memory block size 2GB\n");
>> +#ifdef CONFIG_X86_64
>
> And this brainless 's/CONFIG_X86_UV/CONFIG_X86_64/' sucks even
> more. I'm sure you can figure out the WHY yourself.

The benefit of this is applicable to other architectures. I'm unable to
test the change, but if you agree it's conservative enough, I'll drop
the ifdef?

Thanks,
Daniel
--
Daniel J Blueman
Principal Software Engineer, Numascale

2014-11-03 23:37:10

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On Tue, 4 Nov 2014, Daniel J Blueman wrote:

> On 11/04/2014 03:38 AM, Thomas Gleixner wrote:
> > On Sun, 2 Nov 2014, Daniel J Blueman wrote:
> >
> > > On larger x64-64 systems, use a 2GB memory block size to reduce sysfs
> > > entry creation time by 16x. Large is defined as 64GB or more memory.
> >
> > This changelog sucks.
> >
> > It neither tells which sysfs entries are meant nor does it explain
> > what the actual effect of this change is aside of speeding up some
> > random sysfs thingy.
>
> How about this?
>
> On large-memory systems of 64GB or more with memory hot-plug enabled, use a
> 2GB memory block size. Eg with 64GB memory, this reduces the number of
> directories in /sys/devices/system/memory from 512 to 32, making it more
> manageable, and reducing the creation time accordingly.

It still does not tell what the downside is of this and why you think
it does not matter.

> > > @@ -1247,9 +1246,9 @@ static unsigned long probe_memory_block_size(void)
> > > /* start from 2g */
> > > unsigned long bz = 1UL<<31;
> > >
> > > -#ifdef CONFIG_X86_UV
> > > - if (is_uv_system()) {
> > > - printk(KERN_INFO "UV: memory block size 2GB\n");
> > > +#ifdef CONFIG_X86_64
> >
> > And this brainless 's/CONFIG_X86_UV/CONFIG_X86_64/' sucks even
> > more. I'm sure you can figure out the WHY yourself.
>
> The benefit of this is applicable to other architectures. I'm unable to test
> the change, but if you agree it's conservative enough, I'll drop the ifdef?

Which other architectures? Care to turn on your brain before replying?

Thanks,

tglx

2014-11-04 07:30:22

by Daniel J Blueman

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On 11/04/2014 07:36 AM, Thomas Gleixner wrote:
> On Tue, 4 Nov 2014, Daniel J Blueman wrote:
>
>> On 11/04/2014 03:38 AM, Thomas Gleixner wrote:
>>> On Sun, 2 Nov 2014, Daniel J Blueman wrote:
>>>
>>>> On larger x64-64 systems, use a 2GB memory block size to reduce sysfs
>>>> entry creation time by 16x. Large is defined as 64GB or more memory.
>>>
>>> This changelog sucks.
>>>
>>> It neither tells which sysfs entries are meant nor does it explain
>>> what the actual effect of this change is aside of speeding up some
>>> random sysfs thingy.
>>
>> How about this?
>>
>> On large-memory systems of 64GB or more with memory hot-plug enabled, use a
>> 2GB memory block size. Eg with 64GB memory, this reduces the number of
>> directories in /sys/devices/system/memory from 512 to 32, making it more
>> manageable, and reducing the creation time accordingly.
>
> It still does not tell what the downside is of this and why you think
> it does not matter.

Yes, let's make it explicit:

On large-memory systems of 64GB or more with memory hot-plug enabled,
use a 2GB memory block size. Eg with 64GB memory, this reduces the
number of directories in /sys/devices/system/memory from 512 to 32,
making it more manageable, and reducing the creation time accordingly.

This caveat is that the memory can't be offlined (for hotplug or
otherwise) with finer 128MB granularity, but this is unimportant due to
the high memory densities generally used with such large-memory systems,
where eg a single DIMM is the order of 16GB.

>>>> @@ -1247,9 +1246,9 @@ static unsigned long probe_memory_block_size(void)
>>>> /* start from 2g */
>>>> unsigned long bz = 1UL<<31;
>>>>
>>>> -#ifdef CONFIG_X86_UV
>>>> - if (is_uv_system()) {
>>>> - printk(KERN_INFO "UV: memory block size 2GB\n");
>>>> +#ifdef CONFIG_X86_64
>>>
>>> And this brainless 's/CONFIG_X86_UV/CONFIG_X86_64/' sucks even
>>> more. I'm sure you can figure out the WHY yourself.
>>
>> The benefit of this is applicable to other architectures. I'm unable to test
>> the change, but if you agree it's conservative enough, I'll drop the ifdef?
>
> Which other architectures? Care to turn on your brain before replying?

Clearly 64-bit architectures, including X86, MIPS, PARISC, SPARC,
AArch64, ia64, however, I must be missing something, as a
sizeof(long)/CONFIG_64BIT check would be redundant if we agree to drop
the ifdef, as we're already checking the number of physical pages, which
is bounded by the same limits.

Thanks,
Daniel
--
Daniel J Blueman
Principal Software Engineer, Numascale

2014-11-04 07:51:03

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] Use 2GB memory block size on large x86-64 systems

On Tue, 4 Nov 2014, Daniel J Blueman wrote:
> On 11/04/2014 07:36 AM, Thomas Gleixner wrote:
> > On Tue, 4 Nov 2014, Daniel J Blueman wrote:
> > > > > @@ -1247,9 +1246,9 @@ static unsigned long
> > > > > probe_memory_block_size(void)
> > > > > /* start from 2g */
> > > > > unsigned long bz = 1UL<<31;
> > > > >
> > > > > -#ifdef CONFIG_X86_UV
> > > > > - if (is_uv_system()) {
> > > > > - printk(KERN_INFO "UV: memory block size 2GB\n");
> > > > > +#ifdef CONFIG_X86_64
> > > >
> > > > And this brainless 's/CONFIG_X86_UV/CONFIG_X86_64/' sucks even
> > > > more. I'm sure you can figure out the WHY yourself.
> > >
> > > The benefit of this is applicable to other architectures. I'm unable to
> > > test
> > > the change, but if you agree it's conservative enough, I'll drop the
> > > ifdef?
> >
> > Which other architectures? Care to turn on your brain before replying?
>
> Clearly 64-bit architectures, including X86, MIPS, PARISC, SPARC, AArch64,
> ia64,

ROTFL

> however, I must be missing something, as a sizeof(long)/CONFIG_64BIT
> check would be redundant if we agree to drop the ifdef, as we're already
> checking the number of physical pages, which is bounded by the same limits.

# diffstat $this_patch

should precicely tell you what you're missing.

Thanks,

tglx