LinuxLists.cc - [PATCH v4 1/4] Numachip: Fix 16-bit APIC ID truncation

[permalink] [raw]

Subject: [tip:x86/platform] x86: numachip: Fix 16-bit APIC ID truncation

Commit-ID: 00e7977dd1bbd46e336d7ef907d0fb6b6a4c294f
Gitweb: http://git.kernel.org/tip/00e7977dd1bbd46e336d7ef907d0fb6b6a4c294f
Author: Daniel J Blueman <[email protected]>
AuthorDate: Tue, 4 Nov 2014 16:29:41 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 4 Nov 2014 18:17:27 +0100

x86: numachip: Fix 16-bit APIC ID truncation

Prevent 16-bit APIC IDs being truncated by using correct mask. This fixes
booting large systems, where the wrong core would receive the startup and
init IPIs, causing hanging.

Signed-off-by: Daniel J Blueman <[email protected]>
Cc: Steffen Persvold <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 4128b5f..2aaee79 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x)
unsigned int id;

rdmsrl(MSR_FAM10H_NODE_ID, value);
- id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);

return id;
}

2014-11-04 17:22:26

[permalink] [raw]

Subject: [tip:x86/platform] x86: numachip: Elide self-IPI ICR polling

Commit-ID: 25e5a76bae106e1673887db09e22b19cb1a86c45
Gitweb: http://git.kernel.org/tip/25e5a76bae106e1673887db09e22b19cb1a86c45
Author: Daniel J Blueman <[email protected]>
AuthorDate: Tue, 4 Nov 2014 16:29:42 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 4 Nov 2014 18:17:27 +0100

x86: numachip: Elide self-IPI ICR polling

The default self-IPI path polls the ICR to delay sending the IPI until
there is no IPI in progress. This is redundant on x86-86 APICs, since
IPIs are queued. See the AMD64 Architecture Programmer's Manual, vol 2,
p525.

Signed-off-by: Daniel J Blueman <[email protected]>
Cc: Steffen Persvold <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 2aaee79..7a31912 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector)

static void numachip_send_IPI_self(int vector)
{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+ apic_write(APIC_SELF_IPI, vector);
}

static int __init numachip_probe(void)

2014-11-04 17:22:38

[permalink] [raw]

Subject: [tip:x86/platform] x86: numachip: APIC driver cleanups

Commit-ID: b980dcf25d0ee1f0f8c7b6afc0e715a2f5da5ec4
Gitweb: http://git.kernel.org/tip/b980dcf25d0ee1f0f8c7b6afc0e715a2f5da5ec4
Author: Daniel J Blueman <[email protected]>
AuthorDate: Tue, 4 Nov 2014 16:29:43 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 4 Nov 2014 18:17:27 +0100

x86: numachip: APIC driver cleanups

Drop printing that serves no purpose, as it's printing fixed or known
values, and mark constant structure appropriately.

Signed-off-by: Daniel J Blueman <[email protected]>
Cc: Steffen Persvold <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/apic_numachip.c | 22 +++-------------------
arch/x86/pci/numachip.c | 2 +-
2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 7a31912..c2fd21f 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -153,20 +153,8 @@ static int __init numachip_probe(void)
return apic == &apic_numachip;
}

-static void __init map_csrs(void)
-{
- printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
- NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
-
- printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
- NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
-}
-
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
-
if (c->phys_proc_id != node) {
c->phys_proc_id = node;
per_cpu(cpu_llc_id, smp_processor_id()) = node;
@@ -175,19 +163,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)

static int __init numachip_system_init(void)
{
- unsigned int val;
-
if (!numachip_system)
return 0;

+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
x86_init.pci.arch_init = pci_numachip_init;

- map_csrs();
-
- val = read_lcsr(CSR_G0_NODE_IDS);
- printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
-
return 0;
}
early_initcall(numachip_system_init);
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 7307d9d..2e565e6 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -103,7 +103,7 @@ static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus,
return 0;
}

-const struct pci_raw_ops pci_mmcfg_numachip = {
+static const struct pci_raw_ops pci_mmcfg_numachip = {
.read = pci_mmcfg_read_numachip,
.write = pci_mmcfg_write_numachip,
};

2014-11-04 17:22:51

[permalink] [raw]

Subject: [tip:x86/mm] x86: mm: Use 2GB memory block size on large-memory x86-64 systems

Commit-ID: bdee237c0343a5d1a6cf72c7ea68e88338b26e08
Gitweb: http://git.kernel.org/tip/bdee237c0343a5d1a6cf72c7ea68e88338b26e08
Author: Daniel J Blueman <[email protected]>
AuthorDate: Tue, 4 Nov 2014 16:29:44 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 4 Nov 2014 18:19:27 +0100

x86: mm: Use 2GB memory block size on large-memory x86-64 systems

On large-memory x86-64 systems of 64GB or more with memory hot-plug
enabled, use a 2GB memory block size. Eg with 64GB memory, this reduces
the number of directories in /sys/devices/system/memory from 512 to 32,
making it more manageable, and reducing the creation time accordingly.

This caveat is that the memory can't be offlined (for hotplug or
otherwise) with the finer default 128MB granularity, but this is
unimportant due to the high memory densities generally used with such
large-memory systems, where eg a single DIMM is the order of 16GB.

Signed-off-by: Daniel J Blueman <[email protected]>
Cc: Steffen Persvold <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/mm/init_64.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4cb8763..ebca30f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,7 +52,6 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
-#include <asm/uv/uv.h>
#include <asm/setup.h>

#include "mm_internal.h"
@@ -1247,12 +1246,10 @@ static unsigned long probe_memory_block_size(void)
/* start from 2g */
unsigned long bz = 1UL<<31;

-#ifdef CONFIG_X86_UV
- if (is_uv_system()) {
- printk(KERN_INFO "UV: memory block size 2GB\n");
+ if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
+ pr_info("Using 2GB memory block size for large-memory system\n");
return 2UL * 1024 * 1024 * 1024;
}
-#endif

/* less than 64g installed */
if ((max_pfn << PAGE_SHIFT) < (16UL << 32))

2014-11-05 22:10:52

[permalink] [raw]

Subject: Re: [tip:x86/mm] x86: mm: Use 2GB memory block size on large-memory x86-64 systems

On Tue, Nov 4, 2014 at 9:22 AM, tip-bot for Daniel J Blueman
<[email protected]> wrote:
> Commit-ID: bdee237c0343a5d1a6cf72c7ea68e88338b26e08
> Gitweb: http://git.kernel.org/tip/bdee237c0343a5d1a6cf72c7ea68e88338b26e08
> Author: Daniel J Blueman <[email protected]>
> AuthorDate: Tue, 4 Nov 2014 16:29:44 +0800
> Committer: Thomas Gleixner <[email protected]>
> CommitDate: Tue, 4 Nov 2014 18:19:27 +0100
>
> x86: mm: Use 2GB memory block size on large-memory x86-64 systems
>
> On large-memory x86-64 systems of 64GB or more with memory hot-plug
> enabled, use a 2GB memory block size. Eg with 64GB memory, this reduces
> the number of directories in /sys/devices/system/memory from 512 to 32,
> making it more manageable, and reducing the creation time accordingly.
>
> This caveat is that the memory can't be offlined (for hotplug or
> otherwise) with the finer default 128MB granularity, but this is
> unimportant due to the high memory densities generally used with such
> large-memory systems, where eg a single DIMM is the order of 16GB.
>
> Signed-off-by: Daniel J Blueman <[email protected]>
> Cc: Steffen Persvold <[email protected]>
> Cc: Bjorn Helgaas <[email protected]>
> Link: http://lkml.kernel.org/r/[email protected]
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/mm/init_64.c | 7 ++-----
> 1 file changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 4cb8763..ebca30f 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -52,7 +52,6 @@
> #include <asm/numa.h>
> #include <asm/cacheflush.h>
> #include <asm/init.h>
> -#include <asm/uv/uv.h>
> #include <asm/setup.h>
>
> #include "mm_internal.h"
> @@ -1247,12 +1246,10 @@ static unsigned long probe_memory_block_size(void)
> /* start from 2g */
> unsigned long bz = 1UL<<31;
>
> -#ifdef CONFIG_X86_UV
> - if (is_uv_system()) {
> - printk(KERN_INFO "UV: memory block size 2GB\n");
> + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
> + pr_info("Using 2GB memory block size for large-memory system\n");
> return 2UL * 1024 * 1024 * 1024;
> }
> -#endif
>
> /* less than 64g installed */
> if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
.....

Come on. Can you remove the following dead lines caused by this patch
the same time ?

Yinghai

2015-08-21 18:19:35

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Tue, Nov 04, 2014 at 04:29:44PM +0800, Daniel J Blueman wrote:
> On large-memory x86-64 systems of 64GB or more with memory hot-plug
> enabled, use a 2GB memory block size. Eg with 64GB memory, this reduces
> the number of directories in /sys/devices/system/memory from 512 to 32,
> making it more manageable, and reducing the creation time accordingly.
>
> This caveat is that the memory can't be offlined (for hotplug or otherwise)
> with finer 128MB granularity, but this is unimportant due to the high
> memory densities generally used with such large-memory systems, where
> eg a single DIMM is the order of 16GB.

git bisect points to this commit as the cause of a panic on my
machine:

[ 4.518415] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
[ 4.525882] PCI: MMCONFIG for domain 0000 [bus 00-ff] at [mem 0x80000000-0x8fffffff] (base 0x80000000)
[ 4.536280] PCI: MMCONFIG at [mem 0x80000000-0x8fffffff] reserved in E820
[ 4.544344] PCI: Using configuration type 1 for base access
[ 4.550778] BUG: unable to handle kernel paging request at ffffea0078000020
[ 4.558572] IP: [<ffffffff8142ab0d>] register_mem_sect_under_node+0x6d/0xe0
[ 4.566366] PGD 1dfffcc067 PUD 1dfffca067 PMD 0
[ 4.571554] Oops: 0000 [#1] SMP
[ 4.575181] Modules linked in:
[ 4.578604] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc2+ #17
[ 4.585800] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRBDXSD1.86B.0326.D03.1508171454 08/17/2015
[ 4.597347] task: ffff883b84960000 ti: ffff881d7ea14000 task.ti: ffff881d7ea14000
[ 4.605705] RIP: 0010:[<ffffffff8142ab0d>] [<ffffffff8142ab0d>] register_mem_sect_under_node+0x6d/0xe0
[ 4.616205] RSP: 0000:ffff881d7ea17d68 EFLAGS: 00010206
[ 4.622135] RAX: ffffea0078000020 RBX: 0000000000000001 RCX: 0000000001e00000
[ 4.630102] RDX: 0000000078000000 RSI: 0000000000000001 RDI: ffff881d7ccb6400
[ 4.638069] RBP: ffff881d7ea17d78 R08: 0000000001e7ffff R09: 0000000003c00000
[ 4.646035] R10: ffffffff813043a0 R11: ffffea0169efa600 R12: 0000000000000001
[ 4.654003] R13: 0000000000000001 R14: ffff881d7ccb6400 R15: 0000000000000000
[ 4.661972] FS: 0000000000000000(0000) GS:ffff881d8b400000(0000) knlGS:0000000000000000
[ 4.670996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4.677411] CR2: ffffea0078000020 CR3: 00000000019a0000 CR4: 00000000003407f0
[ 4.685381] Stack:
[ 4.687627] 0000000001e70000 0000000000000001 ffff881d7ea17dc8 ffffffff8142af0a
[ 4.695926] ffff881d7ea17de8 0000000003c00000 ffff881d00000018 0000000000000002
[ 4.704225] 0000000000000400 0000000000000000 ffffffff81b101c5 0000000000000000
[ 4.712524] Call Trace:
[ 4.715261] [<ffffffff8142af0a>] register_one_node+0x18a/0x2b0
[ 4.721871] [<ffffffff81b101c5>] ? pci_iommu_alloc+0x6e/0x6e
[ 4.728287] [<ffffffff81b10201>] topology_init+0x3c/0x95
[ 4.734321] [<ffffffff81002144>] do_one_initcall+0xd4/0x210
[ 4.740645] [<ffffffff8109b515>] ? parse_args+0x245/0x480
[ 4.746774] [<ffffffff810bddc8>] ? __wake_up+0x48/0x60
[ 4.752611] [<ffffffff81b062f9>] kernel_init_freeable+0x19d/0x23c
[ 4.759511] [<ffffffff81b059e3>] ? initcall_blacklist+0xb6/0xb6
[ 4.766226] [<ffffffff816580d0>] ? rest_init+0x80/0x80
[ 4.772059] [<ffffffff816580de>] kernel_init+0xe/0xf0
[ 4.777803] [<ffffffff8167057c>] ret_from_fork+0x7c/0xb0
[ 4.783831] [<ffffffff816580d0>] ? rest_init+0x80/0x80
[ 4.789655] Code: 39 c1 77 59 48 c1 e2 15 48 b8 00 00 00 00 00 ea ff ff 48 8d 44 02 20 eb 12 0f 1f 44 00 00 48 83 c1 01 48 83 c0 40 49 39 c8 72 5b <48> 83 38 00 74 ed 48 8b 50 e0 48 c1 ea 36 39 d6 75 e1 48 8b 04
[ 4.811356] RIP [<ffffffff8142ab0d>] register_mem_sect_under_node+0x6d/0xe0
[ 4.819238] RSP <ffff881d7ea17d68>
[ 4.823132] CR2: ffffea0078000020
[ 4.826836] ---[ end trace 10b7bb944b11529f ]---
[ 4.831989] Kernel panic - not syncing: Fatal exception
[ 4.837866] ---[ end Kernel panic - not syncing: Fatal exception

reverting the commit indeed makes the problem go away.

Now the root problem for me is that I have an insane BIOS
that handed me an e820 table that is full of holes (for entries
above 4GB) ... and ends with an entry that is only 256M aligned:

[ 0.000000] e820: BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000008dfff] usable
[ 0.000000] BIOS-e820: [mem 0x000000000008e000-0x000000000008ffff] reserved
[ 0.000000] BIOS-e820: [mem 0x0000000000090000-0x000000000009ffff] usable
[ 0.000000] BIOS-e820: [mem 0x00000000000a0000-0x00000000000fffff] reserved
[ 0.000000] BIOS-e820: [mem 0x0000000000100000-0x000000005cc0afff] usable
[ 0.000000] BIOS-e820: [mem 0x000000005cc0b000-0x000000005e108fff] reserved
[ 0.000000] BIOS-e820: [mem 0x000000005e109000-0x000000006035cfff] ACPI NVS
[ 0.000000] BIOS-e820: [mem 0x000000006035d000-0x00000000604fcfff] ACPI data
[ 0.000000] BIOS-e820: [mem 0x00000000604fd000-0x000000007bafffff] usable
[ 0.000000] BIOS-e820: [mem 0x000000007bb00000-0x000000008fffffff] reserved
[ 0.000000] BIOS-e820: [mem 0x00000000fed1c000-0x00000000fed1ffff] reserved
[ 0.000000] BIOS-e820: [mem 0x0000000100000000-0x000000118fffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000001200000000-0x0000001dffffffff] usable
[ 0.000000] BIOS-e820: [mem 0x0000001e70000000-0x0000001f3fffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000002000000000-0x0000002cffffffff] usable
[ 0.000000] BIOS-e820: [mem 0x0000002da0000000-0x0000002e6fffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000002f00000000-0x0000003bffffffff] usable
[ 0.000000] BIOS-e820: [mem 0x0000003cd0000000-0x0000003d9fffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000003e00000000-0x0000004ccfffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000004d00000000-0x0000005affffffff] usable
[ 0.000000] BIOS-e820: [mem 0x0000005b30000000-0x0000005bffffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000005c00000000-0x00000069ffffffff] usable
[ 0.000000] BIOS-e820: [mem 0x0000006a60000000-0x0000006b2fffefff] usable
[ 0.000000] BIOS-e820: [mem 0x0000006c00000000-0x000000798fffffff] usable

so the older code will look at max_pfn and set memory block size:

[ 3.021752] memory block size : 256MB

I think the problem is more connected to the strange max_pfn rather
than the holes ... but will defer to wiser heads.

If the problem is with max_pfn ... I don't think it is a safe assumption
that systems with >64GB memory will have 2GB aligned max_pfn.

-Tony

2015-08-21 18:38:16

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Fri, Aug 21, 2015 at 11:19 AM, Luck, Tony <[email protected]> wrote:
> On Tue, Nov 04, 2014 at 04:29:44PM +0800, Daniel J Blueman wrote:
>> On large-memory x86-64 systems of 64GB or more with memory hot-plug
>> enabled, use a 2GB memory block size. Eg with 64GB memory, this reduces
>> the number of directories in /sys/devices/system/memory from 512 to 32,
>> making it more manageable, and reducing the creation time accordingly.
>>
>> This caveat is that the memory can't be offlined (for hotplug or otherwise)
>> with finer 128MB granularity, but this is unimportant due to the high
>> memory densities generally used with such large-memory systems, where
>> eg a single DIMM is the order of 16GB.
>
> git bisect points to this commit as the cause of a panic on my
> machine:
>
> [ 4.518415] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
> [ 4.525882] PCI: MMCONFIG for domain 0000 [bus 00-ff] at [mem 0x80000000-0x8fffffff] (base 0x80000000)
> [ 4.536280] PCI: MMCONFIG at [mem 0x80000000-0x8fffffff] reserved in E820
> [ 4.544344] PCI: Using configuration type 1 for base access
> [ 4.550778] BUG: unable to handle kernel paging request at ffffea0078000020
> [ 4.558572] IP: [<ffffffff8142ab0d>] register_mem_sect_under_
...
> so the older code will look at max_pfn and set memory block size:
>
> [ 3.021752] memory block size : 256MB
>
> I think the problem is more connected to the strange max_pfn rather
> than the holes ... but will defer to wiser heads.
>
> If the problem is with max_pfn ... I don't think it is a safe assumption
> that systems with >64GB memory will have 2GB aligned max_pfn.

That commit could be reverted.
According to
https://lkml.org/lkml/2014/11/10/123

I had attached patch for my test setups for a while.

Yinghai

Attachments:

revert_commit_bdee237.patch (1.49 kB)

2015-08-21 20:27:32

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Fri, Aug 21, 2015 at 11:38:13AM -0700, Yinghai Lu wrote:
> That commit could be reverted.
> According to
> https://lkml.org/lkml/2014/11/10/123

Do we really need to force the MIN_MEMORY_BLOCK_SIZE on small
systems?

What about this patch - which just uses max_pfn to choose
the block size.

It seems that many systems with large amounts of memory
will have a nicely aligned max_pfn ... so they will get
the 2GB block size. If they don't have a well aligned
max_pfn, then they need to use a smaller size to avoid
the crash I saw.

-Tony

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623e3ba5..e14e90fd1cf8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1195,15 +1195,6 @@ static unsigned long probe_memory_block_size(void)
/* start from 2g */
unsigned long bz = 1UL<<31;

- if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
- pr_info("Using 2GB memory block size for large-memory system\n");
- return 2UL * 1024 * 1024 * 1024;
- }
-
- /* less than 64g installed */
- if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
- return MIN_MEMORY_BLOCK_SIZE;
-
/* get the tail size */
while (bz > MIN_MEMORY_BLOCK_SIZE) {
if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))

2015-08-21 20:50:36

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Fri, Aug 21, 2015 at 1:27 PM, Luck, Tony <[email protected]> wrote:
> On Fri, Aug 21, 2015 at 11:38:13AM -0700, Yinghai Lu wrote:
>> That commit could be reverted.
>> According to
>> https://lkml.org/lkml/2014/11/10/123
>
> Do we really need to force the MIN_MEMORY_BLOCK_SIZE on small
> systems?

That is introduced in commit 982792c7 ("x86, mm: probe memory block
size for generic x86 64bit
").
that patch is used to make boot faster why create less entries
in /sys/device/system/memory/.
On system with less 64G ram, that will not have too many entries
even with MIN_MEMORY_BLOCK_SIZE.

>
> What about this patch - which just uses max_pfn to choose
> the block size.
>
> It seems that many systems with large amounts of memory
> will have a nicely aligned max_pfn ... so they will get
> the 2GB block size. If they don't have a well aligned
> max_pfn, then they need to use a smaller size to avoid
> the crash I saw.

Good to me.

> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 3fba623e3ba5..e14e90fd1cf8 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1195,15 +1195,6 @@ static unsigned long probe_memory_block_size(void)
> /* start from 2g */
> unsigned long bz = 1UL<<31;
>
> - if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
> - pr_info("Using 2GB memory block size for large-memory system\n");
> - return 2UL * 1024 * 1024 * 1024;
> - }
> -
> - /* less than 64g installed */
> - if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
> - return MIN_MEMORY_BLOCK_SIZE;
> -
> /* get the tail size */
> while (bz > MIN_MEMORY_BLOCK_SIZE) {
> if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))

2015-08-21 23:54:15

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Fri, Aug 21, 2015 at 1:50 PM, Yinghai Lu <[email protected]> wrote:
>> It seems that many systems with large amounts of memory
>> will have a nicely aligned max_pfn ... so they will get
>> the 2GB block size. If they don't have a well aligned
>> max_pfn, then they need to use a smaller size to avoid
>> the crash I saw.
>
> Good to me.

Still stuff going on that I don't understand here. I increased the amount of
mirrored memory in this machine which moved max_pfn to 0x7560000
and probe_memory_block_size() picked 512MB as the memory_block_size,
which seemed plausible.

But my kernel still crashed during boot with this value. :-(
Forcing the block size to 128M made the system boot.

Maybe all the holes in the e820 map matter too (specifically the
alignment of the holes)?

-Tony

2015-08-24 17:46:50

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Fri, Aug 21, 2015 at 4:54 PM, Tony Luck <[email protected]> wrote:
> On Fri, Aug 21, 2015 at 1:50 PM, Yinghai Lu <[email protected]> wrote:
>
> Still stuff going on that I don't understand here. I increased the amount of
> mirrored memory in this machine which moved max_pfn to 0x7560000
> and probe_memory_block_size() picked 512MB as the memory_block_size,
> which seemed plausible.
>
> But my kernel still crashed during boot with this value. :-(
> Forcing the block size to 128M made the system boot.
>
> Maybe all the holes in the e820 map matter too (specifically the
> alignment of the holes)?

Then, what does the E820 look like?

Yinghai

2015-08-24 20:41:37

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Mon, Aug 24, 2015 at 10:46 AM, Yinghai Lu <[email protected]> wrote:
> Then, what does the E820 look like?

See attached serial console log of the latest crash

-Tony

Attachments:

dmesg (46.14 kB)

2015-08-24 21:25:48

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Mon, Aug 24, 2015 at 1:41 PM, Tony Luck <[email protected]> wrote:
> On Mon, Aug 24, 2015 at 10:46 AM, Yinghai Lu <[email protected]> wrote:
>> Then, what does the E820 look like?
>
> See attached serial console log of the latest crash

Can you boot with "debug ignore_loglevel" so we can see following print out
for vmemmap?

[ 0.352486] [ffffea0000000000-ffffea0001ffffff] PMD ->
[ffff88007de00000-ffff88007fdfffff] on node 0
[ 0.358758] [ffffea0004000000-ffffea0005ffffff] PMD ->
[ffff88017d600000-ffff88017f5fffff] on node 1

2015-08-24 22:39:34

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Mon, Aug 24, 2015 at 2:25 PM, Yinghai Lu <[email protected]> wrote:

> Can you boot with "debug ignore_loglevel" so we can see following print out
> for vmemmap?

See attached. There are a few extra messages from my own debug printk()
calls. It seems that we successfully deal with node 0 from topology_init()
but die walking node 1. I see that the NODE_DATA limits for memory
on node 1 were from 1d70000 to 3a00000. But when we get into
register_mem_sect_under_node() we have rounded the start pfn down to
1d00000 ... and we panic processing that range (which is in a hole in e820).

We seem to die here:

for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int page_nid;

page_nid = get_nid_for_pfn(pfn);

-Tony

Attachments:

dmesg2 (57.42 kB)

2015-08-24 23:41:14

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Mon, Aug 24, 2015 at 3:39 PM, Tony Luck <[email protected]> wrote:
> On Mon, Aug 24, 2015 at 2:25 PM, Yinghai Lu <[email protected]> wrote:
>
>> Can you boot with "debug ignore_loglevel" so we can see following print out
>> for vmemmap?
>
> See attached. There are a few extra messages from my own debug printk()
> calls. It seems that we successfully deal with node 0 from topology_init()
> but die walking node 1. I see that the NODE_DATA limits for memory
> on node 1 were from 1d70000 to 3a00000. But when we get into
> register_mem_sect_under_node() we have rounded the start pfn down to
> 1d00000 ... and we panic processing that range (which is in a hole in e820).
>
> We seem to die here:
>
> for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> int page_nid;
>
> page_nid = get_nid_for_pfn(pfn);

oh, no.
register_mem_sect_under_node() is assuming:
first section in the block is present and first page in that section is present.

2015-08-24 23:59:56

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Mon, Aug 24, 2015 at 4:41 PM, Yinghai Lu <[email protected]> wrote:
> On Mon, Aug 24, 2015 at 3:39 PM, Tony Luck <[email protected]> wrote:
>> On Mon, Aug 24, 2015 at 2:25 PM, Yinghai Lu <[email protected]> wrote:
>>
>>> Can you boot with "debug ignore_loglevel" so we can see following print out
>>> for vmemmap?
>>
>> See attached. There are a few extra messages from my own debug printk()
>> calls. It seems that we successfully deal with node 0 from topology_init()
>> but die walking node 1. I see that the NODE_DATA limits for memory
>> on node 1 were from 1d70000 to 3a00000. But when we get into
>> register_mem_sect_under_node() we have rounded the start pfn down to
>> 1d00000 ... and we panic processing that range (which is in a hole in e820).
>>
>> We seem to die here:
>>
>> for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
>> int page_nid;
>>
>> page_nid = get_nid_for_pfn(pfn);
>
> oh, no.
> register_mem_sect_under_node() is assuming:
> first section in the block is present and first page in that section is present.

attached should fix the problem:

Attachments:

check_section_for_memory_block_register.patch (1.37 kB)

2015-08-25 19:01:50

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Tue, Aug 25, 2015 at 10:03 AM, Tony Luck <[email protected]> wrote:
> On Mon, Aug 24, 2015 at 4:59 PM, Yinghai Lu <[email protected]> wrote:
>> attached should fix the problem:
>
> It does ... but this (attached) is simpler. Your patch and mine both
> allow the system to boot ...

The version that fix with section_nr present checking may save couple thousands
calling to get_nid_for_pfn(). section size / page_size = 128M/4k = 32k

> but it is not happy. See all the chatter from systemd in the attached dmesg.

because of you have "debug ignore_loglevel" ?

>
> x86 doesn't allow me to set CONFIG_HOLES_IN_ZONE ... but now I'm
> worried about all the other places use pfn_valid_within()
>
> Still trying to get an answer from the BIOS folks on whether these
> holes are normal when setting up mirrored areas of memory.

The problem only happens when memory block size is 512M and section
size is 128M.
when you have them both at 128M, the system works. so current kernel
should only has
problem with hole size > 128M to leave some section not present.

Thanks

Yinghai

2015-08-25 22:06:34

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Tue, Aug 25, 2015 at 12:01 PM, Yinghai Lu <[email protected]> wrote:
>> It does ... but this (attached) is simpler. Your patch and mine both
>> allow the system to boot ...
>
> The version that fix with section_nr present checking may save couple thousands
> calling to get_nid_for_pfn(). section size / page_size = 128M/4k = 32k

Actually saves about 1.2 million calls. Your patch wins :-)

Reported-and-tested-by: Tony Luck <[email protected]>

-Tony

2015-08-26 04:17:23

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

* Yinghai Lu <[email protected]> wrote:

> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -390,8 +390,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
> sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
> sect_end_pfn += PAGES_PER_SECTION - 1;
> for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> - int page_nid;
> + int page_nid, scn_nr;
>
> + scn_nr = pfn_to_section_nr(pfn);
> + if (!present_section_nr(scn_nr)) {
> + pfn = round_down(pfn + PAGES_PER_SECTION,
> + PAGES_PER_SECTION) - 1;
> + continue;
> + }
> page_nid = get_nid_for_pfn(pfn);
> if (page_nid < 0)
> continue;
> @@ -426,10 +432,18 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
> return -ENOMEM;
> nodes_clear(*unlinked_nodes);
>
> - sect_start_pfn = section_nr_to_pfn(phys_index);
> - sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
> + sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
> + sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
> + sect_end_pfn += PAGES_PER_SECTION - 1;
> for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> - int nid;
> + int nid, scn_nr;
> +
> + scn_nr = pfn_to_section_nr(pfn);
> + if (!present_section_nr(scn_nr)) {
> + pfn = round_down(pfn + PAGES_PER_SECTION,
> + PAGES_PER_SECTION) - 1;
> + continue;
> + }

NAK due to lack of cleanliness: the two loops look almost identical - this sure
can be factored out...

Thanks,

Ingo

2015-08-26 05:42:08

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Tue, Aug 25, 2015 at 9:17 PM, Ingo Molnar <[email protected]> wrote:
> NAK due to lack of cleanliness: the two loops look almost identical - this sure
> can be factored out...

Please check complete version at

https://patchwork.kernel.org/patch/7074341/

Andrew,
Ingo NAKed raw version of this patch, so you may need to remove it
from -mm tree.

Thanks

Yinghai

2015-08-26 20:49:26

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH v4 4/4] Use 2GB memory block size on large-memory x86-64 systems

On Tue, 25 Aug 2015 22:42:05 -0700 Yinghai Lu <[email protected]> wrote:

> On Tue, Aug 25, 2015 at 9:17 PM, Ingo Molnar <[email protected]> wrote:
> > NAK due to lack of cleanliness: the two loops look almost identical - this sure
> > can be factored out...
>
> Please check complete version at
>
> https://patchwork.kernel.org/patch/7074341/

That doesn't do what Ingo suggested: "can be factored out...".

Please review this?

--- a/drivers/base/node.c~mm-check-if-section-present-during-memory-block-unregistering-v2-fix
+++ a/drivers/base/node.c
@@ -375,6 +375,22 @@ static int __init_refok get_nid_for_pfn(
return pfn_to_nid(pfn);
}

+/*
+ * A memory block can have several absent sections. A helper function for
+ * skipping over these holes.
+ *
+ * If an absent section is detected, skip_absent_section() will advance *pfn
+ * to the final page in that section and will return true.
+ */
+static bool skip_absent_section(unsigned long *pfn)
+{
+ if (present_section_nr(pfn_to_section_nr(*pfn)))
+ return false;
+
+ *pfn = round_down(*pfn + PAGES_PER_SECTION, PAGES_PER_SECTION) - 1;
+ return true;
+}
+
/* register memory section under specified node if it spans that node */
int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
{
@@ -390,18 +406,10 @@ int register_mem_sect_under_node(struct
sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
sect_end_pfn += PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
- int page_nid, scn_nr;
+ int page_nid;

- /*
- * memory block could have several absent sections from start.
- * skip pfn range from absent section
- */
- scn_nr = pfn_to_section_nr(pfn);
- if (!present_section_nr(scn_nr)) {
- pfn = round_down(pfn + PAGES_PER_SECTION,
- PAGES_PER_SECTION) - 1;
+ if (skip_absent_section(&pfn))
continue;
- }

page_nid = get_nid_for_pfn(pfn);
if (page_nid < 0)
@@ -441,18 +449,10 @@ int unregister_mem_sect_under_nodes(stru
sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
sect_end_pfn += PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
- int nid, scn_nr;
+ int nid;

- /*
- * memory block could have several absent sections from start.
- * skip pfn range from absent section
- */
- scn_nr = pfn_to_section_nr(pfn);
- if (!present_section_nr(scn_nr)) {
- pfn = round_down(pfn + PAGES_PER_SECTION,
- PAGES_PER_SECTION) - 1;
+ if (skip_absent_section(&pfn))
continue;
- }

nid = get_nid_for_pfn(pfn);
if (nid < 0)
_

> Andrew,
> Ingo NAKed raw version of this patch, so you may need to remove it
> from -mm tree.

I don't know what that means. We have multiple patches under at least
two different Subject:s. Please be very careful and very specific when
identifying patches. Otherwise mistakes will be made.

I presently have three patches:

mm-check-if-section-present-during-memory-block-unregistering.patch
mm-check-if-section-present-during-memory-block-unregistering-v2.patch
mm-check-if-section-present-during-memory-block-unregistering-v2-fix.patch

When these are consolidated together, this is the result:

From: Yinghai Lu <[email protected]>
Subject: mm: check if section present during memory block (un)registering

Tony Luck found on his setup, if memory block size 512M will cause crash
during booting.

BUG: unable to handle kernel paging request at ffffea0074000020
IP: [<ffffffff81670527>] get_nid_for_pfn+0x17/0x40
PGD 128ffcb067 PUD 128ffc9067 PMD 0
Oops: 0000 [#1] SMP
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc8 #1
...
Call Trace:
[<ffffffff81453b56>] ? register_mem_sect_under_node+0x66/0xe0
[<ffffffff81453eeb>] register_one_node+0x17b/0x240
[<ffffffff81b1f1ed>] ? pci_iommu_alloc+0x6e/0x6e
[<ffffffff81b1f229>] topology_init+0x3c/0x95
[<ffffffff8100213d>] do_one_initcall+0xcd/0x1f0

The system has non continuous RAM address:
BIOS-e820: [mem 0x0000001300000000-0x0000001cffffffff] usable
BIOS-e820: [mem 0x0000001d70000000-0x0000001ec7ffefff] usable
BIOS-e820: [mem 0x0000001f00000000-0x0000002bffffffff] usable
BIOS-e820: [mem 0x0000002c18000000-0x0000002d6fffefff] usable
BIOS-e820: [mem 0x0000002e00000000-0x00000039ffffffff] usable

So there are start sections in memory block not present.
For example:
memory block : [0x2c18000000, 0x2c20000000) 512M
first three sections are not present.

Current register_mem_sect_under_node() assume first section is present,
but memory block section number range [start_section_nr, end_section_nr]
would include not present section.

For arch that support vmemmap, we don't setup memmap for struct page area
within not present sections area.

So skip the pfn range that belong to not present section.

Also fixes unregister_mem_sect_under_nodes().

Fixes: bdee237c0343 ("x86: mm: Use 2GB memory block size on large memory x86-64 systems")
Fixes: 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit")
[[email protected]: factor out common code]
Signed-off-by: Yinghai Lu <[email protected]>
Reported-by: Tony Luck <[email protected]>
Tested-by: Tony Luck <[email protected]>
Cc: Greg KH <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: <[email protected]> [3.15+]
Signed-off-by: Andrew Morton <[email protected]>
---

drivers/base/node.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)

diff -puN drivers/base/node.c~mm-check-if-section-present-during-memory-block-unregistering drivers/base/node.c
--- a/drivers/base/node.c~mm-check-if-section-present-during-memory-block-unregistering
+++ a/drivers/base/node.c
@@ -375,6 +375,22 @@ static int __init_refok get_nid_for_pfn(
return pfn_to_nid(pfn);
}

+/*
+ * A memory block can have several absent sections. A helper function for
+ * skipping over these holes.
+ *
+ * If an absent section is detected, skip_absent_section() will advance *pfn
+ * to the final page in that section and will return true.
+ */
+static bool skip_absent_section(unsigned long *pfn)
+{
+ if (present_section_nr(pfn_to_section_nr(*pfn)))
+ return false;
+
+ *pfn = round_down(*pfn + PAGES_PER_SECTION, PAGES_PER_SECTION) - 1;
+ return true;
+}
+
/* register memory section under specified node if it spans that node */
int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
{
@@ -392,6 +408,9 @@ int register_mem_sect_under_node(struct
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int page_nid;

+ if (skip_absent_section(&pfn))
+ continue;
+
page_nid = get_nid_for_pfn(pfn);
if (page_nid < 0)
continue;
@@ -426,11 +445,15 @@ int unregister_mem_sect_under_nodes(stru
return -ENOMEM;
nodes_clear(*unlinked_nodes);

- sect_start_pfn = section_nr_to_pfn(phys_index);
- sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+ sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+ sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
+ sect_end_pfn += PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int nid;

+ if (skip_absent_section(&pfn))
+ continue;
+
nid = get_nid_for_pfn(pfn);
if (nid < 0)
continue;
_

2015-08-26 21:15:22