During my test on some AMD machine, with kexec -l nr_cpus=x option, the
kernel failed to bootup, because some node's data struct can not be allocated,
e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
device->numa_node info is used as preferred_nid param for
__alloc_pages_nodemask(), which causes NULL reference
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
This patch tries to fix the issue by falling back to the first online node,
when encountering such corner case.
Notes about the crashing info:
-1. kexec -l with nr_cpus=4
-2. system info
NUMA node0 CPU(s): 0,8,16,24
NUMA node1 CPU(s): 2,10,18,26
NUMA node2 CPU(s): 4,12,20,28
NUMA node3 CPU(s): 6,14,22,30
NUMA node4 CPU(s): 1,9,17,25
NUMA node5 CPU(s): 3,11,19,27
NUMA node6 CPU(s): 5,13,21,29
NUMA node7 CPU(s): 7,15,23,31
-3. panic stack
[...]
[ 5.721547] atomic64_test: passed for x86-64 platform with CX8 and with SSE
[ 5.729187] pcieport 0000:00:01.1: Signaling PME with IRQ 34
[ 5.735187] pcieport 0000:00:01.2: Signaling PME with IRQ 35
[ 5.741168] pcieport 0000:00:01.3: Signaling PME with IRQ 36
[ 5.747189] pcieport 0000:00:07.1: Signaling PME with IRQ 37
[ 5.754061] pcieport 0000:00:08.1: Signaling PME with IRQ 39
[ 5.760727] pcieport 0000:20:07.1: Signaling PME with IRQ 40
[ 5.766955] pcieport 0000:20:08.1: Signaling PME with IRQ 42
[ 5.772742] BUG: unable to handle kernel paging request at 0000000000002088
[ 5.773618] PGD 0 P4D 0
[ 5.773618] Oops: 0000 [#1] SMP NOPTI
[ 5.773618] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc1+ #3
[ 5.773618] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.4.3 06/29/2018
[ 5.773618] RIP: 0010:__alloc_pages_nodemask+0xe2/0x2a0
[ 5.773618] Code: 00 00 44 89 ea 80 ca 80 41 83 f8 01 44 0f 44 ea 89 da c1 ea 08 83 e2 01 88 54 24 20 48 8b 54 24 08 48 85 d2 0f 85 46 01 00 00 <3b> 77 08 0f 82 3d 01 00 00 48 89 f8 44 89 ea 48 89
e1 44 89 e6 89
[ 5.773618] RSP: 0018:ffffaa600005fb20 EFLAGS: 00010246
[ 5.773618] RAX: 0000000000000000 RBX: 00000000006012c0 RCX: 0000000000000000
[ 5.773618] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[ 5.773618] RBP: 00000000006012c0 R08: 0000000000000000 R09: 0000000000000002
[ 5.773618] R10: 00000000006080c0 R11: 0000000000000002 R12: 0000000000000000
[ 5.773618] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000002
[ 5.773618] FS: 0000000000000000(0000) GS:ffff8c69afe00000(0000) knlGS:0000000000000000
[ 5.773618] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5.773618] CR2: 0000000000002088 CR3: 000000087e00a000 CR4: 00000000003406e0
[ 5.773618] Call Trace:
[ 5.773618] new_slab+0xa9/0x570
[ 5.773618] ___slab_alloc+0x375/0x540
[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] __slab_alloc+0x1c/0x38
[ 5.773618] __kmalloc_node_track_caller+0xc8/0x270
[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] devm_kmalloc+0x28/0x60
[ 5.773618] pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] really_probe+0x73/0x420
[ 5.773618] driver_probe_device+0x115/0x130
[ 5.773618] __driver_attach+0x103/0x110
[ 5.773618] ? driver_probe_device+0x130/0x130
[ 5.773618] bus_for_each_dev+0x67/0xc0
[ 5.773618] ? klist_add_tail+0x3b/0x70
[ 5.773618] bus_add_driver+0x41/0x260
[ 5.773618] ? pcie_port_setup+0x4d/0x4d
[ 5.773618] driver_register+0x5b/0xe0
[ 5.773618] ? pcie_port_setup+0x4d/0x4d
[ 5.773618] do_one_initcall+0x4e/0x1d4
[ 5.773618] ? init_setup+0x25/0x28
[ 5.773618] kernel_init_freeable+0x1c1/0x26e
[ 5.773618] ? loglevel+0x5b/0x5b
[ 5.773618] ? rest_init+0xb0/0xb0
[ 5.773618] kernel_init+0xa/0x110
[ 5.773618] ret_from_fork+0x22/0x40
[ 5.773618] Modules linked in:
[ 5.773618] CR2: 0000000000002088
[ 5.773618] ---[ end trace 1030c9120a03d081 ]---
[...]
Other notes about the reproduction of this bug:
After appling the following patch:
commit 0d76bcc960e6057750fcf556b65da13f8bbdfd2b
Author: Bjorn Helgaas <[email protected]>
Date: Tue Nov 13 08:38:17 2018 -0600
Revert "ACPI/PCI: Pay attention to device-specific _PXM node values"
This bug is covered and not triggered on my test AMD machine.
But it should still exist since dev->numa_node info can be set by other
method on other archs when using nr_cpus param
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Mike Rapoport <[email protected]>
Cc: Bjorn Helgaas <[email protected]>
Cc: Jonathan Cameron <[email protected]>
---
include/linux/gfp.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 76f8db0..8324953 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -453,6 +453,8 @@ static inline int gfp_zonelist(gfp_t flags)
*/
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
+ if (unlikely(!node_online(nid)))
+ nid = first_online_node;
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
--
2.7.4
On Tue, 4 Dec 2018, Pingfan Liu wrote:
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 76f8db0..8324953 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -453,6 +453,8 @@ static inline int gfp_zonelist(gfp_t flags)
> */
> static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> {
> + if (unlikely(!node_online(nid)))
> + nid = first_online_node;
> return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> }
>
So we're passing the node id from dev_to_node() to kmalloc which
interprets that as the preferred node and then does node_zonelist() to
find the zonelist at allocation time.
What happens if we fix this in alloc_dr()? Does anything else cause
problems?
And rather than using first_online_node, would next_online_node() work?
I'm thinking about this:
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -100,6 +100,8 @@ static __always_inline struct devres * alloc_dr(dr_release_t release,
&tot_size)))
return NULL;
+ if (unlikely(!node_online(nid)))
+ nid = next_online_node(nid);
dr = kmalloc_node_track_caller(tot_size, gfp, nid);
if (unlikely(!dr))
return NULL;
On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
>During my test on some AMD machine, with kexec -l nr_cpus=x option, the
>kernel failed to bootup, because some node's data struct can not be allocated,
>e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
>device->numa_node info is used as preferred_nid param for
could we fix the preferred_nid before passed to
__alloc_pages_nodemask()?
BTW, I don't catch the function call flow to this point. Would you mind
giving me some hint?
--
Wei Yang
Help you, Help me
On Tue, Dec 4, 2018 at 11:53 AM David Rientjes <[email protected]> wrote:
>
> On Tue, 4 Dec 2018, Pingfan Liu wrote:
>
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 76f8db0..8324953 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -453,6 +453,8 @@ static inline int gfp_zonelist(gfp_t flags)
> > */
> > static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> > {
> > + if (unlikely(!node_online(nid)))
> > + nid = first_online_node;
> > return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > }
> >
>
> So we're passing the node id from dev_to_node() to kmalloc which
> interprets that as the preferred node and then does node_zonelist() to
> find the zonelist at allocation time.
>
> What happens if we fix this in alloc_dr()? Does anything else cause
> problems?
>
I think it is better to fix it mm, since it can protect any new
similar bug in future. While fixing in alloc_dr() just work at present
> And rather than using first_online_node, would next_online_node() work?
>
What is the gain? Is it for memory pressure on node0?
Thanks,
Pingfan
> I'm thinking about this:
>
> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
> --- a/drivers/base/devres.c
> +++ b/drivers/base/devres.c
> @@ -100,6 +100,8 @@ static __always_inline struct devres * alloc_dr(dr_release_t release,
> &tot_size)))
> return NULL;
>
> + if (unlikely(!node_online(nid)))
> + nid = next_online_node(nid);
> dr = kmalloc_node_track_caller(tot_size, gfp, nid);
> if (unlikely(!dr))
> return NULL;
On Tue, Dec 4, 2018 at 2:54 PM Wei Yang <[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
> >During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> >kernel failed to bootup, because some node's data struct can not be allocated,
> >e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> >device->numa_node info is used as preferred_nid param for
>
> could we fix the preferred_nid before passed to
> __alloc_pages_nodemask()?
>
Yes, we can doit too, but what is the gain?
> BTW, I don't catch the function call flow to this point. Would you mind
> giving me some hint?
>
You can track the code along slab_alloc() ->...->__alloc_pages_nodemask()
Thanks,
Pingfan
On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> kernel failed to bootup, because some node's data struct can not be allocated,
> e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> device->numa_node info is used as preferred_nid param for
> __alloc_pages_nodemask(), which causes NULL reference
> ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> This patch tries to fix the issue by falling back to the first online node,
> when encountering such corner case.
We have seen similar issues already and the bug was usually that the
zonelists were not initialized yet or the node is completely bogus.
Zonelists should be initialized by build_all_zonelists quite early so I
am wondering whether the later is the case. What is the actual node
number the device is associated with?
Your patch is not correct btw, because we want to fallback into the node in
the distance order rather into the first online node.
--
Michal Hocko
SUSE Labs
On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
>
> On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > kernel failed to bootup, because some node's data struct can not be allocated,
> > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > device->numa_node info is used as preferred_nid param for
> > __alloc_pages_nodemask(), which causes NULL reference
> > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > This patch tries to fix the issue by falling back to the first online node,
> > when encountering such corner case.
>
> We have seen similar issues already and the bug was usually that the
> zonelists were not initialized yet or the node is completely bogus.
> Zonelists should be initialized by build_all_zonelists quite early so I
> am wondering whether the later is the case. What is the actual node
> number the device is associated with?
>
The device's node num is 2. And in my case, I used nr_cpus param. Due
to init_cpu_to_node() initialize all the possible node. It is hard
for me to figure out without this param, how zonelists is accessed
before page allocator works.
> Your patch is not correct btw, because we want to fallback into the node in
> the distance order rather into the first online node.
> --
What about this:
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
+
/*
* We get the zone list from the current node and the gfp_mask.
* This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
@@ -453,6 +455,11 @@ static inline int gfp_zonelist(gfp_t flags)
*/
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
+ if (unlikely(!node_online(nid))) {
+ nodemask_t used_mask;
+ nodes_complement(used_mask, node_online_map);
+ nid = find_next_best_node(nid, &used_mask);
+ }
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
I just finished the compiling, not test it yet, since the machine is
not on hand yet. It needs some time to get it again.
Thanks,
Pingfan
On Tue, Dec 04, 2018 at 03:20:13PM +0800, Pingfan Liu wrote:
>On Tue, Dec 4, 2018 at 2:54 PM Wei Yang <[email protected]> wrote:
>>
>> On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
>> >During my test on some AMD machine, with kexec -l nr_cpus=x option, the
>> >kernel failed to bootup, because some node's data struct can not be allocated,
>> >e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
>> >device->numa_node info is used as preferred_nid param for
>>
>> could we fix the preferred_nid before passed to
>> __alloc_pages_nodemask()?
>>
>Yes, we can doit too, but what is the gain?
node_zonelist() is used some places. If we are sure where the problem
is, it is not necessary to spread to other places.
>
>> BTW, I don't catch the function call flow to this point. Would you mind
>> giving me some hint?
>>
>You can track the code along slab_alloc() ->...->__alloc_pages_nodemask()
slab_alloc() pass NUMA_NO_NODE down, so I am lost in where the
preferred_nid is assigned.
>
>Thanks,
>Pingfan
--
Wei Yang
Help you, Help me
On Tue, Dec 04, 2018 at 04:20:32PM +0800, Pingfan Liu wrote:
>On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
>>
>> On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
>> > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
>> > kernel failed to bootup, because some node's data struct can not be allocated,
>> > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
>> > device->numa_node info is used as preferred_nid param for
>> > __alloc_pages_nodemask(), which causes NULL reference
>> > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
>> > This patch tries to fix the issue by falling back to the first online node,
>> > when encountering such corner case.
>>
>> We have seen similar issues already and the bug was usually that the
>> zonelists were not initialized yet or the node is completely bogus.
>> Zonelists should be initialized by build_all_zonelists quite early so I
>> am wondering whether the later is the case. What is the actual node
>> number the device is associated with?
>>
>The device's node num is 2. And in my case, I used nr_cpus param. Due
>to init_cpu_to_node() initialize all the possible node. It is hard
>for me to figure out without this param, how zonelists is accessed
>before page allocator works.
If my understanding is correct, we can't do page alloc before zonelist
is initialized.
I guess Michal's point is to figure out this reason.
>
>> Your patch is not correct btw, because we want to fallback into the node in
>> the distance order rather into the first online node.
>> --
>What about this:
>+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
>+
> /*
> * We get the zone list from the current node and the gfp_mask.
> * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
>@@ -453,6 +455,11 @@ static inline int gfp_zonelist(gfp_t flags)
> */
> static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> {
>+ if (unlikely(!node_online(nid))) {
>+ nodemask_t used_mask;
>+ nodes_complement(used_mask, node_online_map);
>+ nid = find_next_best_node(nid, &used_mask);
>+ }
> return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> }
>
>I just finished the compiling, not test it yet, since the machine is
>not on hand yet. It needs some time to get it again.
>
>Thanks,
>Pingfan
--
Wei Yang
Help you, Help me
On Tue, Dec 4, 2018 at 4:34 PM Wei Yang <[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 03:20:13PM +0800, Pingfan Liu wrote:
> >On Tue, Dec 4, 2018 at 2:54 PM Wei Yang <[email protected]> wrote:
> >>
> >> On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
> >> >During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> >> >kernel failed to bootup, because some node's data struct can not be allocated,
> >> >e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> >> >device->numa_node info is used as preferred_nid param for
> >>
> >> could we fix the preferred_nid before passed to
> >> __alloc_pages_nodemask()?
> >>
> >Yes, we can doit too, but what is the gain?
>
> node_zonelist() is used some places. If we are sure where the problem
> is, it is not necessary to spread to other places.
>
> >
> >> BTW, I don't catch the function call flow to this point. Would you mind
> >> giving me some hint?
> >>
> >You can track the code along slab_alloc() ->...->__alloc_pages_nodemask()
>
> slab_alloc() pass NUMA_NO_NODE down, so I am lost in where the
> preferred_nid is assigned.
>
You can follow:
[ 5.773618] new_slab+0xa9/0x570
[ 5.773618] ___slab_alloc+0x375/0x540
[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
where static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
Thanks,
Pingfan
On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > device->numa_node info is used as preferred_nid param for
> > > __alloc_pages_nodemask(), which causes NULL reference
> > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > This patch tries to fix the issue by falling back to the first online node,
> > > when encountering such corner case.
> >
> > We have seen similar issues already and the bug was usually that the
> > zonelists were not initialized yet or the node is completely bogus.
> > Zonelists should be initialized by build_all_zonelists quite early so I
> > am wondering whether the later is the case. What is the actual node
> > number the device is associated with?
> >
> The device's node num is 2. And in my case, I used nr_cpus param. Due
> to init_cpu_to_node() initialize all the possible node. It is hard
> for me to figure out without this param, how zonelists is accessed
> before page allocator works.
I believe we should focus on this. Why does the node have no zonelist
even though all zonelists should be initialized already? Maybe this is
nr_cpus pecularity and we do not initialize all the existing numa nodes.
Or maybe the device is associated to a non-existing node with that
setup. A full dmesg might help us here.
> > Your patch is not correct btw, because we want to fallback into the node in
> > the distance order rather into the first online node.
> > --
> What about this:
> +extern int find_next_best_node(int node, nodemask_t *used_node_mask);
> +
> /*
> * We get the zone list from the current node and the gfp_mask.
> * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
> @@ -453,6 +455,11 @@ static inline int gfp_zonelist(gfp_t flags)
> */
> static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> {
> + if (unlikely(!node_online(nid))) {
> + nodemask_t used_mask;
> + nodes_complement(used_mask, node_online_map);
> + nid = find_next_best_node(nid, &used_mask);
> + }
> return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> }
>
> I just finished the compiling, not test it yet, since the machine is
> not on hand yet. It needs some time to get it again.
This is clearly a no-go. nodemask_t can be giant and you cannot have it
on the stack for allocation paths which might be called from a deep
stack already. Also this is called from the allocator hot paths and each
branch counts.
--
Michal Hocko
SUSE Labs
On Tue, Dec 4, 2018 at 4:40 PM Wei Yang <[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 04:20:32PM +0800, Pingfan Liu wrote:
> >On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> >>
> >> On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> >> > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> >> > kernel failed to bootup, because some node's data struct can not be allocated,
> >> > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> >> > device->numa_node info is used as preferred_nid param for
> >> > __alloc_pages_nodemask(), which causes NULL reference
> >> > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> >> > This patch tries to fix the issue by falling back to the first online node,
> >> > when encountering such corner case.
> >>
> >> We have seen similar issues already and the bug was usually that the
> >> zonelists were not initialized yet or the node is completely bogus.
> >> Zonelists should be initialized by build_all_zonelists quite early so I
> >> am wondering whether the later is the case. What is the actual node
> >> number the device is associated with?
> >>
> >The device's node num is 2. And in my case, I used nr_cpus param. Due
> >to init_cpu_to_node() initialize all the possible node. It is hard
> >for me to figure out without this param, how zonelists is accessed
> >before page allocator works.
>
> If my understanding is correct, we can't do page alloc before zonelist
> is initialized.
>
> I guess Michal's point is to figure out this reason.
>
Yeah, I know. I just want to emphasize that I hit this bug using
nr_cpus, which may be rarely used by people. Hence it may be a
different bug as Michal had seen. Sorry for bad English if it cause
confusion.
Thanks
[...]
On Tue, Dec 04, 2018 at 04:52:52PM +0800, Pingfan Liu wrote:
>On Tue, Dec 4, 2018 at 4:34 PM Wei Yang <[email protected]> wrote:
>>
>> On Tue, Dec 04, 2018 at 03:20:13PM +0800, Pingfan Liu wrote:
>> >On Tue, Dec 4, 2018 at 2:54 PM Wei Yang <[email protected]> wrote:
>> >>
>> >> On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
>> >> >During my test on some AMD machine, with kexec -l nr_cpus=x option, the
>> >> >kernel failed to bootup, because some node's data struct can not be allocated,
>> >> >e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
>> >> >device->numa_node info is used as preferred_nid param for
>> >>
>> >> could we fix the preferred_nid before passed to
>> >> __alloc_pages_nodemask()?
>> >>
>> >Yes, we can doit too, but what is the gain?
>>
>> node_zonelist() is used some places. If we are sure where the problem
>> is, it is not necessary to spread to other places.
>>
>> >
>> >> BTW, I don't catch the function call flow to this point. Would you mind
>> >> giving me some hint?
>> >>
>> >You can track the code along slab_alloc() ->...->__alloc_pages_nodemask()
>>
>> slab_alloc() pass NUMA_NO_NODE down, so I am lost in where the
>> preferred_nid is assigned.
>>
>You can follow:
>[ 5.773618] new_slab+0xa9/0x570
>[ 5.773618] ___slab_alloc+0x375/0x540
>[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
>where static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
>
Well, thanks for your patience, but I still don't get it.
new_slab(node)
allocate_slab(node)
alloc_slab_page(node)
if (node == NUMA_NO_NODE)
alloc_pages()
eles
__alloc_pages_node(node)
As you mentioned, this starts from slab_alloc() which pass NUMA_NO_NODE.
This means it goes to alloc_pages() and then alloc_pages_current() ->
__alloc_pages_nodemask(). Here we use policy_node() to get the
preferred_nid.
I didn't catch the relathionship between policy_node() and
device->numa_node. Maybe I got wrong in some place. Would you minding
sharing more?
>Thanks,
>Pingfan
--
Wei Yang
Help you, Help me
On 12/4/18 9:56 AM, Michal Hocko wrote:
>> The device's node num is 2. And in my case, I used nr_cpus param. Due
>> to init_cpu_to_node() initialize all the possible node. It is hard
>> for me to figure out without this param, how zonelists is accessed
>> before page allocator works.
> I believe we should focus on this. Why does the node have no zonelist
> even though all zonelists should be initialized already? Maybe this is
> nr_cpus pecularity and we do not initialize all the existing numa nodes.
> Or maybe the device is associated to a non-existing node with that
> setup. A full dmesg might help us here.
Yes, a full dmesg should contain line such as this one:
[ 0.137407] Built 1 zonelists, mobility grouping on. Total pages:
6181664
That should at least tell us if nr_cpus=X resulted in some node's
zonelists not being built.
On Tue, Dec 4, 2018 at 4:56 PM Michal Hocko <[email protected]> wrote:
>
> On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> > On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > > device->numa_node info is used as preferred_nid param for
> > > > __alloc_pages_nodemask(), which causes NULL reference
> > > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > > This patch tries to fix the issue by falling back to the first online node,
> > > > when encountering such corner case.
> > >
> > > We have seen similar issues already and the bug was usually that the
> > > zonelists were not initialized yet or the node is completely bogus.
> > > Zonelists should be initialized by build_all_zonelists quite early so I
> > > am wondering whether the later is the case. What is the actual node
> > > number the device is associated with?
> > >
> > The device's node num is 2. And in my case, I used nr_cpus param. Due
> > to init_cpu_to_node() initialize all the possible node. It is hard
> > for me to figure out without this param, how zonelists is accessed
> > before page allocator works.
>
> I believe we should focus on this. Why does the node have no zonelist
> even though all zonelists should be initialized already? Maybe this is
> nr_cpus pecularity and we do not initialize all the existing numa nodes.
> Or maybe the device is associated to a non-existing node with that
> setup. A full dmesg might help us here.
>
Requiring the machine again, and I got the following without nr_cpus option
[root@dell-per7425-03 ~]# cd /sys/devices/system/node/
[root@dell-per7425-03 node]# ls
has_cpu has_memory has_normal_memory node0 node1 node2 node3
node4 node5 node6 node7 online possible power uevent
[root@dell-per7425-03 node]# cat has_cpu
0-7
[root@dell-per7425-03 node]# cat has_memory
1,5
[root@dell-per7425-03 node]# cat online
0-7
[root@dell-per7425-03 node]# cat possible
0-7
And lscpu shows the following numa-cpu info:
NUMA node0 CPU(s): 0,8,16,24
NUMA node1 CPU(s): 2,10,18,26
NUMA node2 CPU(s): 4,12,20,28
NUMA node3 CPU(s): 6,14,22,30
NUMA node4 CPU(s): 1,9,17,25
NUMA node5 CPU(s): 3,11,19,27
NUMA node6 CPU(s): 5,13,21,29
NUMA node7 CPU(s): 7,15,23,31
For the full panic message (I masked some hostname info with xx),
please see the attachment.
In a short word, it seems a problem with nr_cpus, if without this
option, the kernel can bootup correctly.
> > > Your patch is not correct btw, because we want to fallback into the node in
> > > the distance order rather into the first online node.
> > > --
> > What about this:
> > +extern int find_next_best_node(int node, nodemask_t *used_node_mask);
> > +
> > /*
> > * We get the zone list from the current node and the gfp_mask.
> > * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
> > @@ -453,6 +455,11 @@ static inline int gfp_zonelist(gfp_t flags)
> > */
> > static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> > {
> > + if (unlikely(!node_online(nid))) {
> > + nodemask_t used_mask;
> > + nodes_complement(used_mask, node_online_map);
> > + nid = find_next_best_node(nid, &used_mask);
> > + }
> > return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > }
> >
> > I just finished the compiling, not test it yet, since the machine is
> > not on hand yet. It needs some time to get it again.
>
> This is clearly a no-go. nodemask_t can be giant and you cannot have it
> on the stack for allocation paths which might be called from a deep
What about the static variable
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
+ if (unlikely(!node_online(nid))) {
+ WARN_ONCE(1, "Try to alloc mem from not online node\n");
+ nid = find_best_node(nid);
+ }
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
+int find_best_node(int node)
+{
+ static nodemask_t used_mask;
+ static DEFINE_SPINLOCK(lock);
+ static int best_neigh[MAX_NUMNODES] = {0};
+ int nid;
+
+ spin_lock(&lock);
+ if (likely( best_neigh[node] != 0)) {
+ nid = best_neigh[node];
+ goto unlock;
+ }
+ nodes_complement(used_mask, node_online_map);
+ nid = find_next_best_node(node, &used_mask);
+ best_neigh[node] = nid;
+
+unlock:
+ spin_unlock(&lock);
+ return nid;
+}
> stack already. Also this is called from the allocator hot paths and each
> branch counts.
>
I am puzzling about this. Does unlikely work for it?
Thanks,
Pingfan
On Tue, Dec 4, 2018 at 5:09 PM Wei Yang <[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 04:52:52PM +0800, Pingfan Liu wrote:
> >On Tue, Dec 4, 2018 at 4:34 PM Wei Yang <[email protected]> wrote:
> >>
> >> On Tue, Dec 04, 2018 at 03:20:13PM +0800, Pingfan Liu wrote:
> >> >On Tue, Dec 4, 2018 at 2:54 PM Wei Yang <[email protected]> wrote:
> >> >>
> >> >> On Tue, Dec 04, 2018 at 11:05:57AM +0800, Pingfan Liu wrote:
> >> >> >During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> >> >> >kernel failed to bootup, because some node's data struct can not be allocated,
> >> >> >e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> >> >> >device->numa_node info is used as preferred_nid param for
> >> >>
> >> >> could we fix the preferred_nid before passed to
> >> >> __alloc_pages_nodemask()?
> >> >>
> >> >Yes, we can doit too, but what is the gain?
> >>
> >> node_zonelist() is used some places. If we are sure where the problem
> >> is, it is not necessary to spread to other places.
> >>
> >> >
> >> >> BTW, I don't catch the function call flow to this point. Would you mind
> >> >> giving me some hint?
> >> >>
> >> >You can track the code along slab_alloc() ->...->__alloc_pages_nodemask()
> >>
> >> slab_alloc() pass NUMA_NO_NODE down, so I am lost in where the
> >> preferred_nid is assigned.
> >>
> >You can follow:
> >[ 5.773618] new_slab+0xa9/0x570
> >[ 5.773618] ___slab_alloc+0x375/0x540
> >[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
> >where static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> >
>
> Well, thanks for your patience, but I still don't get it.
>
> new_slab(node)
> allocate_slab(node)
> alloc_slab_page(node)
> if (node == NUMA_NO_NODE)
> alloc_pages()
> eles
> __alloc_pages_node(node)
>
> As you mentioned, this starts from slab_alloc() which pass NUMA_NO_NODE.
> This means it goes to alloc_pages() and then alloc_pages_current() ->
> __alloc_pages_nodemask(). Here we use policy_node() to get the
> preferred_nid.
>
> I didn't catch the relathionship between policy_node() and
> device->numa_node. Maybe I got wrong in some place. Would you minding
> sharing more?
>
Have uploaded the full panic log. Enjoy it.
Regards,
Pingfan
On Tue, Dec 4, 2018 at 3:16 PM Pingfan Liu <[email protected]> wrote:
>
> On Tue, Dec 4, 2018 at 11:53 AM David Rientjes <[email protected]> wrote:
> >
> > On Tue, 4 Dec 2018, Pingfan Liu wrote:
> >
> > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > > index 76f8db0..8324953 100644
> > > --- a/include/linux/gfp.h
> > > +++ b/include/linux/gfp.h
> > > @@ -453,6 +453,8 @@ static inline int gfp_zonelist(gfp_t flags)
> > > */
> > > static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> > > {
> > > + if (unlikely(!node_online(nid)))
> > > + nid = first_online_node;
> > > return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > > }
> > >
> >
> > So we're passing the node id from dev_to_node() to kmalloc which
> > interprets that as the preferred node and then does node_zonelist() to
> > find the zonelist at allocation time.
> >
> > What happens if we fix this in alloc_dr()? Does anything else cause
> > problems?
> >
> I think it is better to fix it mm, since it can protect any new
> similar bug in future. While fixing in alloc_dr() just work at present
>
> > And rather than using first_online_node, would next_online_node() work?
> >
> What is the gain? Is it for memory pressure on node0?
>
Maybe I got your point now. Do you try to give a cheap assumption on
nearest neigh of this node?
Thanks,
Pingfan
On Wed 05-12-18 13:38:17, Pingfan Liu wrote:
> On Tue, Dec 4, 2018 at 4:56 PM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> > > On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > > > device->numa_node info is used as preferred_nid param for
> > > > > __alloc_pages_nodemask(), which causes NULL reference
> > > > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > > > This patch tries to fix the issue by falling back to the first online node,
> > > > > when encountering such corner case.
> > > >
> > > > We have seen similar issues already and the bug was usually that the
> > > > zonelists were not initialized yet or the node is completely bogus.
> > > > Zonelists should be initialized by build_all_zonelists quite early so I
> > > > am wondering whether the later is the case. What is the actual node
> > > > number the device is associated with?
> > > >
> > > The device's node num is 2. And in my case, I used nr_cpus param. Due
> > > to init_cpu_to_node() initialize all the possible node. It is hard
> > > for me to figure out without this param, how zonelists is accessed
> > > before page allocator works.
> >
> > I believe we should focus on this. Why does the node have no zonelist
> > even though all zonelists should be initialized already? Maybe this is
> > nr_cpus pecularity and we do not initialize all the existing numa nodes.
> > Or maybe the device is associated to a non-existing node with that
> > setup. A full dmesg might help us here.
> >
> Requiring the machine again, and I got the following without nr_cpus option
> [root@dell-per7425-03 ~]# cd /sys/devices/system/node/
> [root@dell-per7425-03 node]# ls
> has_cpu has_memory has_normal_memory node0 node1 node2 node3
> node4 node5 node6 node7 online possible power uevent
> [root@dell-per7425-03 node]# cat has_cpu
> 0-7
> [root@dell-per7425-03 node]# cat has_memory
> 1,5
> [root@dell-per7425-03 node]# cat online
> 0-7
> [root@dell-per7425-03 node]# cat possible
> 0-7
> And lscpu shows the following numa-cpu info:
> NUMA node0 CPU(s): 0,8,16,24
> NUMA node1 CPU(s): 2,10,18,26
> NUMA node2 CPU(s): 4,12,20,28
> NUMA node3 CPU(s): 6,14,22,30
> NUMA node4 CPU(s): 1,9,17,25
> NUMA node5 CPU(s): 3,11,19,27
> NUMA node6 CPU(s): 5,13,21,29
> NUMA node7 CPU(s): 7,15,23,31
>
> For the full panic message (I masked some hostname info with xx),
> please see the attachment.
> In a short word, it seems a problem with nr_cpus, if without this
> option, the kernel can bootup correctly.
Yep.
[ 0.007418] Early memory node ranges
[ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
[ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
[ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
[ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
[ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
[ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
[ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
There is clearly no node2. Where did the driver get the node2 from?
--
Michal Hocko
SUSE Labs
On Wed, Dec 5, 2018 at 5:21 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 05-12-18 13:38:17, Pingfan Liu wrote:
> > On Tue, Dec 4, 2018 at 4:56 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> > > > On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > > > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > > > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > > > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > > > > device->numa_node info is used as preferred_nid param for
> > > > > > __alloc_pages_nodemask(), which causes NULL reference
> > > > > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > > > > This patch tries to fix the issue by falling back to the first online node,
> > > > > > when encountering such corner case.
> > > > >
> > > > > We have seen similar issues already and the bug was usually that the
> > > > > zonelists were not initialized yet or the node is completely bogus.
> > > > > Zonelists should be initialized by build_all_zonelists quite early so I
> > > > > am wondering whether the later is the case. What is the actual node
> > > > > number the device is associated with?
> > > > >
> > > > The device's node num is 2. And in my case, I used nr_cpus param. Due
> > > > to init_cpu_to_node() initialize all the possible node. It is hard
> > > > for me to figure out without this param, how zonelists is accessed
> > > > before page allocator works.
> > >
> > > I believe we should focus on this. Why does the node have no zonelist
> > > even though all zonelists should be initialized already? Maybe this is
> > > nr_cpus pecularity and we do not initialize all the existing numa nodes.
> > > Or maybe the device is associated to a non-existing node with that
> > > setup. A full dmesg might help us here.
> > >
> > Requiring the machine again, and I got the following without nr_cpus option
> > [root@dell-per7425-03 ~]# cd /sys/devices/system/node/
> > [root@dell-per7425-03 node]# ls
> > has_cpu has_memory has_normal_memory node0 node1 node2 node3
> > node4 node5 node6 node7 online possible power uevent
> > [root@dell-per7425-03 node]# cat has_cpu
> > 0-7
> > [root@dell-per7425-03 node]# cat has_memory
> > 1,5
> > [root@dell-per7425-03 node]# cat online
> > 0-7
> > [root@dell-per7425-03 node]# cat possible
> > 0-7
> > And lscpu shows the following numa-cpu info:
> > NUMA node0 CPU(s): 0,8,16,24
> > NUMA node1 CPU(s): 2,10,18,26
> > NUMA node2 CPU(s): 4,12,20,28
> > NUMA node3 CPU(s): 6,14,22,30
> > NUMA node4 CPU(s): 1,9,17,25
> > NUMA node5 CPU(s): 3,11,19,27
> > NUMA node6 CPU(s): 5,13,21,29
> > NUMA node7 CPU(s): 7,15,23,31
> >
> > For the full panic message (I masked some hostname info with xx),
> > please see the attachment.
> > In a short word, it seems a problem with nr_cpus, if without this
> > option, the kernel can bootup correctly.
>
> Yep.
> [ 0.007418] Early memory node ranges
> [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
>
> There is clearly no node2. Where did the driver get the node2 from?
Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
For the normal bootup, having the following:
[ 0.007704] Movable zone start for each node
[ 0.007707] Early memory node ranges
[ 0.007708] node 1: [mem 0x0000000000001000-0x000000000008efff]
[ 0.007709] node 1: [mem 0x0000000000090000-0x000000000009ffff]
[ 0.007711] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
[ 0.007712] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
[ 0.007712] node 1: [mem 0x000000006c528000-0x000000006fffffff]
[ 0.007713] node 1: [mem 0x0000000100000000-0x000000047fffffff]
[ 0.007714] node 5: [mem 0x0000000480000000-0x000000087effffff]
[ 0.008434] Zeroed struct page in unavailable ranges: 46490 pages
[ 0.008435] Initmem setup node 1 [mem 0x0000000000001000-0x000000047fffffff]
[ 0.022826] Initmem setup node 5 [mem 0x0000000480000000-0x000000087effffff]
[ 0.024303] ACPI: PM-Timer IO Port: 0x408
[ 0.024320] ACPI: LAPIC_NMI (acpi_id[0xff] high edge lint[0x1])
[ 0.024349] IOAPIC[0]: apic_id 128, version 33, address 0xfec00000, GSI 0-23
[ 0.024354] IOAPIC[1]: apic_id 129, version 33, address 0xfd880000, GSI 24-55
[ 0.024360] IOAPIC[2]: apic_id 130, version 33, address 0xea900000, GSI 56-87
[ 0.024365] IOAPIC[3]: apic_id 131, version 33, address 0xdd900000,
GSI 88-119
[ 0.024371] IOAPIC[4]: apic_id 132, version 33, address 0xd0900000,
GSI 120-151
[ 0.024378] IOAPIC[5]: apic_id 133, version 33, address 0xc3900000,
GSI 152-183
[ 0.024385] IOAPIC[6]: apic_id 134, version 33, address 0xb6900000,
GSI 184-215
[ 0.024391] IOAPIC[7]: apic_id 135, version 33, address 0xa9900000,
GSI 216-247
[ 0.024397] IOAPIC[8]: apic_id 136, version 33, address 0x9c900000,
GSI 248-279
[ 0.024400] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[ 0.024402] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
[ 0.024408] Using ACPI (MADT) for SMP configuration information
[ 0.024410] ACPI: HPET id: 0x10228201 base: 0xfed00000
[ 0.024418] smpboot: Allowing 128 CPUs, 96 hotplug CPUs
[ 0.024422] NODE_DATA(0) allocated [mem 0x87efa1000-0x87efcbfff]
[ 0.024424] NODE_DATA(0) on node 5
[ 0.024457] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
[ 0.024460] NODE_DATA(4) allocated [mem 0x87ef76000-0x87efa0fff]
[ 0.024461] NODE_DATA(4) on node 5
[ 0.024494] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
[ 0.024497] NODE_DATA(2) allocated [mem 0x87ef4b000-0x87ef75fff]
[ 0.024498] NODE_DATA(2) on node 5
---------------------------------------------------------------------------------->nid=2
[ 0.024530] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
[ 0.024533] NODE_DATA(6) allocated [mem 0x87ef20000-0x87ef4afff]
[ 0.024534] NODE_DATA(6) on node 5
[ 0.024566] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
[ 0.024568] NODE_DATA(3) allocated [mem 0x87eef5000-0x87ef1ffff]
Hence, this should be a specific issue with nr_cpus. The attachment is
full message of normal bootup
Thanks,
Pingfan
On Wed 05-12-18 17:29:31, Pingfan Liu wrote:
> On Wed, Dec 5, 2018 at 5:21 PM Michal Hocko <[email protected]> wrote:
> >
> > On Wed 05-12-18 13:38:17, Pingfan Liu wrote:
> > > On Tue, Dec 4, 2018 at 4:56 PM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> > > > > On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> > > > > >
> > > > > > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > > > > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > > > > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > > > > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > > > > > device->numa_node info is used as preferred_nid param for
> > > > > > > __alloc_pages_nodemask(), which causes NULL reference
> > > > > > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > > > > > This patch tries to fix the issue by falling back to the first online node,
> > > > > > > when encountering such corner case.
> > > > > >
> > > > > > We have seen similar issues already and the bug was usually that the
> > > > > > zonelists were not initialized yet or the node is completely bogus.
> > > > > > Zonelists should be initialized by build_all_zonelists quite early so I
> > > > > > am wondering whether the later is the case. What is the actual node
> > > > > > number the device is associated with?
> > > > > >
> > > > > The device's node num is 2. And in my case, I used nr_cpus param. Due
> > > > > to init_cpu_to_node() initialize all the possible node. It is hard
> > > > > for me to figure out without this param, how zonelists is accessed
> > > > > before page allocator works.
> > > >
> > > > I believe we should focus on this. Why does the node have no zonelist
> > > > even though all zonelists should be initialized already? Maybe this is
> > > > nr_cpus pecularity and we do not initialize all the existing numa nodes.
> > > > Or maybe the device is associated to a non-existing node with that
> > > > setup. A full dmesg might help us here.
> > > >
> > > Requiring the machine again, and I got the following without nr_cpus option
> > > [root@dell-per7425-03 ~]# cd /sys/devices/system/node/
> > > [root@dell-per7425-03 node]# ls
> > > has_cpu has_memory has_normal_memory node0 node1 node2 node3
> > > node4 node5 node6 node7 online possible power uevent
> > > [root@dell-per7425-03 node]# cat has_cpu
> > > 0-7
> > > [root@dell-per7425-03 node]# cat has_memory
> > > 1,5
> > > [root@dell-per7425-03 node]# cat online
> > > 0-7
> > > [root@dell-per7425-03 node]# cat possible
> > > 0-7
> > > And lscpu shows the following numa-cpu info:
> > > NUMA node0 CPU(s): 0,8,16,24
> > > NUMA node1 CPU(s): 2,10,18,26
> > > NUMA node2 CPU(s): 4,12,20,28
> > > NUMA node3 CPU(s): 6,14,22,30
> > > NUMA node4 CPU(s): 1,9,17,25
> > > NUMA node5 CPU(s): 3,11,19,27
> > > NUMA node6 CPU(s): 5,13,21,29
> > > NUMA node7 CPU(s): 7,15,23,31
> > >
> > > For the full panic message (I masked some hostname info with xx),
> > > please see the attachment.
> > > In a short word, it seems a problem with nr_cpus, if without this
> > > option, the kernel can bootup correctly.
> >
> > Yep.
> > [ 0.007418] Early memory node ranges
> > [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> > [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> > [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> > [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> > [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> > [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> > [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
> >
> > There is clearly no node2. Where did the driver get the node2 from?
> Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
> For the normal bootup, having the following:
> [ 0.007704] Movable zone start for each node
> [ 0.007707] Early memory node ranges
> [ 0.007708] node 1: [mem 0x0000000000001000-0x000000000008efff]
> [ 0.007709] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> [ 0.007711] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> [ 0.007712] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> [ 0.007712] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> [ 0.007713] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> [ 0.007714] node 5: [mem 0x0000000480000000-0x000000087effffff]
> [ 0.008434] Zeroed struct page in unavailable ranges: 46490 pages
Hmm, this is even more interesting. So even a normal boot doesn't have
node 2. So where exactly does the device get its affinity from?
I suspect we are looking at two issues here. The first one, and a more
important one is that there is a NUMA affinity configured for the device
to a non-existing node. The second one is that nr_cpus affects
initialization of possible nodes.
--
Michal Hocko
SUSE Labs
On 12/5/18 10:29 AM, Pingfan Liu wrote:
>> [ 0.007418] Early memory node ranges
>> [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
>> [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
>> [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
>> [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
>> [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
>> [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
>> [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
>>
>> There is clearly no node2. Where did the driver get the node2 from?
I don't understand these tables too much, but it seems the other nodes
exist without them:
[ 0.007393] SRAT: PXM 2 -> APIC 0x20 -> Node 2
Maybe the nodes are hotplugable or something?
> Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
Indeed, nr_cpus seems to restrict what nodes we allocate and populate
zonelists for.
On Wed, 5 Dec 2018, Pingfan Liu wrote:
> > > And rather than using first_online_node, would next_online_node() work?
> > >
> > What is the gain? Is it for memory pressure on node0?
> >
> Maybe I got your point now. Do you try to give a cheap assumption on
> nearest neigh of this node?
>
It's likely better than first_online_node, but probably going to be the
same based on the node ids that you have reported since the nodemask will
simply wrap around back to the first node.
On Wed, Dec 5, 2018 at 5:40 PM Vlastimil Babka <[email protected]> wrote:
>
> On 12/5/18 10:29 AM, Pingfan Liu wrote:
> >> [ 0.007418] Early memory node ranges
> >> [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> >> [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> >> [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> >> [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> >> [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> >> [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> >> [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
> >>
> >> There is clearly no node2. Where did the driver get the node2 from?
>
> I don't understand these tables too much, but it seems the other nodes
> exist without them:
>
> [ 0.007393] SRAT: PXM 2 -> APIC 0x20 -> Node 2
>
> Maybe the nodes are hotplugable or something?
>
I also not sure about it, and just have a hurry look at acpi spec. I
will reply it on another email, and Cced some acpi guys about it
> > Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
>
> Indeed, nr_cpus seems to restrict what nodes we allocate and populate
> zonelists for.
Yes, in init_cpu_to_node(), since nr_cpus limits the possible cpu,
which affects the loop for_each_possible_cpu(cpu) and skip the node2
in this case.
Thanks,
Pingfan
On Wed, Dec 5, 2018 at 5:43 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 05-12-18 17:29:31, Pingfan Liu wrote:
> > On Wed, Dec 5, 2018 at 5:21 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Wed 05-12-18 13:38:17, Pingfan Liu wrote:
> > > > On Tue, Dec 4, 2018 at 4:56 PM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Tue 04-12-18 16:20:32, Pingfan Liu wrote:
> > > > > > On Tue, Dec 4, 2018 at 3:22 PM Michal Hocko <[email protected]> wrote:
> > > > > > >
> > > > > > > On Tue 04-12-18 11:05:57, Pingfan Liu wrote:
> > > > > > > > During my test on some AMD machine, with kexec -l nr_cpus=x option, the
> > > > > > > > kernel failed to bootup, because some node's data struct can not be allocated,
> > > > > > > > e.g, on x86, initialized by init_cpu_to_node()->init_memory_less_node(). But
> > > > > > > > device->numa_node info is used as preferred_nid param for
> > > > > > > > __alloc_pages_nodemask(), which causes NULL reference
> > > > > > > > ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > > > > > > > This patch tries to fix the issue by falling back to the first online node,
> > > > > > > > when encountering such corner case.
> > > > > > >
> > > > > > > We have seen similar issues already and the bug was usually that the
> > > > > > > zonelists were not initialized yet or the node is completely bogus.
> > > > > > > Zonelists should be initialized by build_all_zonelists quite early so I
> > > > > > > am wondering whether the later is the case. What is the actual node
> > > > > > > number the device is associated with?
> > > > > > >
> > > > > > The device's node num is 2. And in my case, I used nr_cpus param. Due
> > > > > > to init_cpu_to_node() initialize all the possible node. It is hard
> > > > > > for me to figure out without this param, how zonelists is accessed
> > > > > > before page allocator works.
> > > > >
> > > > > I believe we should focus on this. Why does the node have no zonelist
> > > > > even though all zonelists should be initialized already? Maybe this is
> > > > > nr_cpus pecularity and we do not initialize all the existing numa nodes.
> > > > > Or maybe the device is associated to a non-existing node with that
> > > > > setup. A full dmesg might help us here.
> > > > >
> > > > Requiring the machine again, and I got the following without nr_cpus option
> > > > [root@dell-per7425-03 ~]# cd /sys/devices/system/node/
> > > > [root@dell-per7425-03 node]# ls
> > > > has_cpu has_memory has_normal_memory node0 node1 node2 node3
> > > > node4 node5 node6 node7 online possible power uevent
> > > > [root@dell-per7425-03 node]# cat has_cpu
> > > > 0-7
> > > > [root@dell-per7425-03 node]# cat has_memory
> > > > 1,5
> > > > [root@dell-per7425-03 node]# cat online
> > > > 0-7
> > > > [root@dell-per7425-03 node]# cat possible
> > > > 0-7
> > > > And lscpu shows the following numa-cpu info:
> > > > NUMA node0 CPU(s): 0,8,16,24
> > > > NUMA node1 CPU(s): 2,10,18,26
> > > > NUMA node2 CPU(s): 4,12,20,28
> > > > NUMA node3 CPU(s): 6,14,22,30
> > > > NUMA node4 CPU(s): 1,9,17,25
> > > > NUMA node5 CPU(s): 3,11,19,27
> > > > NUMA node6 CPU(s): 5,13,21,29
> > > > NUMA node7 CPU(s): 7,15,23,31
> > > >
> > > > For the full panic message (I masked some hostname info with xx),
> > > > please see the attachment.
> > > > In a short word, it seems a problem with nr_cpus, if without this
> > > > option, the kernel can bootup correctly.
> > >
> > > Yep.
> > > [ 0.007418] Early memory node ranges
> > > [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> > > [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> > > [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> > > [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> > > [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> > > [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> > > [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
> > >
> > > There is clearly no node2. Where did the driver get the node2 from?
> > Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
> > For the normal bootup, having the following:
> > [ 0.007704] Movable zone start for each node
> > [ 0.007707] Early memory node ranges
> > [ 0.007708] node 1: [mem 0x0000000000001000-0x000000000008efff]
> > [ 0.007709] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> > [ 0.007711] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> > [ 0.007712] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> > [ 0.007712] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> > [ 0.007713] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> > [ 0.007714] node 5: [mem 0x0000000480000000-0x000000087effffff]
> > [ 0.008434] Zeroed struct page in unavailable ranges: 46490 pages
>
> Hmm, this is even more interesting. So even a normal boot doesn't have
> node 2. So where exactly does the device get its affinity from?
>
I am afraid that there is maybe some misunderstanding, but for the
normal bootup, the full boot msg shows the existence of node 2
First, the following can be observed:
[ 0.007385] SRAT: PXM 0 -> APIC 0x00 -> Node 0
[ 0.007386] SRAT: PXM 0 -> APIC 0x01 -> Node 0
[ 0.007387] SRAT: PXM 0 -> APIC 0x08 -> Node 0
[ 0.007388] SRAT: PXM 0 -> APIC 0x09 -> Node 0
[ 0.007389] SRAT: PXM 1 -> APIC 0x10 -> Node 1
[ 0.007390] SRAT: PXM 1 -> APIC 0x11 -> Node 1
[ 0.007391] SRAT: PXM 1 -> APIC 0x18 -> Node 1
[ 0.007392] SRAT: PXM 1 -> APIC 0x19 -> Node 1
[ 0.007393] SRAT: PXM 2 -> APIC 0x20 -> Node 2
[ 0.007394] SRAT: PXM 2 -> APIC 0x21 -> Node 2
[ 0.007395] SRAT: PXM 2 -> APIC 0x28 -> Node 2
[ 0.007396] SRAT: PXM 2 -> APIC 0x29 -> Node 2
[ 0.007397] SRAT: PXM 3 -> APIC 0x30 -> Node 3
[ 0.007398] SRAT: PXM 3 -> APIC 0x31 -> Node 3
[ 0.007399] SRAT: PXM 3 -> APIC 0x38 -> Node 3
[ 0.007400] SRAT: PXM 3 -> APIC 0x39 -> Node 3
[ 0.007401] SRAT: PXM 4 -> APIC 0x40 -> Node 4
[ 0.007402] SRAT: PXM 4 -> APIC 0x41 -> Node 4
[ 0.007403] SRAT: PXM 4 -> APIC 0x48 -> Node 4
[ 0.007403] SRAT: PXM 4 -> APIC 0x49 -> Node 4
[ 0.007404] SRAT: PXM 5 -> APIC 0x50 -> Node 5
[ 0.007405] SRAT: PXM 5 -> APIC 0x51 -> Node 5
[ 0.007406] SRAT: PXM 5 -> APIC 0x58 -> Node 5
[ 0.007407] SRAT: PXM 5 -> APIC 0x59 -> Node 5
[ 0.007408] SRAT: PXM 6 -> APIC 0x60 -> Node 6
[ 0.007409] SRAT: PXM 6 -> APIC 0x61 -> Node 6
[ 0.007410] SRAT: PXM 6 -> APIC 0x68 -> Node 6
[ 0.007411] SRAT: PXM 6 -> APIC 0x69 -> Node 6
[ 0.007412] SRAT: PXM 7 -> APIC 0x70 -> Node 7
[ 0.007413] SRAT: PXM 7 -> APIC 0x71 -> Node 7
[ 0.007414] SRAT: PXM 7 -> APIC 0x78 -> Node 7
[ 0.007414] SRAT: PXM 7 -> APIC 0x79 -> Node 7
Second:
[ 0.024497] NODE_DATA(2) allocated [mem 0x87ef4b000-0x87ef75fff]
[ 0.024498] NODE_DATA(2) on node 5
[ 0.024530] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
Besides these, as I pasted, when normal bootup, lscpu shows the: NUMA
node2 CPU(s): 4,12,20,28
and
[root@dell-per7425-03 node]# cat has_cpu
0-7
[root@dell-per7425-03 node]# cat has_memory
1,5
So I think node 2 exists, which has cpus but no memory.
I attach SRAT, which show there are eight Proximity Domain. As my
understanding, each Proximity Domain will correspond to a numa node.
Cced acpi guys and hope them can give some hints.
> I suspect we are looking at two issues here. The first one, and a more
> important one is that there is a NUMA affinity configured for the device
> to a non-existing node. The second one is that nr_cpus affects
> initialization of possible nodes.
The dev->numa_node info is extracted from acpi table, not depends on
the instance of numa-node, which may be limited by nr_cpus. Hence the
node is existing, just not instanced.
Thanks,
Pingfan
On Thu 06-12-18 11:34:30, Pingfan Liu wrote:
[...]
> > I suspect we are looking at two issues here. The first one, and a more
> > important one is that there is a NUMA affinity configured for the device
> > to a non-existing node. The second one is that nr_cpus affects
> > initialization of possible nodes.
>
> The dev->numa_node info is extracted from acpi table, not depends on
> the instance of numa-node, which may be limited by nr_cpus. Hence the
> node is existing, just not instanced.
Hmm, binding to memory less node is quite dubious. But OK. I am not sure
how much sanitization can we do. We need to fallback anyway so we should
better make sure that all possible nodes are initialized regardless of
nr_cpus. I will look into that.
--
Michal Hocko
SUSE Labs
On Thu 06-12-18 11:07:33, Pingfan Liu wrote:
> On Wed, Dec 5, 2018 at 5:40 PM Vlastimil Babka <[email protected]> wrote:
> >
> > On 12/5/18 10:29 AM, Pingfan Liu wrote:
> > >> [ 0.007418] Early memory node ranges
> > >> [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> > >> [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> > >> [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> > >> [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> > >> [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> > >> [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> > >> [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
> > >>
> > >> There is clearly no node2. Where did the driver get the node2 from?
> >
> > I don't understand these tables too much, but it seems the other nodes
> > exist without them:
> >
> > [ 0.007393] SRAT: PXM 2 -> APIC 0x20 -> Node 2
> >
> > Maybe the nodes are hotplugable or something?
> >
> I also not sure about it, and just have a hurry look at acpi spec. I
> will reply it on another email, and Cced some acpi guys about it
>
> > > Since using nr_cpus=4 , the node2 is not be instanced by x86 initalizing code.
> >
> > Indeed, nr_cpus seems to restrict what nodes we allocate and populate
> > zonelists for.
>
> Yes, in init_cpu_to_node(), since nr_cpus limits the possible cpu,
> which affects the loop for_each_possible_cpu(cpu) and skip the node2
> in this case.
THanks for pointing this out. It made my life easier. So It think the
bug is that we call init_memory_less_node from this path. I suspect
numa_register_memblks is the right place to do this. So I admit I
am not 100% sure but could you give this a try please?
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..4575ae4d5449 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -527,6 +527,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -592,6 +605,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid);
}
/* Dump memblock with node info and return. */
@@ -721,21 +736,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
Michal Hocko
SUSE Labs
[...]
> THanks for pointing this out. It made my life easier. So It think the
> bug is that we call init_memory_less_node from this path. I suspect
> numa_register_memblks is the right place to do this. So I admit I
> am not 100% sure but could you give this a try please?
>
Sure.
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..4575ae4d5449 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -527,6 +527,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> +static void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> @@ -592,6 +605,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (!end)
> + init_memory_less_node(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +736,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> --
Which commit is this patch applied on? I can not apply it on latest linux tree.
Thanks,
Pingfan
On Thu, Dec 6, 2018 at 6:03 PM Pingfan Liu <[email protected]> wrote:
>
> [...]
> > THanks for pointing this out. It made my life easier. So It think the
> > bug is that we call init_memory_less_node from this path. I suspect
> > numa_register_memblks is the right place to do this. So I admit I
> > am not 100% sure but could you give this a try please?
> >
> Sure.
>
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f5408bf7..4575ae4d5449 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -527,6 +527,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > }
> > }
> >
> > +static void __init init_memory_less_node(int nid)
> > +{
> > + unsigned long zones_size[MAX_NR_ZONES] = {0};
> > + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > +
> > + free_area_init_node(nid, zones_size, 0, zholes_size);
> > +
> > + /*
> > + * All zonelists will be built later in start_kernel() after per cpu
> > + * areas are initialized.
> > + */
> > +}
> > +
> > static int __init numa_register_memblks(struct numa_meminfo *mi)
> > {
> > unsigned long uninitialized_var(pfn_align);
> > @@ -592,6 +605,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > continue;
> >
> > alloc_node_data(nid);
> > + if (!end)
> > + init_memory_less_node(nid);
> > }
> >
> > /* Dump memblock with node info and return. */
> > @@ -721,21 +736,6 @@ void __init x86_numa_init(void)
> > numa_init(dummy_numa_init);
> > }
> >
> > -static void __init init_memory_less_node(int nid)
> > -{
> > - unsigned long zones_size[MAX_NR_ZONES] = {0};
> > - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > -
> > - /* Allocate and initialize node data. Memory-less node is now online.*/
> > - alloc_node_data(nid);
> > - free_area_init_node(nid, zones_size, 0, zholes_size);
> > -
> > - /*
> > - * All zonelists will be built later in start_kernel() after per cpu
> > - * areas are initialized.
> > - */
> > -}
> > -
> > /*
> > * Setup early cpu_to_node.
> > *
> > @@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
> > if (node == NUMA_NO_NODE)
> > continue;
> >
> > - if (!node_online(node))
> > - init_memory_less_node(node);
> > -
> > numa_set_node(cpu, node);
> > }
> > }
> > --
> Which commit is this patch applied on? I can not apply it on latest linux tree.
>
I applied it by manual, will see the test result. I think it should
work since you instance all the node.
But there are two things worth to consider:
-1st. why x86 do not bring up all nodes by default, apparently it will
be more simple by that way
-2nd. there are other archs, do they obey the rules?
Thanks,
Pingfan
On Thu 06-12-18 18:44:03, Pingfan Liu wrote:
> On Thu, Dec 6, 2018 at 6:03 PM Pingfan Liu <[email protected]> wrote:
[...]
> > Which commit is this patch applied on? I can not apply it on latest linux tree.
> >
> I applied it by manual, will see the test result. I think it should
> work since you instance all the node.
> But there are two things worth to consider:
> -1st. why x86 do not bring up all nodes by default, apparently it will
> be more simple by that way
What do you mean? Why it didn't bring up before? Or do you see some
nodes not being brought up after this patch?
> -2nd. there are other archs, do they obey the rules?
I am afraid that each arch does its own initialization.
--
Michal Hocko
SUSE Labs
On Thu, Dec 6, 2018 at 8:11 PM Michal Hocko <[email protected]> wrote:
>
> On Thu 06-12-18 18:44:03, Pingfan Liu wrote:
> > On Thu, Dec 6, 2018 at 6:03 PM Pingfan Liu <[email protected]> wrote:
> [...]
> > > Which commit is this patch applied on? I can not apply it on latest linux tree.
> > >
> > I applied it by manual, will see the test result. I think it should
> > work since you instance all the node.
> > But there are two things worth to consider:
> > -1st. why x86 do not bring up all nodes by default, apparently it will
> > be more simple by that way
>
> What do you mean? Why it didn't bring up before? Or do you see some
Yes, this is what I mean. But maybe the author does not consider about
the nr_cpus, otherwise, using:
+ for_each_node(node)
+ if (!node_online(node))
+ init_memory_less_node(node);
in init_cpu_to_node() is more simple.
> nodes not being brought up after this patch?
>
> > -2nd. there are other archs, do they obey the rules?
>
> I am afraid that each arch does its own initialization.
Then it is arguable whether to fix this issue in memory core or let
each archs to fix this issue. I check the powerpc code, it should also
need a fix, it maybe the same in arm and mips ..
BTW, your patch can not work for normal bootup, and the kernel hang
without any kernel message.
I think it is due to the bug in the patch:
alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid); //which calls
alloc_node_data(nid) also.
How about the following:
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f54..4dc497d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
{
int cpu;
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+ int node, nr;
BUG_ON(cpu_to_apicid == NULL);
+ nr = cpumask_weight(cpu_possible_mask);
+
+ /* bring up all possible node, since dev->numa_node */
+ //should check acpi works for node possible,
+ for_each_node(node)
+ if (!node_online(node))
+ init_memory_less_node(node);
for_each_possible_cpu(cpu) {
- int node = numa_cpu_node(cpu);
+ node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
Although it works, I hesitate about the idea, due to the semantic of
online-node, does the online-node require either cpu or memory inside
the node to be online?
In a short word, the fix method should consider about the two factors:
semantic of online-node and the effect on all archs
Thanks,
Pingfan
> --
> Michal Hocko
> SUSE Labs
On Fri 07-12-18 10:56:51, Pingfan Liu wrote:
[...]
> In a short word, the fix method should consider about the two factors:
> semantic of online-node and the effect on all archs
I am pretty sure there is a lot of room for unification in this area.
Nevertheless I strongly believe the bug should be fixed firs with the
simplest way and all the cleanup should be done on top.
Do I get it right that the diff worked for you and I can prepare a full
patch?
--
Michal Hocko
SUSE Labs
On Fri, Dec 7, 2018 at 3:53 PM Michal Hocko <[email protected]> wrote:
>
> On Fri 07-12-18 10:56:51, Pingfan Liu wrote:
> [...]
> > In a short word, the fix method should consider about the two factors:
> > semantic of online-node and the effect on all archs
>
> I am pretty sure there is a lot of room for unification in this area.
> Nevertheless I strongly believe the bug should be fixed firs with the
> simplest way and all the cleanup should be done on top.
>
> Do I get it right that the diff worked for you and I can prepare a full
> patch?
>
Sure, I am glad to test you new patch.
Thanks,
Pingfan
On Fri 07-12-18 17:40:09, Pingfan Liu wrote:
> On Fri, Dec 7, 2018 at 3:53 PM Michal Hocko <[email protected]> wrote:
> >
> > On Fri 07-12-18 10:56:51, Pingfan Liu wrote:
> > [...]
> > > In a short word, the fix method should consider about the two factors:
> > > semantic of online-node and the effect on all archs
> >
> > I am pretty sure there is a lot of room for unification in this area.
> > Nevertheless I strongly believe the bug should be fixed firs with the
> > simplest way and all the cleanup should be done on top.
> >
> > Do I get it right that the diff worked for you and I can prepare a full
> > patch?
> >
> Sure, I am glad to test you new patch.
From 46e68be89d9c299fd497b2b8bea3f2add144f17f Mon Sep 17 00:00:00 2001
From: Michal Hocko <[email protected]>
Date: Fri, 7 Dec 2018 12:23:32 +0100
Subject: [PATCH] x86, numa: always initialize all possible nodes
Pingfan Liu has reported the following splat
[ 5.772742] BUG: unable to handle kernel paging request at 0000000000002088
[ 5.773618] PGD 0 P4D 0
[ 5.773618] Oops: 0000 [#1] SMP NOPTI
[ 5.773618] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc1+ #3
[ 5.773618] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.4.3 06/29/2018
[ 5.773618] RIP: 0010:__alloc_pages_nodemask+0xe2/0x2a0
[ 5.773618] Code: 00 00 44 89 ea 80 ca 80 41 83 f8 01 44 0f 44 ea 89 da c1 ea 08 83 e2 01 88 54 24 20 48 8b 54 24 08 48 85 d2 0f 85 46 01 00 00 <3b> 77 08 0f 82 3d 01 00 00 48 89 f8 44 89 ea 48 89
e1 44 89 e6 89
[ 5.773618] RSP: 0018:ffffaa600005fb20 EFLAGS: 00010246
[ 5.773618] RAX: 0000000000000000 RBX: 00000000006012c0 RCX: 0000000000000000
[ 5.773618] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[ 5.773618] RBP: 00000000006012c0 R08: 0000000000000000 R09: 0000000000000002
[ 5.773618] R10: 00000000006080c0 R11: 0000000000000002 R12: 0000000000000000
[ 5.773618] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000002
[ 5.773618] FS: 0000000000000000(0000) GS:ffff8c69afe00000(0000) knlGS:0000000000000000
[ 5.773618] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5.773618] CR2: 0000000000002088 CR3: 000000087e00a000 CR4: 00000000003406e0
[ 5.773618] Call Trace:
[ 5.773618] new_slab+0xa9/0x570
[ 5.773618] ___slab_alloc+0x375/0x540
[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] __slab_alloc+0x1c/0x38
[ 5.773618] __kmalloc_node_track_caller+0xc8/0x270
[ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] devm_kmalloc+0x28/0x60
[ 5.773618] pinctrl_bind_pins+0x2b/0x2a0
[ 5.773618] really_probe+0x73/0x420
[ 5.773618] driver_probe_device+0x115/0x130
[ 5.773618] __driver_attach+0x103/0x110
[ 5.773618] ? driver_probe_device+0x130/0x130
[ 5.773618] bus_for_each_dev+0x67/0xc0
[ 5.773618] ? klist_add_tail+0x3b/0x70
[ 5.773618] bus_add_driver+0x41/0x260
[ 5.773618] ? pcie_port_setup+0x4d/0x4d
[ 5.773618] driver_register+0x5b/0xe0
[ 5.773618] ? pcie_port_setup+0x4d/0x4d
[ 5.773618] do_one_initcall+0x4e/0x1d4
[ 5.773618] ? init_setup+0x25/0x28
[ 5.773618] kernel_init_freeable+0x1c1/0x26e
[ 5.773618] ? loglevel+0x5b/0x5b
[ 5.773618] ? rest_init+0xb0/0xb0
[ 5.773618] kernel_init+0xa/0x110
[ 5.773618] ret_from_fork+0x22/0x40
[ 5.773618] Modules linked in:
[ 5.773618] CR2: 0000000000002088
[ 5.773618] ---[ end trace 1030c9120a03d081 ]---
with his AMD machine with the following topology
NUMA node0 CPU(s): 0,8,16,24
NUMA node1 CPU(s): 2,10,18,26
NUMA node2 CPU(s): 4,12,20,28
NUMA node3 CPU(s): 6,14,22,30
NUMA node4 CPU(s): 1,9,17,25
NUMA node5 CPU(s): 3,11,19,27
NUMA node6 CPU(s): 5,13,21,29
NUMA node7 CPU(s): 7,15,23,31
[ 0.007418] Early memory node ranges
[ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
[ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
[ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
[ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
[ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
[ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
[ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
and nr_cpus set to 4. The underlying reason is tha the device is bound
to node 2 which doesn't have any memory and init_cpu_to_node only
initializes memory-less nodes for possible cpus which nr_cpus restrics.
This in turn means that proper zonelists are not allocated and the page
allocator blows up.
Fix the issue by moving init_memory_less_node into numa_register_memblks
and always initialize all possible nodes consistently at a single place.
Reported-by: Pingfan Liu <[email protected]>
Signed-off-by: Michal Hocko <[email protected]>
---
arch/x86/mm/numa.c | 33 +++++++++++++++------------------
1 file changed, 15 insertions(+), 18 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..4575ae4d5449 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -527,6 +527,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -592,6 +605,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid);
}
/* Dump memblock with node info and return. */
@@ -721,21 +736,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
2.19.2
--
Michal Hocko
SUSE Labs
On Fri, Dec 7, 2018 at 7:30 PM Michal Hocko <[email protected]> wrote:
>
[...]
> On Fri 07-12-18 17:40:09, Pingfan Liu wrote:
> > On Fri, Dec 7, 2018 at 3:53 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Fri 07-12-18 10:56:51, Pingfan Liu wrote:
> > > [...]
> > > > In a short word, the fix method should consider about the two factors:
> > > > semantic of online-node and the effect on all archs
> > >
> > > I am pretty sure there is a lot of room for unification in this area.
> > > Nevertheless I strongly believe the bug should be fixed firs with the
> > > simplest way and all the cleanup should be done on top.
> > >
> > > Do I get it right that the diff worked for you and I can prepare a full
> > > patch?
> > >
> > Sure, I am glad to test you new patch.
>
> From 46e68be89d9c299fd497b2b8bea3f2add144f17f Mon Sep 17 00:00:00 2001
> From: Michal Hocko <[email protected]>
> Date: Fri, 7 Dec 2018 12:23:32 +0100
> Subject: [PATCH] x86, numa: always initialize all possible nodes
>
> Pingfan Liu has reported the following splat
> [ 5.772742] BUG: unable to handle kernel paging request at 0000000000002088
> [ 5.773618] PGD 0 P4D 0
> [ 5.773618] Oops: 0000 [#1] SMP NOPTI
> [ 5.773618] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc1+ #3
> [ 5.773618] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.4.3 06/29/2018
> [ 5.773618] RIP: 0010:__alloc_pages_nodemask+0xe2/0x2a0
> [ 5.773618] Code: 00 00 44 89 ea 80 ca 80 41 83 f8 01 44 0f 44 ea 89 da c1 ea 08 83 e2 01 88 54 24 20 48 8b 54 24 08 48 85 d2 0f 85 46 01 00 00 <3b> 77 08 0f 82 3d 01 00 00 48 89 f8 44 89 ea 48 89
> e1 44 89 e6 89
> [ 5.773618] RSP: 0018:ffffaa600005fb20 EFLAGS: 00010246
> [ 5.773618] RAX: 0000000000000000 RBX: 00000000006012c0 RCX: 0000000000000000
> [ 5.773618] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
> [ 5.773618] RBP: 00000000006012c0 R08: 0000000000000000 R09: 0000000000000002
> [ 5.773618] R10: 00000000006080c0 R11: 0000000000000002 R12: 0000000000000000
> [ 5.773618] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000002
> [ 5.773618] FS: 0000000000000000(0000) GS:ffff8c69afe00000(0000) knlGS:0000000000000000
> [ 5.773618] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 5.773618] CR2: 0000000000002088 CR3: 000000087e00a000 CR4: 00000000003406e0
> [ 5.773618] Call Trace:
> [ 5.773618] new_slab+0xa9/0x570
> [ 5.773618] ___slab_alloc+0x375/0x540
> [ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
> [ 5.773618] __slab_alloc+0x1c/0x38
> [ 5.773618] __kmalloc_node_track_caller+0xc8/0x270
> [ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0
> [ 5.773618] devm_kmalloc+0x28/0x60
> [ 5.773618] pinctrl_bind_pins+0x2b/0x2a0
> [ 5.773618] really_probe+0x73/0x420
> [ 5.773618] driver_probe_device+0x115/0x130
> [ 5.773618] __driver_attach+0x103/0x110
> [ 5.773618] ? driver_probe_device+0x130/0x130
> [ 5.773618] bus_for_each_dev+0x67/0xc0
> [ 5.773618] ? klist_add_tail+0x3b/0x70
> [ 5.773618] bus_add_driver+0x41/0x260
> [ 5.773618] ? pcie_port_setup+0x4d/0x4d
> [ 5.773618] driver_register+0x5b/0xe0
> [ 5.773618] ? pcie_port_setup+0x4d/0x4d
> [ 5.773618] do_one_initcall+0x4e/0x1d4
> [ 5.773618] ? init_setup+0x25/0x28
> [ 5.773618] kernel_init_freeable+0x1c1/0x26e
> [ 5.773618] ? loglevel+0x5b/0x5b
> [ 5.773618] ? rest_init+0xb0/0xb0
> [ 5.773618] kernel_init+0xa/0x110
> [ 5.773618] ret_from_fork+0x22/0x40
> [ 5.773618] Modules linked in:
> [ 5.773618] CR2: 0000000000002088
> [ 5.773618] ---[ end trace 1030c9120a03d081 ]---
>
> with his AMD machine with the following topology
> NUMA node0 CPU(s): 0,8,16,24
> NUMA node1 CPU(s): 2,10,18,26
> NUMA node2 CPU(s): 4,12,20,28
> NUMA node3 CPU(s): 6,14,22,30
> NUMA node4 CPU(s): 1,9,17,25
> NUMA node5 CPU(s): 3,11,19,27
> NUMA node6 CPU(s): 5,13,21,29
> NUMA node7 CPU(s): 7,15,23,31
>
> [ 0.007418] Early memory node ranges
> [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff]
> [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff]
>
> and nr_cpus set to 4. The underlying reason is tha the device is bound
> to node 2 which doesn't have any memory and init_cpu_to_node only
> initializes memory-less nodes for possible cpus which nr_cpus restrics.
> This in turn means that proper zonelists are not allocated and the page
> allocator blows up.
>
> Fix the issue by moving init_memory_less_node into numa_register_memblks
> and always initialize all possible nodes consistently at a single place.
>
> Reported-by: Pingfan Liu <[email protected]>
> Signed-off-by: Michal Hocko <[email protected]>
> ---
> arch/x86/mm/numa.c | 33 +++++++++++++++------------------
> 1 file changed, 15 insertions(+), 18 deletions(-)
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..4575ae4d5449 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -527,6 +527,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> +static void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> @@ -592,6 +605,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (!end)
> + init_memory_less_node(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +736,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> --
> 2.19.2
>
Hi Michal,
As I mentioned in my previous email, I have manually apply the patch,
and the patch can not work for normal bootup. Your new patch seems to
have no essential changes, I applied it and had a try. It does not
work yet.
Thanks,
Pingfan
On Fri 07-12-18 21:20:17, Pingfan Liu wrote:
[...]
> Hi Michal,
>
> As I mentioned in my previous email, I have manually apply the patch,
> and the patch can not work for normal bootup.
I am sorry, I have misread your previous response. Is there anything
interesting on the serial console by any chance?
--
Michal Hocko
SUSE Labs
On Fri, Dec 7, 2018 at 10:22 PM Michal Hocko <[email protected]> wrote:
>
> On Fri 07-12-18 21:20:17, Pingfan Liu wrote:
> [...]
> > Hi Michal,
> >
> > As I mentioned in my previous email, I have manually apply the patch,
> > and the patch can not work for normal bootup.
>
> I am sorry, I have misread your previous response. Is there anything
> interesting on the serial console by any chance?
Nothing. It need more effort to debug. But as I mentioned, enable all
of the rest possible node, then it works. Maybe it can give some help
for you.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f54..4dc497d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
{
int cpu;
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+ int node, nr;
BUG_ON(cpu_to_apicid == NULL);
+ nr = cpumask_weight(cpu_possible_mask);
+
+ /* bring up all possible node, since dev->numa_node */
+ //should check acpi works for node possible,
+ for_each_node(node)
+ if (!node_online(node))
+ init_memory_less_node(node);
for_each_possible_cpu(cpu) {
- int node = numa_cpu_node(cpu);
+ node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
Thanks,
Pingfan
On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> On Fri, Dec 7, 2018 at 10:22 PM Michal Hocko <[email protected]> wrote:
> >
> > On Fri 07-12-18 21:20:17, Pingfan Liu wrote:
> > [...]
> > > Hi Michal,
> > >
> > > As I mentioned in my previous email, I have manually apply the patch,
> > > and the patch can not work for normal bootup.
> >
> > I am sorry, I have misread your previous response. Is there anything
> > interesting on the serial console by any chance?
>
> Nothing. It need more effort to debug. But as I mentioned, enable all
> of the rest possible node, then it works. Maybe it can give some help
> for you.
I will have a look. Thanks!
--
Michal Hocko
SUSE Labs
On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
[...]
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f54..4dc497d 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> {
> int cpu;
> u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> + int node, nr;
>
> BUG_ON(cpu_to_apicid == NULL);
> + nr = cpumask_weight(cpu_possible_mask);
> +
> + /* bring up all possible node, since dev->numa_node */
> + //should check acpi works for node possible,
> + for_each_node(node)
> + if (!node_online(node))
> + init_memory_less_node(node);
I suspect there is no change if you replace for_each_node by
for_each_node_mask(nid, node_possible_map)
here. If that is the case then we are probably calling
free_area_init_node too early. I do not see it yet though.
--
Michal Hocko
SUSE Labs
On Fri, Dec 7, 2018 at 11:56 PM Michal Hocko <[email protected]> wrote:
>
> On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> [...]
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f54..4dc497d 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > {
> > int cpu;
> > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > + int node, nr;
> >
> > BUG_ON(cpu_to_apicid == NULL);
> > + nr = cpumask_weight(cpu_possible_mask);
> > +
> > + /* bring up all possible node, since dev->numa_node */
> > + //should check acpi works for node possible,
> > + for_each_node(node)
> > + if (!node_online(node))
> > + init_memory_less_node(node);
>
> I suspect there is no change if you replace for_each_node by
> for_each_node_mask(nid, node_possible_map)
>
> here. If that is the case then we are probably calling
> free_area_init_node too early. I do not see it yet though.
Maybe I do not clearly get your meaning, just try to guess. But if you
worry about node_possible_map, then it is dynamically set by
alloc_node_data(). The map is changed after the first time to call
free_area_init_node() for the node with memory. This logic is the
same as the current x86 code.
Thanks,
Pingfan
On Mon, Dec 10, 2018 at 12:00 PM Pingfan Liu <[email protected]> wrote:
>
> On Fri, Dec 7, 2018 at 11:56 PM Michal Hocko <[email protected]> wrote:
> >
> > On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> > [...]
> > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > index 1308f54..4dc497d 100644
> > > --- a/arch/x86/mm/numa.c
> > > +++ b/arch/x86/mm/numa.c
> > > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > > {
> > > int cpu;
> > > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > > + int node, nr;
> > >
> > > BUG_ON(cpu_to_apicid == NULL);
> > > + nr = cpumask_weight(cpu_possible_mask);
> > > +
> > > + /* bring up all possible node, since dev->numa_node */
> > > + //should check acpi works for node possible,
> > > + for_each_node(node)
> > > + if (!node_online(node))
> > > + init_memory_less_node(node);
> >
> > I suspect there is no change if you replace for_each_node by
> > for_each_node_mask(nid, node_possible_map)
> >
> > here. If that is the case then we are probably calling
> > free_area_init_node too early. I do not see it yet though.
>
> Maybe I do not clearly get your meaning, just try to guess. But if you
> worry about node_possible_map, then it is dynamically set by
> alloc_node_data(). The map is changed after the first time to call
A mistake, it should be node_online_map. and in free_area_init_nodes()
for_each_online_node(nid) {
free_area_init_node(nid, NULL,..
So at this time, we do not need to worry about the memory-less node.
> free_area_init_node() for the node with memory. This logic is the
> same as the current x86 code.
>
> Thanks,
> Pingfan
On Fri 07-12-18 16:56:27, Michal Hocko wrote:
> On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> [...]
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f54..4dc497d 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > {
> > int cpu;
> > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > + int node, nr;
> >
> > BUG_ON(cpu_to_apicid == NULL);
> > + nr = cpumask_weight(cpu_possible_mask);
> > +
> > + /* bring up all possible node, since dev->numa_node */
> > + //should check acpi works for node possible,
> > + for_each_node(node)
> > + if (!node_online(node))
> > + init_memory_less_node(node);
>
> I suspect there is no change if you replace for_each_node by
> for_each_node_mask(nid, node_possible_map)
>
> here. If that is the case then we are probably calling
> free_area_init_node too early. I do not see it yet though.
OK, so it is not about calling it late or soon. It is just that
node_possible_map is a misnomer and it has a different semantic than
I've expected. numa_nodemask_from_meminfo simply considers only nodes
with some memory. So my patch didn't really make any difference and the
node stayed uninialized.
In other words. Does the following work? I am sorry to wildguess this
way but I am not able to recreate your setups to play with this myself.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..d51643e10d00 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
}
/**
@@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node(nid) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
@@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid);
+ else
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -721,21 +736,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
Michal Hocko
SUSE Labs
On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
>
> On Fri 07-12-18 16:56:27, Michal Hocko wrote:
> > On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> > [...]
> > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > index 1308f54..4dc497d 100644
> > > --- a/arch/x86/mm/numa.c
> > > +++ b/arch/x86/mm/numa.c
> > > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > > {
> > > int cpu;
> > > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > > + int node, nr;
> > >
> > > BUG_ON(cpu_to_apicid == NULL);
> > > + nr = cpumask_weight(cpu_possible_mask);
> > > +
> > > + /* bring up all possible node, since dev->numa_node */
> > > + //should check acpi works for node possible,
> > > + for_each_node(node)
> > > + if (!node_online(node))
> > > + init_memory_less_node(node);
> >
> > I suspect there is no change if you replace for_each_node by
> > for_each_node_mask(nid, node_possible_map)
> >
> > here. If that is the case then we are probably calling
> > free_area_init_node too early. I do not see it yet though.
>
> OK, so it is not about calling it late or soon. It is just that
> node_possible_map is a misnomer and it has a different semantic than
> I've expected. numa_nodemask_from_meminfo simply considers only nodes
> with some memory. So my patch didn't really make any difference and the
> node stayed uninialized.
>
> In other words. Does the following work? I am sorry to wildguess this
> way but I am not able to recreate your setups to play with this myself.
>
No problem. Yeah, in order to debug the patch, you need a numa machine
with a memory-less node. And unlucky, the patch can not work either by
grub bootup or kexec -l boot. There is nothing, just silent. I will
dig into numa_register_memblks() to figure out the problem.
Thanks,
Pingfan
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..d51643e10d00 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> +static void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node(nid) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (!end)
> + init_memory_less_node(nid);
> + else
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +736,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> --
> Michal Hocko
> SUSE Labs
On Tue 11-12-18 16:05:58, Pingfan Liu wrote:
> On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
> >
> > On Fri 07-12-18 16:56:27, Michal Hocko wrote:
> > > On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> > > [...]
> > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > > index 1308f54..4dc497d 100644
> > > > --- a/arch/x86/mm/numa.c
> > > > +++ b/arch/x86/mm/numa.c
> > > > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > > > {
> > > > int cpu;
> > > > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > > > + int node, nr;
> > > >
> > > > BUG_ON(cpu_to_apicid == NULL);
> > > > + nr = cpumask_weight(cpu_possible_mask);
> > > > +
> > > > + /* bring up all possible node, since dev->numa_node */
> > > > + //should check acpi works for node possible,
> > > > + for_each_node(node)
> > > > + if (!node_online(node))
> > > > + init_memory_less_node(node);
> > >
> > > I suspect there is no change if you replace for_each_node by
> > > for_each_node_mask(nid, node_possible_map)
> > >
> > > here. If that is the case then we are probably calling
> > > free_area_init_node too early. I do not see it yet though.
> >
> > OK, so it is not about calling it late or soon. It is just that
> > node_possible_map is a misnomer and it has a different semantic than
> > I've expected. numa_nodemask_from_meminfo simply considers only nodes
> > with some memory. So my patch didn't really make any difference and the
> > node stayed uninialized.
> >
> > In other words. Does the following work? I am sorry to wildguess this
> > way but I am not able to recreate your setups to play with this myself.
> >
> No problem. Yeah, in order to debug the patch, you need a numa machine
> with a memory-less node. And unlucky, the patch can not work either by
> grub bootup or kexec -l boot. There is nothing, just silent. I will
> dig into numa_register_memblks() to figure out the problem.
I do not have such a machine handy. Anyway, can you post the full serial
console log. Maybe I can infer something. It is quite weird that this
patch would make an existing situation any worse.
--
Michal Hocko
SUSE Labs
On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
>
[...]
>
> In other words. Does the following work? I am sorry to wildguess this
> way but I am not able to recreate your setups to play with this myself.
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..d51643e10d00 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> +static void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node(nid) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (!end)
Here comes the bug, since !end can not reach here.
> + init_memory_less_node(nid);
> + else
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +736,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
After passing extra param for earlyprintk, finally I got the
following. Please get it from attachment.
BTW, based on your patch, I tried the following, it works.
---
arch/x86/mm/numa.c | 42 +++++++++++++++++++-----------------------
1 file changed, 19 insertions(+), 23 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f54..4874248 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,7 +216,6 @@ static void __init alloc_node_data(int nid)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
node_set_online(nid);
}
@@ -527,6 +526,21 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ alloc_node_data(nid);
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+ node_set_online(nid);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -570,7 +584,7 @@ static int __init numa_register_memblks(struct
numa_meminfo *mi)
return -EINVAL;
/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node(nid) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
@@ -581,15 +595,15 @@ static int __init numa_register_memblks(struct
numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start >= end)
- continue;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
- if (end && (end - start) < NODE_MIN_SIZE)
+ if ( start >= end || (end && (end - start) < NODE_MIN_SIZE)) {
+ init_memory_less_node(nid);
continue;
+ }
alloc_node_data(nid);
}
@@ -721,21 +735,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +762,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
1.8.3.1
> --
> Michal Hocko
> SUSE Labs
On Tue, Dec 11, 2018 at 5:44 PM Michal Hocko <[email protected]> wrote:
>
> On Tue 11-12-18 16:05:58, Pingfan Liu wrote:
> > On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Fri 07-12-18 16:56:27, Michal Hocko wrote:
> > > > On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> > > > [...]
> > > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > > > index 1308f54..4dc497d 100644
> > > > > --- a/arch/x86/mm/numa.c
> > > > > +++ b/arch/x86/mm/numa.c
> > > > > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > > > > {
> > > > > int cpu;
> > > > > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > > > > + int node, nr;
> > > > >
> > > > > BUG_ON(cpu_to_apicid == NULL);
> > > > > + nr = cpumask_weight(cpu_possible_mask);
> > > > > +
> > > > > + /* bring up all possible node, since dev->numa_node */
> > > > > + //should check acpi works for node possible,
> > > > > + for_each_node(node)
> > > > > + if (!node_online(node))
> > > > > + init_memory_less_node(node);
> > > >
> > > > I suspect there is no change if you replace for_each_node by
> > > > for_each_node_mask(nid, node_possible_map)
> > > >
> > > > here. If that is the case then we are probably calling
> > > > free_area_init_node too early. I do not see it yet though.
> > >
> > > OK, so it is not about calling it late or soon. It is just that
> > > node_possible_map is a misnomer and it has a different semantic than
> > > I've expected. numa_nodemask_from_meminfo simply considers only nodes
> > > with some memory. So my patch didn't really make any difference and the
> > > node stayed uninialized.
> > >
> > > In other words. Does the following work? I am sorry to wildguess this
> > > way but I am not able to recreate your setups to play with this myself.
> > >
> > No problem. Yeah, in order to debug the patch, you need a numa machine
> > with a memory-less node. And unlucky, the patch can not work either by
> > grub bootup or kexec -l boot. There is nothing, just silent. I will
> > dig into numa_register_memblks() to figure out the problem.
>
> I do not have such a machine handy. Anyway, can you post the full serial
> console log. Maybe I can infer something. It is quite weird that this
> patch would make an existing situation any worse.
After passing extra param to earlyprintk, finally I got something. I
replied it in another mail, and some notes to your code.
Thanks,
Pingfan
On Wed 12-12-18 16:31:35, Pingfan Liu wrote:
> On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
> >
> [...]
> >
> > In other words. Does the following work? I am sorry to wildguess this
> > way but I am not able to recreate your setups to play with this myself.
> >
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f5408bf7..d51643e10d00 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
> >
> > node_data[nid] = nd;
> > memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > -
> > - node_set_online(nid);
> > }
> >
> > /**
> > @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > }
> > }
> >
> > +static void __init init_memory_less_node(int nid)
> > +{
> > + unsigned long zones_size[MAX_NR_ZONES] = {0};
> > + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > +
> > + free_area_init_node(nid, zones_size, 0, zholes_size);
> > +
> > + /*
> > + * All zonelists will be built later in start_kernel() after per cpu
> > + * areas are initialized.
> > + */
> > +}
> > +
> > static int __init numa_register_memblks(struct numa_meminfo *mi)
> > {
> > unsigned long uninitialized_var(pfn_align);
> > @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > return -EINVAL;
> >
> > /* Finally register nodes. */
> > - for_each_node_mask(nid, node_possible_map) {
> > + for_each_node(nid) {
> > u64 start = PFN_PHYS(max_pfn);
> > u64 end = 0;
> >
> > @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > continue;
> >
> > alloc_node_data(nid);
> > + if (!end)
>
> Here comes the bug, since !end can not reach here.
You are right. I am dumb. I've just completely missed that. Sigh.
Anyway, I think the code is more complicated than necessary and we can
simply drop the check. I do not think we really have to worry about
the start overflowing end. So the end patch should look as follows.
Btw. I believe it is better to pull alloc_node_data out of init_memory_less_node
because a) there is no need to duplicate the call and moreover we want
to pull node_set_online as well. The code also seems cleaner this way.
Thanks for your testing and your patience with me here.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..a5548fe668fb 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
}
/**
@@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node(nid) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
@@ -581,9 +592,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start >= end)
- continue;
-
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
@@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid);
+ else
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -721,21 +733,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +760,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
Michal Hocko
SUSE Labs
On Wed, Dec 12, 2018 at 7:53 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 12-12-18 16:31:35, Pingfan Liu wrote:
> > On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
> > >
> > [...]
> > >
> > > In other words. Does the following work? I am sorry to wildguess this
> > > way but I am not able to recreate your setups to play with this myself.
> > >
> > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > index 1308f5408bf7..d51643e10d00 100644
> > > --- a/arch/x86/mm/numa.c
> > > +++ b/arch/x86/mm/numa.c
> > > @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
> > >
> > > node_data[nid] = nd;
> > > memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > > -
> > > - node_set_online(nid);
> > > }
> > >
> > > /**
> > > @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > > }
> > > }
> > >
> > > +static void __init init_memory_less_node(int nid)
> > > +{
> > > + unsigned long zones_size[MAX_NR_ZONES] = {0};
> > > + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > > +
> > > + free_area_init_node(nid, zones_size, 0, zholes_size);
> > > +
> > > + /*
> > > + * All zonelists will be built later in start_kernel() after per cpu
> > > + * areas are initialized.
> > > + */
> > > +}
> > > +
> > > static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > {
> > > unsigned long uninitialized_var(pfn_align);
> > > @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > return -EINVAL;
> > >
> > > /* Finally register nodes. */
> > > - for_each_node_mask(nid, node_possible_map) {
> > > + for_each_node(nid) {
> > > u64 start = PFN_PHYS(max_pfn);
> > > u64 end = 0;
> > >
> > > @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > continue;
> > >
> > > alloc_node_data(nid);
> > > + if (!end)
> >
> > Here comes the bug, since !end can not reach here.
>
> You are right. I am dumb. I've just completely missed that. Sigh.
> Anyway, I think the code is more complicated than necessary and we can
> simply drop the check. I do not think we really have to worry about
> the start overflowing end. So the end patch should look as follows.
> Btw. I believe it is better to pull alloc_node_data out of init_memory_less_node
> because a) there is no need to duplicate the call and moreover we want
> to pull node_set_online as well. The code also seems cleaner this way.
>
I have no strong opinion here.
> Thanks for your testing and your patience with me here.
Np.
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..a5548fe668fb 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> +static void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node(nid) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -581,9 +592,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> end = max(mi->blk[i].end, end);
> }
>
> - if (start >= end)
> - continue;
> -
> /*
> * Don't confuse VM with a node that doesn't have the
> * minimum amount of memory:
> @@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (!end)
> + init_memory_less_node(nid);
> + else
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +733,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +760,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> --
Regret, it still has bug, and I got panic. Attached log.
Thanks,
Pingfan
On Thu, Dec 13, 2018 at 4:37 PM Pingfan Liu <[email protected]> wrote:
>
> On Wed, Dec 12, 2018 at 7:53 PM Michal Hocko <[email protected]> wrote:
> >
> > On Wed 12-12-18 16:31:35, Pingfan Liu wrote:
> > > On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <[email protected]> wrote:
> > > >
> > > [...]
> > > >
> > > > In other words. Does the following work? I am sorry to wildguess this
> > > > way but I am not able to recreate your setups to play with this myself.
> > > >
> > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > > index 1308f5408bf7..d51643e10d00 100644
> > > > --- a/arch/x86/mm/numa.c
> > > > +++ b/arch/x86/mm/numa.c
> > > > @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
> > > >
> > > > node_data[nid] = nd;
> > > > memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > > > -
> > > > - node_set_online(nid);
> > > > }
> > > >
> > > > /**
> > > > @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > > > }
> > > > }
> > > >
> > > > +static void __init init_memory_less_node(int nid)
> > > > +{
> > > > + unsigned long zones_size[MAX_NR_ZONES] = {0};
> > > > + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > > > +
> > > > + free_area_init_node(nid, zones_size, 0, zholes_size);
> > > > +
> > > > + /*
> > > > + * All zonelists will be built later in start_kernel() after per cpu
> > > > + * areas are initialized.
> > > > + */
> > > > +}
> > > > +
> > > > static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > > {
> > > > unsigned long uninitialized_var(pfn_align);
> > > > @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > > return -EINVAL;
> > > >
> > > > /* Finally register nodes. */
> > > > - for_each_node_mask(nid, node_possible_map) {
> > > > + for_each_node(nid) {
> > > > u64 start = PFN_PHYS(max_pfn);
> > > > u64 end = 0;
> > > >
> > > > @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > > continue;
> > > >
> > > > alloc_node_data(nid);
> > > > + if (!end)
> > >
> > > Here comes the bug, since !end can not reach here.
> >
> > You are right. I am dumb. I've just completely missed that. Sigh.
> > Anyway, I think the code is more complicated than necessary and we can
> > simply drop the check. I do not think we really have to worry about
> > the start overflowing end. So the end patch should look as follows.
> > Btw. I believe it is better to pull alloc_node_data out of init_memory_less_node
> > because a) there is no need to duplicate the call and moreover we want
> > to pull node_set_online as well. The code also seems cleaner this way.
> >
> I have no strong opinion here.
> > Thanks for your testing and your patience with me here.
> Np.
> >
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f5408bf7..a5548fe668fb 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
> >
> > node_data[nid] = nd;
> > memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > -
> > - node_set_online(nid);
> > }
> >
> > /**
> > @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > }
> > }
> >
> > +static void __init init_memory_less_node(int nid)
> > +{
> > + unsigned long zones_size[MAX_NR_ZONES] = {0};
> > + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > +
> > + free_area_init_node(nid, zones_size, 0, zholes_size);
> > +
> > + /*
> > + * All zonelists will be built later in start_kernel() after per cpu
> > + * areas are initialized.
> > + */
> > +}
> > +
> > static int __init numa_register_memblks(struct numa_meminfo *mi)
> > {
> > unsigned long uninitialized_var(pfn_align);
> > @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > return -EINVAL;
> >
> > /* Finally register nodes. */
> > - for_each_node_mask(nid, node_possible_map) {
> > + for_each_node(nid) {
> > u64 start = PFN_PHYS(max_pfn);
> > u64 end = 0;
> >
> > @@ -581,9 +592,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > end = max(mi->blk[i].end, end);
> > }
> >
> > - if (start >= end)
> > - continue;
> > -
> > /*
> > * Don't confuse VM with a node that doesn't have the
> > * minimum amount of memory:
> > @@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > continue;
> >
> > alloc_node_data(nid);
> > + if (!end)
> > + init_memory_less_node(nid);
Just have some opinion on this. Here is two issue. First, is this node
online? I do not see node_set_online() is called in this patch.
Second, if node is online here, then init_memory_less_node->
free_area_init_node is called duplicated when free_area_init_nodes().
This should be a critical design issue.
Thanks,
Pingfan
> > + else
> > + node_set_online(nid);
> > }
> >
> > /* Dump memblock with node info and return. */
> > @@ -721,21 +733,6 @@ void __init x86_numa_init(void)
> > numa_init(dummy_numa_init);
> > }
> >
> > -static void __init init_memory_less_node(int nid)
> > -{
> > - unsigned long zones_size[MAX_NR_ZONES] = {0};
> > - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > -
> > - /* Allocate and initialize node data. Memory-less node is now online.*/
> > - alloc_node_data(nid);
> > - free_area_init_node(nid, zones_size, 0, zholes_size);
> > -
> > - /*
> > - * All zonelists will be built later in start_kernel() after per cpu
> > - * areas are initialized.
> > - */
> > -}
> > -
> > /*
> > * Setup early cpu_to_node.
> > *
> > @@ -763,9 +760,6 @@ void __init init_cpu_to_node(void)
> > if (node == NUMA_NO_NODE)
> > continue;
> >
> > - if (!node_online(node))
> > - init_memory_less_node(node);
> > -
> > numa_set_node(cpu, node);
> > }
> > }
> > --
> Regret, it still has bug, and I got panic. Attached log.
>
> Thanks,
> Pingfan
On Thu 13-12-18 16:37:35, Pingfan Liu wrote:
[...]
> [ 0.409667] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> [ 0.419885] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> [ 0.430386] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> [ 0.436352] NODE_DATA(0) on node 5
> [ 0.440124] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.447104] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> [ 0.453110] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> [ 0.459060] NODE_DATA(2) on node 5
> [ 0.462855] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.469809] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> [ 0.475788] NODE_DATA(3) on node 5
> [ 0.479554] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.486536] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> [ 0.492518] NODE_DATA(4) on node 5
> [ 0.496280] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.503266] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> [ 0.509281] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> [ 0.515224] NODE_DATA(6) on node 5
> [ 0.518987] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.525974] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> [ 0.531953] NODE_DATA(7) on node 5
> [ 0.535716] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
OK, so we have allocated node_data for all NUMA nodes. Good!
> [ 0.542839] Reserving 500MB of memory at 384MB for crashkernel (System RAM: 32314MB)
> [ 0.550465] Zone ranges:
> [ 0.552927] DMA [mem 0x0000000000001000-0x0000000000ffffff]
> [ 0.559081] DMA32 [mem 0x0000000001000000-0x00000000ffffffff]
> [ 0.565235] Normal [mem 0x0000000100000000-0x000000087effffff]
> [ 0.571388] Device empty
> [ 0.574249] Movable zone start for each node
> [ 0.578498] Early memory node ranges
> [ 0.582049] node 1: [mem 0x0000000000001000-0x000000000008efff]
> [ 0.588291] node 1: [mem 0x0000000000090000-0x000000000009ffff]
> [ 0.594530] node 1: [mem 0x0000000000100000-0x000000005c3d6fff]
> [ 0.600772] node 1: [mem 0x00000000643df000-0x0000000068ff7fff]
> [ 0.607011] node 1: [mem 0x000000006c528000-0x000000006fffffff]
> [ 0.613251] node 1: [mem 0x0000000100000000-0x000000047fffffff]
> [ 0.619493] node 5: [mem 0x0000000480000000-0x000000087effffff]
> [ 0.626479] Zeroed struct page in unavailable ranges: 46490 pages
> [ 0.626480] Initmem setup node 1 [mem 0x0000000000001000-0x000000047fffffff]
> [ 0.655261] Initmem setup node 5 [mem 0x0000000480000000-0x000000087effffff]
[...]
> [ 1.066324] Built 2 zonelists, mobility grouping off. Total pages: 0
There are 2 zonelists built, but for some reason vm_total_pages is 0 and
that is clearly wrong.
Because the allocation failure (which later leads to NULL ptr) tells
there is quite a lot of memory. One reason might be that the zonelist
for memory less nodes is initialized incorrectly. nr_free_zone_pages
relies on the local Node zonelist so if the code happened to run on a
cpu associated with Node2 then we could indeed got vm_total_pages=0.
> [ 1.439440] Node 1 DMA: 2*4kB (U) 2*8kB (U) 2*16kB (U) 3*32kB (U) 2*64kB (U) 2*128kB (U) 2*256kB (U) 1*512kB (U) 0*1024kB 1*2048kB (M) 3*4096kB (M) = 15896kB
> [ 1.453482] Node 1 DMA32: 2*4kB (M) 1*8kB (M) 1*16kB (M) 2*32kB (M) 3*64kB (M) 2*128kB (M) 3*256kB (M) 3*512kB (M) 2*1024kB (M) 3*2048kB (M) 255*4096kB (M) = 1055520kB
> [ 1.468388] Node 1 Normal: 1*4kB (U) 1*8kB (U) 1*16kB (U) 1*32kB (U) 1*64kB (U) 1*128kB (U) 1*256kB (U) 1*512kB (U) 1*1024kB (U) 1*2048kB (U) 31*4096kB (M) = 131068kB
> [ 1.483211] Node 5 Normal: 1*4kB (U) 1*8kB (U) 1*16kB (U) 1*32kB (U) 1*64kB (U) 1*128kB (U) 1*256kB (U) 1*512kB (U) 1*1024kB (U) 1*2048kB (U) 31*4096kB (M) = 131068kB
I am investigating what the hell is going on here. Maybe the former hack
to re-initialize memory-less nodes is working around some ordering
issues.
--
Michal Hocko
SUSE Labs
On Thu 13-12-18 17:04:01, Pingfan Liu wrote:
[...]
> > > @@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > continue;
> > >
> > > alloc_node_data(nid);
> > > + if (!end)
> > > + init_memory_less_node(nid);
>
> Just have some opinion on this. Here is two issue. First, is this node
> online?
It shouldn't be as it doesn't have any memory.
> I do not see node_set_online() is called in this patch.
It is below for nodes with some memory.
> Second, if node is online here, then init_memory_less_node->
> free_area_init_node is called duplicated when free_area_init_nodes().
> This should be a critical design issue.
I am still trying to wrap my head around the expected code flow here.
numa_init does the following for all CPUs within nr_cpu_ids (aka nr_cpus
aware).
if (!node_online(nid))
numa_clear_node(i);
I do not really understand why do we do this. But this enforces
init_cpu_to_node to do init_memory_less_node (with the current upstream
code) and that will mark the node online again and zonelists are built
properly. My patch couldn't help in that respect because the node is
offline (as it should be IMHO).
So let's try another attempt with some larger surgery (on top of the
previous patch). It will also dump the zonelist after it is built for
each node. Let's see whether something more is lurking there.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a5548fe668fb..eb7c905d5d86 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -525,19 +525,6 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411de93a363..99252a0b6551 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2045,6 +2045,8 @@ extern void __init pagecache_init(void);
extern void free_area_init(unsigned long * zones_size);
extern void __init free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
+extern void init_memory_less_node(int nid);
+
extern void free_initmem(void);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ec9cc407216..a5c035fd6307 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+ struct zone *zone;
+ struct zoneref *z;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
+ pr_info("node[%d] zonelist: ", pgdat->node_id);
+ for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -5447,6 +5454,20 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+ __build_all_zonelists(NODE_DATA(nid));
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
--
Michal Hocko
SUSE Labs
Hi Michal,
WIth this patch applied on the old one, I got the following message.
Please get it from attachment.
Thanks,
Pingfan
On Mon, Dec 17, 2018 at 9:29 PM Michal Hocko <[email protected]> wrote:
>
> On Thu 13-12-18 17:04:01, Pingfan Liu wrote:
> [...]
> > > > @@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > > continue;
> > > >
> > > > alloc_node_data(nid);
> > > > + if (!end)
> > > > + init_memory_less_node(nid);
> >
> > Just have some opinion on this. Here is two issue. First, is this node
> > online?
>
>
> It shouldn't be as it doesn't have any memory.
>
> > I do not see node_set_online() is called in this patch.
>
> It is below for nodes with some memory.
>
> > Second, if node is online here, then init_memory_less_node->
> > free_area_init_node is called duplicated when free_area_init_nodes().
> > This should be a critical design issue.
>
> I am still trying to wrap my head around the expected code flow here.
> numa_init does the following for all CPUs within nr_cpu_ids (aka nr_cpus
> aware).
> if (!node_online(nid))
> numa_clear_node(i);
>
> I do not really understand why do we do this. But this enforces
> init_cpu_to_node to do init_memory_less_node (with the current upstream
> code) and that will mark the node online again and zonelists are built
> properly. My patch couldn't help in that respect because the node is
> offline (as it should be IMHO).
>
> So let's try another attempt with some larger surgery (on top of the
> previous patch). It will also dump the zonelist after it is built for
> each node. Let's see whether something more is lurking there.
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index a5548fe668fb..eb7c905d5d86 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -525,19 +525,6 @@ static void __init numa_clear_kernel_node_hotplug(void)
> }
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> static int __init numa_register_memblks(struct numa_meminfo *mi)
> {
> unsigned long uninitialized_var(pfn_align);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5411de93a363..99252a0b6551 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2045,6 +2045,8 @@ extern void __init pagecache_init(void);
> extern void free_area_init(unsigned long * zones_size);
> extern void __init free_area_init_node(int nid, unsigned long * zones_size,
> unsigned long zone_start_pfn, unsigned long *zholes_size);
> +extern void init_memory_less_node(int nid);
> +
> extern void free_initmem(void);
>
> /*
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2ec9cc407216..a5c035fd6307 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
> int node, load, nr_nodes = 0;
> nodemask_t used_mask;
> int local_node, prev_node;
> + struct zone *zone;
> + struct zoneref *z;
>
> /* NUMA-aware ordering of nodes */
> local_node = pgdat->node_id;
> @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
>
> build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> build_thisnode_zonelists(pgdat);
> +
> + pr_info("node[%d] zonelist: ", pgdat->node_id);
> + for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> + pr_cont("\n");
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -5447,6 +5454,20 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
> #endif
> }
>
> +void __init init_memory_less_node(int nid)
> +{
> + unsigned long zones_size[MAX_NR_ZONES] = {0};
> + unsigned long zholes_size[MAX_NR_ZONES] = {0};
> +
> + free_area_init_node(nid, zones_size, 0, zholes_size);
> + __build_all_zonelists(NODE_DATA(nid));
> +
> + /*
> + * All zonelists will be built later in start_kernel() after per cpu
> + * areas are initialized.
> + */
> +}
> +
> /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
> static bool __meminit
> overlap_memmap_init(unsigned long zone, unsigned long *pfn)
> --
> Michal Hocko
> SUSE Labs
On Thu 20-12-18 15:19:39, Pingfan Liu wrote:
> Hi Michal,
>
> WIth this patch applied on the old one, I got the following message.
> Please get it from attachment.
[...]
> [ 0.409637] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> [ 0.419858] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> [ 0.430356] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> [ 0.436325] NODE_DATA(0) on node 5
> [ 0.440092] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.447078] node[0] zonelist:
> [ 0.450106] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> [ 0.456114] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> [ 0.462064] NODE_DATA(2) on node 5
> [ 0.465852] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.472813] node[2] zonelist:
> [ 0.475846] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> [ 0.481827] NODE_DATA(3) on node 5
> [ 0.485590] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.492575] node[3] zonelist:
> [ 0.495608] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> [ 0.501587] NODE_DATA(4) on node 5
> [ 0.505349] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.512334] node[4] zonelist:
> [ 0.515370] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> [ 0.521384] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> [ 0.527329] NODE_DATA(6) on node 5
> [ 0.531091] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.538076] node[6] zonelist:
> [ 0.541109] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> [ 0.547090] NODE_DATA(7) on node 5
> [ 0.550851] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
> [ 0.557836] node[7] zonelist:
OK, so it is clear that building zonelists this early is not going to
fly. We do not have the complete information yet. I am not sure when do
we get that at this moment but I suspect the we either need to move that
initialization to a sooner stage or we have to reconsider whether the
phase when we build zonelists really needs to consider only online numa
nodes.
[...]
> [ 1.067658] percpu: Embedded 46 pages/cpu @(____ptrval____) s151552 r8192 d28672 u262144
> [ 1.075692] node[1] zonelist: 1:Normal 1:DMA32 1:DMA 5:Normal
> [ 1.081376] node[5] zonelist: 5:Normal 1:Normal 1:DMA32 1:DMA
I hope to get to this before I leave for christmas vacation, if not I
will stare into it after then.
Thanks!
--
Michal Hocko
SUSE Labs
On Thu 20-12-18 10:19:34, Michal Hocko wrote:
> On Thu 20-12-18 15:19:39, Pingfan Liu wrote:
> > Hi Michal,
> >
> > WIth this patch applied on the old one, I got the following message.
> > Please get it from attachment.
> [...]
> > [ 0.409637] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> > [ 0.419858] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> > [ 0.430356] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> > [ 0.436325] NODE_DATA(0) on node 5
> > [ 0.440092] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.447078] node[0] zonelist:
> > [ 0.450106] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> > [ 0.456114] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> > [ 0.462064] NODE_DATA(2) on node 5
> > [ 0.465852] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.472813] node[2] zonelist:
> > [ 0.475846] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> > [ 0.481827] NODE_DATA(3) on node 5
> > [ 0.485590] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.492575] node[3] zonelist:
> > [ 0.495608] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> > [ 0.501587] NODE_DATA(4) on node 5
> > [ 0.505349] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.512334] node[4] zonelist:
> > [ 0.515370] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> > [ 0.521384] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> > [ 0.527329] NODE_DATA(6) on node 5
> > [ 0.531091] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.538076] node[6] zonelist:
> > [ 0.541109] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> > [ 0.547090] NODE_DATA(7) on node 5
> > [ 0.550851] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
> > [ 0.557836] node[7] zonelist:
>
> OK, so it is clear that building zonelists this early is not going to
> fly. We do not have the complete information yet. I am not sure when do
> we get that at this moment but I suspect the we either need to move that
> initialization to a sooner stage or we have to reconsider whether the
> phase when we build zonelists really needs to consider only online numa
> nodes.
>
> [...]
> > [ 1.067658] percpu: Embedded 46 pages/cpu @(____ptrval____) s151552 r8192 d28672 u262144
> > [ 1.075692] node[1] zonelist: 1:Normal 1:DMA32 1:DMA 5:Normal
> > [ 1.081376] node[5] zonelist: 5:Normal 1:Normal 1:DMA32 1:DMA
>
> I hope to get to this before I leave for christmas vacation, if not I
> will stare into it after then.
I am sorry but I didn't get to this sooner. But I've got another idea. I
concluded that the whole dance is simply bogus and we should treat
memory less nodes, well, as nodes with no memory ranges rather than
special case them. Could you give the following a spin please?
---
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..0e79445cfd85 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
}
/**
@@ -535,6 +533,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
/* Account for nodes with cpus and no memory */
node_possible_map = numa_nodes_parsed;
numa_nodemask_from_meminfo(&node_possible_map, mi);
+ pr_info("parsed=%*pbl, possible=%*pbl\n", nodemask_pr_args(&numa_nodes_parsed), nodemask_pr_args(&node_possible_map));
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
@@ -570,7 +569,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node_mask(nid, numa_nodes_parsed) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
@@ -581,9 +580,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start >= end)
- continue;
-
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
@@ -592,6 +588,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (end)
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -721,21 +719,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +746,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ec9cc407216..52e54d16662a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+ struct zone *zone;
+ struct zoneref *z;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
+ pr_info("node[%d] zonelist: ", pgdat->node_id);
+ for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -5361,10 +5368,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- build_zonelists(pgdat);
+ if (pgdat)
+ build_zonelists(pgdat);
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -6644,10 +6652,8 @@ static unsigned long __init find_min_pfn_for_node(int nid)
for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
min_pfn = min(min_pfn, start_pfn);
- if (min_pfn == ULONG_MAX) {
- pr_warn("Could not find start_pfn for node %d\n", nid);
+ if (min_pfn == ULONG_MAX)
return 0;
- }
return min_pfn;
}
@@ -6991,8 +6997,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
mminit_verify_pageflags_layout();
setup_nr_node_ids();
zero_resv_unavail();
- for_each_online_node(nid) {
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (!pgdat)
+ continue;
+
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
--
Michal Hocko
SUSE Labs
On Tue, Jan 8, 2019 at 10:34 PM Michal Hocko <[email protected]> wrote:
>
> On Thu 20-12-18 10:19:34, Michal Hocko wrote:
> > On Thu 20-12-18 15:19:39, Pingfan Liu wrote:
> > > Hi Michal,
> > >
> > > WIth this patch applied on the old one, I got the following message.
> > > Please get it from attachment.
> > [...]
> > > [ 0.409637] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> > > [ 0.419858] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> > > [ 0.430356] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> > > [ 0.436325] NODE_DATA(0) on node 5
> > > [ 0.440092] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.447078] node[0] zonelist:
> > > [ 0.450106] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> > > [ 0.456114] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> > > [ 0.462064] NODE_DATA(2) on node 5
> > > [ 0.465852] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.472813] node[2] zonelist:
> > > [ 0.475846] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> > > [ 0.481827] NODE_DATA(3) on node 5
> > > [ 0.485590] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.492575] node[3] zonelist:
> > > [ 0.495608] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> > > [ 0.501587] NODE_DATA(4) on node 5
> > > [ 0.505349] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.512334] node[4] zonelist:
> > > [ 0.515370] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> > > [ 0.521384] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> > > [ 0.527329] NODE_DATA(6) on node 5
> > > [ 0.531091] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.538076] node[6] zonelist:
> > > [ 0.541109] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> > > [ 0.547090] NODE_DATA(7) on node 5
> > > [ 0.550851] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.557836] node[7] zonelist:
> >
> > OK, so it is clear that building zonelists this early is not going to
> > fly. We do not have the complete information yet. I am not sure when do
> > we get that at this moment but I suspect the we either need to move that
> > initialization to a sooner stage or we have to reconsider whether the
> > phase when we build zonelists really needs to consider only online numa
> > nodes.
> >
> > [...]
> > > [ 1.067658] percpu: Embedded 46 pages/cpu @(____ptrval____) s151552 r8192 d28672 u262144
> > > [ 1.075692] node[1] zonelist: 1:Normal 1:DMA32 1:DMA 5:Normal
> > > [ 1.081376] node[5] zonelist: 5:Normal 1:Normal 1:DMA32 1:DMA
> >
> > I hope to get to this before I leave for christmas vacation, if not I
> > will stare into it after then.
>
> I am sorry but I didn't get to this sooner. But I've got another idea. I
> concluded that the whole dance is simply bogus and we should treat
> memory less nodes, well, as nodes with no memory ranges rather than
> special case them. Could you give the following a spin please?
>
Sure, I have queued a loan for the remote machine. It will take some time.
Regards,
Pingfan
> ---
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..0e79445cfd85 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -535,6 +533,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> /* Account for nodes with cpus and no memory */
> node_possible_map = numa_nodes_parsed;
> numa_nodemask_from_meminfo(&node_possible_map, mi);
> + pr_info("parsed=%*pbl, possible=%*pbl\n", nodemask_pr_args(&numa_nodes_parsed), nodemask_pr_args(&node_possible_map));
> if (WARN_ON(nodes_empty(node_possible_map)))
> return -EINVAL;
>
> @@ -570,7 +569,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node_mask(nid, numa_nodes_parsed) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -581,9 +580,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> end = max(mi->blk[i].end, end);
> }
>
> - if (start >= end)
> - continue;
> -
> /*
> * Don't confuse VM with a node that doesn't have the
> * minimum amount of memory:
> @@ -592,6 +588,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (end)
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +719,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +746,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2ec9cc407216..52e54d16662a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
> int node, load, nr_nodes = 0;
> nodemask_t used_mask;
> int local_node, prev_node;
> + struct zone *zone;
> + struct zoneref *z;
>
> /* NUMA-aware ordering of nodes */
> local_node = pgdat->node_id;
> @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
>
> build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> build_thisnode_zonelists(pgdat);
> +
> + pr_info("node[%d] zonelist: ", pgdat->node_id);
> + for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> + pr_cont("\n");
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -5361,10 +5368,11 @@ static void __build_all_zonelists(void *data)
> if (self && !node_online(self->node_id)) {
> build_zonelists(self);
> } else {
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
>
> - build_zonelists(pgdat);
> + if (pgdat)
> + build_zonelists(pgdat);
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -6644,10 +6652,8 @@ static unsigned long __init find_min_pfn_for_node(int nid)
> for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
> min_pfn = min(min_pfn, start_pfn);
>
> - if (min_pfn == ULONG_MAX) {
> - pr_warn("Could not find start_pfn for node %d\n", nid);
> + if (min_pfn == ULONG_MAX)
> return 0;
> - }
>
> return min_pfn;
> }
> @@ -6991,8 +6997,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
> mminit_verify_pageflags_layout();
> setup_nr_node_ids();
> zero_resv_unavail();
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
> +
> + if (!pgdat)
> + continue;
> +
> free_area_init_node(nid, NULL,
> find_min_pfn_for_node(nid), NULL);
>
> --
> Michal Hocko
> SUSE Labs
On Tue, Jan 8, 2019 at 10:34 PM Michal Hocko <[email protected]> wrote:
>
> On Thu 20-12-18 10:19:34, Michal Hocko wrote:
> > On Thu 20-12-18 15:19:39, Pingfan Liu wrote:
> > > Hi Michal,
> > >
> > > WIth this patch applied on the old one, I got the following message.
> > > Please get it from attachment.
> > [...]
> > > [ 0.409637] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> > > [ 0.419858] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> > > [ 0.430356] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> > > [ 0.436325] NODE_DATA(0) on node 5
> > > [ 0.440092] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.447078] node[0] zonelist:
> > > [ 0.450106] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> > > [ 0.456114] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> > > [ 0.462064] NODE_DATA(2) on node 5
> > > [ 0.465852] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.472813] node[2] zonelist:
> > > [ 0.475846] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> > > [ 0.481827] NODE_DATA(3) on node 5
> > > [ 0.485590] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.492575] node[3] zonelist:
> > > [ 0.495608] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> > > [ 0.501587] NODE_DATA(4) on node 5
> > > [ 0.505349] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.512334] node[4] zonelist:
> > > [ 0.515370] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> > > [ 0.521384] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> > > [ 0.527329] NODE_DATA(6) on node 5
> > > [ 0.531091] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.538076] node[6] zonelist:
> > > [ 0.541109] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> > > [ 0.547090] NODE_DATA(7) on node 5
> > > [ 0.550851] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.557836] node[7] zonelist:
> >
> > OK, so it is clear that building zonelists this early is not going to
> > fly. We do not have the complete information yet. I am not sure when do
> > we get that at this moment but I suspect the we either need to move that
> > initialization to a sooner stage or we have to reconsider whether the
> > phase when we build zonelists really needs to consider only online numa
> > nodes.
> >
> > [...]
> > > [ 1.067658] percpu: Embedded 46 pages/cpu @(____ptrval____) s151552 r8192 d28672 u262144
> > > [ 1.075692] node[1] zonelist: 1:Normal 1:DMA32 1:DMA 5:Normal
> > > [ 1.081376] node[5] zonelist: 5:Normal 1:Normal 1:DMA32 1:DMA
> >
> > I hope to get to this before I leave for christmas vacation, if not I
> > will stare into it after then.
>
> I am sorry but I didn't get to this sooner. But I've got another idea. I
> concluded that the whole dance is simply bogus and we should treat
> memory less nodes, well, as nodes with no memory ranges rather than
> special case them. Could you give the following a spin please?
>
> ---
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..0e79445cfd85 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -535,6 +533,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> /* Account for nodes with cpus and no memory */
> node_possible_map = numa_nodes_parsed;
> numa_nodemask_from_meminfo(&node_possible_map, mi);
> + pr_info("parsed=%*pbl, possible=%*pbl\n", nodemask_pr_args(&numa_nodes_parsed), nodemask_pr_args(&node_possible_map));
> if (WARN_ON(nodes_empty(node_possible_map)))
> return -EINVAL;
>
> @@ -570,7 +569,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node_mask(nid, numa_nodes_parsed) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -581,9 +580,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> end = max(mi->blk[i].end, end);
> }
>
> - if (start >= end)
> - continue;
> -
> /*
> * Don't confuse VM with a node that doesn't have the
> * minimum amount of memory:
> @@ -592,6 +588,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (end)
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +719,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +746,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2ec9cc407216..52e54d16662a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
> int node, load, nr_nodes = 0;
> nodemask_t used_mask;
> int local_node, prev_node;
> + struct zone *zone;
> + struct zoneref *z;
>
> /* NUMA-aware ordering of nodes */
> local_node = pgdat->node_id;
> @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
>
> build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> build_thisnode_zonelists(pgdat);
> +
> + pr_info("node[%d] zonelist: ", pgdat->node_id);
> + for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> + pr_cont("\n");
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -5361,10 +5368,11 @@ static void __build_all_zonelists(void *data)
> if (self && !node_online(self->node_id)) {
> build_zonelists(self);
> } else {
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
>
> - build_zonelists(pgdat);
> + if (pgdat)
> + build_zonelists(pgdat);
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -6644,10 +6652,8 @@ static unsigned long __init find_min_pfn_for_node(int nid)
> for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
> min_pfn = min(min_pfn, start_pfn);
>
> - if (min_pfn == ULONG_MAX) {
> - pr_warn("Could not find start_pfn for node %d\n", nid);
> + if (min_pfn == ULONG_MAX)
> return 0;
> - }
>
> return min_pfn;
> }
> @@ -6991,8 +6997,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
> mminit_verify_pageflags_layout();
> setup_nr_node_ids();
> zero_resv_unavail();
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
> +
> + if (!pgdat)
> + continue;
> +
> free_area_init_node(nid, NULL,
> find_min_pfn_for_node(nid), NULL);
>
Hi, this patch works! Feel free to use tested-by me
Best Regards
Pingfan
On Fri 11-01-19 11:12:45, Pingfan Liu wrote:
[...]
> Hi, this patch works! Feel free to use tested-by me
Thanks a lot for your testing! Now it is time to seriously think whether
this is the right thing to do and sync all other arches that might have
the same problem. I will take care of it. Thanks for your patience and
effort. I will post something hopefully soon in a separate thread as
this one grown too large already.
--
Michal Hocko
SUSE Labs