In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
we use a global variable named "contig_page_data".
If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
"virt_to_phys used for non-linear address" warning when booting.
To fix it, create a small function to handle both translation.
Warning message:
[ 0.000000] ------------[ cut here ]------------
[ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
[ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
[ 0.000000] Hardware name: linux,dummy-virt (DT)
[ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
[ 0.000000] pc : __virt_to_phys+0x58/0x68
[ 0.000000] lr : __virt_to_phys+0x54/0x68
[ 0.000000] sp : ffff800011833e70
[ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
[ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
[ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
[ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
[ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
[ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
[ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
[ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
[ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
[ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
[ 0.000000] Call trace:
[ 0.000000] __virt_to_phys+0x58/0x68
[ 0.000000] check_usemap_section_nr+0x50/0xfc
[ 0.000000] sparse_init_nid+0x1ac/0x28c
[ 0.000000] sparse_init+0x1c4/0x1e0
[ 0.000000] bootmem_init+0x60/0x90
[ 0.000000] setup_arch+0x184/0x1f0
[ 0.000000] start_kernel+0x78/0x488
[ 0.000000] ---[ end trace f68728a0d3053b60 ]---
Signed-off-by: Miles Chen <[email protected]>
---
mm/sparse.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/mm/sparse.c b/mm/sparse.c
index b2ada9dc00cb..55c18aff3e42 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
+static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
+{
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ return __pa_symbol(pgdat);
+#else
+ return __pa(pgdat);
+#endif
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
- pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
--
2.18.0
On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> we use a global variable named "contig_page_data".
>
> If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> "virt_to_phys used for non-linear address" warning when booting.
Maybe we'll just allocate pgdat for CONFIG_NEED_MULTIPLE_NODES=n (which is
essentially !NUMA) case in, say, free_area_init()?
> To fix it, create a small function to handle both translation.
>
> Warning message:
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> [ 0.000000] Modules linked in:
> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> [ 0.000000] Hardware name: linux,dummy-virt (DT)
> [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> [ 0.000000] pc : __virt_to_phys+0x58/0x68
> [ 0.000000] lr : __virt_to_phys+0x54/0x68
> [ 0.000000] sp : ffff800011833e70
> [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> [ 0.000000] Call trace:
> [ 0.000000] __virt_to_phys+0x58/0x68
> [ 0.000000] check_usemap_section_nr+0x50/0xfc
> [ 0.000000] sparse_init_nid+0x1ac/0x28c
> [ 0.000000] sparse_init+0x1c4/0x1e0
> [ 0.000000] bootmem_init+0x60/0x90
> [ 0.000000] setup_arch+0x184/0x1f0
> [ 0.000000] start_kernel+0x78/0x488
> [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
>
> Signed-off-by: Miles Chen <[email protected]>
> ---
> mm/sparse.c | 13 +++++++++++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index b2ada9dc00cb..55c18aff3e42 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> return sizeof(struct mem_section_usage) + usemap_size();
> }
>
> +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> +{
> +#ifndef CONFIG_NEED_MULTIPLE_NODES
> + return __pa_symbol(pgdat);
> +#else
> + return __pa(pgdat);
> +#endif
> +}
> +
> #ifdef CONFIG_MEMORY_HOTREMOVE
> static struct mem_section_usage * __init
> sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> * from the same section as the pgdat where possible to avoid
> * this problem.
> */
> - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> limit = goal + (1UL << PA_SECTION_SHIFT);
> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
> @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> }
>
> usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> if (usemap_snr == pgdat_snr)
> return;
>
> --
> 2.18.0
--
Sincerely yours,
Mike.
On Tue, 2021-05-11 at 13:24 +0300, Mike Rapoport wrote:
> On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> > we use a global variable named "contig_page_data".
> >
> > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > "virt_to_phys used for non-linear address" warning when booting.
>
> Maybe we'll just allocate pgdat for CONFIG_NEED_MULTIPLE_NODES=n (which is
> essentially !NUMA) case in, say, free_area_init()?
thanks for your comment.
I check the source tree and found that contig_page_data is used by
crash_core.c as a symbol. I am not familiar with crash_core but I guess
allocate pgdat may break this crash_core users.
For example: some userspace scripts want to read the address of
contig_page_data symbol from a corefile.
kernel/crash_core.c:460: VMCOREINFO_SYMBOL(contig_page_data);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
VMCOREINFO_SYMBOL(contig_page_data);
#endif
Perhaps we should use current simple approach?
>
> > To fix it, create a small function to handle both translation.
> >
> > Warning message:
> > [ 0.000000] ------------[ cut here ]------------
> > [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> > [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> > [ 0.000000] Modules linked in:
> > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> > [ 0.000000] Hardware name: linux,dummy-virt (DT)
> > [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> > [ 0.000000] pc : __virt_to_phys+0x58/0x68
> > [ 0.000000] lr : __virt_to_phys+0x54/0x68
> > [ 0.000000] sp : ffff800011833e70
> > [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> > [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> > [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> > [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> > [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> > [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> > [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> > [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> > [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> > [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> > [ 0.000000] Call trace:
> > [ 0.000000] __virt_to_phys+0x58/0x68
> > [ 0.000000] check_usemap_section_nr+0x50/0xfc
> > [ 0.000000] sparse_init_nid+0x1ac/0x28c
> > [ 0.000000] sparse_init+0x1c4/0x1e0
> > [ 0.000000] bootmem_init+0x60/0x90
> > [ 0.000000] setup_arch+0x184/0x1f0
> > [ 0.000000] start_kernel+0x78/0x488
> > [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
> >
> > Signed-off-by: Miles Chen <[email protected]>
> > ---
> > mm/sparse.c | 13 +++++++++++--
> > 1 file changed, 11 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index b2ada9dc00cb..55c18aff3e42 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> > return sizeof(struct mem_section_usage) + usemap_size();
> > }
> >
> > +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> > +{
> > +#ifndef CONFIG_NEED_MULTIPLE_NODES
> > + return __pa_symbol(pgdat);
> > +#else
> > + return __pa(pgdat);
> > +#endif
> > +}
> > +
> > #ifdef CONFIG_MEMORY_HOTREMOVE
> > static struct mem_section_usage * __init
> > sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > * from the same section as the pgdat where possible to avoid
> > * this problem.
> > */
> > - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > limit = goal + (1UL << PA_SECTION_SHIFT);
> > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > again:
> > @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> > }
> >
> > usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> > - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> > + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> > if (usemap_snr == pgdat_snr)
> > return;
> >
> > --
> > 2.18.0
>
On Wed, May 12, 2021 at 01:33:20PM +0800, Miles Chen wrote:
> On Tue, 2021-05-11 at 13:24 +0300, Mike Rapoport wrote:
> > On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> > > we use a global variable named "contig_page_data".
> > >
> > > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> > > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > > "virt_to_phys used for non-linear address" warning when booting.
> >
> > Maybe we'll just allocate pgdat for CONFIG_NEED_MULTIPLE_NODES=n (which is
> > essentially !NUMA) case in, say, free_area_init()?
>
>
> thanks for your comment.
>
> I check the source tree and found that contig_page_data is used by
> crash_core.c as a symbol. I am not familiar with crash_core but I guess
> allocate pgdat may break this crash_core users.
>
> For example: some userspace scripts want to read the address of
> contig_page_data symbol from a corefile.
>
> kernel/crash_core.c:460: VMCOREINFO_SYMBOL(contig_page_data);
>
> #ifndef CONFIG_NEED_MULTIPLE_NODES
> VMCOREINFO_SYMBOL(mem_map);
> VMCOREINFO_SYMBOL(contig_page_data);
> #endif
My understanding is that VMCOREINFO_SYMBOL() should correspond to actual
symbol. If there is no contig_page_data symbol, there is no need for
VMCOREINFO_SYMBOL() either.
> Perhaps we should use current simple approach?
>
>
> >
> > > To fix it, create a small function to handle both translation.
> > >
> > > Warning message:
> > > [ 0.000000] ------------[ cut here ]------------
> > > [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> > > [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> > > [ 0.000000] Modules linked in:
> > > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> > > [ 0.000000] Hardware name: linux,dummy-virt (DT)
> > > [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> > > [ 0.000000] pc : __virt_to_phys+0x58/0x68
> > > [ 0.000000] lr : __virt_to_phys+0x54/0x68
> > > [ 0.000000] sp : ffff800011833e70
> > > [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> > > [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> > > [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> > > [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> > > [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> > > [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> > > [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> > > [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> > > [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> > > [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> > > [ 0.000000] Call trace:
> > > [ 0.000000] __virt_to_phys+0x58/0x68
> > > [ 0.000000] check_usemap_section_nr+0x50/0xfc
> > > [ 0.000000] sparse_init_nid+0x1ac/0x28c
> > > [ 0.000000] sparse_init+0x1c4/0x1e0
> > > [ 0.000000] bootmem_init+0x60/0x90
> > > [ 0.000000] setup_arch+0x184/0x1f0
> > > [ 0.000000] start_kernel+0x78/0x488
> > > [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
> > >
> > > Signed-off-by: Miles Chen <[email protected]>
> > > ---
> > > mm/sparse.c | 13 +++++++++++--
> > > 1 file changed, 11 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/sparse.c b/mm/sparse.c
> > > index b2ada9dc00cb..55c18aff3e42 100644
> > > --- a/mm/sparse.c
> > > +++ b/mm/sparse.c
> > > @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> > > return sizeof(struct mem_section_usage) + usemap_size();
> > > }
> > >
> > > +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> > > +{
> > > +#ifndef CONFIG_NEED_MULTIPLE_NODES
> > > + return __pa_symbol(pgdat);
> > > +#else
> > > + return __pa(pgdat);
> > > +#endif
> > > +}
> > > +
> > > #ifdef CONFIG_MEMORY_HOTREMOVE
> > > static struct mem_section_usage * __init
> > > sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > * from the same section as the pgdat where possible to avoid
> > > * this problem.
> > > */
> > > - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > limit = goal + (1UL << PA_SECTION_SHIFT);
> > > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > > again:
> > > @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> > > }
> > >
> > > usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> > > - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> > > + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> > > if (usemap_snr == pgdat_snr)
> > > return;
> > >
> > > --
> > > 2.18.0
> >
>
--
Sincerely yours,
Mike.
On 05/12/21 at 12:37pm, Mike Rapoport wrote:
> On Wed, May 12, 2021 at 01:33:20PM +0800, Miles Chen wrote:
> > On Tue, 2021-05-11 at 13:24 +0300, Mike Rapoport wrote:
> > > On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > > > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > > > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> > > > we use a global variable named "contig_page_data".
> > > >
> > > > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> > > > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > > > "virt_to_phys used for non-linear address" warning when booting.
> > >
> > > Maybe we'll just allocate pgdat for CONFIG_NEED_MULTIPLE_NODES=n (which is
> > > essentially !NUMA) case in, say, free_area_init()?
> >
> >
> > thanks for your comment.
> >
> > I check the source tree and found that contig_page_data is used by
> > crash_core.c as a symbol. I am not familiar with crash_core but I guess
> > allocate pgdat may break this crash_core users.
> >
> > For example: some userspace scripts want to read the address of
> > contig_page_data symbol from a corefile.
> >
> > kernel/crash_core.c:460: VMCOREINFO_SYMBOL(contig_page_data);
> >
> > #ifndef CONFIG_NEED_MULTIPLE_NODES
> > VMCOREINFO_SYMBOL(mem_map);
> > VMCOREINFO_SYMBOL(contig_page_data);
> > #endif
>
> My understanding is that VMCOREINFO_SYMBOL() should correspond to actual
> symbol. If there is no contig_page_data symbol, there is no need for
> VMCOREINFO_SYMBOL() either.
Yeah, it's exported for makedumpfile and crash utility to parse and get
the memory layout of the corrupted kernel. If removing it, makedumpfile
will get it from node_data[]. Looks like a good idea to unify code for
numa|!numa on pglist_data instances.
Add Kazu to CC since he maintain makedumpfile and Crash utilities.
My concern is that that only happens on arm/arm64/riscv, does it mean the
warning is not necessary, so can be removed? Or we need to check if
CONFIG_DEBUG_VIRTUAL doesn't work well in this case.
Thanks
Baoquan
On Thu, 2021-05-13 at 09:16 +0800, Baoquan He wrote:
> On 05/12/21 at 12:37pm, Mike Rapoport wrote:
> > On Wed, May 12, 2021 at 01:33:20PM +0800, Miles Chen wrote:
> > > On Tue, 2021-05-11 at 13:24 +0300, Mike Rapoport wrote:
> > > > On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > > > > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > > > > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> > > > > we use a global variable named "contig_page_data".
> > > > >
> > > > > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> > > > > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > > > > "virt_to_phys used for non-linear address" warning when booting.
> > > >
> > > > Maybe we'll just allocate pgdat for CONFIG_NEED_MULTIPLE_NODES=n (which is
> > > > essentially !NUMA) case in, say, free_area_init()?
> > >
> > >
> > > thanks for your comment.
> > >
> > > I check the source tree and found that contig_page_data is used by
> > > crash_core.c as a symbol. I am not familiar with crash_core but I guess
> > > allocate pgdat may break this crash_core users.
> > >
> > > For example: some userspace scripts want to read the address of
> > > contig_page_data symbol from a corefile.
> > >
> > > kernel/crash_core.c:460: VMCOREINFO_SYMBOL(contig_page_data);
> > >
> > > #ifndef CONFIG_NEED_MULTIPLE_NODES
> > > VMCOREINFO_SYMBOL(mem_map);
> > > VMCOREINFO_SYMBOL(contig_page_data);
> > > #endif
> >
> > My understanding is that VMCOREINFO_SYMBOL() should correspond to actual
> > symbol. If there is no contig_page_data symbol, there is no need for
> > VMCOREINFO_SYMBOL() either.
>
> Yeah, it's exported for makedumpfile and crash utility to parse and get
> the memory layout of the corrupted kernel. If removing it, makedumpfile
> will get it from node_data[]. Looks like a good idea to unify code for
> numa|!numa on pglist_data instances.
>
> Add Kazu to CC since he maintain makedumpfile and Crash utilities.
thanks for adding the experts in. (I searched the source code of crash
last night and found that contig_page_data is used in memory.c)
I will move the allocation and initialization to free_area_init() and
submit patch v2.
Miles
>
> My concern is that that only happens on arm/arm64/riscv, does it mean the
> warning is not necessary, so can be removed? Or we need to check if
> CONFIG_DEBUG_VIRTUAL doesn't work well in this case.
>
> Thanks
> Baoquan
>
(add arm64 people)
On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
The node structures are allocated from memblock rather than kzmalloc, so
I'd suggest to use "dynamically allocated":
... node_data is dynamically allocated
> we use a global variable named "contig_page_data".
>
> If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
^ coma
And here as well, rather then mention kzalloc
... __pa() can handle both dynamic allocation and symbol cases.
> symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> "virt_to_phys used for non-linear address" warning when booting.
>
> To fix it, create a small function to handle both translation.
More generally, I wonder how other architectures than support DEBUG_VIRTUAL
cope with this?
Maybe such lack of consistency between debug and no-debug version of
arm64::virt_to_phys() is what needs to be fixed at the first place?
> Warning message:
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> [ 0.000000] Modules linked in:
> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> [ 0.000000] Hardware name: linux,dummy-virt (DT)
> [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> [ 0.000000] pc : __virt_to_phys+0x58/0x68
> [ 0.000000] lr : __virt_to_phys+0x54/0x68
> [ 0.000000] sp : ffff800011833e70
> [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> [ 0.000000] Call trace:
> [ 0.000000] __virt_to_phys+0x58/0x68
> [ 0.000000] check_usemap_section_nr+0x50/0xfc
> [ 0.000000] sparse_init_nid+0x1ac/0x28c
> [ 0.000000] sparse_init+0x1c4/0x1e0
> [ 0.000000] bootmem_init+0x60/0x90
> [ 0.000000] setup_arch+0x184/0x1f0
> [ 0.000000] start_kernel+0x78/0x488
> [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
>
> Signed-off-by: Miles Chen <[email protected]>
> ---
> mm/sparse.c | 13 +++++++++++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index b2ada9dc00cb..55c18aff3e42 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> return sizeof(struct mem_section_usage) + usemap_size();
> }
>
> +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> +{
> +#ifndef CONFIG_NEED_MULTIPLE_NODES
> + return __pa_symbol(pgdat);
> +#else
> + return __pa(pgdat);
> +#endif
> +}
> +
> #ifdef CONFIG_MEMORY_HOTREMOVE
> static struct mem_section_usage * __init
> sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> * from the same section as the pgdat where possible to avoid
> * this problem.
> */
> - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> limit = goal + (1UL << PA_SECTION_SHIFT);
> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
> @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> }
>
> usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> if (usemap_snr == pgdat_snr)
> return;
>
> --
> 2.18.0
--
Sincerely yours,
Mike.
On Wed, 2021-05-19 at 08:30 +0300, Mike Rapoport wrote:
> (add arm64 people)
>
> On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
>
> The node structures are allocated from memblock rather than kzmalloc, so
> I'd suggest to use "dynamically allocated":
>
> ... node_data is dynamically allocated
no problem. "dynamically allocated" is better.
>
> > we use a global variable named "contig_page_data".
> >
> > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
>
> ^ coma
>
> And here as well, rather then mention kzalloc
ok, will fix this in v3
>
> ... __pa() can handle both dynamic allocation and symbol cases.
>
> > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > "virt_to_phys used for non-linear address" warning when booting.
> >
> > To fix it, create a small function to handle both translation.
>
> More generally, I wonder how other architectures than support DEBUG_VIRTUAL
> cope with this?
arm64 support DEBUG_VIRTUAL since v4.11. Before v4.11, a driver can get
incorrect PA by using __pa() for non-linear addresses and create
hard-to-debug issues.
arm32...I remember an old issue in ext4: __pa() is used to convert
non-linear kmap'ed addresses (when CONFIG_HIGHMEM=y).
There are multiple architectures which support CONFIG_DEBUG_VIRTUAL.
It helps people to catch non-linear addresses at the first place.
find arch -name 'physaddr.c'
arch/arm/mm/physaddr.c
arch/arm64/mm/physaddr.c
arch/mips/mm/physaddr.c
arch/riscv/mm/physaddr.c
arch/x86/mm/physaddr.c
>
> Maybe such lack of consistency between debug and no-debug version of
> arm64::virt_to_phys() is what needs to be fixed at the first place?
>
> > Warning message:
> > [ 0.000000] ------------[ cut here ]------------
> > [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> > [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> > [ 0.000000] Modules linked in:
> > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> > [ 0.000000] Hardware name: linux,dummy-virt (DT)
> > [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> > [ 0.000000] pc : __virt_to_phys+0x58/0x68
> > [ 0.000000] lr : __virt_to_phys+0x54/0x68
> > [ 0.000000] sp : ffff800011833e70
> > [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> > [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> > [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> > [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> > [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> > [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> > [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> > [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> > [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> > [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> > [ 0.000000] Call trace:
> > [ 0.000000] __virt_to_phys+0x58/0x68
> > [ 0.000000] check_usemap_section_nr+0x50/0xfc
> > [ 0.000000] sparse_init_nid+0x1ac/0x28c
> > [ 0.000000] sparse_init+0x1c4/0x1e0
> > [ 0.000000] bootmem_init+0x60/0x90
> > [ 0.000000] setup_arch+0x184/0x1f0
> > [ 0.000000] start_kernel+0x78/0x488
> > [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
> >
> > Signed-off-by: Miles Chen <[email protected]>
> > ---
> > mm/sparse.c | 13 +++++++++++--
> > 1 file changed, 11 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index b2ada9dc00cb..55c18aff3e42 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> > return sizeof(struct mem_section_usage) + usemap_size();
> > }
> >
> > +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> > +{
> > +#ifndef CONFIG_NEED_MULTIPLE_NODES
> > + return __pa_symbol(pgdat);
> > +#else
> > + return __pa(pgdat);
> > +#endif
> > +}
> > +
> > #ifdef CONFIG_MEMORY_HOTREMOVE
> > static struct mem_section_usage * __init
> > sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > * from the same section as the pgdat where possible to avoid
> > * this problem.
> > */
> > - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > limit = goal + (1UL << PA_SECTION_SHIFT);
> > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > again:
> > @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> > }
> >
> > usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> > - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> > + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> > if (usemap_snr == pgdat_snr)
> > return;
> >
> > --
> > 2.18.0
>
On Wed, May 19, 2021 at 03:18:49PM +0800, Miles Chen wrote:
> On Wed, 2021-05-19 at 08:30 +0300, Mike Rapoport wrote:
> > (add arm64 people)
> >
> > On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> >
> > The node structures are allocated from memblock rather than kzmalloc, so
> > I'd suggest to use "dynamically allocated":
> >
> > ... node_data is dynamically allocated
>
>
> no problem. "dynamically allocated" is better.
>
> >
> > > we use a global variable named "contig_page_data".
> > >
> > > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> >
> > ^ coma
> >
> > And here as well, rather then mention kzalloc
>
>
> ok, will fix this in v3
>
> >
> > ... __pa() can handle both dynamic allocation and symbol cases.
> >
> > > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > > "virt_to_phys used for non-linear address" warning when booting.
> > >
> > > To fix it, create a small function to handle both translation.
> >
> > More generally, I wonder how other architectures than support DEBUG_VIRTUAL
> > cope with this?
>
> arm64 support DEBUG_VIRTUAL since v4.11. Before v4.11, a driver can get
> incorrect PA by using __pa() for non-linear addresses and create
> hard-to-debug issues.
>
> arm32...I remember an old issue in ext4: __pa() is used to convert
> non-linear kmap'ed addresses (when CONFIG_HIGHMEM=y).
>
> There are multiple architectures which support CONFIG_DEBUG_VIRTUAL.
> It helps people to catch non-linear addresses at the first place.
>
> find arch -name 'physaddr.c'
> arch/arm/mm/physaddr.c
> arch/arm64/mm/physaddr.c
> arch/mips/mm/physaddr.c
> arch/riscv/mm/physaddr.c
> arch/x86/mm/physaddr.c
Right, but it seems they check for different things on different
architectures...
Just to make it clear, I'm Ok with your patch.
All I'm saying is that we probably need to revisit DEBUG_VIRTUAL to make it
consistent across architectures.
> >
> > Maybe such lack of consistency between debug and no-debug version of
> > arm64::virt_to_phys() is what needs to be fixed at the first place?
> >
> > > Warning message:
> > > [ 0.000000] ------------[ cut here ]------------
> > > [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> > > [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> > > [ 0.000000] Modules linked in:
> > > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> > > [ 0.000000] Hardware name: linux,dummy-virt (DT)
> > > [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> > > [ 0.000000] pc : __virt_to_phys+0x58/0x68
> > > [ 0.000000] lr : __virt_to_phys+0x54/0x68
> > > [ 0.000000] sp : ffff800011833e70
> > > [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> > > [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> > > [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> > > [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> > > [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> > > [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> > > [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> > > [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> > > [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> > > [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> > > [ 0.000000] Call trace:
> > > [ 0.000000] __virt_to_phys+0x58/0x68
> > > [ 0.000000] check_usemap_section_nr+0x50/0xfc
> > > [ 0.000000] sparse_init_nid+0x1ac/0x28c
> > > [ 0.000000] sparse_init+0x1c4/0x1e0
> > > [ 0.000000] bootmem_init+0x60/0x90
> > > [ 0.000000] setup_arch+0x184/0x1f0
> > > [ 0.000000] start_kernel+0x78/0x488
> > > [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
> > >
> > > Signed-off-by: Miles Chen <[email protected]>
> > > ---
> > > mm/sparse.c | 13 +++++++++++--
> > > 1 file changed, 11 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/sparse.c b/mm/sparse.c
> > > index b2ada9dc00cb..55c18aff3e42 100644
> > > --- a/mm/sparse.c
> > > +++ b/mm/sparse.c
> > > @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> > > return sizeof(struct mem_section_usage) + usemap_size();
> > > }
> > >
> > > +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> > > +{
> > > +#ifndef CONFIG_NEED_MULTIPLE_NODES
> > > + return __pa_symbol(pgdat);
> > > +#else
> > > + return __pa(pgdat);
> > > +#endif
> > > +}
> > > +
> > > #ifdef CONFIG_MEMORY_HOTREMOVE
> > > static struct mem_section_usage * __init
> > > sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > * from the same section as the pgdat where possible to avoid
> > > * this problem.
> > > */
> > > - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > limit = goal + (1UL << PA_SECTION_SHIFT);
> > > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > > again:
> > > @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> > > }
> > >
> > > usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> > > - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> > > + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> > > if (usemap_snr == pgdat_snr)
> > > return;
> > >
> > > --
> > > 2.18.0
> >
>
--
Sincerely yours,
Mike.
On Wed, 2021-05-19 at 10:56 +0300, Mike Rapoport wrote:
> On Wed, May 19, 2021 at 03:18:49PM +0800, Miles Chen wrote:
> > On Wed, 2021-05-19 at 08:30 +0300, Mike Rapoport wrote:
> > > (add arm64 people)
> > >
> > > On Tue, May 11, 2021 at 05:31:14PM +0800, Miles Chen wrote:
> > > > In current implementation of node_data, if CONFIG_NEED_MULTIPLE_NODES=y,
> > > > node_data is allocated by kzmalloc. If CONFIG_NEED_MULTIPLE_NODES=n,
> > >
> > > The node structures are allocated from memblock rather than kzmalloc, so
> > > I'd suggest to use "dynamically allocated":
> > >
> > > ... node_data is dynamically allocated
> >
> >
> > no problem. "dynamically allocated" is better.
> >
> > >
> > > > we use a global variable named "contig_page_data".
> > > >
> > > > If CONFIG_DEBUG_VIRTUAL is not enabled. __pa() can handle both kzalloc and
> > >
> > > ^ coma
> > >
> > > And here as well, rather then mention kzalloc
> >
> >
> > ok, will fix this in v3
> >
> > >
> > > ... __pa() can handle both dynamic allocation and symbol cases.
> > >
> > > > symbol cases. But if CONFIG_DEBUG_VIRTUAL is set, we will have the
> > > > "virt_to_phys used for non-linear address" warning when booting.
> > > >
> > > > To fix it, create a small function to handle both translation.
> > >
> > > More generally, I wonder how other architectures than support DEBUG_VIRTUAL
> > > cope with this?
> >
> > arm64 support DEBUG_VIRTUAL since v4.11. Before v4.11, a driver can get
> > incorrect PA by using __pa() for non-linear addresses and create
> > hard-to-debug issues.
> >
> > arm32...I remember an old issue in ext4: __pa() is used to convert
> > non-linear kmap'ed addresses (when CONFIG_HIGHMEM=y).
> >
> > There are multiple architectures which support CONFIG_DEBUG_VIRTUAL.
> > It helps people to catch non-linear addresses at the first place.
> >
> > find arch -name 'physaddr.c'
> > arch/arm/mm/physaddr.c
> > arch/arm64/mm/physaddr.c
> > arch/mips/mm/physaddr.c
> > arch/riscv/mm/physaddr.c
> > arch/x86/mm/physaddr.c
>
> Right, but it seems they check for different things on different
> architectures...
>
> Just to make it clear, I'm Ok with your patch.
> All I'm saying is that we probably need to revisit DEBUG_VIRTUAL to make it
> consistent across architectures.
>
Understood, thanks your reviewing.
(I think different things are checked because that the "linear address
space" is architecture-dependent and may be changed over time. For
example, arm64, symbol addresses was in linear address space and moved
to "non-linear address space" for address randomization.)
Miles
> > >
> > > Maybe such lack of consistency between debug and no-debug version of
> > > arm64::virt_to_phys() is what needs to be fixed at the first place?
> > >
> > > > Warning message:
> > > > [ 0.000000] ------------[ cut here ]------------
> > > > [ 0.000000] virt_to_phys used for non-linear address: (____ptrval____) (contig_page_data+0x0/0x1c00)
> > > > [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x58/0x68
> > > > [ 0.000000] Modules linked in:
> > > > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.13.0-rc1-00074-g1140ab592e2e #3
> > > > [ 0.000000] Hardware name: linux,dummy-virt (DT)
> > > > [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--)
> > > > [ 0.000000] pc : __virt_to_phys+0x58/0x68
> > > > [ 0.000000] lr : __virt_to_phys+0x54/0x68
> > > > [ 0.000000] sp : ffff800011833e70
> > > > [ 0.000000] x29: ffff800011833e70 x28: 00000000418a0018 x27: 0000000000000000
> > > > [ 0.000000] x26: 000000000000000a x25: ffff800011b70000 x24: ffff800011b70000
> > > > [ 0.000000] x23: fffffc0001c00000 x22: ffff800011b70000 x21: 0000000047ffffb0
> > > > [ 0.000000] x20: 0000000000000008 x19: ffff800011b082c0 x18: ffffffffffffffff
> > > > [ 0.000000] x17: 0000000000000000 x16: ffff800011833bf9 x15: 0000000000000004
> > > > [ 0.000000] x14: 0000000000000fff x13: ffff80001186a548 x12: 0000000000000000
> > > > [ 0.000000] x11: 0000000000000000 x10: 00000000ffffffff x9 : 0000000000000000
> > > > [ 0.000000] x8 : ffff8000115c9000 x7 : 737520737968705f x6 : ffff800011b62ef8
> > > > [ 0.000000] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000
> > > > [ 0.000000] x2 : 0000000000000000 x1 : ffff80001159585e x0 : 0000000000000058
> > > > [ 0.000000] Call trace:
> > > > [ 0.000000] __virt_to_phys+0x58/0x68
> > > > [ 0.000000] check_usemap_section_nr+0x50/0xfc
> > > > [ 0.000000] sparse_init_nid+0x1ac/0x28c
> > > > [ 0.000000] sparse_init+0x1c4/0x1e0
> > > > [ 0.000000] bootmem_init+0x60/0x90
> > > > [ 0.000000] setup_arch+0x184/0x1f0
> > > > [ 0.000000] start_kernel+0x78/0x488
> > > > [ 0.000000] ---[ end trace f68728a0d3053b60 ]---
> > > >
> > > > Signed-off-by: Miles Chen <[email protected]>
> > > > ---
> > > > mm/sparse.c | 13 +++++++++++--
> > > > 1 file changed, 11 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/mm/sparse.c b/mm/sparse.c
> > > > index b2ada9dc00cb..55c18aff3e42 100644
> > > > --- a/mm/sparse.c
> > > > +++ b/mm/sparse.c
> > > > @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
> > > > return sizeof(struct mem_section_usage) + usemap_size();
> > > > }
> > > >
> > > > +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
> > > > +{
> > > > +#ifndef CONFIG_NEED_MULTIPLE_NODES
> > > > + return __pa_symbol(pgdat);
> > > > +#else
> > > > + return __pa(pgdat);
> > > > +#endif
> > > > +}
> > > > +
> > > > #ifdef CONFIG_MEMORY_HOTREMOVE
> > > > static struct mem_section_usage * __init
> > > > sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > > @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> > > > * from the same section as the pgdat where possible to avoid
> > > > * this problem.
> > > > */
> > > > - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > > + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
> > > > limit = goal + (1UL << PA_SECTION_SHIFT);
> > > > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > > > again:
> > > > @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
> > > > }
> > > >
> > > > usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> > > > - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> > > > + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
> > > > if (usemap_snr == pgdat_snr)
> > > > return;
> > > >
> > > > --
> > > > 2.18.0
> > >
> >
>