2023-01-26 16:44:16

by Georgi Djakov

[permalink] [raw]
Subject: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

From: Chris Goldsworthy <[email protected]>

It's useful to have an option to disable the ZONE_DMA32 during boot as
CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
example). There are platforms that do not use this zone and in some high
memory pressure scenarios this would help on easing kswapd (to leave file
backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
these platforms - kswapd is woken up more easily and drains the file cache
which leads to some performance issues.

Signed-off-by: Chris Goldsworthy <[email protected]>
[georgi: updated commit text]
Signed-off-by: Georgi Djakov <[email protected]>
---
The main question here is whether we can have a kernel command line
option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
I can imagine this being useful also for Linux distros.

.../admin-guide/kernel-parameters.txt | 5 +++++
arch/arm64/mm/init.c | 20 ++++++++++++++++-
arch/x86/mm/init.c | 20 ++++++++++++++++-
include/linux/dma-direct.h | 22 +++++++++++++++++++
kernel/dma/direct.c | 5 +++--
kernel/dma/pool.c | 8 +++----
6 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb12df402161..854ff65ac6b0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1070,6 +1070,11 @@
Disable Dynamic DMA Window support. Use this
to workaround buggy firmware.

+ disable_dma32= [KNL]
+ Dynamically disable ZONE_DMA32 on kernels compiled with
+ CONFIG_ZONE_DMA32=y.
+
+
disable_ipv6= [IPV6]
See Documentation/networking/ipv6.rst.

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 58a0bb2c17f1..1a56098c0e19 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
return 0;
}

+/*
+ * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
+ * CONFIG_ZONE_DMA32.
+ */
+static bool disable_dma32 __ro_after_init;
+
/*
* reserve_crashkernel() - reserves memory for crash kernel
*
@@ -244,7 +250,7 @@ static void __init zone_sizes_init(void)
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
+ max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit);
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
@@ -253,6 +259,18 @@ static void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}

+static int __init early_disable_dma32(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (!strcmp(buf, "on"))
+ disable_dma32 = true;
+
+ return 0;
+}
+early_param("disable_dma32", early_disable_dma32);
+
int pfn_is_map_memory(unsigned long pfn)
{
phys_addr_t addr = PFN_PHYS(pfn);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cb258f58fdc8..b8af7e2f21f5 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;

static bool __initdata can_use_brk_pgt = true;

+/*
+ * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
+ * CONFIG_ZONE_DMA32.
+ */
+static bool disable_dma32 __ro_after_init;
+
/*
* Pages returned are already directly mapped.
*
@@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
+ max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
@@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}

+static int __init early_disable_dma32(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (!strcmp(buf, "on"))
+ disable_dma32 = true;
+
+ return 0;
+}
+early_param("disable_dma32", early_disable_dma32);
+
__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 18aade195884..ed69618cf1fc 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -24,6 +24,28 @@ struct bus_dma_region {
u64 offset;
};

+static inline bool zone_dma32_is_empty(int node)
+{
+#ifdef CONFIG_ZONE_DMA32
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
+#else
+ return true;
+#endif
+}
+
+static inline bool zone_dma32_are_empty(void)
+{
+ int node;
+
+ for_each_node(node)
+ if (!zone_dma32_is_empty(node))
+ return false;
+
+ return true;
+}
+
static inline dma_addr_t translate_phys_to_dma(struct device *dev,
phys_addr_t paddr)
{
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 63859a101ed8..754210c65658 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
*phys_limit = dma_to_phys(dev, dma_limit);
if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
return GFP_DMA;
- if (*phys_limit <= DMA_BIT_MASK(32))
+ if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
return GFP_DMA32;
return 0;
}
@@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,

if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
phys_limit < DMA_BIT_MASK(64) &&
- !(gfp & (GFP_DMA32 | GFP_DMA))) {
+ !(gfp & (GFP_DMA32 | GFP_DMA)) &&
+ !zone_dma32_is_empty(node)) {
gfp |= GFP_DMA32;
goto again;
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..8e79903fbda8 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
end = cma_get_base(cma) + size - 1;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
return end <= DMA_BIT_MASK(zone_dma_bits);
- if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
return end <= DMA_BIT_MASK(32);
return true;
}
@@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
if (IS_ENABLED(CONFIG_ZONE_DMA))
atomic_pool_resize(atomic_pool_dma,
GFP_KERNEL | GFP_DMA);
- if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
atomic_pool_resize(atomic_pool_dma32,
GFP_KERNEL | GFP_DMA32);
atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
@@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
if (!atomic_pool_dma)
ret = -ENOMEM;
}
- if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA32);
if (!atomic_pool_dma32)
@@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
{
if (prev == NULL) {
- if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
return atomic_pool_dma32;
if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;


2023-01-26 18:51:23

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On 1/26/23 08:43, Georgi Djakov wrote:
> From: Chris Goldsworthy <[email protected]>
>
> It's useful to have an option to disable the ZONE_DMA32 during boot as
> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> example). There are platforms that do not use this zone and in some high
> memory pressure scenarios this would help on easing kswapd (to leave file
> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> these platforms - kswapd is woken up more easily and drains the file cache
> which leads to some performance issues.

Any chance you could expound on "some performance issues", maybe with
actual workloads and numbers?

Also, what are the practical implications here? There are obviously an
ever decreasing number of 32-bit DMA devices out there. Somebody that
has one and uses this option might be sad because now they're stuck
using ZONE_DMA which is quite tiny.

What other ZONE_DMA32 users are left? Will anyone else care? There is
some DMA32 slab and vmalloc() functionality remaining. Is it impacted?

What should the default be? If disable_dma32=0 by default, this seems
like code that will never get tested. There are some rather subtle
relationships between ZONE_DMA32 and other zones that could get missed
without sufficient testing, like...

alloc_flags_nofragment() has some rather relevant comments that are left
unaddressed:

> /*
> * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
> * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
> * on UMA that if Normal is populated then so is DMA32.
> */

This patch appears to break the "on UMA" assumption.


> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 58a0bb2c17f1..1a56098c0e19 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
> return 0;
> }
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;

Is this referenced in any hot paths? It might deserve a static branch.

> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index cb258f58fdc8..b8af7e2f21f5 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>
> static bool __initdata can_use_brk_pgt = true;
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
> /*
> * Pages returned are already directly mapped.
> *
> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
> max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
> #endif
> #ifdef CONFIG_ZONE_DMA32
> - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
> #endif
> max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
> #ifdef CONFIG_HIGHMEM
> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
> free_area_init(max_zone_pfns);
> }
>
> +static int __init early_disable_dma32(char *buf)
> +{
> + if (!buf)
> + return -EINVAL;
> +
> + if (!strcmp(buf, "on"))
> + disable_dma32 = true;
> +
> + return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);

Ick. Is there no way to do this other than a cross-arch copy/paste?

> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 18aade195884..ed69618cf1fc 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -24,6 +24,28 @@ struct bus_dma_region {
> u64 offset;
> };
>
> +static inline bool zone_dma32_is_empty(int node)
> +{
> +#ifdef CONFIG_ZONE_DMA32
> + pg_data_t *pgdat = NODE_DATA(node);
> +
> + return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
> +#else
> + return true;
> +#endif
> +}
> +
> +static inline bool zone_dma32_are_empty(void)
> +{
> + int node;
> +
> + for_each_node(node)
> + if (!zone_dma32_is_empty(node))
> + return false;
> +
> + return true;
> +}

That for_each_node() loop can be 1024 nodes long. I hope this isn't in
any hot paths. It'll be fine on DMA32 systems because it'll fall out of
the loop on the first pass. But, if someone uses your shiny new option,
they're in for a long loop.

Oh, and this is:

#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)

Did you really want all *possible* nodes? Or will
for_each_online_node() do?

Also, I'd be rather shocked if there wasn't one-stop-shopping for this
somewhere. Consulting your new 'disable_dma32' variable would make this
function rather cheap. I suspect a totally empty set of DMA32 will also
leave its mark in arch_zone_*_possible_pfn[].

> static inline dma_addr_t translate_phys_to_dma(struct device *dev,
> phys_addr_t paddr)
> {
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 63859a101ed8..754210c65658 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
> *phys_limit = dma_to_phys(dev, dma_limit);
> if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> return GFP_DMA;
> - if (*phys_limit <= DMA_BIT_MASK(32))
> + if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
> return GFP_DMA32;
> return 0;
> }
> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>
> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
> phys_limit < DMA_BIT_MASK(64) &&
> - !(gfp & (GFP_DMA32 | GFP_DMA))) {
> + !(gfp & (GFP_DMA32 | GFP_DMA)) &&
> + !zone_dma32_is_empty(node)) {
> gfp |= GFP_DMA32;
> goto again;
> }

The zone_dma32_is_empty() value is known at compile-time if
!CONFIG_ZONE_DMA32. That would make it redundant to check both.

> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 4d40dcce7604..8e79903fbda8 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
> end = cma_get_base(cma) + size - 1;
> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> return end <= DMA_BIT_MASK(zone_dma_bits);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return end <= DMA_BIT_MASK(32);
> return true;
> }
> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
> if (IS_ENABLED(CONFIG_ZONE_DMA))
> atomic_pool_resize(atomic_pool_dma,
> GFP_KERNEL | GFP_DMA);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
> atomic_pool_resize(atomic_pool_dma32,
> GFP_KERNEL | GFP_DMA32);
> atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
> if (!atomic_pool_dma)
> ret = -ENOMEM;
> }
> - if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
> atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
> GFP_KERNEL | GFP_DMA32);
> if (!atomic_pool_dma32)
> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
> static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
> {
> if (prev == NULL) {
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return atomic_pool_dma32;
> if (atomic_pool_dma && (gfp & GFP_DMA))
> return atomic_pool_dma;

I think every single call to zone_dma32_are_empty() is !'d. It seems
like it chose the wrong polarity.

2023-01-26 19:15:42

by Robin Murphy

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On 2023-01-26 16:43, Georgi Djakov wrote:
> From: Chris Goldsworthy <[email protected]>
>
> It's useful to have an option to disable the ZONE_DMA32 during boot as
> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> example). There are platforms that do not use this zone and in some high
> memory pressure scenarios this would help on easing kswapd (to leave file
> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> these platforms - kswapd is woken up more easily and drains the file cache
> which leads to some performance issues.
>
> Signed-off-by: Chris Goldsworthy <[email protected]>
> [georgi: updated commit text]
> Signed-off-by: Georgi Djakov <[email protected]>
> ---
> The main question here is whether we can have a kernel command line
> option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> I can imagine this being useful also for Linux distros.

FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed
around in a few places" are very different notions...

However, I'm just going to take a step back and read the commit message
a few more times... Given what it claims, I can't help but ask why
wouldn't we want a parameter to control kswapd's behaviour and address
that issue directly, rather than a massive hammer that breaks everyone
allocating explicitly or implicitly with __GFP_DMA32 (especially on
systems where it doesn't normally matter because all memory is below 4GB
anyway), just to achieve one rather niche side-effect?

Thanks,
Robin.

> .../admin-guide/kernel-parameters.txt | 5 +++++
> arch/arm64/mm/init.c | 20 ++++++++++++++++-
> arch/x86/mm/init.c | 20 ++++++++++++++++-
> include/linux/dma-direct.h | 22 +++++++++++++++++++
> kernel/dma/direct.c | 5 +++--
> kernel/dma/pool.c | 8 +++----
> 6 files changed, 72 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index cb12df402161..854ff65ac6b0 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1070,6 +1070,11 @@
> Disable Dynamic DMA Window support. Use this
> to workaround buggy firmware.
>
> + disable_dma32= [KNL]
> + Dynamically disable ZONE_DMA32 on kernels compiled with
> + CONFIG_ZONE_DMA32=y.
> +
> +
> disable_ipv6= [IPV6]
> See Documentation/networking/ipv6.rst.
>
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 58a0bb2c17f1..1a56098c0e19 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
> return 0;
> }
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
> /*
> * reserve_crashkernel() - reserves memory for crash kernel
> *
> @@ -244,7 +250,7 @@ static void __init zone_sizes_init(void)
> max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
> #endif
> #ifdef CONFIG_ZONE_DMA32
> - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit);
> if (!arm64_dma_phys_limit)
> arm64_dma_phys_limit = dma32_phys_limit;
> #endif
> @@ -253,6 +259,18 @@ static void __init zone_sizes_init(void)
> free_area_init(max_zone_pfns);
> }
>
> +static int __init early_disable_dma32(char *buf)
> +{
> + if (!buf)
> + return -EINVAL;
> +
> + if (!strcmp(buf, "on"))
> + disable_dma32 = true;
> +
> + return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
> int pfn_is_map_memory(unsigned long pfn)
> {
> phys_addr_t addr = PFN_PHYS(pfn);
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index cb258f58fdc8..b8af7e2f21f5 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>
> static bool __initdata can_use_brk_pgt = true;
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
> /*
> * Pages returned are already directly mapped.
> *
> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
> max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
> #endif
> #ifdef CONFIG_ZONE_DMA32
> - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
> #endif
> max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
> #ifdef CONFIG_HIGHMEM
> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
> free_area_init(max_zone_pfns);
> }
>
> +static int __init early_disable_dma32(char *buf)
> +{
> + if (!buf)
> + return -EINVAL;
> +
> + if (!strcmp(buf, "on"))
> + disable_dma32 = true;
> +
> + return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
> __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
> .loaded_mm = &init_mm,
> .next_asid = 1,
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 18aade195884..ed69618cf1fc 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -24,6 +24,28 @@ struct bus_dma_region {
> u64 offset;
> };
>
> +static inline bool zone_dma32_is_empty(int node)
> +{
> +#ifdef CONFIG_ZONE_DMA32
> + pg_data_t *pgdat = NODE_DATA(node);
> +
> + return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
> +#else
> + return true;
> +#endif
> +}
> +
> +static inline bool zone_dma32_are_empty(void)
> +{
> + int node;
> +
> + for_each_node(node)
> + if (!zone_dma32_is_empty(node))
> + return false;
> +
> + return true;
> +}
> +
> static inline dma_addr_t translate_phys_to_dma(struct device *dev,
> phys_addr_t paddr)
> {
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 63859a101ed8..754210c65658 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
> *phys_limit = dma_to_phys(dev, dma_limit);
> if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> return GFP_DMA;
> - if (*phys_limit <= DMA_BIT_MASK(32))
> + if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
> return GFP_DMA32;
> return 0;
> }
> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>
> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
> phys_limit < DMA_BIT_MASK(64) &&
> - !(gfp & (GFP_DMA32 | GFP_DMA))) {
> + !(gfp & (GFP_DMA32 | GFP_DMA)) &&
> + !zone_dma32_is_empty(node)) {
> gfp |= GFP_DMA32;
> goto again;
> }
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 4d40dcce7604..8e79903fbda8 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
> end = cma_get_base(cma) + size - 1;
> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> return end <= DMA_BIT_MASK(zone_dma_bits);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return end <= DMA_BIT_MASK(32);
> return true;
> }
> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
> if (IS_ENABLED(CONFIG_ZONE_DMA))
> atomic_pool_resize(atomic_pool_dma,
> GFP_KERNEL | GFP_DMA);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
> atomic_pool_resize(atomic_pool_dma32,
> GFP_KERNEL | GFP_DMA32);
> atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
> if (!atomic_pool_dma)
> ret = -ENOMEM;
> }
> - if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
> atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
> GFP_KERNEL | GFP_DMA32);
> if (!atomic_pool_dma32)
> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
> static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
> {
> if (prev == NULL) {
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return atomic_pool_dma32;
> if (atomic_pool_dma && (gfp & GFP_DMA))
> return atomic_pool_dma;

2023-01-26 22:57:05

by Georgi Djakov

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

Hi Dave,

Thanks for the detailed review and comments!

On 26.01.23 20:51, Dave Hansen wrote:
> On 1/26/23 08:43, Georgi Djakov wrote:
>> From: Chris Goldsworthy <[email protected]>
>>
>> It's useful to have an option to disable the ZONE_DMA32 during boot as
>> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
>> example). There are platforms that do not use this zone and in some high
>> memory pressure scenarios this would help on easing kswapd (to leave file
>> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
>> these platforms - kswapd is woken up more easily and drains the file cache
>> which leads to some performance issues.
>
> Any chance you could expound on "some performance issues", maybe with
> actual workloads and numbers?

The goal is to prevent any reclaim at all or defer it as much as possible.
I can gather some numbers, but let me explain the scenario first. Let's say
you have 100 processes and you put some to sleep. After some time kswapd
reclaims file cached memory. You wake one of these processes back and some
lagging is observed because files have to be re-opened, as they were reclaimed
from the file cache.

>
> Also, what are the practical implications here? There are obviously an
> ever decreasing number of 32-bit DMA devices out there. Somebody that
> has one and uses this option might be sad because now they're stuck
> using ZONE_DMA which is quite tiny.

True. There are side effects indeed.

I think that Robin's suggestion to try to control kswapd's behavior is
really something worth exploring!

Thanks,
Georgi

> What other ZONE_DMA32 users are left? Will anyone else care? There is
> some DMA32 slab and vmalloc() functionality remaining. Is it impacted?
>
> What should the default be? If disable_dma32=0 by default, this seems
> like code that will never get tested. There are some rather subtle
> relationships between ZONE_DMA32 and other zones that could get missed
> without sufficient testing, like...
>
> alloc_flags_nofragment() has some rather relevant comments that are left
> unaddressed:
>
>> /*
>> * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
>> * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
>> * on UMA that if Normal is populated then so is DMA32.
>> */
>
> This patch appears to break the "on UMA" assumption.
>
>
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 58a0bb2c17f1..1a56098c0e19 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>> return 0;
>> }
>>
>> +/*
>> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
>> + * CONFIG_ZONE_DMA32.
>> + */
>> +static bool disable_dma32 __ro_after_init;
>
> Is this referenced in any hot paths? It might deserve a static branch.
>
>> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
>> index cb258f58fdc8..b8af7e2f21f5 100644
>> --- a/arch/x86/mm/init.c
>> +++ b/arch/x86/mm/init.c
>> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>>
>> static bool __initdata can_use_brk_pgt = true;
>>
>> +/*
>> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
>> + * CONFIG_ZONE_DMA32.
>> + */
>> +static bool disable_dma32 __ro_after_init;
>> +
>> /*
>> * Pages returned are already directly mapped.
>> *
>> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
>> max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
>> #endif
>> #ifdef CONFIG_ZONE_DMA32
>> - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
>> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
>> #endif
>> max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
>> #ifdef CONFIG_HIGHMEM
>> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
>> free_area_init(max_zone_pfns);
>> }
>>
>> +static int __init early_disable_dma32(char *buf)
>> +{
>> + if (!buf)
>> + return -EINVAL;
>> +
>> + if (!strcmp(buf, "on"))
>> + disable_dma32 = true;
>> +
>> + return 0;
>> +}
>> +early_param("disable_dma32", early_disable_dma32);
>
> Ick. Is there no way to do this other than a cross-arch copy/paste?
>
>> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
>> index 18aade195884..ed69618cf1fc 100644
>> --- a/include/linux/dma-direct.h
>> +++ b/include/linux/dma-direct.h
>> @@ -24,6 +24,28 @@ struct bus_dma_region {
>> u64 offset;
>> };
>>
>> +static inline bool zone_dma32_is_empty(int node)
>> +{
>> +#ifdef CONFIG_ZONE_DMA32
>> + pg_data_t *pgdat = NODE_DATA(node);
>> +
>> + return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
>> +#else
>> + return true;
>> +#endif
>> +}
>> +
>> +static inline bool zone_dma32_are_empty(void)
>> +{
>> + int node;
>> +
>> + for_each_node(node)
>> + if (!zone_dma32_is_empty(node))
>> + return false;
>> +
>> + return true;
>> +}
>
> That for_each_node() loop can be 1024 nodes long. I hope this isn't in
> any hot paths. It'll be fine on DMA32 systems because it'll fall out of
> the loop on the first pass. But, if someone uses your shiny new option,
> they're in for a long loop.
>
> Oh, and this is:
>
> #define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
>
> Did you really want all *possible* nodes? Or will
> for_each_online_node() do?
>
> Also, I'd be rather shocked if there wasn't one-stop-shopping for this
> somewhere. Consulting your new 'disable_dma32' variable would make this
> function rather cheap. I suspect a totally empty set of DMA32 will also
> leave its mark in arch_zone_*_possible_pfn[].
>
>> static inline dma_addr_t translate_phys_to_dma(struct device *dev,
>> phys_addr_t paddr)
>> {
>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>> index 63859a101ed8..754210c65658 100644
>> --- a/kernel/dma/direct.c
>> +++ b/kernel/dma/direct.c
>> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
>> *phys_limit = dma_to_phys(dev, dma_limit);
>> if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>> return GFP_DMA;
>> - if (*phys_limit <= DMA_BIT_MASK(32))
>> + if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
>> return GFP_DMA32;
>> return 0;
>> }
>> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>>
>> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
>> phys_limit < DMA_BIT_MASK(64) &&
>> - !(gfp & (GFP_DMA32 | GFP_DMA))) {
>> + !(gfp & (GFP_DMA32 | GFP_DMA)) &&
>> + !zone_dma32_is_empty(node)) {
>> gfp |= GFP_DMA32;
>> goto again;
>> }
>
> The zone_dma32_is_empty() value is known at compile-time if
> !CONFIG_ZONE_DMA32. That would make it redundant to check both.
>
>> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
>> index 4d40dcce7604..8e79903fbda8 100644
>> --- a/kernel/dma/pool.c
>> +++ b/kernel/dma/pool.c
>> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
>> end = cma_get_base(cma) + size - 1;
>> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>> return end <= DMA_BIT_MASK(zone_dma_bits);
>> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>> return end <= DMA_BIT_MASK(32);
>> return true;
>> }
>> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
>> if (IS_ENABLED(CONFIG_ZONE_DMA))
>> atomic_pool_resize(atomic_pool_dma,
>> GFP_KERNEL | GFP_DMA);
>> - if (IS_ENABLED(CONFIG_ZONE_DMA32))
>> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
>> atomic_pool_resize(atomic_pool_dma32,
>> GFP_KERNEL | GFP_DMA32);
>> atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
>> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
>> if (!atomic_pool_dma)
>> ret = -ENOMEM;
>> }
>> - if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
>> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
>> atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
>> GFP_KERNEL | GFP_DMA32);
>> if (!atomic_pool_dma32)
>> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
>> static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
>> {
>> if (prev == NULL) {
>> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>> return atomic_pool_dma32;
>> if (atomic_pool_dma && (gfp & GFP_DMA))
>> return atomic_pool_dma;
>
> I think every single call to zone_dma32_are_empty() is !'d. It seems
> like it chose the wrong polarity.


2023-01-27 00:57:58

by Randy Dunlap

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line



On 1/26/23 10:51, Dave Hansen wrote:
> On 1/26/23 08:43, Georgi Djakov wrote:
>> From: Chris Goldsworthy <[email protected]>


>> +static int __init early_disable_dma32(char *buf)
>> +{
>> + if (!buf)
>> + return -EINVAL;
>> +
>> + if (!strcmp(buf, "on"))
>> + disable_dma32 = true;
>> +
>> + return 0;
>> +}
>> +early_param("disable_dma32", early_disable_dma32);
>
> Ick. Is there no way to do this other than a cross-arch copy/paste?

I think that using __setup() instead of early_param() would allow that.

--
~Randy

2023-01-27 02:21:27

by Chris Goldsworthy

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On Thu, Jan 26, 2023 at 07:15:26PM +0000, Robin Murphy wrote:
> On 2023-01-26 16:43, Georgi Djakov wrote:
> >From: Chris Goldsworthy <[email protected]>
> >
> >It's useful to have an option to disable the ZONE_DMA32 during boot as
> >CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> >example). There are platforms that do not use this zone and in some high
> >memory pressure scenarios this would help on easing kswapd (to leave file
> >backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> >these platforms - kswapd is woken up more easily and drains the file cache
> >which leads to some performance issues.
> >
> >Signed-off-by: Chris Goldsworthy <[email protected]>
> >[georgi: updated commit text]
> >Signed-off-by: Georgi Djakov <[email protected]>
> >---
> >The main question here is whether we can have a kernel command line
> >option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> >I can imagine this being useful also for Linux distros.
>
> FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed around
> in a few places" are very different notions...
>
> However, I'm just going to take a step back and read the commit message a
> few more times... Given what it claims, I can't help but ask why wouldn't we
> want a parameter to control kswapd's behaviour and address that issue
> directly, rather than a massive hammer that breaks everyone allocating
> explicitly or implicitly with __GFP_DMA32 (especially on systems where it
> doesn't normally matter because all memory is below 4GB anyway), just to
> achieve one rather niche side-effect?
>
> Thanks,
> Robin.

Hi Robin,

The commit text doesn't spell out the scenario we want to avoid, so I
will do that for clarity. We use a kernel binary compiled for us, and
by default has CONFIG_ZONE_DMA32 (and it can't be disabled for now as
another party needs it). Our higher-end SoCs are usually used with
8-12 GB of DDR, so using a 12 GB device as an example, we would have 8
GB of ZONE_NORMAL memory and 4 GB of ZONE_MOVABLE memory with the
feature, and 4 GB of ZONE_DMA32, 4 GB of ZONE_NORMAL and 4 GB of
ZONE_MOVABLE otherwise.

Without the feature enabled, consider a GFP_KERNEL allocation that
causes a low watermark beach in ZONE_NORMAL, such that such that
ZONE_DMA32 is almost full. This will cause kswapd to start reclaiming
memory, despite the fact that that we might have gigabytes of free
memory in ZONE_DMA32 that can be used by anyone (since GFP_MOVABLE and
GFP_NORMAL can fall back to using ZONE_DMA32).

So, fleshing out your suggestion to make it concrete for our case, we
would want to stop kswapd from doing reclaim on ZONE_NORMAL watermark
breaches when ZONE_DMA32 is present (since anything targeting
ZONE_NORMAL can fall back to ZONE_DMA32).

Thanks,

Chris.

2023-01-27 06:36:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On Thu, Jan 26, 2023 at 10:51:17AM -0800, Dave Hansen wrote:
>
> Also, what are the practical implications here? There are obviously an
> ever decreasing number of 32-bit DMA devices out there. Somebody that
> has one and uses this option might be sad because now they're stuck
> using ZONE_DMA which is quite tiny.
>
> What other ZONE_DMA32 users are left? Will anyone else care? There is
> some DMA32 slab and vmalloc() functionality remaining. Is it impacted?

DMA32 never supported lab. But < 64-bit DMA device are unfortunately
still not uncommon, and configuring out ZONE_DMA32 breaks them pretty
badly as we guarantee that a DMA mask of 32-bit always works.

So I'm not only very much against this patch, but also the currently
existing way to configure out ZONE_DMA32 on arm64, which needs to
go away.

If people want ZONE_DMA32 to go away we need something to replace
it first, like a large enough CMA region in the 32-bit addressable
range.

2023-01-27 06:53:56

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On January 26, 2023 10:35:55 PM PST, Christoph Hellwig <[email protected]> wrote:
>On Thu, Jan 26, 2023 at 10:51:17AM -0800, Dave Hansen wrote:
>>
>> Also, what are the practical implications here? There are obviously an
>> ever decreasing number of 32-bit DMA devices out there. Somebody that
>> has one and uses this option might be sad because now they're stuck
>> using ZONE_DMA which is quite tiny.
>>
>> What other ZONE_DMA32 users are left? Will anyone else care? There is
>> some DMA32 slab and vmalloc() functionality remaining. Is it impacted?
>
>DMA32 never supported lab. But < 64-bit DMA device are unfortunately
>still not uncommon, and configuring out ZONE_DMA32 breaks them pretty
>badly as we guarantee that a DMA mask of 32-bit always works.
>
>So I'm not only very much against this patch, but also the currently
>existing way to configure out ZONE_DMA32 on arm64, which needs to
>go away.
>
>If people want ZONE_DMA32 to go away we need something to replace
>it first, like a large enough CMA region in the 32-bit addressable
>range.

Not to mention all kinds of odd masks like 30, 31, 39, 40, 46, ... bits.

2023-01-27 07:07:14

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On Thu, Jan 26, 2023 at 10:52:43PM -0800, H. Peter Anvin wrote:
> >If people want ZONE_DMA32 to go away we need something to replace
> >it first, like a large enough CMA region in the 32-bit addressable
> >range.
>
> Not to mention all kinds of odd masks like 30, 31, 39, 40, 46, ... bits.

Yes. Out of those all >= 32 are falling straight into ZONE_DM32,
the lower ones we do a first try in ZONE_DMA32 and then fall back to
ZONE_DMA. <= 29 mask OTOH are really rate in modern systems for
actual devices. So with a CMA region for what is currently ZONE_DMA
and one for the first 1G we'd probably cover most of what's actually
needed for x86_64. Of course on 32-bit architetures things become
a lot more complicated due to highmem.

2023-01-30 05:29:19

by kernel test robot

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

Greeting,

FYI, we noticed BUG:kernel_NULL_pointer_dereference,address due to commit (built with gcc-11):

commit: f7562f00aeee914d2d1fc45c9464826a29f7e823 ("[RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line")
url: https://github.com/intel-lab-lkp/linux/commits/Georgi-Djakov/mm-Allow-ZONE_DMA32-to-be-disabled-via-kernel-command-line/20230128-105803
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


[ 5.564449][ T1] BUG: kernel NULL pointer dereference, address: 0000000000000690
[ 5.566019][ T1] #PF: supervisor read access in kernel mode
[ 5.567187][ T1] #PF: error_code(0x0000) - not-present page
[ 5.568357][ T1] PGD 0 P4D 0
[ 5.569101][ T1] Oops: 0000 [#1] SMP PTI
[ 5.569997][ T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc4-00588-gf7562f00aeee #8
[ 5.571695][ T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[ 5.573657][ T1] RIP: __dma_direct_alloc_pages+0xf3/0x330
[ 5.575069][ T1] Code: 40 74 ed b8 01 00 00 00 48 d3 e0 48 83 e8 01 48 39 04 24 76 db 48 8b 04 24 48 c1 e8 20 75 1f 49 63 c4 48 8b 04 c5 20 68 d8 82 <48> 83 b8 90 06 00 00 00 0f 95 c0 0f b6 c0 c1 e0 02 41 09 c6 49 8d
All code
========
0: 40 74 ed rex je 0xfffffffffffffff0
3: b8 01 00 00 00 mov $0x1,%eax
8: 48 d3 e0 shl %cl,%rax
b: 48 83 e8 01 sub $0x1,%rax
f: 48 39 04 24 cmp %rax,(%rsp)
13: 76 db jbe 0xfffffffffffffff0
15: 48 8b 04 24 mov (%rsp),%rax
19: 48 c1 e8 20 shr $0x20,%rax
1d: 75 1f jne 0x3e
1f: 49 63 c4 movslq %r12d,%rax
22: 48 8b 04 c5 20 68 d8 mov -0x7d2797e0(,%rax,8),%rax
29: 82
2a:* 48 83 b8 90 06 00 00 cmpq $0x0,0x690(%rax) <-- trapping instruction
31: 00
32: 0f 95 c0 setne %al
35: 0f b6 c0 movzbl %al,%eax
38: c1 e0 02 shl $0x2,%eax
3b: 41 09 c6 or %eax,%r14d
3e: 49 rex.WB
3f: 8d .byte 0x8d

Code starting with the faulting instruction
===========================================
0: 48 83 b8 90 06 00 00 cmpq $0x0,0x690(%rax)
7: 00
8: 0f 95 c0 setne %al
b: 0f b6 c0 movzbl %al,%eax
e: c1 e0 02 shl $0x2,%eax
11: 41 09 c6 or %eax,%r14d
14: 49 rex.WB
15: 8d .byte 0x8d
[ 5.578626][ T1] RSP: 0018:ffffc90000013bd8 EFLAGS: 00010247
[ 5.579804][ T1] RAX: 0000000000000000 RBX: 0000000000000cc0 RCX: 0000000000000018
[ 5.581366][ T1] RDX: 0000000000000000 RSI: 0000000000001000 RDI: 0000000000000000
[ 5.582922][ T1] RBP: ffff8881003b30d0 R08: 0000000000000000 R09: 000fffffffffffff
[ 5.584518][ T1] R10: 0000000000000008 R11: ffff88843ffc6000 R12: 00000000ffffffff
[ 5.586080][ T1] R13: 0000000000001000 R14: 0000000000000cc0 R15: 0000000000001000
[ 5.587644][ T1] FS: 0000000000000000(0000) GS:ffff88842fd00000(0000) knlGS:0000000000000000
[ 5.589376][ T1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5.590616][ T1] CR2: 0000000000000690 CR3: 000000000280a000 CR4: 00000000000406e0
[ 5.592197][ T1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5.593739][ T1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5.595340][ T1] Call Trace:
[ 5.597097][ T1] <TASK>
[ 5.597778][ T1] ? __vmalloc_node_range (mm/vmalloc.c:3182)
[ 5.598829][ T1] dma_direct_alloc (kernel/dma/direct.c:269)
[ 5.599794][ T1] e1000_setup_tx_resources (drivers/net/ethernet/intel/e1000/e1000_main.c:1514)
[ 5.601912][ T1] e1000_setup_all_tx_resources (drivers/net/ethernet/intel/e1000/e1000_main.c:1574)
[ 5.603106][ T1] e1000_open (drivers/net/ethernet/intel/e1000/e1000_main.c:1367)
[ 5.603998][ T1] ? raw_notifier_call_chain (kernel/notifier.c:92 kernel/notifier.c:455)
[ 5.605069][ T1] __dev_open (net/core/dev.c:1419)
[ 5.605963][ T1] __dev_change_flags (net/core/dev.c:8530)
[ 5.607006][ T1] ? __dev_notify_flags (net/core/dev.c:8574)
[ 5.607959][ T1] dev_change_flags (net/core/dev.c:8602)
[ 5.608915][ T1] ic_open_devs (net/ipv4/ipconfig.c:242)
[ 5.609818][ T1] ip_auto_config (net/ipv4/ipconfig.c:1508)
[ 5.610775][ T1] ? __pfx_ip_auto_config (net/ipv4/ipconfig.c:1471)
[ 5.611816][ T1] do_one_initcall (init/main.c:1306)
[ 5.612782][ T1] do_initcalls (init/main.c:1378 init/main.c:1395)
[ 5.613649][ T1] kernel_init_freeable (init/main.c:1638)
[ 5.614747][ T1] ? __pfx_kernel_init (init/main.c:1514)
[ 5.615771][ T1] kernel_init (init/main.c:1524)
[ 5.616681][ T1] ret_from_fork (arch/x86/entry/entry_64.S:314)
[ 5.617557][ T1] </TASK>
[ 5.618264][ T1] Modules linked in:
[ 5.619088][ T1] CR2: 0000000000000690
[ 5.619963][ T1] ---[ end trace 0000000000000000 ]---
[ 5.621042][ T1] RIP: __dma_direct_alloc_pages+0xf3/0x330
[ 5.622410][ T1] Code: 40 74 ed b8 01 00 00 00 48 d3 e0 48 83 e8 01 48 39 04 24 76 db 48 8b 04 24 48 c1 e8 20 75 1f 49 63 c4 48 8b 04 c5 20 68 d8 82 <48> 83 b8 90 06 00 00 00 0f 95 c0 0f b6 c0 c1 e0 02 41 09 c6 49 8d
All code
========
0: 40 74 ed rex je 0xfffffffffffffff0
3: b8 01 00 00 00 mov $0x1,%eax
8: 48 d3 e0 shl %cl,%rax
b: 48 83 e8 01 sub $0x1,%rax
f: 48 39 04 24 cmp %rax,(%rsp)
13: 76 db jbe 0xfffffffffffffff0
15: 48 8b 04 24 mov (%rsp),%rax
19: 48 c1 e8 20 shr $0x20,%rax
1d: 75 1f jne 0x3e
1f: 49 63 c4 movslq %r12d,%rax
22: 48 8b 04 c5 20 68 d8 mov -0x7d2797e0(,%rax,8),%rax
29: 82
2a:* 48 83 b8 90 06 00 00 cmpq $0x0,0x690(%rax) <-- trapping instruction
31: 00
32: 0f 95 c0 setne %al
35: 0f b6 c0 movzbl %al,%eax
38: c1 e0 02 shl $0x2,%eax
3b: 41 09 c6 or %eax,%r14d
3e: 49 rex.WB
3f: 8d .byte 0x8d

Code starting with the faulting instruction
===========================================
0: 48 83 b8 90 06 00 00 cmpq $0x0,0x690(%rax)
7: 00
8: 0f 95 c0 setne %al
b: 0f b6 c0 movzbl %al,%eax
e: c1 e0 02 shl $0x2,%eax
11: 41 09 c6 or %eax,%r14d
14: 49 rex.WB
15: 8d .byte 0x8d


If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-lkp/[email protected]


To reproduce:

# build kernel
cd linux
cp config-6.2.0-rc4-00588-gf7562f00aeee .config
make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.


--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests


Attachments:
(No filename) (7.65 kB)
config-6.2.0-rc4-00588-gf7562f00aeee (163.18 kB)
job-script (4.80 kB)
dmesg.xz (25.10 kB)
Download all attachments

2023-02-01 04:10:02

by Chris Goldsworthy

[permalink] [raw]
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

On Fri, Jan 27, 2023 at 04:55:53PM +0800, Hillf Danton wrote:
> On Thu, 26 Jan 2023 18:20:26 -0800 Chris Goldsworthy <[email protected]>
> > On Thu, Jan 26, 2023 at 07:15:26PM +0000, Robin Murphy wrote:
> > > However, I'm just going to take a step back and read the commit message a
> > > few more times... Given what it claims, I can't help but ask why wouldn't we
> > > want a parameter to control kswapd's behaviour and address that issue
> > > directly, rather than a massive hammer that breaks everyone allocating
> > > explicitly or implicitly with __GFP_DMA32 (especially on systems where it
> > > doesn't normally matter because all memory is below 4GB anyway), just to
> > > achieve one rather niche side-effect?
> > >
> > > Thanks,
> > > Robin.
> >
> > Hi Robin,
> >
> > The commit text doesn't spell out the scenario we want to avoid, so I
> > will do that for clarity. We use a kernel binary compiled for us, and
> > by default has CONFIG_ZONE_DMA32 (and it can't be disabled for now as
> > another party needs it). Our higher-end SoCs are usually used with
> > 8-12 GB of DDR, so using a 12 GB device as an example, we would have 8
> > GB of ZONE_NORMAL memory and 4 GB of ZONE_MOVABLE memory with the
> > feature, and 4 GB of ZONE_DMA32, 4 GB of ZONE_NORMAL and 4 GB of
> > ZONE_MOVABLE otherwise.
> >
> > Without the feature enabled, consider a GFP_KERNEL allocation that
> > causes a low watermark beach in ZONE_NORMAL, such that such that
> > ZONE_DMA32 is almost full. This will cause kswapd to start reclaiming
> > memory, despite the fact that that we might have gigabytes of free
> > memory in ZONE_DMA32 that can be used by anyone (since GFP_MOVABLE and
> > GFP_NORMAL can fall back to using ZONE_DMA32).
>
> If kswapd is busy reclaiming pages even given gigabytes of free memory
> in the DMA32 zone then it is a CPU hog.
>
> Feel free to check pgdat_balanced() and prepare_kswapd_sleep().

Thanks for pointing out this gap in my understanding - I'm taking a closer look
at these paths to see whether there is room for what Robin suggested.