Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.
Signed-off-by: Charlie Jenkins <[email protected]>
---
arch/riscv/include/asm/elf.h | 2 +-
arch/riscv/include/asm/pgtable.h | 12 +++++++-
arch/riscv/include/asm/processor.h | 46 +++++++++++++++++++++++++-----
3 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index c24280774caf..5d3368d5585c 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -49,7 +49,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
* the loader. We need to make sure that it is out of the way of the program
* that it will "exec", and that there is sufficient room for the brk.
*/
-#define ELF_ET_DYN_BASE ((TASK_SIZE / 3) * 2)
+#define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2)
#ifdef CONFIG_64BIT
#ifdef CONFIG_COMPAT
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 75970ee2bda2..e13f5872bfe9 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -63,12 +63,22 @@
* position vmemmap directly below the VMALLOC region.
*/
#ifdef CONFIG_64BIT
+#define VA_BITS_SV39 39
+#define VA_BITS_SV48 48
+#define VA_BITS_SV57 57
+
+#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1))
+#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
+#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
+
#define VA_BITS (pgtable_l5_enabled ? \
- 57 : (pgtable_l4_enabled ? 48 : 39))
+ VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39))
#else
#define VA_BITS 32
#endif
+#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
+
#define VMEMMAP_SHIFT \
(VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index c950a8d9edef..14a5396eed3d 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -13,20 +13,52 @@
#include <asm/ptrace.h>
-/*
- * This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3)
-
-#define STACK_TOP TASK_SIZE
#ifdef CONFIG_64BIT
+#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
#define STACK_TOP_MAX TASK_SIZE_64
+
+#define arch_get_mmap_end(addr, len, flags) \
+({ \
+ unsigned long mmap_end; \
+ if ((addr) >= VA_USER_SV57) \
+ mmap_end = STACK_TOP_MAX; \
+ else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
+ mmap_end = VA_USER_SV48; \
+ else if ((addr) == 0) \
+ mmap_end = DEFAULT_MAP_WINDOW; \
+ else \
+ mmap_end = VA_USER_SV39; \
+ mmap_end; \
+})
+
+#define arch_get_mmap_base(addr, base) \
+({ \
+ unsigned long mmap_base; \
+ if (((addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
+ mmap_base = (base) + (VA_USER_SV57 - DEFAULT_MAP_WINDOW); \
+ else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
+ mmap_base = (base) + (VA_USER_SV48 - DEFAULT_MAP_WINDOW); \
+ else if ((addr) == 0) \
+ mmap_base = (base); \
+ else \
+ mmap_base = (base) + (VA_USER_SV39 - DEFAULT_MAP_WINDOW); \
+ mmap_base; \
+})
+
#else
+#define DEFAULT_MAP_WINDOW TASK_SIZE
#define STACK_TOP_MAX TASK_SIZE
#endif
#define STACK_ALIGN 16
+#define STACK_TOP DEFAULT_MAP_WINDOW
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE PAGE_ALIGN(DEFAULT_MAP_WINDOW / 3)
+
#ifndef __ASSEMBLY__
struct task_struct;
--
2.41.0
On Fri, Jul 14, 2023 at 6:55 PM Charlie Jenkins <[email protected]> wrote:
>
> Make sv48 the default address space for mmap as some applications
> currently depend on this assumption. A hint address passed to mmap will
> cause the largest address space that fits entirely into the hint to be
> used. If the hint is less than or equal to 1<<38, an sv39 address will
> be used. An exception is that if the hint address is 0, then a sv48
> address will be used. After an address space is completely full, the next
> smallest address space will be used.
>
> Signed-off-by: Charlie Jenkins <[email protected]>
> ---
> arch/riscv/include/asm/elf.h | 2 +-
> arch/riscv/include/asm/pgtable.h | 12 +++++++-
> arch/riscv/include/asm/processor.h | 46 +++++++++++++++++++++++++-----
> 3 files changed, 51 insertions(+), 9 deletions(-)
>
> diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
> index c24280774caf..5d3368d5585c 100644
> --- a/arch/riscv/include/asm/elf.h
> +++ b/arch/riscv/include/asm/elf.h
> @@ -49,7 +49,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
> * the loader. We need to make sure that it is out of the way of the program
> * that it will "exec", and that there is sufficient room for the brk.
> */
> -#define ELF_ET_DYN_BASE ((TASK_SIZE / 3) * 2)
> +#define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2)
>
> #ifdef CONFIG_64BIT
> #ifdef CONFIG_COMPAT
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 75970ee2bda2..e13f5872bfe9 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -63,12 +63,22 @@
> * position vmemmap directly below the VMALLOC region.
> */
> #ifdef CONFIG_64BIT
> +#define VA_BITS_SV39 39
> +#define VA_BITS_SV48 48
> +#define VA_BITS_SV57 57
> +
> +#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1))
> +#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
> +#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
> +
> #define VA_BITS (pgtable_l5_enabled ? \
> - 57 : (pgtable_l4_enabled ? 48 : 39))
> + VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39))
> #else
> #define VA_BITS 32
> #endif
>
> +#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
> +
> #define VMEMMAP_SHIFT \
> (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
> #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
> diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> index c950a8d9edef..14a5396eed3d 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -13,20 +13,52 @@
>
> #include <asm/ptrace.h>
>
> -/*
> - * This decides where the kernel will search for a free chunk of vm
> - * space during mmap's.
> - */
> -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3)
> -
> -#define STACK_TOP TASK_SIZE
> #ifdef CONFIG_64BIT
> +#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> #define STACK_TOP_MAX TASK_SIZE_64
> +
> +#define arch_get_mmap_end(addr, len, flags) \
> +({ \
> + unsigned long mmap_end; \
> + if ((addr) >= VA_USER_SV57) \
> + mmap_end = STACK_TOP_MAX; \
> + else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
> + mmap_end = VA_USER_SV48; \
> + else if ((addr) == 0) \
> + mmap_end = DEFAULT_MAP_WINDOW; \
> + else \
> + mmap_end = VA_USER_SV39; \
> + mmap_end; \
> +})
What about the following instead:
#define arch_get_mmap_end(addr, len, flags) \
({ \
unsigned long mmap_end; \
if ((addr) >= VA_USER_SV57) \
mmap_end = STACK_TOP_MAX; \ // Maybe a comment here that
says it returns the max user address of the current mode, not obvious
at first sight.
else \
mmap_end = DEFAULT_MAP_WINDOW; \
mmap_end; \
})
The only corner case is when sv57 is active, then only a hint greater
than VA_USER_SV57 can return a sv57 user address. Otherwise, we just
need to return the default mmap end right?
> +
> +#define arch_get_mmap_base(addr, base) \
> +({ \
> + unsigned long mmap_base; \
> + if (((addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
> + mmap_base = (base) + (VA_USER_SV57 - DEFAULT_MAP_WINDOW); \
> + else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
> + mmap_base = (base) + (VA_USER_SV48 - DEFAULT_MAP_WINDOW); \
> + else if ((addr) == 0) \
> + mmap_base = (base); \
> + else \
> + mmap_base = (base) + (VA_USER_SV39 - DEFAULT_MAP_WINDOW); \
> + mmap_base; \
> +})
> +
From arch_pick_mmap_layout()
(https://elixir.bootlin.com/linux/latest/source/mm/util.c#L433), the
"base" argument is:
- either STACK_TOP in top-down (more or less some random offset)
- or TASK_UNMAPPED_BASE in bottom-up (more or less some random offset)
When bottom-up is the current mode, we should not change the base, so
adding (VA_USER_SV57 - DEFAULT_MAP_WINDOW) in the first case is not
right for me. When sv48 or sv57 are the active mode,
DEFAULT_MAP_WINDOW is equal to VA_USER_SV48 right? So (VA_USER_SV48 -
DEFAULT_MAP_WINDOW) is 0, so not useful. And for the last case, when
the user asks for a sv39 address whereas the active mode is sv48 or
sv57, then (VA_USER_SV39 - DEFAULT_MAP_WINDOW) is negative and the
base is smaller which is not correct.
In the bottom-up case, we should preserve the base and I think that
again, only sv57 is the corner case to deal with.
> #else
> +#define DEFAULT_MAP_WINDOW TASK_SIZE
> #define STACK_TOP_MAX TASK_SIZE
> #endif
> #define STACK_ALIGN 16
>
> +#define STACK_TOP DEFAULT_MAP_WINDOW
> +
> +/*
> + * This decides where the kernel will search for a free chunk of vm
> + * space during mmap's.
> + */
> +#define TASK_UNMAPPED_BASE PAGE_ALIGN(DEFAULT_MAP_WINDOW / 3)
> +
> #ifndef __ASSEMBLY__
>
> struct task_struct;
> --
> 2.41.0
>
On Thu, Jul 20, 2023 at 10:13:54AM +0200, Alexandre Ghiti wrote:
> On Fri, Jul 14, 2023 at 6:55 PM Charlie Jenkins <[email protected]> wrote:
> >
> > Make sv48 the default address space for mmap as some applications
> > currently depend on this assumption. A hint address passed to mmap will
> > cause the largest address space that fits entirely into the hint to be
> > used. If the hint is less than or equal to 1<<38, an sv39 address will
> > be used. An exception is that if the hint address is 0, then a sv48
> > address will be used. After an address space is completely full, the next
> > smallest address space will be used.
> >
> > Signed-off-by: Charlie Jenkins <[email protected]>
> > ---
> > arch/riscv/include/asm/elf.h | 2 +-
> > arch/riscv/include/asm/pgtable.h | 12 +++++++-
> > arch/riscv/include/asm/processor.h | 46 +++++++++++++++++++++++++-----
> > 3 files changed, 51 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
> > index c24280774caf..5d3368d5585c 100644
> > --- a/arch/riscv/include/asm/elf.h
> > +++ b/arch/riscv/include/asm/elf.h
> > @@ -49,7 +49,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
> > * the loader. We need to make sure that it is out of the way of the program
> > * that it will "exec", and that there is sufficient room for the brk.
> > */
> > -#define ELF_ET_DYN_BASE ((TASK_SIZE / 3) * 2)
> > +#define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2)
> >
> > #ifdef CONFIG_64BIT
> > #ifdef CONFIG_COMPAT
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 75970ee2bda2..e13f5872bfe9 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -63,12 +63,22 @@
> > * position vmemmap directly below the VMALLOC region.
> > */
> > #ifdef CONFIG_64BIT
> > +#define VA_BITS_SV39 39
> > +#define VA_BITS_SV48 48
> > +#define VA_BITS_SV57 57
> > +
> > +#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1))
> > +#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
> > +#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
> > +
> > #define VA_BITS (pgtable_l5_enabled ? \
> > - 57 : (pgtable_l4_enabled ? 48 : 39))
> > + VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39))
> > #else
> > #define VA_BITS 32
> > #endif
> >
> > +#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
> > +
> > #define VMEMMAP_SHIFT \
> > (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
> > #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index c950a8d9edef..14a5396eed3d 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -13,20 +13,52 @@
> >
> > #include <asm/ptrace.h>
> >
> > -/*
> > - * This decides where the kernel will search for a free chunk of vm
> > - * space during mmap's.
> > - */
> > -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3)
> > -
> > -#define STACK_TOP TASK_SIZE
> > #ifdef CONFIG_64BIT
> > +#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > #define STACK_TOP_MAX TASK_SIZE_64
> > +
> > +#define arch_get_mmap_end(addr, len, flags) \
> > +({ \
> > + unsigned long mmap_end; \
> > + if ((addr) >= VA_USER_SV57) \
> > + mmap_end = STACK_TOP_MAX; \
> > + else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
> > + mmap_end = VA_USER_SV48; \
> > + else if ((addr) == 0) \
> > + mmap_end = DEFAULT_MAP_WINDOW; \
> > + else \
> > + mmap_end = VA_USER_SV39; \
> > + mmap_end; \
> > +})
>
> What about the following instead:
>
> #define arch_get_mmap_end(addr, len, flags) \
> ({ \
> unsigned long mmap_end; \
> if ((addr) >= VA_USER_SV57) \
> mmap_end = STACK_TOP_MAX; \ // Maybe a comment here that
> says it returns the max user address of the current mode, not obvious
> at first sight.
> else \
> mmap_end = DEFAULT_MAP_WINDOW; \
> mmap_end; \
> })
>
> The only corner case is when sv57 is active, then only a hint greater
> than VA_USER_SV57 can return a sv57 user address. Otherwise, we just
> need to return the default mmap end right?
>
> > +
> > +#define arch_get_mmap_base(addr, base) \
> > +({ \
> > + unsigned long mmap_base; \
> > + if (((addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
> > + mmap_base = (base) + (VA_USER_SV57 - DEFAULT_MAP_WINDOW); \
> > + else if ((((addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
> > + mmap_base = (base) + (VA_USER_SV48 - DEFAULT_MAP_WINDOW); \
> > + else if ((addr) == 0) \
> > + mmap_base = (base); \
> > + else \
> > + mmap_base = (base) + (VA_USER_SV39 - DEFAULT_MAP_WINDOW); \
> > + mmap_base; \
> > +})
> > +
>
> From arch_pick_mmap_layout()
> (https://elixir.bootlin.com/linux/latest/source/mm/util.c#L433), the
> "base" argument is:
>
> - either STACK_TOP in top-down (more or less some random offset)
> - or TASK_UNMAPPED_BASE in bottom-up (more or less some random offset)
>
> When bottom-up is the current mode, we should not change the base, so
> adding (VA_USER_SV57 - DEFAULT_MAP_WINDOW) in the first case is not
> right for me. When sv48 or sv57 are the active mode,
> DEFAULT_MAP_WINDOW is equal to VA_USER_SV48 right? So (VA_USER_SV48 -
> DEFAULT_MAP_WINDOW) is 0, so not useful. And for the last case, when
> the user asks for a sv39 address whereas the active mode is sv48 or
> sv57, then (VA_USER_SV39 - DEFAULT_MAP_WINDOW) is negative and the
> base is smaller which is not correct.
>
In the bottom-up case mm->mmap_base is directly used instead of
arch_get_mmap_base so base will not be modified in that case. The math
here is confusing so I can refactor it. I like your suggestion to
extract out the rnd value of base with (base) - DEFAULT_MAP_WINDOW since
base is defined as PAGE_ALIGN(STACK_TOP - gap - rnd) and STACK_TOP is
DEFAULT_MAP_WINDOW. This (-gap-rnd) value can then directly be used
instead of redoing the math in each if.
> In the bottom-up case, we should preserve the base and I think that
> again, only sv57 is the corner case to deal with.
>
>
For both this comment and the one above, I think we should allow the
user to default back to sv38. I talked to Palmer and he would prefer
allowing selection of sv38. Since there is no overhead, there is no
need to prevent the user from doing so.
> > #else
> > +#define DEFAULT_MAP_WINDOW TASK_SIZE
> > #define STACK_TOP_MAX TASK_SIZE
> > #endif
> > #define STACK_ALIGN 16
> >
> > +#define STACK_TOP DEFAULT_MAP_WINDOW
> > +
> > +/*
> > + * This decides where the kernel will search for a free chunk of vm
> > + * space during mmap's.
> > + */
> > +#define TASK_UNMAPPED_BASE PAGE_ALIGN(DEFAULT_MAP_WINDOW / 3)
> > +
> > #ifndef __ASSEMBLY__
> >
> > struct task_struct;
> > --
> > 2.41.0
> >