Report available page shifts in arch independent manner, so that poor
userspace developers won't have to parse /proc/cpuinfo hunting for
arch-specific flag strings:
unsigned long val = getauxval(AT_PAGE_SHIFT_LIST);
while (val && (val & 255) != 30) {
val >>= 8;
}
if (val) {
page_size_1gib = true;
} else {
page_size_1gib = false;
}
Note!
This is strictly for userspace, if some page size is shutdown due
to kernel command line option or CPU bug workaround, than is must not
be reported in aux vector!
x86_64 machine with 1 GiB pages:
$ hexdump -C /proc/self/auxv
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 0c 15 1e 00 00 00 00 00
x86_64 machine with 2MiB pages only:
$ hexdump -C /proc/self/auxv
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 0c 15 00 00 00 00 00 00
AT_PAGESZ is always 4096 which is not much information.
Signed-off-by: Alexey Dobriyan <[email protected]>
---
arch/x86/include/asm/elf.h | 13 +++++++++++++
fs/binfmt_elf.c | 3 +++
include/uapi/linux/auxvec.h | 17 +++++++++++++++++
3 files changed, 33 insertions(+)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -358,6 +358,19 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+#define ARCH_AT_PAGE_SHIFT_LIST \
+ do { \
+ u32 val = 12; \
+ int s = 0; \
+ if (boot_cpu_has(X86_FEATURE_PSE)) { \
+ val |= 21 << (s += 8); \
+ } \
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
+ val |= 30 << (s += 8); \
+ } \
+ NEW_AUX_ENT(AT_PAGE_SHIFT_LIST, val); \
+ } while (0)
+
#endif /* !CONFIG_X86_32 */
#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#endif
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+#ifdef ARCH_AT_PAGE_SHIFT_LIST
+ ARCH_AT_PAGE_SHIFT_LIST;
+#endif
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,6 +33,23 @@
#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
+/*
+ * Page sizes available for mmap(2) encoded as 1 page shift per byte in
+ * increasing order.
+ *
+ * Thus 32-bit systems get 4 shifts, 64-bit systems get 8 shifts tops.
+ *
+ * Example:
+ * x86_64 system with "pdpe1gb" reports 4 KiB, 2 MiB and 1 GiB page support.
+ *
+ * $ hexdump -C /proc/self/auxv
+ * 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
+ * 00000040 1d 00 00 00 00 00 00 00 0c 15 1e 00 00 00 00 00
+ *
+ * For 2^256 hugepage support please contact your Universe sales representative.
+ */
+#define AT_PAGE_SHIFT_LIST 29
+
#define AT_EXECFN 31 /* filename of program */
#ifndef AT_MINSIGSTKSZ
* Alexey Dobriyan:
> +/*
> + * Page sizes available for mmap(2) encoded as 1 page shift per byte in
> + * increasing order.
> + *
> + * Thus 32-bit systems get 4 shifts, 64-bit systems get 8 shifts tops.
Couldn't you use the bits in a long instead, to indicate which shifts
are present? That's always going to be enough.
Thanks,
Florian
On Tue, Dec 05, 2023 at 10:51:39AM +0100, Florian Weimer wrote:
> * Alexey Dobriyan:
>
> > +/*
> > + * Page sizes available for mmap(2) encoded as 1 page shift per byte in
> > + * increasing order.
> > + *
> > + * Thus 32-bit systems get 4 shifts, 64-bit systems get 8 shifts tops.
>
> Couldn't you use the bits in a long instead, to indicate which shifts
> are present? That's always going to be enough.
Yes!
I was so proud of myself for this line:
val |= 21 << (s += 8);
Now it is boring bitmask again :-)
Report available page shifts in arch independent manner, so that
userspace developers won't have to parse /proc/cpuinfo hunting
for arch specific strings:
Note!
This is strictly for userspace, if some page size is shutdown due
to kernel command line option or CPU bug workaround, than is must not
be reported in aux vector!
x86_64 machine with 1 GiB pages:
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
x86_64 machine with 2 MiB pages only:
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
AT_PAGESZ is always 4096 which is not that interesting.
Signed-off-by: Alexey Dobriyan <[email protected]>
---
v2: switch to 1 bit per page shift (bitmask)
arch/x86/include/asm/elf.h | 12 ++++++++++++
fs/binfmt_elf.c | 3 +++
include/uapi/linux/auxvec.h | 14 ++++++++++++++
3 files changed, 29 insertions(+)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -358,6 +358,18 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+#define ARCH_AT_PAGE_SHIFT_MASK \
+ do { \
+ u32 val = 1 << 12; \
+ if (boot_cpu_has(X86_FEATURE_PSE)) { \
+ val |= 1 << 21; \
+ } \
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
+ val |= 1 << 30; \
+ } \
+ NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, val); \
+ } while (0)
+
#endif /* !CONFIG_X86_32 */
#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#endif
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+#ifdef ARCH_AT_PAGE_SHIFT_MASK
+ ARCH_AT_PAGE_SHIFT_MASK;
+#endif
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,6 +33,20 @@
#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
+/*
+ * Page sizes available for mmap(2) encoded as bitmask.
+ *
+ * Example: x86_64 system with pse, pdpe1gb /proc/cpuinfo flags reports
+ * 4 KiB, 2 MiB and 1 GiB page support.
+ *
+ * $ hexdump -C /proc/self/auxv
+ * 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
+ * 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
+ *
+ * For 2^64 hugepage support please contact your Universe sales representative.
+ */
+#define AT_PAGE_SHIFT_MASK 29
+
#define AT_EXECFN 31 /* filename of program */
#ifndef AT_MINSIGSTKSZ
On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
> Report available page shifts in arch independent manner, so that
> userspace developers won't have to parse /proc/cpuinfo hunting
> for arch specific strings:
>
> Note!
>
> This is strictly for userspace, if some page size is shutdown due
> to kernel command line option or CPU bug workaround, than is must not
> be reported in aux vector!
Given Florian in CC, I assume this is something glibc would like to be
using? Please mention this in the commit log.
>
> x86_64 machine with 1 GiB pages:
>
> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
>
> x86_64 machine with 2 MiB pages only:
>
> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> 00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
>
> AT_PAGESZ is always 4096 which is not that interesting.
That's not always true. For example, see arm64:
arch/arm64/include/asm/elf.h:#define ELF_EXEC_PAGESIZE PAGE_SIZE
I'm not actually sure why x86 forces it to 4096. I'd need to go look
through the history there.
>
> Signed-off-by: Alexey Dobriyan <[email protected]>
> ---
>
> v2: switch to 1 bit per page shift (bitmask)
>
> arch/x86/include/asm/elf.h | 12 ++++++++++++
> fs/binfmt_elf.c | 3 +++
> include/uapi/linux/auxvec.h | 14 ++++++++++++++
> 3 files changed, 29 insertions(+)
>
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -358,6 +358,18 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
>
> #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
>
> +#define ARCH_AT_PAGE_SHIFT_MASK \
> + do { \
> + u32 val = 1 << 12; \
> + if (boot_cpu_has(X86_FEATURE_PSE)) { \
> + val |= 1 << 21; \
> + } \
> + if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
> + val |= 1 << 30; \
> + } \
> + NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, val); \
> + } while (0)
> +
> #endif /* !CONFIG_X86_32 */
Can't we have a generic ARCH_AT_PAGE_SHIFT_MASK too? Something like:
#ifndef ARCH_AT_PAGE_SHIFT_MASK
#define ARCH_AT_PAGE_SHIFT_MASK
NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, 1 << PAGE_SHIFT)
#endif
Or am I misunderstanding something here?
>
> #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> #endif
> NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
> +#ifdef ARCH_AT_PAGE_SHIFT_MASK
> + ARCH_AT_PAGE_SHIFT_MASK;
> +#endif
That way we can avoid an #ifdef in the .c file.
> NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
> NEW_AUX_ENT(AT_PHDR, phdr_addr);
> NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
> --- a/include/uapi/linux/auxvec.h
> +++ b/include/uapi/linux/auxvec.h
> @@ -33,6 +33,20 @@
> #define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
> #define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
>
> +/*
> + * Page sizes available for mmap(2) encoded as bitmask.
> + *
> + * Example: x86_64 system with pse, pdpe1gb /proc/cpuinfo flags reports
> + * 4 KiB, 2 MiB and 1 GiB page support.
> + *
> + * $ hexdump -C /proc/self/auxv
FWIW, a more readable form is: $ LD_SHOW_AUXV=1 /bin/true
> + * 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> + * 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
> + *
> + * For 2^64 hugepage support please contact your Universe sales representative.
> + */
> +#define AT_PAGE_SHIFT_MASK 29
... hmm, why is 29 unused?
> +
> #define AT_EXECFN 31 /* filename of program */
>
> #ifndef AT_MINSIGSTKSZ
This will need a man page update for "getauxval" as well...
--
Kees Cook
* Kees Cook:
> On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
>> Report available page shifts in arch independent manner, so that
>> userspace developers won't have to parse /proc/cpuinfo hunting
>> for arch specific strings:
>>
>> Note!
>>
>> This is strictly for userspace, if some page size is shutdown due
>> to kernel command line option or CPU bug workaround, than is must not
>> be reported in aux vector!
>
> Given Florian in CC, I assume this is something glibc would like to be
> using? Please mention this in the commit log.
Nope, I just wrote a random drive-by comment on the first version.
>> x86_64 machine with 1 GiB pages:
>>
>> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
>> 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
>>
>> x86_64 machine with 2 MiB pages only:
>>
>> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
>> 00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
>>
>> AT_PAGESZ is always 4096 which is not that interesting.
>
> That's not always true. For example, see arm64:
> arch/arm64/include/asm/elf.h:#define ELF_EXEC_PAGESIZE PAGE_SIZE
I'm pretty sure the comment refers to the x86-64 situation. 8-)
> I'm not actually sure why x86 forces it to 4096. I'd need to go look
> through the history there.
On x86-64, page size 4096 is architectural. Likewise on s390x and a few
other architectures.
Thanks,
Florian
On Wed, Dec 06, 2023 at 10:05:36PM +0100, Florian Weimer wrote:
> * Kees Cook:
>
> > On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
> >> Report available page shifts in arch independent manner, so that
> >> userspace developers won't have to parse /proc/cpuinfo hunting
> >> for arch specific strings:
> >>
> >> Note!
> >>
> >> This is strictly for userspace, if some page size is shutdown due
> >> to kernel command line option or CPU bug workaround, than is must not
> >> be reported in aux vector!
> >
> > Given Florian in CC, I assume this is something glibc would like to be
> > using? Please mention this in the commit log.
>
> Nope, I just wrote a random drive-by comment on the first version.
Ah, okay. Then Alexey, who do you expect to be the consumer of this new
AT value?
--
Kees Cook
On Wed, Dec 06, 2023 at 12:47:27PM -0800, Kees Cook wrote:
> On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
> > Report available page shifts in arch independent manner, so that
> > userspace developers won't have to parse /proc/cpuinfo hunting
> > for arch specific strings:
> >
> > Note!
> >
> > This is strictly for userspace, if some page size is shutdown due
> > to kernel command line option or CPU bug workaround, than is must not
> > be reported in aux vector!
>
> Given Florian in CC, I assume this is something glibc would like to be
> using? Please mention this in the commit log.
glibc can use it. Main user is libhugetlbfs, I guess:
https://github.com/libhugetlbfs/libhugetlbfs/blob/master/hugeutils.c#L915
Loop inside getauxval() can run faster than opendir().
> > x86_64 machine with 1 GiB pages:
> >
> > 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> > 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
> >
> > x86_64 machine with 2 MiB pages only:
> >
> > 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> > 00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
> >
> > AT_PAGESZ is always 4096 which is not that interesting.
>
> That's not always true. For example, see arm64:
> arch/arm64/include/asm/elf.h:#define ELF_EXEC_PAGESIZE PAGE_SIZE
Yes, I'm x86_64 guy, AT_PAGESZ remark is about x86_64.
> I'm not actually sure why x86 forces it to 4096. I'd need to go look
> through the history there.
> > --- a/arch/x86/include/asm/elf.h
> > +++ b/arch/x86/include/asm/elf.h
> > @@ -358,6 +358,18 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
> >
> > #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
> >
> > +#define ARCH_AT_PAGE_SHIFT_MASK \
> > + do { \
> > + u32 val = 1 << 12; \
> > + if (boot_cpu_has(X86_FEATURE_PSE)) { \
> > + val |= 1 << 21; \
> > + } \
> > + if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
> > + val |= 1 << 30; \
> > + } \
> > + NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, val); \
> > + } while (0)
> > +
> > #endif /* !CONFIG_X86_32 */
>
> Can't we have a generic ARCH_AT_PAGE_SHIFT_MASK too? Something like:
>
> #ifndef ARCH_AT_PAGE_SHIFT_MASK
> #define ARCH_AT_PAGE_SHIFT_MASK
> NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, 1 << PAGE_SHIFT)
> #endif
>
> Or am I misunderstanding something here?
1) Arch maintainers can opt into this new way to report information at
their own pace.
2) AT_PAGE_SHIFT_MASK is about _all_ pagesizes supported by CPU.
Reporting just one is missing the point.
I'll clarify comment: mmap() support require many things including
tests for hugetlbfs being mounted, this is about CPU support.
> > --- a/fs/binfmt_elf.c
> > +++ b/fs/binfmt_elf.c
> > @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> > #endif
> > NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> > NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
> > +#ifdef ARCH_AT_PAGE_SHIFT_MASK
> > + ARCH_AT_PAGE_SHIFT_MASK;
> > +#endif
>
> That way we can avoid an #ifdef in the .c file.
That's a false economy. ifdefs aren't bad inherently.
When all archs implement AT_PAGE_SHIFT_MASK, ifdef will be removed.
> > --- a/include/uapi/linux/auxvec.h
> > +++ b/include/uapi/linux/auxvec.h
> > @@ -33,6 +33,20 @@
> > #define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
> > #define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
> >
> > +/*
> > + * Page sizes available for mmap(2) encoded as bitmask.
> > + *
> > + * Example: x86_64 system with pse, pdpe1gb /proc/cpuinfo flags reports
> > + * 4 KiB, 2 MiB and 1 GiB page support.
> > + *
> > + * $ hexdump -C /proc/self/auxv
>
> FWIW, a more readable form is: $ LD_SHOW_AUXV=1 /bin/true
OK. It doesn't show new values as text, but OK.
> > + * 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> > + * 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
> > + *
> > + * For 2^64 hugepage support please contact your Universe sales representative.
> > + */
> > +#define AT_PAGE_SHIFT_MASK 29
>
> ... hmm, why is 29 unused?
>
> > +
> > #define AT_EXECFN 31 /* filename of program */
> >
> > #ifndef AT_MINSIGSTKSZ
>
> This will need a man page update for "getauxval" as well...
Hear, hear!
On Wed, Dec 06, 2023 at 01:09:01PM -0800, Kees Cook wrote:
> On Wed, Dec 06, 2023 at 10:05:36PM +0100, Florian Weimer wrote:
> > * Kees Cook:
> >
> > > On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
> > >> Report available page shifts in arch independent manner, so that
> > >> userspace developers won't have to parse /proc/cpuinfo hunting
> > >> for arch specific strings:
> > >>
> > >> Note!
> > >>
> > >> This is strictly for userspace, if some page size is shutdown due
> > >> to kernel command line option or CPU bug workaround, than is must not
> > >> be reported in aux vector!
> > >
> > > Given Florian in CC, I assume this is something glibc would like to be
> > > using? Please mention this in the commit log.
> >
> > Nope, I just wrote a random drive-by comment on the first version.
>
> Ah, okay. Then Alexey, who do you expect to be the consumer of this new
> AT value?
libhugetlbfs and everyone who is using 2 MiB pages.
New code should look like this:
#ifndef AT_PAGE_SHIFT_MASK
#define AT_PAGE_SHIFT_MASK 29
#endif
unsigned long val = getauxval(AT_PAGE_SHIFT_MASK);
if (val) {
g_page_size_2mib = val & (1UL << 21);
return;
}
// old 2 MiB page detection code
It is few lines of fast code before code they're already using.
* Alexey Dobriyan:
> On Wed, Dec 06, 2023 at 12:47:27PM -0800, Kees Cook wrote:
>> On Tue, Dec 05, 2023 at 07:01:34PM +0300, Alexey Dobriyan wrote:
>> > Report available page shifts in arch independent manner, so that
>> > userspace developers won't have to parse /proc/cpuinfo hunting
>> > for arch specific strings:
>> >
>> > Note!
>> >
>> > This is strictly for userspace, if some page size is shutdown due
>> > to kernel command line option or CPU bug workaround, than is must not
>> > be reported in aux vector!
>>
>> Given Florian in CC, I assume this is something glibc would like to be
>> using? Please mention this in the commit log.
>
> glibc can use it. Main user is libhugetlbfs, I guess:
>
> https://github.com/libhugetlbfs/libhugetlbfs/blob/master/hugeutils.c#L915
>
> Loop inside getauxval() can run faster than opendir().
Is libhugetlbfs still maintained? Last commit was three years ago?
Thanks,
Florian
Report available page shifts in arch independent manner, so that
userspace developers won't have to parse /proc/cpuinfo hunting
for arch specific strings.
Main users are supposed to be libhugetlbfs-like libraries which try
to abstract huge mappings across multiple architectures. Regular code
which queries hugepage support before using them benefits too because
it doesn't have to deal with descriptors and parsing sysfs hierarchies
while enjoying the simplicity and speed of getauxval(AT_PAGE_SHIFT_MASK).
Note!
This is strictly for userspace, if some page size is shutdown due
to kernel command line option or CPU bug workaround, than it must
not be reported in aux vector!
x86_64 machine with 1 GiB pages:
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
x86_64 machine with 2 MiB pages only:
00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
AT_PAGESZ always reports one smallest page size which is not interesting.
Signed-off-by: Alexey Dobriyan <[email protected]>
---
v3: better comment and changelog
v2: switch to page shifts, rename to ARCH_AT_PAGE_SHIFT_MASK
arch/x86/include/asm/elf.h | 12 ++++++++++++
fs/binfmt_elf.c | 3 +++
include/uapi/linux/auxvec.h | 13 +++++++++++++
3 files changed, 28 insertions(+)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -358,6 +358,18 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+#define ARCH_AT_PAGE_SHIFT_MASK \
+ do { \
+ u32 val = 1 << 12; \
+ if (boot_cpu_has(X86_FEATURE_PSE)) { \
+ val |= 1 << 21; \
+ } \
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
+ val |= 1 << 30; \
+ } \
+ NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, val); \
+ } while (0)
+
#endif /* !CONFIG_X86_32 */
#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#endif
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+#ifdef ARCH_AT_PAGE_SHIFT_MASK
+ ARCH_AT_PAGE_SHIFT_MASK;
+#endif
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,6 +33,19 @@
#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
+/*
+ * All page sizes supported by CPU encoded as bitmask.
+ *
+ * Example: x86_64 system with pse, pdpe1gb /proc/cpuinfo flags
+ * reports 4 KiB, 2 MiB and 1 GiB page support.
+ *
+ * $ LD_SHOW_AUXV=1 $(which true) | grep -e AT_PAGE_SHIFT_MASK
+ * AT_PAGE_SHIFT_MASK: 0x40201000
+ *
+ * For 2^64 hugepage support please contact your Universe sales representative.
+ */
+#define AT_PAGE_SHIFT_MASK 29
+
#define AT_EXECFN 31 /* filename of program */
#ifndef AT_MINSIGSTKSZ
On Thu, Dec 07, 2023 at 05:57:05PM +0300, Alexey Dobriyan wrote:
> On Wed, Dec 06, 2023 at 12:47:27PM -0800, Kees Cook wrote:
> > Can't we have a generic ARCH_AT_PAGE_SHIFT_MASK too? Something like:
> >
> > #ifndef ARCH_AT_PAGE_SHIFT_MASK
> > #define ARCH_AT_PAGE_SHIFT_MASK
> > NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, 1 << PAGE_SHIFT)
> > #endif
> >
> > Or am I misunderstanding something here?
>
> 1) Arch maintainers can opt into this new way to report information at
> their own pace.
>
> 2) AT_PAGE_SHIFT_MASK is about _all_ pagesizes supported by CPU.
> Reporting just one is missing the point.
>
> I'll clarify comment: mmap() support require many things including
> tests for hugetlbfs being mounted, this is about CPU support.
I significantly prefer APIs not being arch-specific, so I'd prefer we
always include AT_PAGE_SHIFT_MASK. For an architecture that doesn't
define its own ARCH_AT_PAGE_SHIFT_MASK, it's not _inaccurate_ to report
1 << PAGE_SHIFT, but it might be incomplete.
--
Kees Cook
* Kees Cook:
> I significantly prefer APIs not being arch-specific, so I'd prefer we
> always include AT_PAGE_SHIFT_MASK. For an architecture that doesn't
> define its own ARCH_AT_PAGE_SHIFT_MASK, it's not _inaccurate_ to report
> 1 << PAGE_SHIFT, but it might be incomplete.
The downside is that as an application programmer, I have to go and
chase for the information the legacy way if I encounter
getauxval(AT_PAGE_SHIFT_MASK) == getpagesize() for a longer time
because the interface does not signal the absence of any extended
page sizes.
Thanks,
Florian
On Fri, Dec 08, 2023 at 07:35:25PM +0100, Florian Weimer wrote:
> * Kees Cook:
>
> > I significantly prefer APIs not being arch-specific, so I'd prefer we
> > always include AT_PAGE_SHIFT_MASK. For an architecture that doesn't
> > define its own ARCH_AT_PAGE_SHIFT_MASK, it's not _inaccurate_ to report
> > 1 << PAGE_SHIFT, but it might be incomplete.
>
> The downside is that as an application programmer, I have to go and
> chase for the information the legacy way if I encounter
> getauxval(AT_PAGE_SHIFT_MASK) == getpagesize() for a longer time
> because the interface does not signal the absence of any extended
> page sizes.
Are there architectures besides x86 where AT_PAGE_SHIFT_MASK isn't a
single bit? If so, let's get them added now along with x86.
--
Kees Cook
On Fri, Dec 08, 2023 at 10:29:25AM -0800, Kees Cook wrote:
> On Thu, Dec 07, 2023 at 05:57:05PM +0300, Alexey Dobriyan wrote:
> > On Wed, Dec 06, 2023 at 12:47:27PM -0800, Kees Cook wrote:
> > > Can't we have a generic ARCH_AT_PAGE_SHIFT_MASK too? Something like:
> > >
> > > #ifndef ARCH_AT_PAGE_SHIFT_MASK
> > > #define ARCH_AT_PAGE_SHIFT_MASK
> > > NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, 1 << PAGE_SHIFT)
> > > #endif
> > >
> > > Or am I misunderstanding something here?
> >
> > 1) Arch maintainers can opt into this new way to report information at
> > their own pace.
> >
> > 2) AT_PAGE_SHIFT_MASK is about _all_ pagesizes supported by CPU.
> > Reporting just one is missing the point.
> >
> > I'll clarify comment: mmap() support require many things including
> > tests for hugetlbfs being mounted, this is about CPU support.
>
> I significantly prefer APIs not being arch-specific,
It will become arch-independent once all relevant archs opt-in.
I doubt anyone is writing new software for sparc or alpha.
> so I'd prefer we
> always include AT_PAGE_SHIFT_MASK. For an architecture that doesn't
> define its own ARCH_AT_PAGE_SHIFT_MASK, it's not _inaccurate_ to report
> 1 << PAGE_SHIFT, but it might be incomplete.
It is inaccurate if ARCH_AT_PAGE_SHIFT_MASK is defined as "_all_ page
shift CPU supports". Inaccurate version is called AT_PAGESZ which lists
just 1 page size, there is no need for 2 inaccurate APIs.
On Thu, Dec 07, 2023 at 09:44:33PM +0300, Alexey Dobriyan wrote:
> Report available page shifts in arch independent manner, so that
> userspace developers won't have to parse /proc/cpuinfo hunting
> for arch specific strings.
>
> Main users are supposed to be libhugetlbfs-like libraries which try
> to abstract huge mappings across multiple architectures. Regular code
> which queries hugepage support before using them benefits too because
> it doesn't have to deal with descriptors and parsing sysfs hierarchies
> while enjoying the simplicity and speed of getauxval(AT_PAGE_SHIFT_MASK).
>
> Note!
>
> This is strictly for userspace, if some page size is shutdown due
> to kernel command line option or CPU bug workaround, than it must
> not be reported in aux vector!
>
> x86_64 machine with 1 GiB pages:
>
> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> 00000040 1d 00 00 00 00 00 00 00 00 10 20 40 00 00 00 00
>
> x86_64 machine with 2 MiB pages only:
>
> 00000030 06 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00
> 00000040 1d 00 00 00 00 00 00 00 00 10 20 00 00 00 00 00
>
> AT_PAGESZ always reports one smallest page size which is not interesting.
>
> Signed-off-by: Alexey Dobriyan <[email protected]>
> ---
>
> v3: better comment and changelog
> v2: switch to page shifts, rename to ARCH_AT_PAGE_SHIFT_MASK
>
> arch/x86/include/asm/elf.h | 12 ++++++++++++
> fs/binfmt_elf.c | 3 +++
> include/uapi/linux/auxvec.h | 13 +++++++++++++
> 3 files changed, 28 insertions(+)
>
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -358,6 +358,18 @@ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
>
> #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
>
> +#define ARCH_AT_PAGE_SHIFT_MASK \
> + do { \
> + u32 val = 1 << 12; \
> + if (boot_cpu_has(X86_FEATURE_PSE)) { \
> + val |= 1 << 21; \
> + } \
> + if (boot_cpu_has(X86_FEATURE_GBPAGES)) { \
> + val |= 1 << 30; \
> + } \
> + NEW_AUX_ENT(AT_PAGE_SHIFT_MASK, val); \
> + } while (0)
> +
> #endif /* !CONFIG_X86_32 */
>
> #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
If I can get an Ack from x86 maintainers for this, I can carry it in my
execve tree.
Thanks for the updates to the commit log and comments, it reads better
now.
-Kees
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> #endif
> NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
> +#ifdef ARCH_AT_PAGE_SHIFT_MASK
> + ARCH_AT_PAGE_SHIFT_MASK;
> +#endif
> NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
> NEW_AUX_ENT(AT_PHDR, phdr_addr);
> NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
> --- a/include/uapi/linux/auxvec.h
> +++ b/include/uapi/linux/auxvec.h
> @@ -33,6 +33,19 @@
> #define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
> #define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
>
> +/*
> + * All page sizes supported by CPU encoded as bitmask.
> + *
> + * Example: x86_64 system with pse, pdpe1gb /proc/cpuinfo flags
> + * reports 4 KiB, 2 MiB and 1 GiB page support.
> + *
> + * $ LD_SHOW_AUXV=1 $(which true) | grep -e AT_PAGE_SHIFT_MASK
> + * AT_PAGE_SHIFT_MASK: 0x40201000
> + *
> + * For 2^64 hugepage support please contact your Universe sales representative.
> + */
> +#define AT_PAGE_SHIFT_MASK 29
> +
> #define AT_EXECFN 31 /* filename of program */
>
> #ifndef AT_MINSIGSTKSZ
--
Kees Cook