2018-06-25 03:18:11

by Baoquan He

[permalink] [raw]
Subject: [PATCH v2 0/2] x86/boot/KASLR: Skip specified number of 1GB huge pages when do physical randomization

This is a regression bug fix. Luiz's team reported that 1GB huge page
allocation will get one less 1GB page randomly when KASLR is enabled. On
their KVM guest with 4GB RAM, which only has one good 1GB huge page,
they found the 1GB huge page allocation sometime failed with below
kernel option adding.

default_hugepagesz=1G hugepagesz=1G hugepages=1

This is because kernel may be randomized into those good 1GB huge pages.

And also on those bare-metal machines with larger memory, one less 1GB huge
page might be seen with KASLR enabled than 'nokaslr' specified case. It's
also because that kernel might be randomized into one of those good 1GB huge
pages.

The solution in this patchset is to skip specified number of GB huge
pages when do kernel physical randomization. If the specified number of GB
huge pages is bigger than amount of good GB huge pages which system can
provide, it's consistent with the current huge page implementation.

v1->v2:
There are several code style problems and typos which were pointed out
by Ingo, fix them in this patchset.

Baoquan He (2):
x86/boot/KASLR: Add two functions for 1GB huge pages handling
x86/boot/KASLR: Skip specified number of 1GB huge pages when do
physical randomization

arch/x86/boot/compressed/kaslr.c | 96 +++++++++++++++++++++++++++++++++++++---
1 file changed, 91 insertions(+), 5 deletions(-)

--
2.13.6



2018-06-25 03:18:19

by Baoquan He

[permalink] [raw]
Subject: [PATCH v2 1/2] x86/boot/KASLR: Add two functions for 1GB huge pages handling

Functions parse_gb_huge_pages() and process_gb_huge_pages() are
introduced to handle a conflict between KASLR and huge pages of
GB, will be used in the next patch.

Function parse_gb_huge_pages() is used to parse kernel
command-line to get how many 1GB huge pages have been specified.
A static global variable 'max_gb_huge_pages' is added to store
the number.

And process_gb_huge_pages() is used to skip as many 1GB huge pages
as possible from the passed in memory region according to the
specified number.

Signed-off-by: Baoquan He <[email protected]>
---
arch/x86/boot/compressed/kaslr.c | 83 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b87a7582853d..0fea96f9cc28 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -215,6 +215,35 @@ static void mem_avoid_memmap(char *str)
memmap_too_large = true;
}

+/* Store the number of 1GB huge pages which users specified.*/
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+
static int handle_mem_memmap(void)
{
char *args = (char *)get_cmd_line_ptr();
@@ -466,6 +495,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
}
}

+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified.
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ unsigned long addr, size = 0;
+ struct mem_vector tmp;
+ int i = 0;
+
+ if (!max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ addr = ALIGN(region->start, PUD_SIZE);
+ /* Did we raise the address above the passed in memory entry? */
+ if (addr < region->start + region->size)
+ size = region->size - (addr - region->start);
+
+ /* Check how many 1GB huge pages can be filtered out*/
+ while (size > PUD_SIZE && max_gb_huge_pages) {
+ size -= PUD_SIZE;
+ max_gb_huge_pages--;
+ i++;
+ }
+
+ /* No good 1GB huge pages found. */
+ if (!i) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /*
+ * Skip those 'i'*1GB good huge pages. And continue checking and
+ * processing the remaining head or tail part of the passed region
+ * if available.
+ */
+
+ if (addr >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = addr - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ size = region->size - (addr - region->start) - i * PUD_SIZE;
+ if (size >= image_size) {
+ tmp.start = addr + i * PUD_SIZE;
+ tmp.size = size;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
static unsigned long slots_fetch_random(void)
{
unsigned long slot;
--
2.13.6


2018-06-25 03:18:43

by Baoquan He

[permalink] [raw]
Subject: [PATCH v2 2/2] x86/boot/KASLR: Skip specified number of 1GB huge pages when do physical randomization

In 1GB huge pages allocation, a regression bug could be triggered when
KASLR is enabled. On a KVM guest with 4GB RAM, after adding the following
to the kernel command-line:

'default_hugepagesz=1G hugepagesz=1G hugepages=1'

then boot the guest and check number of 1GB pages reserved:
# grep HugePages_Total /proc/meminfo

It shows that when booting with "nokaslr" HugePages_Total is always 1,
while booting without "nokaslr" sometimes HugePages_Total is set as 0
(that is, reserving the 1GB page fails). Note that it may need to boot
a few times to trigger the issue.

After investigation, the root cause is that kernel may be put into the
only good 1GB huge page [0x40000000, 0x7fffffff] randomly. Below is the
dmesg output snippet from the KVM guest. We can see that only
[0x40000000, 0x7fffffff] region is good 1GB huge page,
[0x100000000, 0x13fffffff] will be touched by memblock top-down
allocation.

[...] e820: BIOS-provided physical RAM map:
[...] BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable
[...] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved
[...] BIOS-e820: [mem 0x00000000000f0000-0x00000000000fffff] reserved
[...] BIOS-e820: [mem 0x0000000000100000-0x00000000bffdffff] usable
[...] BIOS-e820: [mem 0x00000000bffe0000-0x00000000bfffffff] reserved
[...] BIOS-e820: [mem 0x00000000feffc000-0x00000000feffffff] reserved
[...] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved
[...] BIOS-e820: [mem 0x0000000100000000-0x000000013fffffff] usable

Besides, on those bare-metal machines with larger memory, one less 1GB
huge page might be got with KASLR enabled. It's also because that kernel
might be randomized into those good 1GB huge pages.

To fix this, firstly parse kernel command-line to get how many 1GB huge
pages are specified. Then try to skip the specified number of 1GB huge
pages when decide which memory region kernel can be randomized into.

And also change the name of handle_mem_memmap() as handle_mem_options()
since it handles not only 'mem=' and 'memmap=', but also 'hugepagesxxx'
now.

Signed-off-by: Baoquan He <[email protected]>
---
arch/x86/boot/compressed/kaslr.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 0fea96f9cc28..ff8a865de36b 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -244,7 +244,7 @@ static void parse_gb_huge_pages(char *param, char *val)
}


-static int handle_mem_memmap(void)
+static int handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
size_t len = strlen((char *)args);
@@ -252,7 +252,8 @@ static int handle_mem_memmap(void)
char *param, *val;
u64 mem_size;

- if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
+ !strstr(args, "hugepages"))
return 0;

tmp_cmdline = malloc(len + 1);
@@ -277,6 +278,8 @@ static int handle_mem_memmap(void)

if (!strcmp(param, "memmap")) {
mem_avoid_memmap(val);
+ } else if (strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
char *p = val;

@@ -416,7 +419,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */

/* Mark the memmap regions we need to avoid */
- handle_mem_memmap();
+ handle_mem_options();

#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -629,7 +632,7 @@ static void process_mem_region(struct mem_vector *entry,

/* If nothing overlaps, store the region and return. */
if (!mem_avoid_overlap(&region, &overlap)) {
- store_slot_info(&region, image_size);
+ process_gb_huge_pages(&region, image_size);
return;
}

@@ -639,7 +642,7 @@ static void process_mem_region(struct mem_vector *entry,

beginning.start = region.start;
beginning.size = overlap.start - region.start;
- store_slot_info(&beginning, image_size);
+ process_gb_huge_pages(&beginning, image_size);
}

/* Return if overlap extends to or past end of region. */
--
2.13.6


2018-06-26 15:29:10

by Luiz Capitulino

[permalink] [raw]
Subject: Re: [PATCH v2 0/2] x86/boot/KASLR: Skip specified number of 1GB huge pages when do physical randomization

On Mon, 25 Jun 2018 11:16:54 +0800
Baoquan He <[email protected]> wrote:

> This is a regression bug fix. Luiz's team reported that 1GB huge page
> allocation will get one less 1GB page randomly when KASLR is enabled. On
> their KVM guest with 4GB RAM, which only has one good 1GB huge page,
> they found the 1GB huge page allocation sometime failed with below
> kernel option adding.
>
> default_hugepagesz=1G hugepagesz=1G hugepages=1
>
> This is because kernel may be randomized into those good 1GB huge pages.
>
> And also on those bare-metal machines with larger memory, one less 1GB huge
> page might be seen with KASLR enabled than 'nokaslr' specified case. It's
> also because that kernel might be randomized into one of those good 1GB huge
> pages.
>
> The solution in this patchset is to skip specified number of GB huge
> pages when do kernel physical randomization. If the specified number of GB
> huge pages is bigger than amount of good GB huge pages which system can
> provide, it's consistent with the current huge page implementation.

Reviewed-and-Tested-by: Luiz Capitulino <[email protected]>

>
> v1->v2:
> There are several code style problems and typos which were pointed out
> by Ingo, fix them in this patchset.
>
> Baoquan He (2):
> x86/boot/KASLR: Add two functions for 1GB huge pages handling
> x86/boot/KASLR: Skip specified number of 1GB huge pages when do
> physical randomization
>
> arch/x86/boot/compressed/kaslr.c | 96 +++++++++++++++++++++++++++++++++++++---
> 1 file changed, 91 insertions(+), 5 deletions(-)
>


Subject: [tip:x86/boot] x86/boot/KASLR: Add two new functions for 1GB huge pages handling

Commit-ID: 9b912485e0e74a74e042e4f2dd87f262e46fcdf1
Gitweb: https://git.kernel.org/tip/9b912485e0e74a74e042e4f2dd87f262e46fcdf1
Author: Baoquan He <[email protected]>
AuthorDate: Mon, 25 Jun 2018 11:16:55 +0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 3 Jul 2018 10:50:12 +0200

x86/boot/KASLR: Add two new functions for 1GB huge pages handling

Introduce two new functions: parse_gb_huge_pages() and process_gb_huge_pages(),
which handle a conflict between KASLR and huge pages of 1GB.

These two functions will be used in the next patch:

- parse_gb_huge_pages() is used to parse kernel command-line to get
how many 1GB huge pages have been specified. A static global
variable 'max_gb_huge_pages' is added to store the number.

- process_gb_huge_pages() is used to skip as many 1GB huge pages
as possible from the passed in memory region according to the
specified number.

Signed-off-by: Baoquan He <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/boot/compressed/kaslr.c | 83 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b87a7582853d..d97647b5ffb7 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -215,6 +215,35 @@ static void mem_avoid_memmap(char *str)
memmap_too_large = true;
}

+/* Store the number of 1GB huge pages which users specified: */
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+
static int handle_mem_memmap(void)
{
char *args = (char *)get_cmd_line_ptr();
@@ -466,6 +495,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
}
}

+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified:
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ unsigned long addr, size = 0;
+ struct mem_vector tmp;
+ int i = 0;
+
+ if (!max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ addr = ALIGN(region->start, PUD_SIZE);
+ /* Did we raise the address above the passed in memory entry? */
+ if (addr < region->start + region->size)
+ size = region->size - (addr - region->start);
+
+ /* Check how many 1GB huge pages can be filtered out: */
+ while (size > PUD_SIZE && max_gb_huge_pages) {
+ size -= PUD_SIZE;
+ max_gb_huge_pages--;
+ i++;
+ }
+
+ /* No good 1GB huge pages found: */
+ if (!i) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /*
+ * Skip those 'i'*1GB good huge pages, and continue checking and
+ * processing the remaining head or tail part of the passed region
+ * if available.
+ */
+
+ if (addr >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = addr - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ size = region->size - (addr - region->start) - i * PUD_SIZE;
+ if (size >= image_size) {
+ tmp.start = addr + i * PUD_SIZE;
+ tmp.size = size;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
static unsigned long slots_fetch_random(void)
{
unsigned long slot;

Subject: [tip:x86/boot] x86/boot/KASLR: Skip specified number of 1GB huge pages when doing physical randomization (KASLR)

Commit-ID: 747ff6265db4c2b77e8c7384f8054916a0c1eb39
Gitweb: https://git.kernel.org/tip/747ff6265db4c2b77e8c7384f8054916a0c1eb39
Author: Baoquan He <[email protected]>
AuthorDate: Mon, 25 Jun 2018 11:16:56 +0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 3 Jul 2018 10:50:13 +0200

x86/boot/KASLR: Skip specified number of 1GB huge pages when doing physical randomization (KASLR)

When KASLR is enabled then 1GB huge pages allocations might regress
sporadically.

To reproduce on a KVM guest with 4GB RAM:

- add the following options to the kernel command-line:

'default_hugepagesz=1G hugepagesz=1G hugepages=1'

- boot the guest and check number of 1GB pages reserved:

# grep HugePages_Total /proc/meminfo

- sporadically, every couple of bootups the output of this
command shows that when booting with "nokaslr" HugePages_Total is always 1,
while booting without "nokaslr" sometimes HugePages_Total is set as 0
(that is, reserving the 1GB page failed).

Note that you may need to boot a few times to trigger the issue,
because it's somewhat non-deterministic.

The root cause is that kernel may be put into the only good 1GB huge page
in the [0x40000000, 0x7fffffff] physical range randomly.

Below is the dmesg output snippet from the KVM guest. We can see that only
[0x40000000, 0x7fffffff] region is good 1GB huge page,
[0x100000000, 0x13fffffff] will be touched by the memblock top-down allocation:

[...] e820: BIOS-provided physical RAM map:
[...] BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable
[...] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved
[...] BIOS-e820: [mem 0x00000000000f0000-0x00000000000fffff] reserved
[...] BIOS-e820: [mem 0x0000000000100000-0x00000000bffdffff] usable
[...] BIOS-e820: [mem 0x00000000bffe0000-0x00000000bfffffff] reserved
[...] BIOS-e820: [mem 0x00000000feffc000-0x00000000feffffff] reserved
[...] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved
[...] BIOS-e820: [mem 0x0000000100000000-0x000000013fffffff] usable

Besides, on bare-metal machines with larger memory, one less 1GB huge page
might be available with KASLR enabled. That too is because the kernel
image might be randomized into those "good" 1GB huge pages.

To fix this, firstly parse the kernel command-line to get how many 1GB huge
pages are specified. Then try to skip the specified number of 1GB huge
pages when decide which memory region kernel can be randomized into.

Also change the name of handle_mem_memmap() as handle_mem_options()
since it handles not only 'mem=' and 'memmap=', but also 'hugepagesxxx' now.

Signed-off-by: Baoquan He <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
[ Rewrote the changelog, fixed style problems in the code. ]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/boot/compressed/kaslr.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index d97647b5ffb7..531c9876f573 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -244,7 +244,7 @@ static void parse_gb_huge_pages(char *param, char *val)
}


-static int handle_mem_memmap(void)
+static int handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
size_t len = strlen((char *)args);
@@ -252,7 +252,8 @@ static int handle_mem_memmap(void)
char *param, *val;
u64 mem_size;

- if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
+ !strstr(args, "hugepages"))
return 0;

tmp_cmdline = malloc(len + 1);
@@ -277,6 +278,8 @@ static int handle_mem_memmap(void)

if (!strcmp(param, "memmap")) {
mem_avoid_memmap(val);
+ } else if (strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
char *p = val;

@@ -416,7 +419,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */

/* Mark the memmap regions we need to avoid */
- handle_mem_memmap();
+ handle_mem_options();

#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -629,7 +632,7 @@ static void process_mem_region(struct mem_vector *entry,

/* If nothing overlaps, store the region and return. */
if (!mem_avoid_overlap(&region, &overlap)) {
- store_slot_info(&region, image_size);
+ process_gb_huge_pages(&region, image_size);
return;
}

@@ -639,7 +642,7 @@ static void process_mem_region(struct mem_vector *entry,

beginning.start = region.start;
beginning.size = overlap.start - region.start;
- store_slot_info(&beginning, image_size);
+ process_gb_huge_pages(&beginning, image_size);
}

/* Return if overlap extends to or past end of region. */