The first patch fixes a bug in populate_pud, which results in the
requested mapping not being completely installed if the CPU does not
support GB pages.
The next three are small cleanups.
Thanks.
Arvind Sankar (4):
x86/mm/pat: Handle no-GBPAGES case correctly in populate_pud
x86/mm/pat: Ensure that populate_pud only handles one PUD
x86/mm/pat: Propagate all errors out of populate_pud
x86/mm/pat: Make num_pages consistent in populate_{pte,pud,pgd}
arch/x86/include/asm/pgtable_types.h | 2 +-
arch/x86/mm/pat/set_memory.c | 74 +++++++++++++++++-----------
2 files changed, 45 insertions(+), 31 deletions(-)
--
2.24.1
populate_pud tries to return the number of pages mapped so far if
populate_pmd fails. This is of dubious utility, since if populate_pmd
did any work before failing, the returned number of pages will be
inconsistent with cpa->pfn, and the loop in __change_page_attr_set_clr
will retry with that inconsistent state. Further, if the number of pages
mapped before failure is zero, that will trigger the BUG_ON in
__change_page_attr_set_clr.
Just return all errors up the stack and let the original caller deal
with it.
Signed-off-by: Arvind Sankar <[email protected]>
---
arch/x86/mm/pat/set_memory.c | 43 ++++++++++++++++--------------------
1 file changed, 19 insertions(+), 24 deletions(-)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index a1003bc9fdf6..2f98423ef69a 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1247,9 +1247,9 @@ static void populate_pte(struct cpa_data *cpa,
}
}
-static long populate_pmd(struct cpa_data *cpa,
- unsigned long start, unsigned long end,
- unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+static int populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
long cur_pages = 0;
pmd_t *pmd;
@@ -1283,7 +1283,7 @@ static long populate_pmd(struct cpa_data *cpa,
* We mapped them all?
*/
if (num_pages == cur_pages)
- return cur_pages;
+ return 0;
pmd_pgprot = pgprot_4k_2_large(pgprot);
@@ -1318,7 +1318,7 @@ static long populate_pmd(struct cpa_data *cpa,
populate_pte(cpa, start, end, num_pages - cur_pages,
pmd, pgprot);
}
- return num_pages;
+ return 0;
}
static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
@@ -1328,6 +1328,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
unsigned long end;
long cur_pages = 0;
pgprot_t pud_pgprot;
+ int ret;
end = start + (cpa->numpages << PAGE_SHIFT);
@@ -1352,17 +1353,16 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
if (alloc_pmd_page(pud))
return -1;
- cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
- pud, pgprot);
- if (cur_pages < 0)
- return cur_pages;
+ ret = populate_pmd(cpa, start, pre_end, cur_pages, pud, pgprot);
+ if (ret < 0)
+ return ret;
start = pre_end;
}
/* We mapped them all? */
if (cpa->numpages == cur_pages)
- return cur_pages;
+ return 0;
pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
@@ -1379,10 +1379,10 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
- if (populate_pmd(cpa, start, start + PUD_SIZE,
- PUD_SIZE >> PAGE_SHIFT,
- pud, pgprot) < 0)
- return cur_pages;
+ ret = populate_pmd(cpa, start, start + PUD_SIZE,
+ PUD_SIZE >> PAGE_SHIFT, pud, pgprot);
+ if (ret < 0)
+ return ret;
}
start += PUD_SIZE;
@@ -1392,21 +1392,17 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
/* Map trailing leftover */
if (start < end) {
- long tmp;
-
pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
- tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ ret = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
pud, pgprot);
- if (tmp < 0)
- return cur_pages;
-
- cur_pages += tmp;
+ if (ret < 0)
+ return ret;
}
- return cur_pages;
+ return 0;
}
/*
@@ -1419,7 +1415,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pud_t *pud = NULL; /* shut up gcc */
p4d_t *p4d;
pgd_t *pgd_entry;
- long ret;
+ int ret;
unsigned long end, end_p4d;
pgd_entry = cpa->pgd + pgd_index(addr);
@@ -1468,7 +1464,6 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
return ret;
}
- cpa->numpages = ret;
return 0;
}
--
2.24.1
The number of pages is currently all of int, unsigned int, long and
unsigned long in different places.
Change it to be consistently unsigned long.
Remove the unnecessary min(num_pages, cur_pages), since pre_end has
already been min'd with start + num_pages << PAGE_SHIFT. This gets rid
of two conversions to int/unsigned int.
Signed-off-by: Arvind Sankar <[email protected]>
---
arch/x86/include/asm/pgtable_types.h | 2 +-
arch/x86/mm/pat/set_memory.c | 13 ++++++-------
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0239998d8cdc..894569255a95 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -574,7 +574,7 @@ extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
unsigned long address,
- unsigned numpages,
+ unsigned long numpages,
unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
unsigned long numpages);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 2f98423ef69a..51b64937cc16 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1230,7 +1230,7 @@ static int alloc_pmd_page(pud_t *pud)
static void populate_pte(struct cpa_data *cpa,
unsigned long start, unsigned long end,
- unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+ unsigned long num_pages, pmd_t *pmd, pgprot_t pgprot)
{
pte_t *pte;
@@ -1249,9 +1249,9 @@ static void populate_pte(struct cpa_data *cpa,
static int populate_pmd(struct cpa_data *cpa,
unsigned long start, unsigned long end,
- unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+ unsigned long num_pages, pud_t *pud, pgprot_t pgprot)
{
- long cur_pages = 0;
+ unsigned long cur_pages = 0;
pmd_t *pmd;
pgprot_t pmd_pgprot;
@@ -1264,7 +1264,6 @@ static int populate_pmd(struct cpa_data *cpa,
pre_end = min_t(unsigned long, pre_end, next_page);
cur_pages = (pre_end - start) >> PAGE_SHIFT;
- cur_pages = min_t(unsigned int, num_pages, cur_pages);
/*
* Need a PTE page?
@@ -1326,7 +1325,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
{
pud_t *pud;
unsigned long end;
- long cur_pages = 0;
+ unsigned long cur_pages = 0;
pgprot_t pud_pgprot;
int ret;
@@ -1342,7 +1341,6 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
pre_end = min_t(unsigned long, end, next_page);
cur_pages = (pre_end - start) >> PAGE_SHIFT;
- cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
pud = pud_offset(p4d, start);
@@ -2231,7 +2229,8 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_HIBERNATION */
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
- unsigned numpages, unsigned long page_flags)
+ unsigned long numpages,
+ unsigned long page_flags)
{
int retval = -EINVAL;
--
2.24.1
Commit d367cef0a7f0 ("x86/mm/pat: Fix boot crash when 1GB pages are not
supported by the CPU") added checking for CPU support for 1G pages
before using them.
However, when support is not present, nothing is done to map the
intermediate 1G regions and we go directly to the code that normally
maps the remainder after 1G mappings have been done. This code can only
handle mappings that fit inside a single PUD entry, but there is no
check, and it instead silently produces a corrupted mapping to the end
of the PUD entry, and no mapping beyond it, but still returns success.
This bug is encountered on EFI machines in mixed mode (32-bit firmware
with 64-bit kernel), with RAM beyond 2G. The EFI support code
direct-maps all the RAM, so a memory range from below 1G to above 2G
triggers the bug and results in no mapping above 2G, and an incorrect
mapping in the 1G-2G range. If the kernel resides in the 1G-2G range, a
firmware call does not return correctly, and if it resides above 2G, we
end up passing addresses that are not mapped in the EFI pagetable.
Fix this by mapping the 1G regions using 2M pages when 1G page support
is not available.
Signed-off-by: Arvind Sankar <[email protected]>
---
arch/x86/mm/pat/set_memory.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c4aedd00c1ba..d0b7b06253a5 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1370,12 +1370,22 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
/*
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
- while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
- canon_pgprot(pud_pgprot))));
+ while (end - start >= PUD_SIZE) {
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
+ cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
+ } else {
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+ if (populate_pmd(cpa, start, start + PUD_SIZE,
+ PUD_SIZE >> PAGE_SHIFT,
+ pud, pgprot) < 0)
+ return cur_pages;
+ }
start += PUD_SIZE;
- cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
cur_pages += PUD_SIZE >> PAGE_SHIFT;
pud++;
}
--
2.24.1
Add a check in populate_pgd to make sure that populate_pud is called on
a range that actually fits inside a PUD.
Signed-off-by: Arvind Sankar <[email protected]>
---
arch/x86/mm/pat/set_memory.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index d0b7b06253a5..a1003bc9fdf6 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1420,6 +1420,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
+ unsigned long end, end_p4d;
pgd_entry = cpa->pgd + pgd_index(addr);
@@ -1443,6 +1444,15 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
+ /*
+ * Ensure that the range fits inside one p4d entry. If a larger range
+ * was requested, __change_page_attr_set_clr will loop to finish it.
+ */
+ end = addr + (cpa->numpages << PAGE_SHIFT);
+ end_p4d = (addr + P4D_SIZE) & P4D_MASK;
+ if (end_p4d < end)
+ cpa->numpages = (end_p4d - addr) >> PAGE_SHIFT;
+
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
--
2.24.1
On Tue, 3 Mar 2020 at 21:54, Arvind Sankar <[email protected]> wrote:
>
> Commit d367cef0a7f0 ("x86/mm/pat: Fix boot crash when 1GB pages are not
> supported by the CPU") added checking for CPU support for 1G pages
> before using them.
>
> However, when support is not present, nothing is done to map the
> intermediate 1G regions and we go directly to the code that normally
> maps the remainder after 1G mappings have been done. This code can only
> handle mappings that fit inside a single PUD entry, but there is no
> check, and it instead silently produces a corrupted mapping to the end
> of the PUD entry, and no mapping beyond it, but still returns success.
>
> This bug is encountered on EFI machines in mixed mode (32-bit firmware
> with 64-bit kernel), with RAM beyond 2G. The EFI support code
> direct-maps all the RAM, so a memory range from below 1G to above 2G
> triggers the bug and results in no mapping above 2G, and an incorrect
> mapping in the 1G-2G range. If the kernel resides in the 1G-2G range, a
> firmware call does not return correctly, and if it resides above 2G, we
> end up passing addresses that are not mapped in the EFI pagetable.
>
> Fix this by mapping the 1G regions using 2M pages when 1G page support
> is not available.
>
> Signed-off-by: Arvind Sankar <[email protected]>
I was trying to test these patches, and while they seem fine from a
regression point of view, I can't seem to reproduce this issue and
make it go away again by applying this patch.
Do you have any detailed instructions how to reproduce this?
> ---
> arch/x86/mm/pat/set_memory.c | 18 ++++++++++++++----
> 1 file changed, 14 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index c4aedd00c1ba..d0b7b06253a5 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -1370,12 +1370,22 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
> /*
> * Map everything starting from the Gb boundary, possibly with 1G pages
> */
> - while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
> - set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
> - canon_pgprot(pud_pgprot))));
> + while (end - start >= PUD_SIZE) {
> + if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
> + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
> + canon_pgprot(pud_pgprot))));
> + cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
> + } else {
> + if (pud_none(*pud))
> + if (alloc_pmd_page(pud))
> + return -1;
> + if (populate_pmd(cpa, start, start + PUD_SIZE,
> + PUD_SIZE >> PAGE_SHIFT,
> + pud, pgprot) < 0)
> + return cur_pages;
> + }
>
> start += PUD_SIZE;
> - cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
> cur_pages += PUD_SIZE >> PAGE_SHIFT;
> pud++;
> }
> --
> 2.24.1
>
On Wed, Mar 04, 2020 at 09:17:44AM +0100, Ard Biesheuvel wrote:
> On Tue, 3 Mar 2020 at 21:54, Arvind Sankar <[email protected]> wrote:
> >
> > Commit d367cef0a7f0 ("x86/mm/pat: Fix boot crash when 1GB pages are not
> > supported by the CPU") added checking for CPU support for 1G pages
> > before using them.
> >
> > However, when support is not present, nothing is done to map the
> > intermediate 1G regions and we go directly to the code that normally
> > maps the remainder after 1G mappings have been done. This code can only
> > handle mappings that fit inside a single PUD entry, but there is no
> > check, and it instead silently produces a corrupted mapping to the end
> > of the PUD entry, and no mapping beyond it, but still returns success.
> >
> > This bug is encountered on EFI machines in mixed mode (32-bit firmware
> > with 64-bit kernel), with RAM beyond 2G. The EFI support code
> > direct-maps all the RAM, so a memory range from below 1G to above 2G
> > triggers the bug and results in no mapping above 2G, and an incorrect
> > mapping in the 1G-2G range. If the kernel resides in the 1G-2G range, a
> > firmware call does not return correctly, and if it resides above 2G, we
> > end up passing addresses that are not mapped in the EFI pagetable.
> >
> > Fix this by mapping the 1G regions using 2M pages when 1G page support
> > is not available.
> >
> > Signed-off-by: Arvind Sankar <[email protected]>
>
> I was trying to test these patches, and while they seem fine from a
> regression point of view, I can't seem to reproduce this issue and
> make it go away again by applying this patch.
>
> Do you have any detailed instructions how to reproduce this?
>
The steps I'm following are
- build x86_64 defconfig + enable EFI_PGT_DUMP (to show the incorrect
pagetable)
- run (QEMU is 4.2.0)
$ qemu-system-x86_64 -cpu Haswell -pflash qemu/OVMF_32.fd -m 3072 -nographic \
-kernel kernel64/arch/x86/boot/bzImage -append "earlyprintk=ttyS0,keep efi=debug nokaslr"
The EFI memory map I get is (abbreviated to regions of interest):
...
[ 0.253991] efi: mem10: [Conventional Memory| | | | | | | | | |WB|WT|WC|UC] range=[0x00000000053e7000-0x000000003fffbfff] (940MB)
[ 0.254424] efi: mem11: [Loader Data | | | | | | | | | |WB|WT|WC|UC] range=[0x000000003fffc000-0x000000003fffffff] (0MB)
[ 0.254991] efi: mem12: [Conventional Memory| | | | | | | | | |WB|WT|WC|UC] range=[0x0000000040000000-0x00000000bbf77fff] (1983MB)
...
The pagetable this produces is (abbreviated again):
...
[ 0.272980] 0x0000000003400000-0x0000000004800000 20M ro PSE x pmd
[ 0.273327] 0x0000000004800000-0x0000000005200000 10M RW PSE NX pmd
[ 0.273987] 0x0000000005200000-0x0000000005400000 2M RW NX pte
[ 0.274343] 0x0000000005400000-0x000000003fe00000 938M RW PSE NX pmd
[ 0.274725] 0x000000003fe00000-0x0000000040000000 2M RW NX pte
[ 0.275066] 0x0000000040000000-0x0000000080000000 1G RW PSE NX pmd
[ 0.275437] 0x0000000080000000-0x00000000bbe00000 958M pmd
...
Note how 0x80000000-0xbbe00000 range is unmapped in the resulting
pagetable. The dump doesn't show physical addresses, but the
0x40000000-0x80000000 range is incorrectly mapped as well, as the loop
in populate_pmd would just go over that virtual address range twice.
while (end - start >= PMD_SIZE) {
...
pmd = pmd_offset(pud, start);
set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
cur_pages += PMD_SIZE >> PAGE_SHIFT;
}
On Wed, 4 Mar 2020 at 16:49, Arvind Sankar <[email protected]> wrote:
>
> On Wed, Mar 04, 2020 at 09:17:44AM +0100, Ard Biesheuvel wrote:
> > On Tue, 3 Mar 2020 at 21:54, Arvind Sankar <[email protected]> wrote:
> > >
> > > Commit d367cef0a7f0 ("x86/mm/pat: Fix boot crash when 1GB pages are not
> > > supported by the CPU") added checking for CPU support for 1G pages
> > > before using them.
> > >
> > > However, when support is not present, nothing is done to map the
> > > intermediate 1G regions and we go directly to the code that normally
> > > maps the remainder after 1G mappings have been done. This code can only
> > > handle mappings that fit inside a single PUD entry, but there is no
> > > check, and it instead silently produces a corrupted mapping to the end
> > > of the PUD entry, and no mapping beyond it, but still returns success.
> > >
> > > This bug is encountered on EFI machines in mixed mode (32-bit firmware
> > > with 64-bit kernel), with RAM beyond 2G. The EFI support code
> > > direct-maps all the RAM, so a memory range from below 1G to above 2G
> > > triggers the bug and results in no mapping above 2G, and an incorrect
> > > mapping in the 1G-2G range. If the kernel resides in the 1G-2G range, a
> > > firmware call does not return correctly, and if it resides above 2G, we
> > > end up passing addresses that are not mapped in the EFI pagetable.
> > >
> > > Fix this by mapping the 1G regions using 2M pages when 1G page support
> > > is not available.
> > >
> > > Signed-off-by: Arvind Sankar <[email protected]>
> >
> > I was trying to test these patches, and while they seem fine from a
> > regression point of view, I can't seem to reproduce this issue and
> > make it go away again by applying this patch.
> >
> > Do you have any detailed instructions how to reproduce this?
> >
>
> The steps I'm following are
> - build x86_64 defconfig + enable EFI_PGT_DUMP (to show the incorrect
> pagetable)
> - run (QEMU is 4.2.0)
> $ qemu-system-x86_64 -cpu Haswell -pflash qemu/OVMF_32.fd -m 3072 -nographic \
> -kernel kernel64/arch/x86/boot/bzImage -append "earlyprintk=ttyS0,keep efi=debug nokaslr"
>
> The EFI memory map I get is (abbreviated to regions of interest):
> ...
> [ 0.253991] efi: mem10: [Conventional Memory| | | | | | | | | |WB|WT|WC|UC] range=[0x00000000053e7000-0x000000003fffbfff] (940MB)
> [ 0.254424] efi: mem11: [Loader Data | | | | | | | | | |WB|WT|WC|UC] range=[0x000000003fffc000-0x000000003fffffff] (0MB)
> [ 0.254991] efi: mem12: [Conventional Memory| | | | | | | | | |WB|WT|WC|UC] range=[0x0000000040000000-0x00000000bbf77fff] (1983MB)
> ...
>
> The pagetable this produces is (abbreviated again):
> ...
> [ 0.272980] 0x0000000003400000-0x0000000004800000 20M ro PSE x pmd
> [ 0.273327] 0x0000000004800000-0x0000000005200000 10M RW PSE NX pmd
> [ 0.273987] 0x0000000005200000-0x0000000005400000 2M RW NX pte
> [ 0.274343] 0x0000000005400000-0x000000003fe00000 938M RW PSE NX pmd
> [ 0.274725] 0x000000003fe00000-0x0000000040000000 2M RW NX pte
> [ 0.275066] 0x0000000040000000-0x0000000080000000 1G RW PSE NX pmd
> [ 0.275437] 0x0000000080000000-0x00000000bbe00000 958M pmd
> ...
>
> Note how 0x80000000-0xbbe00000 range is unmapped in the resulting
> pagetable. The dump doesn't show physical addresses, but the
> 0x40000000-0x80000000 range is incorrectly mapped as well, as the loop
> in populate_pmd would just go over that virtual address range twice.
>
> while (end - start >= PMD_SIZE) {
> ...
> pmd = pmd_offset(pud, start);
>
> set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
> canon_pgprot(pmd_pgprot))));
>
> start += PMD_SIZE;
> cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
> cur_pages += PMD_SIZE >> PAGE_SHIFT;
> }
I've tried a couple of different ways, but I can't seem to get my
memory map organized in the way that will trigger the error.
On Wed, Mar 04, 2020 at 07:44:50PM +0100, Ard Biesheuvel wrote:
>
> I've tried a couple of different ways, but I can't seem to get my
> memory map organized in the way that will trigger the error.
What does yours look like? efi_merge_regions doesn't merge everything
that will eventually be mapped the same way, so if there are some
non-conventional memory regions scattered over the address space, it
might be breaking up the mappings to the point where this doesn't
trigger.
On Wed, 4 Mar 2020 at 19:50, Arvind Sankar <[email protected]> wrote:
>
> On Wed, Mar 04, 2020 at 07:44:50PM +0100, Ard Biesheuvel wrote:
> >
> > I've tried a couple of different ways, but I can't seem to get my
> > memory map organized in the way that will trigger the error.
>
> What does yours look like? efi_merge_regions doesn't merge everything
> that will eventually be mapped the same way, so if there are some
> non-conventional memory regions scattered over the address space, it
> might be breaking up the mappings to the point where this doesn't
> trigger.
I have a region
[ 0.000000] efi: mem07: [Conventional Memory| | | | | | | |
| |WB|WT|WC|UC] range=[0x0000000001400000-0x00000000b9855fff]
(2948MB)
which gets covered correctly
[ 0.401766] 0x0000000000a00000-0x0000000040000000 1014M
RW PSE NX pmd
[ 0.403436] 0x0000000040000000-0x0000000080000000 1G
RW PSE NX pud
[ 0.404645] 0x0000000080000000-0x00000000b9800000 920M
RW PSE NX pmd
[ 0.405844] 0x00000000b9800000-0x00000000b9a00000 2M
RW NX pte
[ 0.407436] 0x00000000b9a00000-0x00000000baa00000 16M
ro PSE x pmd
[ 0.408591] 0x00000000baa00000-0x00000000bbe00000 20M
RW PSE NX pmd
[ 0.409751] 0x00000000bbe00000-0x00000000bc000000 2M
RW NX pte
[ 0.410821] 0x00000000bc000000-0x00000000be600000 38M
RW PSE NX pmd
However, the fact that you can provide a case where it does fail
should be sufficient justification for taking this patch. I was just
trying to give more than a regression-tested-by
On Wed, Mar 04, 2020 at 08:04:04PM +0100, Ard Biesheuvel wrote:
> On Wed, 4 Mar 2020 at 19:50, Arvind Sankar <[email protected]> wrote:
> >
> > On Wed, Mar 04, 2020 at 07:44:50PM +0100, Ard Biesheuvel wrote:
> > >
> > > I've tried a couple of different ways, but I can't seem to get my
> > > memory map organized in the way that will trigger the error.
> >
> > What does yours look like? efi_merge_regions doesn't merge everything
> > that will eventually be mapped the same way, so if there are some
> > non-conventional memory regions scattered over the address space, it
> > might be breaking up the mappings to the point where this doesn't
> > trigger.
>
> I have a region
>
> [ 0.000000] efi: mem07: [Conventional Memory| | | | | | | |
> | |WB|WT|WC|UC] range=[0x0000000001400000-0x00000000b9855fff]
> (2948MB)
>
> which gets covered correctly
>
> [ 0.401766] 0x0000000000a00000-0x0000000040000000 1014M
> RW PSE NX pmd
> [ 0.403436] 0x0000000040000000-0x0000000080000000 1G
> RW PSE NX pud
> [ 0.404645] 0x0000000080000000-0x00000000b9800000 920M
> RW PSE NX pmd
> [ 0.405844] 0x00000000b9800000-0x00000000b9a00000 2M
> RW NX pte
> [ 0.407436] 0x00000000b9a00000-0x00000000baa00000 16M
> ro PSE x pmd
> [ 0.408591] 0x00000000baa00000-0x00000000bbe00000 20M
> RW PSE NX pmd
> [ 0.409751] 0x00000000bbe00000-0x00000000bc000000 2M
> RW NX pte
> [ 0.410821] 0x00000000bc000000-0x00000000be600000 38M
> RW PSE NX pmd
>
> However, the fact that you can provide a case where it does fail
> should be sufficient justification for taking this patch. I was just
> trying to give more than a regression-tested-by
No, this case is exactly one that should break. But I think you're
running on a processor model that _does_ support GB pages, as shown by
the "pud" mapping there for the 1G-2G range.
At least for my version of qemu, -cpu Haswell does not enable the
pdpe1gb feature. Which cpu did you specify?
Thanks.
On Wed, 4 Mar 2020 at 20:10, Arvind Sankar <[email protected]> wrote:
>
> On Wed, Mar 04, 2020 at 08:04:04PM +0100, Ard Biesheuvel wrote:
> > On Wed, 4 Mar 2020 at 19:50, Arvind Sankar <[email protected]> wrote:
> > >
> > > On Wed, Mar 04, 2020 at 07:44:50PM +0100, Ard Biesheuvel wrote:
> > > >
> > > > I've tried a couple of different ways, but I can't seem to get my
> > > > memory map organized in the way that will trigger the error.
> > >
> > > What does yours look like? efi_merge_regions doesn't merge everything
> > > that will eventually be mapped the same way, so if there are some
> > > non-conventional memory regions scattered over the address space, it
> > > might be breaking up the mappings to the point where this doesn't
> > > trigger.
> >
> > I have a region
> >
> > [ 0.000000] efi: mem07: [Conventional Memory| | | | | | | |
> > | |WB|WT|WC|UC] range=[0x0000000001400000-0x00000000b9855fff]
> > (2948MB)
> >
> > which gets covered correctly
> >
> > [ 0.401766] 0x0000000000a00000-0x0000000040000000 1014M
> > RW PSE NX pmd
> > [ 0.403436] 0x0000000040000000-0x0000000080000000 1G
> > RW PSE NX pud
> > [ 0.404645] 0x0000000080000000-0x00000000b9800000 920M
> > RW PSE NX pmd
> > [ 0.405844] 0x00000000b9800000-0x00000000b9a00000 2M
> > RW NX pte
> > [ 0.407436] 0x00000000b9a00000-0x00000000baa00000 16M
> > ro PSE x pmd
> > [ 0.408591] 0x00000000baa00000-0x00000000bbe00000 20M
> > RW PSE NX pmd
> > [ 0.409751] 0x00000000bbe00000-0x00000000bc000000 2M
> > RW NX pte
> > [ 0.410821] 0x00000000bc000000-0x00000000be600000 38M
> > RW PSE NX pmd
> >
> > However, the fact that you can provide a case where it does fail
> > should be sufficient justification for taking this patch. I was just
> > trying to give more than a regression-tested-by
>
> No, this case is exactly one that should break. But I think you're
> running on a processor model that _does_ support GB pages, as shown by
> the "pud" mapping there for the 1G-2G range.
>
> At least for my version of qemu, -cpu Haswell does not enable the
> pdpe1gb feature. Which cpu did you specify?
>
The wrong one, obviously :-)
With Haswell, I get [before]
[ 0.368541] 0x0000000000900000-0x0000000000a00000 1M
RW NX pte
[ 0.369118] 0x0000000000a00000-0x0000000080000000 2038M
RW PSE NX pmd
[ 0.369592] 0x0000000080000000-0x00000000b9800000 920M
pmd
[ 0.370177] 0x00000000b9800000-0x00000000b9856000 344K
pte
[ 0.370649] 0x00000000b9856000-0x00000000b9a00000 1704K
RW NX pte
[ 0.371066] 0x00000000b9a00000-0x00000000baa00000 16M
ro PSE x pmd
and after
[ 0.349577] 0x0000000000a00000-0x0000000080000000 2038M
RW PSE NX pmd
[ 0.350049] 0x0000000080000000-0x00000000b9800000 920M
pmd
[ 0.350514] 0x00000000b9800000-0x00000000b9856000 344K
pte
[ 0.351013] 0x00000000b9856000-0x00000000b9a00000 1704K
RW NX pte
so i'm still doing something wrong, I think?
On Wed, Mar 04, 2020 at 08:22:36PM +0100, Ard Biesheuvel wrote:
> The wrong one, obviously :-)
>
> With Haswell, I get [before]
>
> [ 0.368541] 0x0000000000900000-0x0000000000a00000 1M
> RW NX pte
> [ 0.369118] 0x0000000000a00000-0x0000000080000000 2038M
> RW PSE NX pmd
> [ 0.369592] 0x0000000080000000-0x00000000b9800000 920M
> pmd
> [ 0.370177] 0x00000000b9800000-0x00000000b9856000 344K
> pte
^^ so this is showing the region that didn't get mapped, so you did
reproduce the issue.
> [ 0.370649] 0x00000000b9856000-0x00000000b9a00000 1704K
> RW NX pte
> [ 0.371066] 0x00000000b9a00000-0x00000000baa00000 16M
> ro PSE x pmd
>
> and after
>
> [ 0.349577] 0x0000000000a00000-0x0000000080000000 2038M
> RW PSE NX pmd
> [ 0.350049] 0x0000000080000000-0x00000000b9800000 920M
> pmd
> [ 0.350514] 0x00000000b9800000-0x00000000b9856000 344K
> pte
^^ but it didn't get fixed :( This region should now be mapped properly
with flags RW/NX.
> [ 0.351013] 0x00000000b9856000-0x00000000b9a00000 1704K
> RW NX pte
>
> so i'm still doing something wrong, I think?
You're *sure* the after is actually after? There seems to be no change
at all, the patch should have had some effect.
On Wed, 4 Mar 2020 at 20:54, Arvind Sankar <[email protected]> wrote:
>
> On Wed, Mar 04, 2020 at 08:22:36PM +0100, Ard Biesheuvel wrote:
> > The wrong one, obviously :-)
> >
> > With Haswell, I get [before]
> >
> > [ 0.368541] 0x0000000000900000-0x0000000000a00000 1M
> > RW NX pte
> > [ 0.369118] 0x0000000000a00000-0x0000000080000000 2038M
> > RW PSE NX pmd
> > [ 0.369592] 0x0000000080000000-0x00000000b9800000 920M
> > pmd
> > [ 0.370177] 0x00000000b9800000-0x00000000b9856000 344K
> > pte
> ^^ so this is showing the region that didn't get mapped, so you did
> reproduce the issue.
> > [ 0.370649] 0x00000000b9856000-0x00000000b9a00000 1704K
> > RW NX pte
> > [ 0.371066] 0x00000000b9a00000-0x00000000baa00000 16M
> > ro PSE x pmd
> >
> > and after
> >
> > [ 0.349577] 0x0000000000a00000-0x0000000080000000 2038M
> > RW PSE NX pmd
> > [ 0.350049] 0x0000000080000000-0x00000000b9800000 920M
> > pmd
> > [ 0.350514] 0x00000000b9800000-0x00000000b9856000 344K
> > pte
> ^^ but it didn't get fixed :( This region should now be mapped properly
> with flags RW/NX.
> > [ 0.351013] 0x00000000b9856000-0x00000000b9a00000 1704K
> > RW NX pte
> >
> > so i'm still doing something wrong, I think?
>
> You're *sure* the after is actually after? There seems to be no change
> at all, the patch should have had some effect.
Duh.
Yes, you are right. It was 'operator error'
On Tue, 3 Mar 2020 at 21:54, Arvind Sankar <[email protected]> wrote:
>
> The first patch fixes a bug in populate_pud, which results in the
> requested mapping not being completely installed if the CPU does not
> support GB pages.
>
> The next three are small cleanups.
>
> Thanks.
>
> Arvind Sankar (4):
> x86/mm/pat: Handle no-GBPAGES case correctly in populate_pud
> x86/mm/pat: Ensure that populate_pud only handles one PUD
> x86/mm/pat: Propagate all errors out of populate_pud
> x86/mm/pat: Make num_pages consistent in populate_{pte,pud,pgd}
>
Acked-by: Ard Biesheuvel <[email protected]>
Tested-by: Ard Biesheuvel <[email protected]>
> arch/x86/include/asm/pgtable_types.h | 2 +-
> arch/x86/mm/pat/set_memory.c | 74 +++++++++++++++++-----------
> 2 files changed, 45 insertions(+), 31 deletions(-)
>
> --
> 2.24.1
>
On Wed, Mar 04, 2020 at 10:51:01PM +0100, Ard Biesheuvel wrote:
> On Wed, 4 Mar 2020 at 20:54, Arvind Sankar <[email protected]> wrote:
> >
> > You're *sure* the after is actually after? There seems to be no change
> > at all, the patch should have had some effect.
>
> Duh.
>
> Yes, you are right. It was 'operator error'
Phew! Thanks for the test :)