2017-12-04 11:23:37

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

sme_populate_pgd() open-codes a lot of things that are not needed to be
open-coded.

Let's rewrite it in a more stream-lined way.

This would also buy us boot-time switching between support between
paging modes, when rest of the pieces will be upstream.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---

The patch is only build tested. I don't have hardware. Tom, could you give it a try?

---
arch/x86/mm/mem_encrypt.c | 89 +++++++++++++++--------------------------------
1 file changed, 29 insertions(+), 60 deletions(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index d9a9e9fc75dd..16038f7472ca 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -489,73 +489,42 @@ static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
unsigned long vaddr, pmdval_t pmd_val)
{
- pgd_t *pgd_p;
- p4d_t *p4d_p;
- pud_t *pud_p;
- pmd_t *pmd_p;
-
- pgd_p = pgd_base + pgd_index(vaddr);
- if (native_pgd_val(*pgd_p)) {
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
- p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- else
- pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- } else {
- pgd_t pgd;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p = pgtable_area;
- memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
- pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
-
- pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
- } else {
- pud_p = pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
- }
- native_set_pgd(pgd_p, pgd);
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_base + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ native_set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
}

- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p += p4d_index(vaddr);
- if (native_p4d_val(*p4d_p)) {
- pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
- } else {
- p4d_t p4d;
-
- pud_p = pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
- native_set_p4d(p4d_p, p4d);
- }
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ pud = pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ native_set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
}

- pud_p += pud_index(vaddr);
- if (native_pud_val(*pud_p)) {
- if (native_pud_val(*pud_p) & _PAGE_PSE)
- goto out;
-
- pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
- } else {
- pud_t pud;
-
- pmd_p = pgtable_area;
- memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
- pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
-
- pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
- native_set_pud(pud_p, pud);
+ pud = pud_offset(p4d, vaddr);
+ if (pud_none(*pud)) {
+ pmd = pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ native_set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
}
+ if (pud_large(*pud))
+ goto out;

- pmd_p += pmd_index(vaddr);
- if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
- native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_large(*pmd))
+ goto out;

+ native_set_pmd(pmd, native_make_pmd(pmd_val));
out:
return pgtable_area;
}
--
2.15.0


2017-12-04 14:19:27

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> sme_populate_pgd() open-codes a lot of things that are not needed to be
> open-coded.
>
> Let's rewrite it in a more stream-lined way.
>
> This would also buy us boot-time switching between support between
> paging modes, when rest of the pieces will be upstream.

Hi Kirill,

Unfortunately, some of these can't be changed. The use of p4d_offset(),
pud_offset(), etc., use non-identity mapped virtual addresses which cause
failures at this point of the boot process. Also, calls such as __p4d(),
__pud(), etc., are part of the paravirt support and can't be used yet,
either. I can take a closer look at some of the others (p*d_none() and
p*d_large()) which make use of the native_ macros, but my worry would be
that these get changed in the future to the non-native calls and then
boot failures occur.

Thanks,
Tom

>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
>
> The patch is only build tested. I don't have hardware. Tom, could you give it a try?
>
> ---
> arch/x86/mm/mem_encrypt.c | 89 +++++++++++++++--------------------------------
> 1 file changed, 29 insertions(+), 60 deletions(-)
>
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index d9a9e9fc75dd..16038f7472ca 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -489,73 +489,42 @@ static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
> static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> unsigned long vaddr, pmdval_t pmd_val)
> {
> - pgd_t *pgd_p;
> - p4d_t *p4d_p;
> - pud_t *pud_p;
> - pmd_t *pmd_p;
> -
> - pgd_p = pgd_base + pgd_index(vaddr);
> - if (native_pgd_val(*pgd_p)) {
> - if (IS_ENABLED(CONFIG_X86_5LEVEL))
> - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
> - else
> - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
> - } else {
> - pgd_t pgd;
> -
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_p = pgtable_area;
> - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
> - pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
> -
> - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
> - } else {
> - pud_p = pgtable_area;
> - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> -
> - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
> - }
> - native_set_pgd(pgd_p, pgd);
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pgd = pgd_base + pgd_index(vaddr);
> + if (pgd_none(*pgd)) {
> + p4d = pgtable_area;
> + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
> + pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
> + native_set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
> }
>
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_p += p4d_index(vaddr);
> - if (native_p4d_val(*p4d_p)) {
> - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
> - } else {
> - p4d_t p4d;
> -
> - pud_p = pgtable_area;
> - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> -
> - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
> - native_set_p4d(p4d_p, p4d);
> - }
> + p4d = p4d_offset(pgd, vaddr);
> + if (p4d_none(*p4d)) {
> + pud = pgtable_area;
> + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
> + pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
> + native_set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
> }
>
> - pud_p += pud_index(vaddr);
> - if (native_pud_val(*pud_p)) {
> - if (native_pud_val(*pud_p) & _PAGE_PSE)
> - goto out;
> -
> - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
> - } else {
> - pud_t pud;
> -
> - pmd_p = pgtable_area;
> - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
> - pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
> -
> - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
> - native_set_pud(pud_p, pud);
> + pud = pud_offset(p4d, vaddr);
> + if (pud_none(*pud)) {
> + pmd = pgtable_area;
> + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
> + pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
> + native_set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
> }
> + if (pud_large(*pud))
> + goto out;
>
> - pmd_p += pmd_index(vaddr);
> - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
> - native_set_pmd(pmd_p, native_make_pmd(pmd_val));
> + pmd = pmd_offset(pud, vaddr);
> + if (pmd_large(*pmd))
> + goto out;
>
> + native_set_pmd(pmd, native_make_pmd(pmd_val));
> out:
> return pgtable_area;
> }
>

2017-12-04 14:58:00

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > open-coded.
> >
> > Let's rewrite it in a more stream-lined way.
> >
> > This would also buy us boot-time switching between support between
> > paging modes, when rest of the pieces will be upstream.
>
> Hi Kirill,
>
> Unfortunately, some of these can't be changed. The use of p4d_offset(),
> pud_offset(), etc., use non-identity mapped virtual addresses which cause
> failures at this point of the boot process.

Wat? Virtual address is virtual address. p?d_offset() doesn't care about
what mapping you're using.

> Also, calls such as __p4d(), __pud(), etc., are part of the paravirt
> support and can't be used yet, either.

Yeah, I missed this. native_make_p?d() has to be used instead.

> I can take a closer look at some of the others (p*d_none() and
> p*d_large()) which make use of the native_ macros, but my worry would be
> that these get changed in the future to the non-native calls and then
> boot failures occur.

If you want to avoid paravirt altogher for whole compilation unit, one
more option would be to put #undef CONFIG_PARAVIRT before all includes.
That's hack, but it works. We already use this in arch/x86/boot/compressed
code.

--
Kirill A. Shutemov

2017-12-04 16:00:53

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
>> On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
>>> sme_populate_pgd() open-codes a lot of things that are not needed to be
>>> open-coded.
>>>
>>> Let's rewrite it in a more stream-lined way.
>>>
>>> This would also buy us boot-time switching between support between
>>> paging modes, when rest of the pieces will be upstream.
>>
>> Hi Kirill,
>>
>> Unfortunately, some of these can't be changed. The use of p4d_offset(),
>> pud_offset(), etc., use non-identity mapped virtual addresses which cause
>> failures at this point of the boot process.
>
> Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> what mapping you're using.

Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
which does a __va() returning a non-identity mapped address (0xffff88...).
Only identity mapped virtual addresses have been setup at this point, so
the use of that virtual address panics the kernel.

Thanks,
Tom

>
>> Also, calls such as __p4d(), __pud(), etc., are part of the paravirt
>> support and can't be used yet, either.
>
> Yeah, I missed this. native_make_p?d() has to be used instead.
>
>> I can take a closer look at some of the others (p*d_none() and
>> p*d_large()) which make use of the native_ macros, but my worry would be
>> that these get changed in the future to the non-native calls and then
>> boot failures occur.
>
> If you want to avoid paravirt altogher for whole compilation unit, one
> more option would be to put #undef CONFIG_PARAVIRT before all includes.
> That's hack, but it works. We already use this in arch/x86/boot/compressed
> code.
>

2017-12-04 16:35:24

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
> On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> > On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> > > On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > > > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > > > open-coded.
> > > >
> > > > Let's rewrite it in a more stream-lined way.
> > > >
> > > > This would also buy us boot-time switching between support between
> > > > paging modes, when rest of the pieces will be upstream.
> > >
> > > Hi Kirill,
> > >
> > > Unfortunately, some of these can't be changed. The use of p4d_offset(),
> > > pud_offset(), etc., use non-identity mapped virtual addresses which cause
> > > failures at this point of the boot process.
> >
> > Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> > what mapping you're using.
>
> Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
> which does a __va() returning a non-identity mapped address (0xffff88...).
> Only identity mapped virtual addresses have been setup at this point, so
> the use of that virtual address panics the kernel.

Stupid me. You are right.

What about something like this:

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index d9a9e9fc75dd..65e0d68f863f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,23 @@

#define DISABLE_BRANCH_PROFILING

+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -489,73 +506,42 @@ static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
unsigned long vaddr, pmdval_t pmd_val)
{
- pgd_t *pgd_p;
- p4d_t *p4d_p;
- pud_t *pud_p;
- pmd_t *pmd_p;
-
- pgd_p = pgd_base + pgd_index(vaddr);
- if (native_pgd_val(*pgd_p)) {
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
- p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- else
- pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- } else {
- pgd_t pgd;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p = pgtable_area;
- memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
- pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
-
- pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
- } else {
- pud_p = pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
- }
- native_set_pgd(pgd_p, pgd);
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_base + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
}

- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p += p4d_index(vaddr);
- if (native_p4d_val(*p4d_p)) {
- pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
- } else {
- p4d_t p4d;
-
- pud_p = pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
- native_set_p4d(p4d_p, p4d);
- }
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ pud = pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
}

- pud_p += pud_index(vaddr);
- if (native_pud_val(*pud_p)) {
- if (native_pud_val(*pud_p) & _PAGE_PSE)
- goto out;
-
- pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
- } else {
- pud_t pud;
-
- pmd_p = pgtable_area;
- memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
- pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
-
- pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
- native_set_pud(pud_p, pud);
+ pud = pud_offset(p4d, vaddr);
+ if (pud_none(*pud)) {
+ pmd = pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
}
+ if (pud_large(*pud))
+ goto out;

- pmd_p += pmd_index(vaddr);
- if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
- native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_large(*pmd))
+ goto out;

+ set_pmd(pmd, __pmd(pmd_val));
out:
return pgtable_area;
}
--
Kirill A. Shutemov

2017-12-04 17:40:07

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Mon, Dec 04, 2017 at 04:34:45PM +0000, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
> > On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> > > On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> > > > On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > > > > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > > > > open-coded.
> > > > >
> > > > > Let's rewrite it in a more stream-lined way.
> > > > >
> > > > > This would also buy us boot-time switching between support between
> > > > > paging modes, when rest of the pieces will be upstream.
> > > >
> > > > Hi Kirill,
> > > >
> > > > Unfortunately, some of these can't be changed. The use of p4d_offset(),
> > > > pud_offset(), etc., use non-identity mapped virtual addresses which cause
> > > > failures at this point of the boot process.
> > >
> > > Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> > > what mapping you're using.
> >
> > Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
> > which does a __va() returning a non-identity mapped address (0xffff88...).
> > Only identity mapped virtual addresses have been setup at this point, so
> > the use of that virtual address panics the kernel.
>
> Stupid me. You are right.
>
> What about something like this:

sme_pgtable_calc() also looks unnecessary complex.

Any objections on this:

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 65e0d68f863f..59b7d7ba9b37 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -548,8 +548,7 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,

static unsigned long __init sme_pgtable_calc(unsigned long len)
{
- unsigned long p4d_size, pud_size, pmd_size;
- unsigned long total;
+ unsigned long entries, tables;

/*
* Perform a relatively simplistic calculation of the pagetable
@@ -559,41 +558,25 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
* mappings. Incrementing the count for each covers the case where
* the addresses cross entries.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;

- total = p4d_size + pud_size + pmd_size;
+ entries = (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * PAGE_SIZE;
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * PAGE_SIZE;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * PAGE_SIZE;
+ entries += (DIV_ROUND_UP(len, PMD_SIZE) + 1) * PAGE_SIZE;

/*
* Now calculate the added pagetable structures needed to populate
* the new pagetables.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;

- total += p4d_size + pud_size + pmd_size;
+ tables = DIV_ROUND_UP(entries, PGDIR_SIZE) * PAGE_SIZE;
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * PAGE_SIZE;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * PAGE_SIZE;
+ tables += DIV_ROUND_UP(entries, PMD_SIZE) * PAGE_SIZE;

- return total;
+ return entries + tables;
}

void __init sme_encrypt_kernel(void)
--
Kirill A. Shutemov

2017-12-04 17:50:42

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Mon, Dec 04, 2017 at 05:39:31PM +0000, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 04:34:45PM +0000, Kirill A. Shutemov wrote:
> > On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
> > > On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> > > > On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> > > > > On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > > > > > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > > > > > open-coded.
> > > > > >
> > > > > > Let's rewrite it in a more stream-lined way.
> > > > > >
> > > > > > This would also buy us boot-time switching between support between
> > > > > > paging modes, when rest of the pieces will be upstream.
> > > > >
> > > > > Hi Kirill,
> > > > >
> > > > > Unfortunately, some of these can't be changed. The use of p4d_offset(),
> > > > > pud_offset(), etc., use non-identity mapped virtual addresses which cause
> > > > > failures at this point of the boot process.
> > > >
> > > > Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> > > > what mapping you're using.
> > >
> > > Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
> > > which does a __va() returning a non-identity mapped address (0xffff88...).
> > > Only identity mapped virtual addresses have been setup at this point, so
> > > the use of that virtual address panics the kernel.
> >
> > Stupid me. You are right.
> >
> > What about something like this:
>
> sme_pgtable_calc() also looks unnecessary complex.
>
> Any objections on this:

Oops. Screwed up whitespaces.

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 65e0d68f863f..f7b6c7972884 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -548,8 +548,7 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,

static unsigned long __init sme_pgtable_calc(unsigned long len)
{
- unsigned long p4d_size, pud_size, pmd_size;
- unsigned long total;
+ unsigned long entries, tables;

/*
* Perform a relatively simplistic calculation of the pagetable
@@ -559,41 +558,25 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
* mappings. Incrementing the count for each covers the case where
* the addresses cross entries.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;

- total = p4d_size + pud_size + pmd_size;
+ entries = (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * PAGE_SIZE;
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * PAGE_SIZE;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * PAGE_SIZE;
+ entries += (DIV_ROUND_UP(len, PMD_SIZE) + 1) * PAGE_SIZE;

/*
* Now calculate the added pagetable structures needed to populate
* the new pagetables.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;

- total += p4d_size + pud_size + pmd_size;
+ tables = DIV_ROUND_UP(entries, PGDIR_SIZE) * PAGE_SIZE;
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * PAGE_SIZE;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * PAGE_SIZE;
+ tables += DIV_ROUND_UP(entries, PMD_SIZE) * PAGE_SIZE;

- return total;
+ return entries + tables;
}

void __init sme_encrypt_kernel(void)
--
Kirill A. Shutemov

2017-12-04 18:33:08

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On 12/4/2017 10:34 AM, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
>> On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
>>> On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
>>>> On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
>>>>> sme_populate_pgd() open-codes a lot of things that are not needed to be
>>>>> open-coded.
>>>>>
>>>>> Let's rewrite it in a more stream-lined way.
>>>>>
>>>>> This would also buy us boot-time switching between support between
>>>>> paging modes, when rest of the pieces will be upstream.
>>>>
>>>> Hi Kirill,
>>>>
>>>> Unfortunately, some of these can't be changed. The use of p4d_offset(),
>>>> pud_offset(), etc., use non-identity mapped virtual addresses which cause
>>>> failures at this point of the boot process.
>>>
>>> Wat? Virtual address is virtual address. p?d_offset() doesn't care about
>>> what mapping you're using.
>>
>> Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
>> which does a __va() returning a non-identity mapped address (0xffff88...).
>> Only identity mapped virtual addresses have been setup at this point, so
>> the use of that virtual address panics the kernel.
>
> Stupid me. You are right.
>
> What about something like this:
>
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index d9a9e9fc75dd..65e0d68f863f 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -12,6 +12,23 @@
>
> #define DISABLE_BRANCH_PROFILING
>
> +/*
> + * Since we're dealing with identity mappings, physical and virtual
> + * addresses are the same, so override these defines which are ultimately
> + * used by the headers in misc.h.
> + */
> +#define __pa(x) ((unsigned long)(x))
> +#define __va(x) ((void *)((unsigned long)(x)))

No, you can't do this. There are routines in this file that are called
after the kernel has switched to its standard virtual address map where
this definition of __va() will likely cause a failure.

> +
> +/*
> + * Special hack: we have to be careful, because no indirections are
> + * allowed here, and paravirt_ops is a kind of one. As it will only run in
> + * baremetal anyway, we just keep it from happening. (This list needs to
> + * be extended when new paravirt and debugging variants are added.)
> + */
> +#undef CONFIG_PARAVIRT
> +#undef CONFIG_PARAVIRT_SPINLOCKS

I'd really, really like to avoid doing something like this.

Thanks,
Tom

> +
> #include <linux/linkage.h>
> #include <linux/init.h>
> #include <linux/mm.h>
> @@ -489,73 +506,42 @@ static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
> static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> unsigned long vaddr, pmdval_t pmd_val)
> {
> - pgd_t *pgd_p;
> - p4d_t *p4d_p;
> - pud_t *pud_p;
> - pmd_t *pmd_p;
> -
> - pgd_p = pgd_base + pgd_index(vaddr);
> - if (native_pgd_val(*pgd_p)) {
> - if (IS_ENABLED(CONFIG_X86_5LEVEL))
> - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
> - else
> - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
> - } else {
> - pgd_t pgd;
> -
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_p = pgtable_area;
> - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
> - pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
> -
> - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
> - } else {
> - pud_p = pgtable_area;
> - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> -
> - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
> - }
> - native_set_pgd(pgd_p, pgd);
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pgd = pgd_base + pgd_index(vaddr);
> + if (pgd_none(*pgd)) {
> + p4d = pgtable_area;
> + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
> + pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
> + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
> }
>
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_p += p4d_index(vaddr);
> - if (native_p4d_val(*p4d_p)) {
> - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
> - } else {
> - p4d_t p4d;
> -
> - pud_p = pgtable_area;
> - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> -
> - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
> - native_set_p4d(p4d_p, p4d);
> - }
> + p4d = p4d_offset(pgd, vaddr);
> + if (p4d_none(*p4d)) {
> + pud = pgtable_area;
> + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
> + pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
> + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
> }
>
> - pud_p += pud_index(vaddr);
> - if (native_pud_val(*pud_p)) {
> - if (native_pud_val(*pud_p) & _PAGE_PSE)
> - goto out;
> -
> - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
> - } else {
> - pud_t pud;
> -
> - pmd_p = pgtable_area;
> - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
> - pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
> -
> - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
> - native_set_pud(pud_p, pud);
> + pud = pud_offset(p4d, vaddr);
> + if (pud_none(*pud)) {
> + pmd = pgtable_area;
> + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
> + pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
> + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
> }
> + if (pud_large(*pud))
> + goto out;
>
> - pmd_p += pmd_index(vaddr);
> - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
> - native_set_pmd(pmd_p, native_make_pmd(pmd_val));
> + pmd = pmd_offset(pud, vaddr);
> + if (pmd_large(*pmd))
> + goto out;
>
> + set_pmd(pmd, __pmd(pmd_val));
> out:
> return pgtable_area;
> }
>

2017-12-04 18:50:32

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Mon, Dec 04, 2017 at 12:33:01PM -0600, Tom Lendacky wrote:
> On 12/4/2017 10:34 AM, Kirill A. Shutemov wrote:
> > On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
> > > On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> > > > On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> > > > > On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > > > > > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > > > > > open-coded.
> > > > > >
> > > > > > Let's rewrite it in a more stream-lined way.
> > > > > >
> > > > > > This would also buy us boot-time switching between support between
> > > > > > paging modes, when rest of the pieces will be upstream.
> > > > >
> > > > > Hi Kirill,
> > > > >
> > > > > Unfortunately, some of these can't be changed. The use of p4d_offset(),
> > > > > pud_offset(), etc., use non-identity mapped virtual addresses which cause
> > > > > failures at this point of the boot process.
> > > >
> > > > Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> > > > what mapping you're using.
> > >
> > > Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
> > > which does a __va() returning a non-identity mapped address (0xffff88...).
> > > Only identity mapped virtual addresses have been setup at this point, so
> > > the use of that virtual address panics the kernel.
> >
> > Stupid me. You are right.
> >
> > What about something like this:
> >
> > diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> > index d9a9e9fc75dd..65e0d68f863f 100644
> > --- a/arch/x86/mm/mem_encrypt.c
> > +++ b/arch/x86/mm/mem_encrypt.c
> > @@ -12,6 +12,23 @@
> > #define DISABLE_BRANCH_PROFILING
> > +/*
> > + * Since we're dealing with identity mappings, physical and virtual
> > + * addresses are the same, so override these defines which are ultimately
> > + * used by the headers in misc.h.
> > + */
> > +#define __pa(x) ((unsigned long)(x))
> > +#define __va(x) ((void *)((unsigned long)(x)))
>
> No, you can't do this. There are routines in this file that are called
> after the kernel has switched to its standard virtual address map where
> this definition of __va() will likely cause a failure.

Let's than split it up into separate compilation unit.

> > +/*
> > + * Special hack: we have to be careful, because no indirections are
> > + * allowed here, and paravirt_ops is a kind of one. As it will only run in
> > + * baremetal anyway, we just keep it from happening. (This list needs to
> > + * be extended when new paravirt and debugging variants are added.)
> > + */
> > +#undef CONFIG_PARAVIRT
> > +#undef CONFIG_PARAVIRT_SPINLOCKS
>
> I'd really, really like to avoid doing something like this.

Any other proposals?

Current code is way too hairy and hard to modify.

--
Kirill A. Shutemov

2017-12-08 14:37:53

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On 12/4/2017 11:39 AM, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 04:34:45PM +0000, Kirill A. Shutemov wrote:
>> On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
>>> On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
>>>> On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
>>>>> On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
>>>>>> sme_populate_pgd() open-codes a lot of things that are not needed to be
>>>>>> open-coded.
>>>>>>
>>>>>> Let's rewrite it in a more stream-lined way.
>>>>>>
>>>>>> This would also buy us boot-time switching between support between
>>>>>> paging modes, when rest of the pieces will be upstream.
>>>>>
>>>>> Hi Kirill,
>>>>>
>>>>> Unfortunately, some of these can't be changed. The use of p4d_offset(),
>>>>> pud_offset(), etc., use non-identity mapped virtual addresses which cause
>>>>> failures at this point of the boot process.
>>>>
>>>> Wat? Virtual address is virtual address. p?d_offset() doesn't care about
>>>> what mapping you're using.
>>>
>>> Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
>>> which does a __va() returning a non-identity mapped address (0xffff88...).
>>> Only identity mapped virtual addresses have been setup at this point, so
>>> the use of that virtual address panics the kernel.
>>
>> Stupid me. You are right.
>>
>> What about something like this:
>
> sme_pgtable_calc() also looks unnecessary complex.

I have no objections to improving this (although I just submitted a patch
that modifies this area, so this will have to be updated now).

>
> Any objections on this:
>
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 65e0d68f863f..59b7d7ba9b37 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -548,8 +548,7 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
>
> static unsigned long __init sme_pgtable_calc(unsigned long len)
> {
> - unsigned long p4d_size, pud_size, pmd_size;
> - unsigned long total;
> + unsigned long entries, tables;
>
> /*
> * Perform a relatively simplistic calculation of the pagetable
> @@ -559,41 +558,25 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
> * mappings. Incrementing the count for each covers the case where
> * the addresses cross entries.
> */
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
> - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
> - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
> - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> - } else {
> - p4d_size = 0;
> - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
> - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> - }
> - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
> - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
>
> - total = p4d_size + pud_size + pmd_size;
> + entries = (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * PAGE_SIZE;

I stayed away from using PAGE_SIZE directly because other areas/files used
the sizeof() * PTRS_PER_ and I was trying to be consistent. Not that the
size of a page table is ever likely to change, but maybe defining a macro
(similar to the one in mm/pgtable.c) would be best rather than using
PAGE_SIZE directly. Not required, just my opinion.

> + if (PTRS_PER_P4D > 1)
> + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * PAGE_SIZE;
> + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * PAGE_SIZE;
> + entries += (DIV_ROUND_UP(len, PMD_SIZE) + 1) * PAGE_SIZE;
>
> /*
> * Now calculate the added pagetable structures needed to populate
> * the new pagetables.
> */
> - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
> - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
> - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> - } else {
> - p4d_size = 0;
> - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> - }
> - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
> - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
>
> - total += p4d_size + pud_size + pmd_size;
> + tables = DIV_ROUND_UP(entries, PGDIR_SIZE) * PAGE_SIZE;
> + if (PTRS_PER_P4D > 1)
> + tables += DIV_ROUND_UP(entries, P4D_SIZE) * PAGE_SIZE;
> + tables += DIV_ROUND_UP(entries, PUD_SIZE) * PAGE_SIZE;
> + tables += DIV_ROUND_UP(entries, PMD_SIZE) * PAGE_SIZE;
>
> - return total;
> + return entries + tables;
> }

It all looks reasonable, but I won't be able to test for the next few
days, though.

Thanks,
Tom

>
> void __init sme_encrypt_kernel(void)
>

2017-12-08 14:44:06

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On 12/4/2017 12:50 PM, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2017 at 12:33:01PM -0600, Tom Lendacky wrote:
>> On 12/4/2017 10:34 AM, Kirill A. Shutemov wrote:
>>> On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
>>>> On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
>>>>> On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
>>>>>> On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
>>>>>>> sme_populate_pgd() open-codes a lot of things that are not needed to be
>>>>>>> open-coded.
>>>>>>>
>>>>>>> Let's rewrite it in a more stream-lined way.
>>>>>>>
>>>>>>> This would also buy us boot-time switching between support between
>>>>>>> paging modes, when rest of the pieces will be upstream.
>>>>>>
>>>>>> Hi Kirill,
>>>>>>
>>>>>> Unfortunately, some of these can't be changed. The use of p4d_offset(),
>>>>>> pud_offset(), etc., use non-identity mapped virtual addresses which cause
>>>>>> failures at this point of the boot process.
>>>>>
>>>>> Wat? Virtual address is virtual address. p?d_offset() doesn't care about
>>>>> what mapping you're using.
>>>>
>>>> Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
>>>> which does a __va() returning a non-identity mapped address (0xffff88...).
>>>> Only identity mapped virtual addresses have been setup at this point, so
>>>> the use of that virtual address panics the kernel.
>>>
>>> Stupid me. You are right.
>>>
>>> What about something like this:
>>>
>>> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
>>> index d9a9e9fc75dd..65e0d68f863f 100644
>>> --- a/arch/x86/mm/mem_encrypt.c
>>> +++ b/arch/x86/mm/mem_encrypt.c
>>> @@ -12,6 +12,23 @@
>>> #define DISABLE_BRANCH_PROFILING
>>> +/*
>>> + * Since we're dealing with identity mappings, physical and virtual
>>> + * addresses are the same, so override these defines which are ultimately
>>> + * used by the headers in misc.h.
>>> + */
>>> +#define __pa(x) ((unsigned long)(x))
>>> +#define __va(x) ((void *)((unsigned long)(x)))
>>
>> No, you can't do this. There are routines in this file that are called
>> after the kernel has switched to its standard virtual address map where
>> this definition of __va() will likely cause a failure.
>
> Let's than split it up into separate compilation unit.
>
>>> +/*
>>> + * Special hack: we have to be careful, because no indirections are
>>> + * allowed here, and paravirt_ops is a kind of one. As it will only run in
>>> + * baremetal anyway, we just keep it from happening. (This list needs to
>>> + * be extended when new paravirt and debugging variants are added.)
>>> + */
>>> +#undef CONFIG_PARAVIRT
>>> +#undef CONFIG_PARAVIRT_SPINLOCKS
>>
>> I'd really, really like to avoid doing something like this.
>
> Any other proposals?
>
> Current code is way too hairy and hard to modify.

I'd like to hear some other opinions, but if there are none, I don't see
an issue with splitting this up, I guess. I just dislike doing some of
these hacks, but if it makes things cleaner and simpler to understand,
then I guess I can't really object.

Thanks,
Tom

>

2017-12-12 11:41:13

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Rewrite sme_populate_pgd() in a more sensible way

On Fri, Dec 08, 2017 at 08:37:43AM -0600, Tom Lendacky wrote:
> On 12/4/2017 11:39 AM, Kirill A. Shutemov wrote:
> > On Mon, Dec 04, 2017 at 04:34:45PM +0000, Kirill A. Shutemov wrote:
> > > On Mon, Dec 04, 2017 at 04:00:26PM +0000, Tom Lendacky wrote:
> > > > On 12/4/2017 8:57 AM, Kirill A. Shutemov wrote:
> > > > > On Mon, Dec 04, 2017 at 08:19:11AM -0600, Tom Lendacky wrote:
> > > > > > On 12/4/2017 5:23 AM, Kirill A. Shutemov wrote:
> > > > > > > sme_populate_pgd() open-codes a lot of things that are not needed to be
> > > > > > > open-coded.
> > > > > > >
> > > > > > > Let's rewrite it in a more stream-lined way.
> > > > > > >
> > > > > > > This would also buy us boot-time switching between support between
> > > > > > > paging modes, when rest of the pieces will be upstream.
> > > > > >
> > > > > > Hi Kirill,
> > > > > >
> > > > > > Unfortunately, some of these can't be changed. The use of p4d_offset(),
> > > > > > pud_offset(), etc., use non-identity mapped virtual addresses which cause
> > > > > > failures at this point of the boot process.
> > > > >
> > > > > Wat? Virtual address is virtual address. p?d_offset() doesn't care about
> > > > > what mapping you're using.
> > > >
> > > > Yes it does. For example, pmd_offset() issues a pud_page_addr() call,
> > > > which does a __va() returning a non-identity mapped address (0xffff88...).
> > > > Only identity mapped virtual addresses have been setup at this point, so
> > > > the use of that virtual address panics the kernel.
> > >
> > > Stupid me. You are right.
> > >
> > > What about something like this:
> >
> > sme_pgtable_calc() also looks unnecessary complex.
>
> I have no objections to improving this (although I just submitted a patch
> that modifies this area, so this will have to be updated now).

I'll post patchset on top of your "SME: BSP/SME microcode update fix"

> > Any objections on this:
> >
> > diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> > index 65e0d68f863f..59b7d7ba9b37 100644
> > --- a/arch/x86/mm/mem_encrypt.c
> > +++ b/arch/x86/mm/mem_encrypt.c
> > @@ -548,8 +548,7 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> > static unsigned long __init sme_pgtable_calc(unsigned long len)
> > {
> > - unsigned long p4d_size, pud_size, pmd_size;
> > - unsigned long total;
> > + unsigned long entries, tables;
> > /*
> > * Perform a relatively simplistic calculation of the pagetable
> > @@ -559,41 +558,25 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
> > * mappings. Incrementing the count for each covers the case where
> > * the addresses cross entries.
> > */
> > - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> > - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
> > - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
> > - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
> > - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> > - } else {
> > - p4d_size = 0;
> > - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
> > - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> > - }
> > - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
> > - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
> > - total = p4d_size + pud_size + pmd_size;
> > + entries = (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * PAGE_SIZE;
>
> I stayed away from using PAGE_SIZE directly because other areas/files used
> the sizeof() * PTRS_PER_ and I was trying to be consistent. Not that the
> size of a page table is ever likely to change, but maybe defining a macro
> (similar to the one in mm/pgtable.c) would be best rather than using
> PAGE_SIZE directly. Not required, just my opinion.

I've rewritten this with PTRS_PER_, although I don't think it matters much.

> > + if (PTRS_PER_P4D > 1)
> > + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * PAGE_SIZE;
> > + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * PAGE_SIZE;
> > + entries += (DIV_ROUND_UP(len, PMD_SIZE) + 1) * PAGE_SIZE;
> > /*
> > * Now calculate the added pagetable structures needed to populate
> > * the new pagetables.
> > */
> > - if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> > - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> > - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
> > - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
> > - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> > - } else {
> > - p4d_size = 0;
> > - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> > - pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
> > - }
> > - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
> > - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
> > - total += p4d_size + pud_size + pmd_size;
> > + tables = DIV_ROUND_UP(entries, PGDIR_SIZE) * PAGE_SIZE;
> > + if (PTRS_PER_P4D > 1)
> > + tables += DIV_ROUND_UP(entries, P4D_SIZE) * PAGE_SIZE;
> > + tables += DIV_ROUND_UP(entries, PUD_SIZE) * PAGE_SIZE;
> > + tables += DIV_ROUND_UP(entries, PMD_SIZE) * PAGE_SIZE;
> > - return total;
> > + return entries + tables;
> > }
>
> It all looks reasonable, but I won't be able to test for the next few
> days, though.

No worries. Test when you'll get time for this.

--
Kirill A. Shutemov