In order for the kernel to be encrypted "in place" during boot, a workarea
outside of the kernel must be used. This SME workarea used during early
encryption of the kernel is situated on a 2MB boundary after the end of
the kernel text, data, etc. sections (_end). This works well during
initial boot of a compressed kernel because of the relocation used for
decompression of the kernel. But when performing a kexec boot, there's a
chance that the SME workarea may not be mapped by the kexec pagetables or
that some of the other data used by kexec could exist in this range.
Create a section for SME in vmlinux.lds.S. Position it after "_end", which
is after "__end_of_kernel_reserve", so that the memory will be reclaimed
during boot and since this area is all zeroes, it compresses well. This
new section will be part of the kernel image, so kexec will account for it
in pagetable mappings and placement of data after the kernel.
Here's an example of a kernel size without and with the SME section:
without:
vmlinux: 36,501,616
bzImage: 6,497,344
100000000-47f37ffff : System RAM
1e4000000-1e47677d4 : Kernel code (0x7677d4)
1e47677d5-1e4e2e0bf : Kernel data (0x6c68ea)
1e5074000-1e5372fff : Kernel bss (0x2fefff)
with:
vmlinux: 44,419,408
bzImage: 6,503,136
880000000-c7ff7ffff : System RAM
8cf000000-8cf7677d4 : Kernel code (0x7677d4)
8cf7677d5-8cfe2e0bf : Kernel data (0x6c68ea)
8d0074000-8d0372fff : Kernel bss (0x2fefff)
Cc: Baoquan He <[email protected]>
Cc: Lianbo Jiang <[email protected]>
Signed-off-by: Tom Lendacky <[email protected]>
---
arch/x86/kernel/vmlinux.lds.S | 24 ++++++++++++++++++++++++
arch/x86/mm/mem_encrypt_identity.c | 22 ++++++++++++++++++++--
2 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ca2252ca6ad7..a7aa65b44c71 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -387,6 +387,30 @@ SECTIONS
. = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * SME workarea section: Lives outside of the kernel proper (_text -
+ * _end) for performing in-place encryption of the kernel during boot.
+ *
+ * Resides after _end because even though the .brk section is after
+ * __end_of_kernel_reserve, the .brk section is later reserved as a
+ * part of the kernel. It is used in very early boot code and not
+ * needed after that, so it is located after __end_of_kernel_reserve
+ * so that it will be discarded and become part of the available
+ * memory.
+ *
+ * Resides on a 2MB boundary to simplify the pagetable setup used for
+ * the encryption.
+ */
+ . = ALIGN(HPAGE_SIZE);
+ .sme : AT(ADDR(.sme) - LOAD_OFFSET) {
+ __sme_begin = .;
+ *(.sme)
+ . = ALIGN(HPAGE_SIZE);
+ __sme_end = .;
+ }
+#endif
+
STABS_DEBUG
DWARF_DEBUG
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 4aa9b1480866..c55c2ec8fb12 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -73,6 +73,19 @@ struct sme_populate_pgd_data {
unsigned long vaddr_end;
};
+/*
+ * This work area lives in the .sme section, which lives outside of
+ * the kernel proper. It is sized to hold the intermediate copy buffer
+ * and more than enough pagetable pages.
+ *
+ * By using this section, the kernel can be encrypted in place and we
+ * avoid any possibility of boot parameters or initramfs images being
+ * placed such that the in-place encryption logic overwrites them. This
+ * section is 2MB aligned to allow for simple pagetable setup using only
+ * PMD entries (see vmlinux.lds.S).
+ */
+static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.sme);
+
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
static char sme_cmdline_off[] __initdata = "off";
@@ -314,8 +327,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
}
#endif
- /* Set the encryption workarea to be immediately after the kernel */
- workarea_start = kernel_end;
+ /*
+ * We're running identity mapped, so we must obtain the address to the
+ * SME encryption workarea using rip-relative addressing.
+ */
+ asm ("lea sme_workarea(%%rip), %0"
+ : "=r" (workarea_start)
+ : "p" (sme_workarea));
/*
* Calculate required number of workarea bytes needed:
--
2.17.1
On Fri, Jun 14, 2019 at 09:15:19PM +0000, Lendacky, Thomas wrote:
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index ca2252ca6ad7..a7aa65b44c71 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -387,6 +387,30 @@ SECTIONS
> . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
> _end = .;
>
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + /*
> + * SME workarea section: Lives outside of the kernel proper (_text -
> + * _end) for performing in-place encryption of the kernel during boot.
> + *
> + * Resides after _end because even though the .brk section is after
> + * __end_of_kernel_reserve, the .brk section is later reserved as a
> + * part of the kernel. It is used in very early boot code and not
> + * needed after that, so it is located after __end_of_kernel_reserve
> + * so that it will be discarded and become part of the available
> + * memory.
> + *
> + * Resides on a 2MB boundary to simplify the pagetable setup used for
> + * the encryption.
> + */
> + . = ALIGN(HPAGE_SIZE);
> + .sme : AT(ADDR(.sme) - LOAD_OFFSET) {
Should we call that section something more generic as
.early_scratch
or so?
Someone else might need something like that too, in the future...
Also, the DISCARDS sections do get freed at runtime so why not make it
part of the DISCARD section...?
> + __sme_begin = .;
> + *(.sme)
> + . = ALIGN(HPAGE_SIZE);
> + __sme_end = .;
> + }
> +#endif
> +
> STABS_DEBUG
> DWARF_DEBUG
>
> diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
> index 4aa9b1480866..c55c2ec8fb12 100644
> --- a/arch/x86/mm/mem_encrypt_identity.c
> +++ b/arch/x86/mm/mem_encrypt_identity.c
> @@ -73,6 +73,19 @@ struct sme_populate_pgd_data {
> unsigned long vaddr_end;
> };
>
> +/*
> + * This work area lives in the .sme section, which lives outside of
> + * the kernel proper. It is sized to hold the intermediate copy buffer
> + * and more than enough pagetable pages.
> + *
> + * By using this section, the kernel can be encrypted in place and we
replace that "we" with an impartial passive formulation.
Other than that, I like the commenting, very helpful!
Thx.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On 6/17/19 6:02 AM, Borislav Petkov wrote:
> On Fri, Jun 14, 2019 at 09:15:19PM +0000, Lendacky, Thomas wrote:
>> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
>> index ca2252ca6ad7..a7aa65b44c71 100644
>> --- a/arch/x86/kernel/vmlinux.lds.S
>> +++ b/arch/x86/kernel/vmlinux.lds.S
>> @@ -387,6 +387,30 @@ SECTIONS
>> . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
>> _end = .;
>>
>> +#ifdef CONFIG_AMD_MEM_ENCRYPT
>> + /*
>> + * SME workarea section: Lives outside of the kernel proper (_text -
>> + * _end) for performing in-place encryption of the kernel during boot.
>> + *
>> + * Resides after _end because even though the .brk section is after
>> + * __end_of_kernel_reserve, the .brk section is later reserved as a
>> + * part of the kernel. It is used in very early boot code and not
>> + * needed after that, so it is located after __end_of_kernel_reserve
>> + * so that it will be discarded and become part of the available
>> + * memory.
>> + *
>> + * Resides on a 2MB boundary to simplify the pagetable setup used for
>> + * the encryption.
>> + */
>> + . = ALIGN(HPAGE_SIZE);
>> + .sme : AT(ADDR(.sme) - LOAD_OFFSET) {
>
> Should we call that section something more generic as
>
> .early_scratch
>
> or so?
>
> Someone else might need something like that too, in the future...
Whoever uses it in the future could rename it if desired. But I can do
that now. Is there a preferred name? I can leave it as .early_scratch
or .early_workarea.
>
> Also, the DISCARDS sections do get freed at runtime so why not make it
> part of the DISCARD section...?
I think it's easier to show the alignment requirements that SME has for
this section by having it be its own section.
>
>> + __sme_begin = .;
>> + *(.sme)
>> + . = ALIGN(HPAGE_SIZE);
>> + __sme_end = .;
>> + }
>> +#endif
>> +
>> STABS_DEBUG
>> DWARF_DEBUG
>>
>> diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
>> index 4aa9b1480866..c55c2ec8fb12 100644
>> --- a/arch/x86/mm/mem_encrypt_identity.c
>> +++ b/arch/x86/mm/mem_encrypt_identity.c
>> @@ -73,6 +73,19 @@ struct sme_populate_pgd_data {
>> unsigned long vaddr_end;
>> };
>>
>> +/*
>> + * This work area lives in the .sme section, which lives outside of
>> + * the kernel proper. It is sized to hold the intermediate copy buffer
>> + * and more than enough pagetable pages.
>> + *
>> + * By using this section, the kernel can be encrypted in place and we
>
> replace that "we" with an impartial passive formulation.
Ok.
>
> Other than that, I like the commenting, very helpful!
I'll send out a V3 with the comments addressed (after giving a bit of time
for name suggestions).
Thanks,
Tom
>
> Thx.
>
On Tue, Jun 18, 2019 at 01:49:13AM +0000, Lendacky, Thomas wrote:
> Whoever uses it in the future could rename it if desired. But I can do
> that now. Is there a preferred name? I can leave it as .early_scratch
> or .early_workarea.
So looking at readelf output of vmlinux, we already have .init.*
sections for stuff which gets freed after booting but I'm guessing we
can't have the SME scratch area in the middle because you need to be
able to say which range gets encrypted without encrypting the scratch
area itself...
But you could call it .init.scratch or so, so that it fits with the
already existing naming nomenclature for ranges which get freed after
init.
> I think it's easier to show the alignment requirements that SME has for
> this section by having it be its own section.
Not only that, from ld.info:
" The special output section name '/DISCARD/' may be used to discard
input sections. Any input sections which are assigned to an output
section named '/DISCARD/' are not included in the output file."
but you want that section present in the output file.
Thx.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.