2024-05-28 09:59:13

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv11 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
This is done by allocating pages normally from the buddy allocator and
then converting them to shared using set_memory_decrypted().

On kexec, the second kernel is unaware of which memory has been
converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
memory as private is fatal.

Therefore, the memory state must be reset to its original state before
starting the new kernel with kexec.

The process of converting shared memory back to private occurs in two
steps:

- enc_kexec_begin() stops new conversions.

- enc_kexec_finish() unshares all existing shared memory, reverting it
back to private.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Nikolay Borisov <[email protected]>
Reviewed-by: Kai Huang <[email protected]>
Tested-by: Tao Liu <[email protected]>
---
arch/x86/include/asm/x86_init.h | 9 +++++++++
arch/x86/kernel/crash.c | 12 ++++++++++++
arch/x86/kernel/reboot.c | 12 ++++++++++++
arch/x86/kernel/x86_init.c | 4 ++++
4 files changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 28ac3cb9b987..6cade48811cc 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -149,12 +149,21 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin Begin the two-step process of conversion shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * @enc_kexec_finish Finish the two-step process of conversion shared memory to
+ * private. All memory is private after the call.
+ * It called with all CPUs but one shutdown and interrupts
+ * disabled.
*/
struct x86_guest {
int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(bool crash);
+ void (*enc_kexec_finish)(void);
};

/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f06501445cd9..74f6305eb9ec 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() get call after all but one CPU has been shut down
+ * and interrupts have been disabled. This only allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin(true);
+ x86_platform.guest.enc_kexec_finish();
+
crash_save_cpu(regs, safe_smp_processor_id());
}

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..097313147ad3 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)

void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}

static void __machine_emergency_restart(int emergency)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a7143bb7dd93..8a79fb505303 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,6 +138,8 @@ static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(bool crash) {}
+static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }

struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
},
};

--
2.43.0



2024-05-29 10:44:05

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCHv11 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

On Tue, May 28, 2024 at 12:55:13PM +0300, Kirill A. Shutemov wrote:
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 28ac3cb9b987..6cade48811cc 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -149,12 +149,21 @@ struct x86_init_acpi {
> * @enc_status_change_finish Notify HV after the encryption status of a range is changed
> * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
> * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
> + * @enc_kexec_begin Begin the two-step process of conversion shared memory back

s/conversion/converting/

> + * to private. It stops the new conversions from being started
> + * and waits in-flight conversions to finish, if possible.

Good.

Now add "The @crash parameter denotes whether the function is being
called in the crash shutdown path."

> + * @enc_kexec_finish Finish the two-step process of conversion shared memory to

s/conversion/converting/

> + * private. All memory is private after the call.

"... when the function returns."

> + * It called with all CPUs but one shutdown and interrupts
> + * disabled.

"It is called on only one CPU while the others are shut down and with
interrupts disabled."

> */
> struct x86_guest {
> int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
> int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
> bool (*enc_tlb_flush_required)(bool enc);
> bool (*enc_cache_flush_required)(void);
> + void (*enc_kexec_begin)(bool crash);
> + void (*enc_kexec_finish)(void);
> };
>
> /**
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index f06501445cd9..74f6305eb9ec 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
> #ifdef CONFIG_HPET_TIMER
> hpet_disable();
> #endif
> +
> + /*
> + * Non-crash kexec calls enc_kexec_begin() while scheduling is still
> + * active. This allows the callback to wait until all in-flight
> + * shared<->private conversions are complete. In a crash scenario,
> + * enc_kexec_begin() get call after all but one CPU has been shut down

"gets called" ... "have been shut down"

> + * and interrupts have been disabled. This only allows the callback to

only?

> + * detect a race with the conversion and report it.
> + */
> + x86_platform.guest.enc_kexec_begin(true);
> + x86_platform.guest.enc_kexec_finish();
> +

..

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2024-06-02 12:39:26

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv11.1 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
This is done by allocating pages normally from the buddy allocator and
then converting them to shared using set_memory_decrypted().

On kexec, the second kernel is unaware of which memory has been
converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
memory as private is fatal.

Therefore, the memory state must be reset to its original state before
starting the new kernel with kexec.

The process of converting shared memory back to private occurs in two
steps:

- enc_kexec_begin() stops new conversions.

- enc_kexec_finish() unshares all existing shared memory, reverting it
back to private.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Nikolay Borisov <[email protected]>
Reviewed-by: Kai Huang <[email protected]>
Tested-by: Tao Liu <[email protected]>
---
arch/x86/include/asm/x86_init.h | 9 +++++++++
arch/x86/kernel/crash.c | 12 ++++++++++++
arch/x86/kernel/reboot.c | 12 ++++++++++++
arch/x86/kernel/x86_init.c | 4 ++++
4 files changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 28ac3cb9b987..6cade48811cc 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -149,12 +149,21 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin Begin the two-step process of conversion shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * @enc_kexec_finish Finish the two-step process of conversion shared memory to
+ * private. All memory is private after the call.
+ * It called with all CPUs but one shutdown and interrupts
+ * disabled.
*/
struct x86_guest {
int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(bool crash);
+ void (*enc_kexec_finish)(void);
};

/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f06501445cd9..74f6305eb9ec 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() get call after all but one CPU has been shut down
+ * and interrupts have been disabled. This only allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin(true);
+ x86_platform.guest.enc_kexec_finish();
+
crash_save_cpu(regs, safe_smp_processor_id());
}

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..097313147ad3 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)

void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}

static void __machine_emergency_restart(int emergency)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a7143bb7dd93..8a79fb505303 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,6 +138,8 @@ static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(bool crash) {}
+static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }

struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
},
};

--
2.43.0


2024-06-02 12:42:31

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv11.1 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

Please disregard this. I failed to fold changes :/

--
Kiryl Shutsemau / Kirill A. Shutemov

2024-06-02 12:44:32

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv11.2 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
This is done by allocating pages normally from the buddy allocator and
then converting them to shared using set_memory_decrypted().

On kexec, the second kernel is unaware of which memory has been
converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
memory as private is fatal.

Therefore, the memory state must be reset to its original state before
starting the new kernel with kexec.

The process of converting shared memory back to private occurs in two
steps:

- enc_kexec_begin() stops new conversions.

- enc_kexec_finish() unshares all existing shared memory, reverting it
back to private.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Nikolay Borisov <[email protected]>
Reviewed-by: Kai Huang <[email protected]>
Tested-by: Tao Liu <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/x86_init.h | 12 ++++++++++++
arch/x86/kernel/crash.c | 12 ++++++++++++
arch/x86/kernel/reboot.c | 12 ++++++++++++
arch/x86/kernel/x86_init.c | 4 ++++
4 files changed, 40 insertions(+)

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 28ac3cb9b987..b0f313278967 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -149,12 +149,24 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin Begin the two-step process of converting shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * The @crash parameter denotes whether the function is being
+ * called in the crash shutdown path.
+ * @enc_kexec_finish Finish the two-step process of converting shared memory to
+ * private. All memory is private after the call when
+ * the function returns.
+ * It is called on only one CPU while the others are shut down
+ * and with interrupts disabled.
*/
struct x86_guest {
int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(bool crash);
+ void (*enc_kexec_finish)(void);
};

/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f06501445cd9..fc52ea80cdc8 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin(true);
+ x86_platform.guest.enc_kexec_finish();
+
crash_save_cpu(regs, safe_smp_processor_id());
}

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..097313147ad3 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)

void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}

static void __machine_emergency_restart(int emergency)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a7143bb7dd93..8a79fb505303 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,6 +138,8 @@ static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(bool crash) {}
+static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }

struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
},
};

--
2.43.0


2024-06-04 16:18:41

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCHv11 10/19] x86/mm: Add callbacks to prepare encrypted memory for kexec

On 5/28/24 02:55, Kirill A. Shutemov wrote:
> + x86_platform.guest.enc_kexec_begin(true);
> + x86_platform.guest.enc_kexec_finish();

I really despise the random, unlabeled true/false/0/1 arguments to
functions like this.

I'll bring it up in the non-noop patch though.