2021-03-11 13:31:06

by Lenny Szubowicz

[permalink] [raw]
Subject: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

Turn off host updates to the registered kvmclock memory
locations when transitioning to a hibernated kernel in
resume_target_kernel().

This is accomplished for secondary vcpus by disabling host
clock updates for that vcpu when it is put offline. For the
primary vcpu, it's accomplished by using the existing call back
from save_processor_state() to kvm_save_sched_clock_state().

The registered kvmclock memory locations may differ between
the currently running kernel and the hibernated kernel, which
is being restored and resumed. Kernel memory corruption is thus
possible if the host clock updates are allowed to run while the
hibernated kernel is relocated to its original physical memory
locations.

This is similar to the problem solved for kexec by
commit 1e977aa12dd4 ("x86: KVM guest: disable clock before rebooting.")

Commit 95a3d4454bb1 ("x86/kvmclock: Switch kvmclock data to a
PER_CPU variable") innocently increased the exposure for this
problem by dynamically allocating the physical pages that are
used for host clock updates when the vcpu count exceeds 64.
This increases the likelihood that the registered kvmclock
locations will differ for vcpus above 64.

Reported-by: Xiaoyi Chen <[email protected]>
Tested-by: Mohamed Aboubakr <[email protected]>
Signed-off-by: Lenny Szubowicz <[email protected]>
---
arch/x86/kernel/kvmclock.c | 34 ++++++++++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa593743acf6..291ffca41afb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -187,8 +187,18 @@ static void kvm_register_clock(char *txt)
pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}

+/*
+ * Turn off host clock updates to the registered memory location when the
+ * cpu clock context is saved via save_processor_state(). Enables correct
+ * handling of the primary cpu clock when transitioning to a hibernated
+ * kernel in resume_target_kernel(), where the old and new registered
+ * memory locations may differ.
+ */
static void kvm_save_sched_clock_state(void)
{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
+ pr_info("kvm-clock: cpu %d, clock stopped", smp_processor_id());
}

static void kvm_restore_sched_clock_state(void)
@@ -311,9 +321,23 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return p ? 0 : -ENOMEM;
}

+/*
+ * Turn off host clock updates to the registered memory location when a
+ * cpu is placed offline. Enables correct handling of secondary cpu clocks
+ * when transitioning to a hibernated kernel in resume_target_kernel(),
+ * where the old and new registered memory locations may differ.
+ */
+static int kvmclock_cpu_offline(unsigned int cpu)
+{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ pr_info("kvm-clock: cpu %d, clock stopped", cpu);
+ return 0;
+}
+
void __init kvmclock_init(void)
{
u8 flags;
+ int cpuhp_prepare;

if (!kvm_para_available() || !kvmclock)
return;
@@ -325,8 +349,14 @@ void __init kvmclock_init(void)
return;
}

- if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
- kvmclock_setup_percpu, NULL) < 0) {
+ cpuhp_prepare = cpuhp_setup_state(CPUHP_BP_PREPARE_DYN,
+ "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL);
+ if (cpuhp_prepare < 0)
+ return;
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvmclock:cpu_offline",
+ NULL, kvmclock_cpu_offline) < 0) {
+ cpuhp_remove_state(cpuhp_prepare);
return;
}

--
2.27.0


2021-03-17 13:33:32

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

Lenny Szubowicz <[email protected]> writes:

> Turn off host updates to the registered kvmclock memory
> locations when transitioning to a hibernated kernel in
> resume_target_kernel().
>
> This is accomplished for secondary vcpus by disabling host
> clock updates for that vcpu when it is put offline. For the
> primary vcpu, it's accomplished by using the existing call back
> from save_processor_state() to kvm_save_sched_clock_state().
>
> The registered kvmclock memory locations may differ between
> the currently running kernel and the hibernated kernel, which
> is being restored and resumed. Kernel memory corruption is thus
> possible if the host clock updates are allowed to run while the
> hibernated kernel is relocated to its original physical memory
> locations.
>
> This is similar to the problem solved for kexec by
> commit 1e977aa12dd4 ("x86: KVM guest: disable clock before rebooting.")
>
> Commit 95a3d4454bb1 ("x86/kvmclock: Switch kvmclock data to a
> PER_CPU variable") innocently increased the exposure for this
> problem by dynamically allocating the physical pages that are
> used for host clock updates when the vcpu count exceeds 64.
> This increases the likelihood that the registered kvmclock
> locations will differ for vcpus above 64.
>
> Reported-by: Xiaoyi Chen <[email protected]>
> Tested-by: Mohamed Aboubakr <[email protected]>
> Signed-off-by: Lenny Szubowicz <[email protected]>
> ---
> arch/x86/kernel/kvmclock.c | 34 ++++++++++++++++++++++++++++++++--
> 1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index aa593743acf6..291ffca41afb 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -187,8 +187,18 @@ static void kvm_register_clock(char *txt)
> pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
> }
>
> +/*
> + * Turn off host clock updates to the registered memory location when the
> + * cpu clock context is saved via save_processor_state(). Enables correct
> + * handling of the primary cpu clock when transitioning to a hibernated
> + * kernel in resume_target_kernel(), where the old and new registered
> + * memory locations may differ.
> + */
> static void kvm_save_sched_clock_state(void)
> {
> + native_write_msr(msr_kvm_system_time, 0, 0);
> + kvm_disable_steal_time();
> + pr_info("kvm-clock: cpu %d, clock stopped", smp_processor_id());
> }

Nitpick: should we rename kvm_save_sched_clock_state() to something more
generic, like kvm_disable_host_clock_updates() to indicate, that what it
does is not only sched clock related?

>
> static void kvm_restore_sched_clock_state(void)
> @@ -311,9 +321,23 @@ static int kvmclock_setup_percpu(unsigned int cpu)
> return p ? 0 : -ENOMEM;
> }
>
> +/*
> + * Turn off host clock updates to the registered memory location when a
> + * cpu is placed offline. Enables correct handling of secondary cpu clocks
> + * when transitioning to a hibernated kernel in resume_target_kernel(),
> + * where the old and new registered memory locations may differ.
> + */
> +static int kvmclock_cpu_offline(unsigned int cpu)
> +{
> + native_write_msr(msr_kvm_system_time, 0, 0);
> + pr_info("kvm-clock: cpu %d, clock stopped", cpu);

I'd say this pr_info() is superfluous: on a system with hundereds of
vCPUs users will get flooded with 'clock stopped' messages which don't
actually mean much: in case native_write_msr() fails the error gets
reported in dmesg anyway. I'd suggest we drop this and pr_info() in
kvm_save_sched_clock_state().

> + return 0;

Why don't we disable steal time accounting here? MSR_KVM_STEAL_TIME is
also per-cpu. Can we merge kvm_save_sched_clock_state() with
kvmclock_cpu_offline() maybe?

> +}
> +
> void __init kvmclock_init(void)
> {
> u8 flags;
> + int cpuhp_prepare;
>
> if (!kvm_para_available() || !kvmclock)
> return;
> @@ -325,8 +349,14 @@ void __init kvmclock_init(void)
> return;
> }
>
> - if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
> - kvmclock_setup_percpu, NULL) < 0) {
> + cpuhp_prepare = cpuhp_setup_state(CPUHP_BP_PREPARE_DYN,
> + "kvmclock:setup_percpu",
> + kvmclock_setup_percpu, NULL);
> + if (cpuhp_prepare < 0)
> + return;
> + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvmclock:cpu_offline",
> + NULL, kvmclock_cpu_offline) < 0) {
> + cpuhp_remove_state(cpuhp_prepare);

In case we fail to set up kvmclock_cpu_offline() callback we get broken
hybernation but without kvmclock_setup_percpu() call we get a broken
guest (as kvmclock() may be the only reliable clocksource) so I'm not
exactly sure we're active in a best way with cpuhp_remove_state()
here. I don't feel strong though, I think it can stay but in that case
I'd add a pr_warn() at least.

> return;
> }

--
Vitaly

2021-03-26 02:28:55

by Lenny Szubowicz

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

On 3/17/21 9:30 AM, Vitaly Kuznetsov wrote:
> Lenny Szubowicz <[email protected]> writes:
>
>> Turn off host updates to the registered kvmclock memory
>> locations when transitioning to a hibernated kernel in
>> resume_target_kernel().
>>
>> This is accomplished for secondary vcpus by disabling host
>> clock updates for that vcpu when it is put offline. For the
>> primary vcpu, it's accomplished by using the existing call back
>> from save_processor_state() to kvm_save_sched_clock_state().
>>
>> The registered kvmclock memory locations may differ between
>> the currently running kernel and the hibernated kernel, which
>> is being restored and resumed. Kernel memory corruption is thus
>> possible if the host clock updates are allowed to run while the
>> hibernated kernel is relocated to its original physical memory
>> locations.
>>
>> This is similar to the problem solved for kexec by
>> commit 1e977aa12dd4 ("x86: KVM guest: disable clock before rebooting.")
>>
>> Commit 95a3d4454bb1 ("x86/kvmclock: Switch kvmclock data to a
>> PER_CPU variable") innocently increased the exposure for this
>> problem by dynamically allocating the physical pages that are
>> used for host clock updates when the vcpu count exceeds 64.
>> This increases the likelihood that the registered kvmclock
>> locations will differ for vcpus above 64.
>>
>> Reported-by: Xiaoyi Chen <[email protected]>
>> Tested-by: Mohamed Aboubakr <[email protected]>
>> Signed-off-by: Lenny Szubowicz <[email protected]>
>> ---
>> arch/x86/kernel/kvmclock.c | 34 ++++++++++++++++++++++++++++++++--
>> 1 file changed, 32 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index aa593743acf6..291ffca41afb 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -187,8 +187,18 @@ static void kvm_register_clock(char *txt)
>> pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
>> }
>>
>> +/*
>> + * Turn off host clock updates to the registered memory location when the
>> + * cpu clock context is saved via save_processor_state(). Enables correct
>> + * handling of the primary cpu clock when transitioning to a hibernated
>> + * kernel in resume_target_kernel(), where the old and new registered
>> + * memory locations may differ.
>> + */
>> static void kvm_save_sched_clock_state(void)
>> {
>> + native_write_msr(msr_kvm_system_time, 0, 0);
>> + kvm_disable_steal_time();
>> + pr_info("kvm-clock: cpu %d, clock stopped", smp_processor_id());
>> }
>
> Nitpick: should we rename kvm_save_sched_clock_state() to something more
> generic, like kvm_disable_host_clock_updates() to indicate, that what it
> does is not only sched clock related?

I see your rationale. But if I rename kvm_save_sched_clock_state()
then shouldn't I also rename kvm_restore_sched_clock_state().
The names appear to reflect the callback that invokes them,
from save_processor_state()/restore_state(), rather than what these
functions need to do.

x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;

For V2 of my patch, I kept these names as they were. But if you have a strong
desire for a different name, then I think both routines should be renamed
similarly, since they are meant to be a complimentary pair.

>
>>
>> static void kvm_restore_sched_clock_state(void)
>> @@ -311,9 +321,23 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>> return p ? 0 : -ENOMEM;
>> }
>>
>> +/*
>> + * Turn off host clock updates to the registered memory location when a
>> + * cpu is placed offline. Enables correct handling of secondary cpu clocks
>> + * when transitioning to a hibernated kernel in resume_target_kernel(),
>> + * where the old and new registered memory locations may differ.
>> + */
>> +static int kvmclock_cpu_offline(unsigned int cpu)
>> +{
>> + native_write_msr(msr_kvm_system_time, 0, 0);
>> + pr_info("kvm-clock: cpu %d, clock stopped", cpu);
>
> I'd say this pr_info() is superfluous: on a system with hundereds of
> vCPUs users will get flooded with 'clock stopped' messages which don't
> actually mean much: in case native_write_msr() fails the error gets
> reported in dmesg anyway. I'd suggest we drop this and pr_info() in
> kvm_save_sched_clock_state().
>

Agreed. I was essentially using it as a pr_debug(). Gone in V2.

>> + return 0;
>
> Why don't we disable steal time accounting here? MSR_KVM_STEAL_TIME is
> also per-cpu. Can we merge kvm_save_sched_clock_state() with
> kvmclock_cpu_offline() maybe?
>

kvm_cpu_down_prepare() in arch/x86/kernel/kvm.c already calls
kvm_disable_steal_time() when a vcpu is placed offline.
So there is no need to do that in kvmclock_cpu_offline().

In the case of the hibernation resume code path, resume_target_kernel()
in kernel/power/hibernate.c, the secondary cpus are placed offline,
but the primary is not. Instead, we are going to be switching contexts
of the primary cpu from the boot kernel to the kernel that was restored
from the hibernation image.

This is where save_processor_state()/restore_processor_state() and kvm_save_sched_clock_state()/restore_sched_clock_state() come into play
to stop the kvmclock of the boot kernel's primary cpu and restart
the kvmclock of restored hibernated kernel's primary cpu.

And in this case, no one is calling kvm_disable_steal_time(),
so kvm_save_sched_clock_state() is doing it. (This is very similar
to the reason why kvm_crash_shutdown() in kvmclock.c needs to call
kvm_disable_steal_time())

However, I'm now wondering if kvm_restore_sched_clock_state()
needs to add a call to the equivalent of kvm_register_steal_time(),
because otherwise no one will do that for the primary vcpu
on resume from hibernation.

>> +}
>> +
>> void __init kvmclock_init(void)
>> {
>> u8 flags;
>> + int cpuhp_prepare;
>>
>> if (!kvm_para_available() || !kvmclock)
>> return;
>> @@ -325,8 +349,14 @@ void __init kvmclock_init(void)
>> return;
>> }
>>
>> - if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
>> - kvmclock_setup_percpu, NULL) < 0) {
>> + cpuhp_prepare = cpuhp_setup_state(CPUHP_BP_PREPARE_DYN,
>> + "kvmclock:setup_percpu",
>> + kvmclock_setup_percpu, NULL);
>> + if (cpuhp_prepare < 0)
>> + return;
>> + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvmclock:cpu_offline",
>> + NULL, kvmclock_cpu_offline) < 0) {
>> + cpuhp_remove_state(cpuhp_prepare);
>
> In case we fail to set up kvmclock_cpu_offline() callback we get broken
> hybernation but without kvmclock_setup_percpu() call we get a broken
> guest (as kvmclock() may be the only reliable clocksource) so I'm not
> exactly sure we're active in a best way with cpuhp_remove_state()
> here. I don't feel strong though, I think it can stay but in that case
> I'd add a pr_warn() at least.

Something is seriously broken if either of these cpuhp_setup_state()
calls fail. I first considered using the "down" callback of the
CPUHP_BP_PREPARE_DYN state, as in:

if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, kvmclock_cpu_offline) < 0) {

This would have minimized the change, but I wasn't comfortable with how late
it would be called. Other examples in the kernel, including kvm.c, use
CPUHP_AP_ONLINE_DYN for cpu online/offline events.

But I do agree that a failure of either cpuhp_setup_state() should not
be silent. So in V2 I added a pr_err().

Thank you for your review comments.

-Lenny.

>
>> return;
>> }
>

2021-03-26 11:01:09

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

Lenny Szubowicz <[email protected]> writes:

> On 3/17/21 9:30 AM, Vitaly Kuznetsov wrote:
>> Lenny Szubowicz <[email protected]> writes:
>>
>>> Turn off host updates to the registered kvmclock memory
>>> locations when transitioning to a hibernated kernel in
>>> resume_target_kernel().
>>>
>>> This is accomplished for secondary vcpus by disabling host
>>> clock updates for that vcpu when it is put offline. For the
>>> primary vcpu, it's accomplished by using the existing call back
>>> from save_processor_state() to kvm_save_sched_clock_state().
>>>
>>> The registered kvmclock memory locations may differ between
>>> the currently running kernel and the hibernated kernel, which
>>> is being restored and resumed. Kernel memory corruption is thus
>>> possible if the host clock updates are allowed to run while the
>>> hibernated kernel is relocated to its original physical memory
>>> locations.
>>>
>>> This is similar to the problem solved for kexec by
>>> commit 1e977aa12dd4 ("x86: KVM guest: disable clock before rebooting.")
>>>
>>> Commit 95a3d4454bb1 ("x86/kvmclock: Switch kvmclock data to a
>>> PER_CPU variable") innocently increased the exposure for this
>>> problem by dynamically allocating the physical pages that are
>>> used for host clock updates when the vcpu count exceeds 64.
>>> This increases the likelihood that the registered kvmclock
>>> locations will differ for vcpus above 64.
>>>
>>> Reported-by: Xiaoyi Chen <[email protected]>
>>> Tested-by: Mohamed Aboubakr <[email protected]>
>>> Signed-off-by: Lenny Szubowicz <[email protected]>
>>> ---
>>> arch/x86/kernel/kvmclock.c | 34 ++++++++++++++++++++++++++++++++--
>>> 1 file changed, 32 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>>> index aa593743acf6..291ffca41afb 100644
>>> --- a/arch/x86/kernel/kvmclock.c
>>> +++ b/arch/x86/kernel/kvmclock.c
>>> @@ -187,8 +187,18 @@ static void kvm_register_clock(char *txt)
>>> pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
>>> }
>>>
>>> +/*
>>> + * Turn off host clock updates to the registered memory location when the
>>> + * cpu clock context is saved via save_processor_state(). Enables correct
>>> + * handling of the primary cpu clock when transitioning to a hibernated
>>> + * kernel in resume_target_kernel(), where the old and new registered
>>> + * memory locations may differ.
>>> + */
>>> static void kvm_save_sched_clock_state(void)
>>> {
>>> + native_write_msr(msr_kvm_system_time, 0, 0);
>>> + kvm_disable_steal_time();
>>> + pr_info("kvm-clock: cpu %d, clock stopped", smp_processor_id());
>>> }
>>
>> Nitpick: should we rename kvm_save_sched_clock_state() to something more
>> generic, like kvm_disable_host_clock_updates() to indicate, that what it
>> does is not only sched clock related?
>
> I see your rationale. But if I rename kvm_save_sched_clock_state()
> then shouldn't I also rename kvm_restore_sched_clock_state().
> The names appear to reflect the callback that invokes them,
> from save_processor_state()/restore_state(), rather than what these
> functions need to do.
>
> x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
> x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
>
> For V2 of my patch, I kept these names as they were. But if you have a strong
> desire for a different name, then I think both routines should be renamed
> similarly, since they are meant to be a complimentary pair.
>
>>
>>>
>>> static void kvm_restore_sched_clock_state(void)
>>> @@ -311,9 +321,23 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>>> return p ? 0 : -ENOMEM;
>>> }
>>>
>>> +/*
>>> + * Turn off host clock updates to the registered memory location when a
>>> + * cpu is placed offline. Enables correct handling of secondary cpu clocks
>>> + * when transitioning to a hibernated kernel in resume_target_kernel(),
>>> + * where the old and new registered memory locations may differ.
>>> + */
>>> +static int kvmclock_cpu_offline(unsigned int cpu)
>>> +{
>>> + native_write_msr(msr_kvm_system_time, 0, 0);
>>> + pr_info("kvm-clock: cpu %d, clock stopped", cpu);
>>
>> I'd say this pr_info() is superfluous: on a system with hundereds of
>> vCPUs users will get flooded with 'clock stopped' messages which don't
>> actually mean much: in case native_write_msr() fails the error gets
>> reported in dmesg anyway. I'd suggest we drop this and pr_info() in
>> kvm_save_sched_clock_state().
>>
>
> Agreed. I was essentially using it as a pr_debug(). Gone in V2.
>
>>> + return 0;
>>
>> Why don't we disable steal time accounting here? MSR_KVM_STEAL_TIME is
>> also per-cpu. Can we merge kvm_save_sched_clock_state() with
>> kvmclock_cpu_offline() maybe?
>>
>
> kvm_cpu_down_prepare() in arch/x86/kernel/kvm.c already calls
> kvm_disable_steal_time() when a vcpu is placed offline.
> So there is no need to do that in kvmclock_cpu_offline().
>
> In the case of the hibernation resume code path, resume_target_kernel()
> in kernel/power/hibernate.c, the secondary cpus are placed offline,
> but the primary is not. Instead, we are going to be switching contexts
> of the primary cpu from the boot kernel to the kernel that was restored
> from the hibernation image.
>
> This is where save_processor_state()/restore_processor_state() and kvm_save_sched_clock_state()/restore_sched_clock_state() come into play
> to stop the kvmclock of the boot kernel's primary cpu and restart
> the kvmclock of restored hibernated kernel's primary cpu.
>
> And in this case, no one is calling kvm_disable_steal_time(),
> so kvm_save_sched_clock_state() is doing it. (This is very similar
> to the reason why kvm_crash_shutdown() in kvmclock.c needs to call
> kvm_disable_steal_time())
>
> However, I'm now wondering if kvm_restore_sched_clock_state()
> needs to add a call to the equivalent of kvm_register_steal_time(),
> because otherwise no one will do that for the primary vcpu
> on resume from hibernation.

In case this is true, steal time accounting is not our only
problem. kvm_guest_cpu_init(), which is called from
smp_prepare_boot_cpu() hook also sets up Async PF an PV EOI and both
these features establish a shared guest-host memory region, in this
doesn't happen upon resume from hibernation we're in trouble.

smp_prepare_boot_cpu() hook is called very early from start_kernel() but
what happens when we switch to the context of the hibernated kernel?

I'm going to set up an environement and check what's going on.

>
>>> +}
>>> +
>>> void __init kvmclock_init(void)
>>> {
>>> u8 flags;
>>> + int cpuhp_prepare;
>>>
>>> if (!kvm_para_available() || !kvmclock)
>>> return;
>>> @@ -325,8 +349,14 @@ void __init kvmclock_init(void)
>>> return;
>>> }
>>>
>>> - if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
>>> - kvmclock_setup_percpu, NULL) < 0) {
>>> + cpuhp_prepare = cpuhp_setup_state(CPUHP_BP_PREPARE_DYN,
>>> + "kvmclock:setup_percpu",
>>> + kvmclock_setup_percpu, NULL);
>>> + if (cpuhp_prepare < 0)
>>> + return;
>>> + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvmclock:cpu_offline",
>>> + NULL, kvmclock_cpu_offline) < 0) {
>>> + cpuhp_remove_state(cpuhp_prepare);
>>
>> In case we fail to set up kvmclock_cpu_offline() callback we get broken
>> hybernation but without kvmclock_setup_percpu() call we get a broken
>> guest (as kvmclock() may be the only reliable clocksource) so I'm not
>> exactly sure we're active in a best way with cpuhp_remove_state()
>> here. I don't feel strong though, I think it can stay but in that case
>> I'd add a pr_warn() at least.
>
> Something is seriously broken if either of these cpuhp_setup_state()
> calls fail. I first considered using the "down" callback of the
> CPUHP_BP_PREPARE_DYN state, as in:
>
> if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
> kvmclock_setup_percpu, kvmclock_cpu_offline) < 0) {
>
> This would have minimized the change, but I wasn't comfortable with how late
> it would be called. Other examples in the kernel, including kvm.c, use
> CPUHP_AP_ONLINE_DYN for cpu online/offline events.
>
> But I do agree that a failure of either cpuhp_setup_state() should not
> be silent. So in V2 I added a pr_err().
>
> Thank you for your review comments.
>
> -Lenny.
>
>>
>>> return;
>>> }
>>
>

--
Vitaly

2021-03-26 12:39:50

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

Vitaly Kuznetsov <[email protected]> writes:

> Lenny Szubowicz <[email protected]> writes:
>
>> On 3/17/21 9:30 AM, Vitaly Kuznetsov wrote:
>>> Lenny Szubowicz <[email protected]> writes:
>>>
>>>> Turn off host updates to the registered kvmclock memory
>>>> locations when transitioning to a hibernated kernel in
>>>> resume_target_kernel().
>>>>
>>>> This is accomplished for secondary vcpus by disabling host
>>>> clock updates for that vcpu when it is put offline. For the
>>>> primary vcpu, it's accomplished by using the existing call back
>>>> from save_processor_state() to kvm_save_sched_clock_state().
>>>>
>>>> The registered kvmclock memory locations may differ between
>>>> the currently running kernel and the hibernated kernel, which
>>>> is being restored and resumed. Kernel memory corruption is thus
>>>> possible if the host clock updates are allowed to run while the
>>>> hibernated kernel is relocated to its original physical memory
>>>> locations.
>>>>
>>>> This is similar to the problem solved for kexec by
>>>> commit 1e977aa12dd4 ("x86: KVM guest: disable clock before rebooting.")
>>>>
>>>> Commit 95a3d4454bb1 ("x86/kvmclock: Switch kvmclock data to a
>>>> PER_CPU variable") innocently increased the exposure for this
>>>> problem by dynamically allocating the physical pages that are
>>>> used for host clock updates when the vcpu count exceeds 64.
>>>> This increases the likelihood that the registered kvmclock
>>>> locations will differ for vcpus above 64.
>>>>
>>>> Reported-by: Xiaoyi Chen <[email protected]>
>>>> Tested-by: Mohamed Aboubakr <[email protected]>
>>>> Signed-off-by: Lenny Szubowicz <[email protected]>
>>>> ---
>>>> arch/x86/kernel/kvmclock.c | 34 ++++++++++++++++++++++++++++++++--
>>>> 1 file changed, 32 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>>>> index aa593743acf6..291ffca41afb 100644
>>>> --- a/arch/x86/kernel/kvmclock.c
>>>> +++ b/arch/x86/kernel/kvmclock.c
>>>> @@ -187,8 +187,18 @@ static void kvm_register_clock(char *txt)
>>>> pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
>>>> }
>>>>
>>>> +/*
>>>> + * Turn off host clock updates to the registered memory location when the
>>>> + * cpu clock context is saved via save_processor_state(). Enables correct
>>>> + * handling of the primary cpu clock when transitioning to a hibernated
>>>> + * kernel in resume_target_kernel(), where the old and new registered
>>>> + * memory locations may differ.
>>>> + */
>>>> static void kvm_save_sched_clock_state(void)
>>>> {
>>>> + native_write_msr(msr_kvm_system_time, 0, 0);
>>>> + kvm_disable_steal_time();
>>>> + pr_info("kvm-clock: cpu %d, clock stopped", smp_processor_id());
>>>> }
>>>
>>> Nitpick: should we rename kvm_save_sched_clock_state() to something more
>>> generic, like kvm_disable_host_clock_updates() to indicate, that what it
>>> does is not only sched clock related?
>>
>> I see your rationale. But if I rename kvm_save_sched_clock_state()
>> then shouldn't I also rename kvm_restore_sched_clock_state().
>> The names appear to reflect the callback that invokes them,
>> from save_processor_state()/restore_state(), rather than what these
>> functions need to do.
>>
>> x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
>> x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
>>
>> For V2 of my patch, I kept these names as they were. But if you have a strong
>> desire for a different name, then I think both routines should be renamed
>> similarly, since they are meant to be a complimentary pair.
>>
>>>
>>>>
>>>> static void kvm_restore_sched_clock_state(void)
>>>> @@ -311,9 +321,23 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>>>> return p ? 0 : -ENOMEM;
>>>> }
>>>>
>>>> +/*
>>>> + * Turn off host clock updates to the registered memory location when a
>>>> + * cpu is placed offline. Enables correct handling of secondary cpu clocks
>>>> + * when transitioning to a hibernated kernel in resume_target_kernel(),
>>>> + * where the old and new registered memory locations may differ.
>>>> + */
>>>> +static int kvmclock_cpu_offline(unsigned int cpu)
>>>> +{
>>>> + native_write_msr(msr_kvm_system_time, 0, 0);
>>>> + pr_info("kvm-clock: cpu %d, clock stopped", cpu);
>>>
>>> I'd say this pr_info() is superfluous: on a system with hundereds of
>>> vCPUs users will get flooded with 'clock stopped' messages which don't
>>> actually mean much: in case native_write_msr() fails the error gets
>>> reported in dmesg anyway. I'd suggest we drop this and pr_info() in
>>> kvm_save_sched_clock_state().
>>>
>>
>> Agreed. I was essentially using it as a pr_debug(). Gone in V2.
>>
>>>> + return 0;
>>>
>>> Why don't we disable steal time accounting here? MSR_KVM_STEAL_TIME is
>>> also per-cpu. Can we merge kvm_save_sched_clock_state() with
>>> kvmclock_cpu_offline() maybe?
>>>
>>
>> kvm_cpu_down_prepare() in arch/x86/kernel/kvm.c already calls
>> kvm_disable_steal_time() when a vcpu is placed offline.
>> So there is no need to do that in kvmclock_cpu_offline().
>>
>> In the case of the hibernation resume code path, resume_target_kernel()
>> in kernel/power/hibernate.c, the secondary cpus are placed offline,
>> but the primary is not. Instead, we are going to be switching contexts
>> of the primary cpu from the boot kernel to the kernel that was restored
>> from the hibernation image.
>>
>> This is where save_processor_state()/restore_processor_state() and kvm_save_sched_clock_state()/restore_sched_clock_state() come into play
>> to stop the kvmclock of the boot kernel's primary cpu and restart
>> the kvmclock of restored hibernated kernel's primary cpu.
>>
>> And in this case, no one is calling kvm_disable_steal_time(),
>> so kvm_save_sched_clock_state() is doing it. (This is very similar
>> to the reason why kvm_crash_shutdown() in kvmclock.c needs to call
>> kvm_disable_steal_time())
>>
>> However, I'm now wondering if kvm_restore_sched_clock_state()
>> needs to add a call to the equivalent of kvm_register_steal_time(),
>> because otherwise no one will do that for the primary vcpu
>> on resume from hibernation.
>
> In case this is true, steal time accounting is not our only
> problem. kvm_guest_cpu_init(), which is called from
> smp_prepare_boot_cpu() hook also sets up Async PF an PV EOI and both
> these features establish a shared guest-host memory region, in this
> doesn't happen upon resume from hibernation we're in trouble.
>
> smp_prepare_boot_cpu() hook is called very early from start_kernel() but
> what happens when we switch to the context of the hibernated kernel?
>
> I'm going to set up an environement and check what's going on.

According to the log we have a problem indeed:

[ 15.844263] ACPI: Preparing to enter system sleep state S4
[ 15.844309] PM: Saving platform NVS memory
[ 15.844311] Disabling non-boot CPUs ...
[ 15.844625] kvm-guest: Unregister pv shared memory for cpu 1
[ 15.846272] smpboot: CPU 1 is now offline
[ 15.847124] kvm-guest: Unregister pv shared memory for cpu 2
[ 15.848720] smpboot: CPU 2 is now offline
[ 15.849637] kvm-guest: Unregister pv shared memory for cpu 3
[ 15.851452] smpboot: CPU 3 is now offline
[ 15.853295] PM: hibernation: Creating image:
[ 15.865126] PM: hibernation: Need to copy 82214 pages
[18446743485.711482] kvm-clock: cpu 0, msr 8201001, primary cpu clock, resume
[18446743485.711610] PM: Restoring platform NVS memory
[18446743485.713922] Enabling non-boot CPUs ...
[18446743485.713997] x86: Booting SMP configuration:
[18446743485.713998] smpboot: Booting Node 0 Processor 1 APIC 0x1
[18446743485.714127] kvm-clock: cpu 1, msr 8201041, secondary cpu clock
[18446743485.714484] kvm-guest: KVM setup async PF for cpu 1
[18446743485.714489] kvm-guest: stealtime: cpu 1, msr 3ecac080
[18446743485.714816] CPU1 is up
[18446743485.714846] smpboot: Booting Node 0 Processor 2 APIC 0x2
[18446743485.714954] kvm-clock: cpu 2, msr 8201081, secondary cpu clock
[18446743485.715359] kvm-guest: KVM setup async PF for cpu 2
[18446743485.715364] kvm-guest: stealtime: cpu 2, msr 3ed2c080
[18446743485.715640] CPU2 is up
[18446743485.715672] smpboot: Booting Node 0 Processor 3 APIC 0x3
[18446743485.715867] kvm-clock: cpu 3, msr 82010c1, secondary cpu clock
[18446743485.716288] kvm-guest: KVM setup async PF for cpu 3
[18446743485.716293] kvm-guest: stealtime: cpu 3, msr 3edac080
[18446743485.716564] CPU3 is up
[18446743485.716732] ACPI: Waking up from system sleep state S4
[18446743485.728139] sd 0:0:0:0: [sda] Starting disk
[18446743485.750373] OOM killer enabled.
[18446743485.750909] Restarting tasks ... done.
[18446743485.754465] PM: hibernation: hibernation exit

(this is with your v2 included). There's nothing about CPU0 for
e.g. async PF + timestamps are really interesting. Seems we have issues
to fix) I'm playing with it right now.

--
Vitaly

2021-03-26 13:05:27

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

Vitaly Kuznetsov <[email protected]> writes:

..

> (this is with your v2 included). There's nothing about CPU0 for
> e.g. async PF + timestamps are really interesting. Seems we have issues
> to fix) I'm playing with it right now.

What if we do the following (instead of your patch):

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..c32392d6329d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,7 @@
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/swait.h>
+#include <linux/syscore_ops.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -598,17 +599,21 @@ static void kvm_guest_cpu_offline(void)

static int kvm_cpu_online(unsigned int cpu)
{
- local_irq_disable();
+ unsigned long flags;
+
+ local_irq_save(flags);
kvm_guest_cpu_init();
- local_irq_enable();
+ local_irq_restore(flags);
return 0;
}

static int kvm_cpu_down_prepare(unsigned int cpu)
{
- local_irq_disable();
+ unsigned long flags;
+
+ local_irq_save(flags);
kvm_guest_cpu_offline();
- local_irq_enable();
+ local_irq_restore(flags);
return 0;
}
#endif
@@ -639,6 +644,23 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
native_flush_tlb_others(flushmask, info);
}

+static int kvm_suspend(void)
+{
+ kvm_guest_cpu_offline();
+
+ return 0;
+}
+
+static void kvm_resume(void)
+{
+ kvm_cpu_online(raw_smp_processor_id());
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
static void __init kvm_guest_init(void)
{
int i;
@@ -681,6 +703,8 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif

+ register_syscore_ops(&kvm_syscore_ops);
+
/*
* Hard lockup detection is enabled by default. Disable it, as guests
* can get false positives too easily, for example if the host is
--
2.30.2

This seems to work fine (according to the log, I haven't checked yet
that PV features are actually working):

[ 20.678081] PM: hibernation: Creating image:
[ 20.689925] PM: hibernation: Need to copy 82048 pages
[ 2.302411] kvm-clock: cpu 0, msr 2c201001, primary cpu clock, resume
[ 2.302487] PM: Restoring platform NVS memory
[ 2.302498] kvm-guest: KVM setup async PF for cpu 0
[ 2.302502] kvm-guest: stealtime: cpu 0, msr 3ec2c080
[ 2.304754] Enabling non-boot CPUs ...
[ 2.304823] x86: Booting SMP configuration:
[ 2.304824] smpboot: Booting Node 0 Processor 1 APIC 0x1
[ 2.304952] kvm-clock: cpu 1, msr 2c201041, secondary cpu clock
[ 2.305400] kvm-guest: KVM setup async PF for cpu 1
[ 2.305405] kvm-guest: stealtime: cpu 1, msr 3ecac080
[ 2.305786] CPU1 is up
[ 2.305818] smpboot: Booting Node 0 Processor 2 APIC 0x2
[ 2.305920] kvm-clock: cpu 2, msr 2c201081, secondary cpu clock
[ 2.306325] kvm-guest: KVM setup async PF for cpu 2
[ 2.306330] kvm-guest: stealtime: cpu 2, msr 3ed2c080
[ 2.306599] CPU2 is up
[ 2.306627] smpboot: Booting Node 0 Processor 3 APIC 0x3
[ 2.306729] kvm-clock: cpu 3, msr 2c2010c1, secondary cpu clock
[ 2.307092] kvm-guest: KVM setup async PF for cpu 3
[ 2.307097] kvm-guest: stealtime: cpu 3, msr 3edac080
[ 2.307383] CPU3 is up
[ 2.307560] ACPI: Waking up from system sleep state S4
[ 2.318778] sd 0:0:0:0: [sda] Starting disk
[ 2.342335] OOM killer enabled.
[ 2.342817] Restarting tasks ... done.
[ 2.346209] PM: hibernation: hibernation exit

--
Vitaly

2021-03-26 17:15:55

by Lenny Szubowicz

[permalink] [raw]
Subject: Re: [PATCH] x86/kvmclock: Stop kvmclocks for hibernate restore

On 3/26/21 9:01 AM, Vitaly Kuznetsov wrote:
> Vitaly Kuznetsov <[email protected]> writes:
>
> ..
>
>> (this is with your v2 included). There's nothing about CPU0 for
>> e.g. async PF + timestamps are really interesting. Seems we have issues
>> to fix) I'm playing with it right now.
>
> What if we do the following (instead of your patch):
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 78bb0fae3982..c32392d6329d 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -26,6 +26,7 @@
> #include <linux/kprobes.h>
> #include <linux/nmi.h>
> #include <linux/swait.h>
> +#include <linux/syscore_ops.h>
> #include <asm/timer.h>
> #include <asm/cpu.h>
> #include <asm/traps.h>
> @@ -598,17 +599,21 @@ static void kvm_guest_cpu_offline(void)
>
> static int kvm_cpu_online(unsigned int cpu)
> {
> - local_irq_disable();
> + unsigned long flags;
> +
> + local_irq_save(flags);
> kvm_guest_cpu_init();
> - local_irq_enable();
> + local_irq_restore(flags);
> return 0;
> }
>
> static int kvm_cpu_down_prepare(unsigned int cpu)
> {
> - local_irq_disable();
> + unsigned long flags;
> +
> + local_irq_save(flags);
> kvm_guest_cpu_offline();
> - local_irq_enable();
> + local_irq_restore(flags);
> return 0;
> }
> #endif
> @@ -639,6 +644,23 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
> native_flush_tlb_others(flushmask, info);
> }
>
> +static int kvm_suspend(void)
> +{
> + kvm_guest_cpu_offline();
> +
> + return 0;
> +}
> +
> +static void kvm_resume(void)
> +{
> + kvm_cpu_online(raw_smp_processor_id());
> +}
> +
> +static struct syscore_ops kvm_syscore_ops = {
> + .suspend = kvm_suspend,
> + .resume = kvm_resume,
> +};
> +
> static void __init kvm_guest_init(void)
> {
> int i;
> @@ -681,6 +703,8 @@ static void __init kvm_guest_init(void)
> kvm_guest_cpu_init();
> #endif
>
> + register_syscore_ops(&kvm_syscore_ops);
> +
> /*
> * Hard lockup detection is enabled by default. Disable it, as guests
> * can get false positives too easily, for example if the host is
>

Yes, I do like using register_syscore_ops for this. I will base my V3 on this. -Lenny.