2022-11-02 23:22:02

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

To make it obvious that KVM doesn't have a lurking bug, cleanup eVMCS
enabling if kvm_init() fails even though the enabling doesn't strictly
need to be unwound. eVMCS enabling only toggles values that are fully
contained in the VMX module, i.e. it's technically ok to leave the values
as-is since they'll disappear entirely when the module is unloaded, but
doing proper cleanup is relatively simple, and having a chunk of code
that isn't unwound is confusing.

Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 137 +++++++++++++++++++++++------------------
1 file changed, 78 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 05a747c9a9ff..b3fd4049de01 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -524,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;

#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);

@@ -552,6 +554,71 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
return 0;
}

+static __init void hv_setup_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_direct_tlbflush
+ = hv_enable_direct_tlbflush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+static void hv_cleanup_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+ int cpu;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_setup_evmcs(void) {}
+static void hv_cleanup_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */

/*
@@ -8435,29 +8502,8 @@ static void vmx_exit(void)

kvm_exit();

-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
+ hv_cleanup_evmcs();

- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
vmx_cleanup_l1d_flush();

allow_smaller_maxphyaddr = false;
@@ -8468,43 +8514,12 @@ static int __init vmx_init(void)
{
int r, cpu;

-#if IS_ENABLED(CONFIG_HYPERV)
- /*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
- */
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
-
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
+ hv_setup_evmcs();

r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
- return r;
+ goto err_kvm_init;

/*
* Must be called after kvm_init() so enable_ept is properly set
@@ -8514,10 +8529,8 @@ static int __init vmx_init(void)
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;

vmx_setup_fb_clear_ctrl();

@@ -8542,5 +8555,11 @@ static int __init vmx_init(void)
allow_smaller_maxphyaddr = true;

return 0;
+
+err_l1d_flush:
+ vmx_exit();
+err_kvm_init:
+ hv_cleanup_evmcs();
+ return r;
}
module_init(vmx_init);
--
2.38.1.431.g37b22c650d-goog



2022-11-03 14:15:59

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

On Thu, Nov 3, 2022 at 3:01 PM Paolo Bonzini <[email protected]> wrote:
>
> On 11/3/22 00:18, Sean Christopherson wrote:
> > +static void hv_cleanup_evmcs(void)
>
> This needs to be __init.

Error: brain temporarily disconnected.

Paolo


2022-11-03 14:55:22

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

On 11/3/22 00:18, Sean Christopherson wrote:
> +static void hv_cleanup_evmcs(void)

This needs to be __init.

Paolo


2022-11-03 15:31:38

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

Sean Christopherson <[email protected]> writes:

> To make it obvious that KVM doesn't have a lurking bug, cleanup eVMCS
> enabling if kvm_init() fails even though the enabling doesn't strictly
> need to be unwound. eVMCS enabling only toggles values that are fully
> contained in the VMX module, i.e. it's technically ok to leave the values
> as-is since they'll disappear entirely when the module is unloaded, but
> doing proper cleanup is relatively simple, and having a chunk of code
> that isn't unwound is confusing.
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/vmx/vmx.c | 137 +++++++++++++++++++++++------------------
> 1 file changed, 78 insertions(+), 59 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 05a747c9a9ff..b3fd4049de01 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -524,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
> static unsigned long host_idt_base;
>
> #if IS_ENABLED(CONFIG_HYPERV)
> +static struct kvm_x86_ops vmx_x86_ops __initdata;
> +
> static bool __read_mostly enlightened_vmcs = true;
> module_param(enlightened_vmcs, bool, 0444);
>
> @@ -552,6 +554,71 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
> return 0;
> }
>
> +static __init void hv_setup_evmcs(void)
> +{
> + int cpu;
> +
> + if (!enlightened_vmcs)
> + return;
> +
> + /*
> + * Enlightened VMCS usage should be recommended and the host needs
> + * to support eVMCS v1 or above.
> + */
> + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
> + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
> + KVM_EVMCS_VERSION) {
> +
> + /* Check that we have assist pages on all online CPUs */
> + for_each_online_cpu(cpu) {
> + if (!hv_get_vp_assist_page(cpu)) {
> + enlightened_vmcs = false;
> + break;
> + }
> + }
> +
> + if (enlightened_vmcs) {
> + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
> + static_branch_enable(&enable_evmcs);
> + }
> +
> + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
> + vmx_x86_ops.enable_direct_tlbflush
> + = hv_enable_direct_tlbflush;
> +
> + } else {
> + enlightened_vmcs = false;
> + }
> +}
> +static void hv_cleanup_evmcs(void)
> +{
> + struct hv_vp_assist_page *vp_ap;
> + int cpu;
> +
> + if (!static_branch_unlikely(&enable_evmcs))
> + return;
> +
> + /*
> + * Reset everything to support using non-enlightened VMCS access later
> + * (e.g. when we reload the module with enlightened_vmcs=0)
> + */
> + for_each_online_cpu(cpu) {
> + vp_ap = hv_get_vp_assist_page(cpu);
> +
> + if (!vp_ap)
> + continue;
> +
> + vp_ap->nested_control.features.directhypercall = 0;
> + vp_ap->current_nested_vmcs = 0;
> + vp_ap->enlighten_vmentry = 0;
> + }

Unrelated to your patch but while looking at this code I got curious
about why don't we need a protection against CPU offlining here. Turns
out that even when we offline a CPU, its VP assist page remains
allocated (see hv_cpu_die()), we just write '0' to the MSR and thus
accessing the page is safe. The consequent hv_cpu_init(), however, does
not restore VP assist page when it's already allocated:

# rdmsr -p 24 0x40000073
10212f001
# echo 0 > /sys/devices/system/cpu/cpu24/online
# echo 1 > /sys/devices/system/cpu/cpu24/online
# rdmsr -p 24 0x40000073
0

The culprit is commit e5d9b714fe402 ("x86/hyperv: fix root partition
faults when writing to VP assist page MSR"). A patch is inbound.

'hv_root_partition' case is different though. We do memunmap() and reset
VP assist page to zero so it is theoretically possible we're going to
clash. Unless I'm missing some obvious reason why module unload can't
coincide with CPU offlining, we may be better off surrounding this with
cpus_read_lock()/cpus_read_unlock().

> +
> + static_branch_disable(&enable_evmcs);
> +}
> +
> +#else /* IS_ENABLED(CONFIG_HYPERV) */
> +static void hv_setup_evmcs(void) {}
> +static void hv_cleanup_evmcs(void) {}
> #endif /* IS_ENABLED(CONFIG_HYPERV) */
>
> /*
> @@ -8435,29 +8502,8 @@ static void vmx_exit(void)
>
> kvm_exit();
>
> -#if IS_ENABLED(CONFIG_HYPERV)
> - if (static_branch_unlikely(&enable_evmcs)) {
> - int cpu;
> - struct hv_vp_assist_page *vp_ap;
> - /*
> - * Reset everything to support using non-enlightened VMCS
> - * access later (e.g. when we reload the module with
> - * enlightened_vmcs=0)
> - */
> - for_each_online_cpu(cpu) {
> - vp_ap = hv_get_vp_assist_page(cpu);
> + hv_cleanup_evmcs();
>
> - if (!vp_ap)
> - continue;
> -
> - vp_ap->nested_control.features.directhypercall = 0;
> - vp_ap->current_nested_vmcs = 0;
> - vp_ap->enlighten_vmentry = 0;
> - }
> -
> - static_branch_disable(&enable_evmcs);
> - }
> -#endif
> vmx_cleanup_l1d_flush();
>
> allow_smaller_maxphyaddr = false;
> @@ -8468,43 +8514,12 @@ static int __init vmx_init(void)
> {
> int r, cpu;
>
> -#if IS_ENABLED(CONFIG_HYPERV)
> - /*
> - * Enlightened VMCS usage should be recommended and the host needs
> - * to support eVMCS v1 or above. We can also disable eVMCS support
> - * with module parameter.
> - */
> - if (enlightened_vmcs &&
> - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
> - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
> - KVM_EVMCS_VERSION) {
> -
> - /* Check that we have assist pages on all online CPUs */
> - for_each_online_cpu(cpu) {
> - if (!hv_get_vp_assist_page(cpu)) {
> - enlightened_vmcs = false;
> - break;
> - }
> - }
> -
> - if (enlightened_vmcs) {
> - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
> - static_branch_enable(&enable_evmcs);
> - }
> -
> - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
> - vmx_x86_ops.enable_direct_tlbflush
> - = hv_enable_direct_tlbflush;
> -
> - } else {
> - enlightened_vmcs = false;
> - }
> -#endif
> + hv_setup_evmcs();
>
> r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
> __alignof__(struct vcpu_vmx), THIS_MODULE);
> if (r)
> - return r;
> + goto err_kvm_init;
>
> /*
> * Must be called after kvm_init() so enable_ept is properly set
> @@ -8514,10 +8529,8 @@ static int __init vmx_init(void)
> * mitigation mode.
> */
> r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
> - if (r) {
> - vmx_exit();
> - return r;
> - }
> + if (r)
> + goto err_l1d_flush;
>
> vmx_setup_fb_clear_ctrl();
>
> @@ -8542,5 +8555,11 @@ static int __init vmx_init(void)
> allow_smaller_maxphyaddr = true;
>
> return 0;
> +
> +err_l1d_flush:
> + vmx_exit();
> +err_kvm_init:
> + hv_cleanup_evmcs();
> + return r;
> }
> module_init(vmx_init);

--
Vitaly


2022-11-11 02:34:05

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

On Thu, Nov 03, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <[email protected]> writes:
> > + /*
> > + * Reset everything to support using non-enlightened VMCS access later
> > + * (e.g. when we reload the module with enlightened_vmcs=0)
> > + */
> > + for_each_online_cpu(cpu) {
> > + vp_ap = hv_get_vp_assist_page(cpu);
> > +
> > + if (!vp_ap)
> > + continue;
> > +
> > + vp_ap->nested_control.features.directhypercall = 0;
> > + vp_ap->current_nested_vmcs = 0;
> > + vp_ap->enlighten_vmentry = 0;
> > + }
>
> Unrelated to your patch but while looking at this code I got curious
> about why don't we need a protection against CPU offlining here. Turns
> out that even when we offline a CPU, its VP assist page remains
> allocated (see hv_cpu_die()), we just write '0' to the MSR and thus

Heh, "die". Hyper-V is quite dramatic.

> accessing the page is safe. The consequent hv_cpu_init(), however, does
> not restore VP assist page when it's already allocated:
>
> # rdmsr -p 24 0x40000073
> 10212f001
> # echo 0 > /sys/devices/system/cpu/cpu24/online
> # echo 1 > /sys/devices/system/cpu/cpu24/online
> # rdmsr -p 24 0x40000073
> 0
>
> The culprit is commit e5d9b714fe402 ("x86/hyperv: fix root partition
> faults when writing to VP assist page MSR"). A patch is inbound.
>
> 'hv_root_partition' case is different though. We do memunmap() and reset
> VP assist page to zero so it is theoretically possible we're going to
> clash. Unless I'm missing some obvious reason why module unload can't
> coincide with CPU offlining, we may be better off surrounding this with
> cpus_read_lock()/cpus_read_unlock().

I finally see what you're concerned about. If a CPU goes offline and its assist
page is unmapped, zeroing out the nested/eVMCS stuff will fault.

I think the real problem is that the purging of the eVMCS is in the wrong place.
Move the clearing to vmx_hardware_disable() and then the CPU hotplug bug goes
away once KVM disables hotplug during hardware enabling/disable later in the series.
There's no need to wait until module exit, e.g. it's not like it costs much to
clear a few variables, and IIUC the state is used only when KVM is actively using
VMX/eVMCS.

However, I believe there's a second bug. KVM's CPU online hook is called before
Hyper-V's online hook (CPUHP_AP_ONLINE_DYN). Before this series, which moves KVM's
hook from STARTING to ONLINE, KVM's hook is waaaay before Hyper-V's. That means
that hv_cpu_init()'s allocation of the VP assist page will come _after_ KVM's
check in vmx_hardware_enable()

/*
* This can happen if we hot-added a CPU but failed to allocate
* VP assist page for it.
*/
if (static_branch_unlikely(&enable_evmcs) &&
!hv_get_vp_assist_page(cpu))
return -EFAULT;

I.e. CPU hotplug will never work if KVM is running VMs as a Hyper-V guest. I bet
you can repro by doing a SUSPEND+RESUME.

Can you try to see if that's actually a bug? If so, the only sane fix seems to
be to add a dedicated ONLINE action for Hyper-V. Per patch

KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section

from this series, CPUHP_AP_KVM_ONLINE needs to be before CPUHP_AP_SCHED_WAIT_EMPTY
to ensure there are no tasks, i.e. no vCPUs, running on the to-be-unplugged CPU.

Back to the original bug, proposed fix is below. The other advantage of moving
the reset to hardware disabling is that the "cleanup" is just disabling the static
key, and at that point can simply be deleted as there's no need to disable the
static key when kvm-intel is unloaded since kvm-intel owns the key. I.e. this
patch (that we're replying to) would get replaced with a patch to delete the
disabling of the static key.

--
From: Sean Christopherson <[email protected]>
Date: Thu, 10 Nov 2022 17:28:08 -0800
Subject: [PATCH] KVM: VMX: Reset eVMCS controls in VP assist page during
hardware disabling

Reset the eVMCS controls in the per-CPU VP assist page during hardware
disabling instead of waiting until kvm-intel's module exit. The controls
are activated if and only if KVM creates a VM, i.e. don't need to be
reset if hardware is never enabled.

Doing the reset during hardware disabling will naturally fix a potential
NULL pointer deref bug once KVM disables CPU hotplug while enabling and
disabling hardware (which is necessary to fix a variety of bugs). If the
kernel is running as the root partition, the VP assist page is unmapped
during CPU hot unplug, and so KVM's clearing of the eVMCS controls needs
to occur with CPU hot(un)plug disabled, otherwise KVM could attempt to
write to a CPU's VP assist page after it's unmapped.

Reported-by: Vitaly Kuznetsov <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 50 +++++++++++++++++++++++++-----------------
1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aca88524fd1e..ae13aa3e8a1d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -552,6 +552,33 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
return 0;
}

+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */

/*
@@ -2497,6 +2524,8 @@ static void vmx_hardware_disable(void)
if (cpu_vmxoff())
kvm_spurious_fault();

+ hv_reset_evmcs();
+
intel_pt_handle_vmx(0);
}

@@ -8463,27 +8492,8 @@ static void vmx_exit(void)
kvm_exit();

#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
+ if (static_branch_unlikely(&enable_evmcs))
static_branch_disable(&enable_evmcs);
- }
#endif
vmx_cleanup_l1d_flush();


base-commit: 5f47ba6894477dfbdc5416467a25fb7acb47d404
--


2022-11-15 10:30:48

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 10/44] KVM: VMX: Clean up eVMCS enabling if KVM initialization fails

Sean Christopherson <[email protected]> writes:

> On Thu, Nov 03, 2022, Vitaly Kuznetsov wrote:
>> Sean Christopherson <[email protected]> writes:
>> > + /*
>> > + * Reset everything to support using non-enlightened VMCS access later
>> > + * (e.g. when we reload the module with enlightened_vmcs=0)
>> > + */
>> > + for_each_online_cpu(cpu) {
>> > + vp_ap = hv_get_vp_assist_page(cpu);
>> > +
>> > + if (!vp_ap)
>> > + continue;
>> > +
>> > + vp_ap->nested_control.features.directhypercall = 0;
>> > + vp_ap->current_nested_vmcs = 0;
>> > + vp_ap->enlighten_vmentry = 0;
>> > + }
>>
>> Unrelated to your patch but while looking at this code I got curious
>> about why don't we need a protection against CPU offlining here. Turns
>> out that even when we offline a CPU, its VP assist page remains
>> allocated (see hv_cpu_die()), we just write '0' to the MSR and thus
>
> Heh, "die". Hyper-V is quite dramatic.
>
>> accessing the page is safe. The consequent hv_cpu_init(), however, does
>> not restore VP assist page when it's already allocated:
>>
>> # rdmsr -p 24 0x40000073
>> 10212f001
>> # echo 0 > /sys/devices/system/cpu/cpu24/online
>> # echo 1 > /sys/devices/system/cpu/cpu24/online
>> # rdmsr -p 24 0x40000073
>> 0
>>
>> The culprit is commit e5d9b714fe402 ("x86/hyperv: fix root partition
>> faults when writing to VP assist page MSR"). A patch is inbound.
>>
>> 'hv_root_partition' case is different though. We do memunmap() and reset
>> VP assist page to zero so it is theoretically possible we're going to
>> clash. Unless I'm missing some obvious reason why module unload can't
>> coincide with CPU offlining, we may be better off surrounding this with
>> cpus_read_lock()/cpus_read_unlock().
>
> I finally see what you're concerned about. If a CPU goes offline and its assist
> page is unmapped, zeroing out the nested/eVMCS stuff will fault.
>
> I think the real problem is that the purging of the eVMCS is in the wrong place.
> Move the clearing to vmx_hardware_disable() and then the CPU hotplug bug goes
> away once KVM disables hotplug during hardware enabling/disable later in the series.
> There's no need to wait until module exit, e.g. it's not like it costs much to
> clear a few variables, and IIUC the state is used only when KVM is actively using
> VMX/eVMCS.
>
> However, I believe there's a second bug. KVM's CPU online hook is called before
> Hyper-V's online hook (CPUHP_AP_ONLINE_DYN). Before this series, which moves KVM's
> hook from STARTING to ONLINE, KVM's hook is waaaay before Hyper-V's. That means
> that hv_cpu_init()'s allocation of the VP assist page will come _after_ KVM's
> check in vmx_hardware_enable()
>
> /*
> * This can happen if we hot-added a CPU but failed to allocate
> * VP assist page for it.
> */
> if (static_branch_unlikely(&enable_evmcs) &&
> !hv_get_vp_assist_page(cpu))
> return -EFAULT;
>
> I.e. CPU hotplug will never work if KVM is running VMs as a Hyper-V guest. I bet
> you can repro by doing a SUSPEND+RESUME.
>
> Can you try to see if that's actually a bug? If so, the only sane fix seems to
> be to add a dedicated ONLINE action for Hyper-V.

It seems we can't get away without a dedicated stage for Hyper-V anyway,
e.g. see our discussion with Michael:

https://lore.kernel.org/linux-hyperv/[email protected]/

All these issues are more or less "theoretical" as there's no real CPU
hotplug on Hyper-V/Azure. Yes, it is possible to trigger problems by
doing CPU offline/online but I don't see how this may come handy outside
of testing envs.

> Per patch
>
> KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section
>
> from this series, CPUHP_AP_KVM_ONLINE needs to be before CPUHP_AP_SCHED_WAIT_EMPTY
> to ensure there are no tasks, i.e. no vCPUs, running on the to-be-unplugged CPU.
>
> Back to the original bug, proposed fix is below. The other advantage of moving
> the reset to hardware disabling is that the "cleanup" is just disabling the static
> key, and at that point can simply be deleted as there's no need to disable the
> static key when kvm-intel is unloaded since kvm-intel owns the key. I.e. this
> patch (that we're replying to) would get replaced with a patch to delete the
> disabling of the static key.
>

From a quick glance looks good to me, I'll try to find some time to work
on this issue. I will likely end up proposing a dedicated CPU hotplug
stage for Hyper-V (which needs to happen before KVM's
CPUHP_AP_KVM_ONLINE on CPU hotplug and after on unplug) anyway.

Thanks for looking into this!

> --
> From: Sean Christopherson <[email protected]>
> Date: Thu, 10 Nov 2022 17:28:08 -0800
> Subject: [PATCH] KVM: VMX: Reset eVMCS controls in VP assist page during
> hardware disabling
>
> Reset the eVMCS controls in the per-CPU VP assist page during hardware
> disabling instead of waiting until kvm-intel's module exit. The controls
> are activated if and only if KVM creates a VM, i.e. don't need to be
> reset if hardware is never enabled.
>
> Doing the reset during hardware disabling will naturally fix a potential
> NULL pointer deref bug once KVM disables CPU hotplug while enabling and
> disabling hardware (which is necessary to fix a variety of bugs). If the
> kernel is running as the root partition, the VP assist page is unmapped
> during CPU hot unplug, and so KVM's clearing of the eVMCS controls needs
> to occur with CPU hot(un)plug disabled, otherwise KVM could attempt to
> write to a CPU's VP assist page after it's unmapped.
>
> Reported-by: Vitaly Kuznetsov <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/vmx/vmx.c | 50 +++++++++++++++++++++++++-----------------
> 1 file changed, 30 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index aca88524fd1e..ae13aa3e8a1d 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -552,6 +552,33 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
> return 0;
> }
>
> +static void hv_reset_evmcs(void)
> +{
> + struct hv_vp_assist_page *vp_ap;
> +
> + if (!static_branch_unlikely(&enable_evmcs))
> + return;
> +
> + /*
> + * KVM should enable eVMCS if and only if all CPUs have a VP assist
> + * page, and should reject CPU onlining if eVMCS is enabled the CPU
> + * doesn't have a VP assist page allocated.
> + */
> + vp_ap = hv_get_vp_assist_page(smp_processor_id());
> + if (WARN_ON_ONCE(!vp_ap))
> + return;
> +
> + /*
> + * Reset everything to support using non-enlightened VMCS access later
> + * (e.g. when we reload the module with enlightened_vmcs=0)
> + */
> + vp_ap->nested_control.features.directhypercall = 0;
> + vp_ap->current_nested_vmcs = 0;
> + vp_ap->enlighten_vmentry = 0;
> +}
> +
> +#else /* IS_ENABLED(CONFIG_HYPERV) */
> +static void hv_reset_evmcs(void) {}
> #endif /* IS_ENABLED(CONFIG_HYPERV) */
>
> /*
> @@ -2497,6 +2524,8 @@ static void vmx_hardware_disable(void)
> if (cpu_vmxoff())
> kvm_spurious_fault();
>
> + hv_reset_evmcs();
> +
> intel_pt_handle_vmx(0);
> }
>
> @@ -8463,27 +8492,8 @@ static void vmx_exit(void)
> kvm_exit();
>
> #if IS_ENABLED(CONFIG_HYPERV)
> - if (static_branch_unlikely(&enable_evmcs)) {
> - int cpu;
> - struct hv_vp_assist_page *vp_ap;
> - /*
> - * Reset everything to support using non-enlightened VMCS
> - * access later (e.g. when we reload the module with
> - * enlightened_vmcs=0)
> - */
> - for_each_online_cpu(cpu) {
> - vp_ap = hv_get_vp_assist_page(cpu);
> -
> - if (!vp_ap)
> - continue;
> -
> - vp_ap->nested_control.features.directhypercall = 0;
> - vp_ap->current_nested_vmcs = 0;
> - vp_ap->enlighten_vmentry = 0;
> - }
> -
> + if (static_branch_unlikely(&enable_evmcs))
> static_branch_disable(&enable_evmcs);
> - }
> #endif
> vmx_cleanup_l1d_flush();
>
>
> base-commit: 5f47ba6894477dfbdc5416467a25fb7acb47d404

--
Vitaly