2022-06-08 05:49:55

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

Restrict the nVMX MSRs based on KVM's config, not based on the guest's
current config. Using the guest's config to audit the new config
prevents userspace from restoring the original config (KVM's config) if
at any point in the past the guest's config was restricted in any way.

Fixes: 62cc6b9dc61e ("KVM: nVMX: support restore of VMX capability MSRs")
Cc: [email protected]
Cc: David Matlack <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 100 ++++++++++++++++++++------------------
1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 00c7b00c017a..fca30e79b3a0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1223,7 +1223,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
/* reserved */
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
- u64 vmx_basic = vmx->nested.msrs.basic;
+ u64 vmx_basic = vmcs_config.nested.basic;

if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
return -EINVAL;
@@ -1246,36 +1246,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
return 0;
}

+static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
+ u32 **low, u32 **high)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ *low = &msrs->pinbased_ctls_low;
+ *high = &msrs->pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *low = &msrs->procbased_ctls_low;
+ *high = &msrs->procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *low = &msrs->exit_ctls_low;
+ *high = &msrs->exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *low = &msrs->entry_ctls_low;
+ *high = &msrs->entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *low = &msrs->secondary_ctls_low;
+ *high = &msrs->secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+}
+
static int
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{
- u64 supported;
u32 *lowp, *highp;
+ u64 supported;

- switch (msr_index) {
- case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- lowp = &vmx->nested.msrs.pinbased_ctls_low;
- highp = &vmx->nested.msrs.pinbased_ctls_high;
- break;
- case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- lowp = &vmx->nested.msrs.procbased_ctls_low;
- highp = &vmx->nested.msrs.procbased_ctls_high;
- break;
- case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- lowp = &vmx->nested.msrs.exit_ctls_low;
- highp = &vmx->nested.msrs.exit_ctls_high;
- break;
- case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- lowp = &vmx->nested.msrs.entry_ctls_low;
- highp = &vmx->nested.msrs.entry_ctls_high;
- break;
- case MSR_IA32_VMX_PROCBASED_CTLS2:
- lowp = &vmx->nested.msrs.secondary_ctls_low;
- highp = &vmx->nested.msrs.secondary_ctls_high;
- break;
- default:
- BUG();
- }
+ vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);

supported = vmx_control_msr(*lowp, *highp);

@@ -1287,6 +1293,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
return -EINVAL;

+ vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
*lowp = data;
*highp = data >> 32;
return 0;
@@ -1300,10 +1307,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
/* reserved */
GENMASK_ULL(13, 9) | BIT_ULL(31);
- u64 vmx_misc;
-
- vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
+ u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
+ vmcs_config.nested.misc_high);

if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
return -EINVAL;
@@ -1331,10 +1336,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)

static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{
- u64 vmx_ept_vpid_cap;
-
- vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
- vmx->nested.msrs.vpid_caps);
+ u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
+ vmcs_config.nested.vpid_caps);

/* Every bit is either reserved or a feature bit. */
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
@@ -1345,20 +1348,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
return 0;
}

+static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ return &msrs->cr0_fixed0;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return &msrs->cr4_fixed0;
+ default:
+ BUG();
+ }
+}
+
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{
- u64 *msr;
-
- switch (msr_index) {
- case MSR_IA32_VMX_CR0_FIXED0:
- msr = &vmx->nested.msrs.cr0_fixed0;
- break;
- case MSR_IA32_VMX_CR4_FIXED0:
- msr = &vmx->nested.msrs.cr4_fixed0;
- break;
- default:
- BUG();
- }
+ const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);

/*
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
@@ -1367,7 +1371,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
if (!is_bitwise_subset(data, *msr, -1ULL))
return -EINVAL;

- *msr = data;
+ *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
return 0;
}

@@ -1428,7 +1432,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmx->nested.msrs.vmcs_enum = data;
return 0;
case MSR_IA32_VMX_VMFUNC:
- if (data & ~vmx->nested.msrs.vmfunc_controls)
+ if (data & ~vmcs_config.nested.vmfunc_controls)
return -EINVAL;
vmx->nested.msrs.vmfunc_controls = data;
return 0;
--
2.36.1.255.ge46751e96f-goog


2022-10-31 16:50:57

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

Hi Sean & Paolo,

On Tue, Jun 07, 2022 at 09:35:54PM +0000, Sean Christopherson wrote:
> Restrict the nVMX MSRs based on KVM's config, not based on the guest's
> current config. Using the guest's config to audit the new config
> prevents userspace from restoring the original config (KVM's config) if
> at any point in the past the guest's config was restricted in any way.

May I ask for an example here, to explain why we use the KVM config
here, instead of the guest's? I mean, the guest's config can be
adjusted after cpuid updates by vmx_vcpu_after_set_cpuid(). Yet the
msr settings in vmcs_config.nested might be outdated by then.

Another question is about the setting of secondary_ctls_high in
nested_vmx_setup_ctls_msrs(). I saw there's a comment saying:
"Do not include those that depend on CPUID bits, they are
added later by vmx_vcpu_after_set_cpuid.".

But since cpuid updates can adjust the vmx->nested.msrs.secondary_ctls_high,
do we really need to clear those flags for secondary_ctls_high in this
global config? Could we just set
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl?

If yes, code(in nested_vmx_setup_ctls_msrs()) such as
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
or
if (cpu_has_vmx_vmfunc()) {
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VMFUNC;
and other similar ones may also be uncessary.

B.R.
Yu

>
> Fixes: 62cc6b9dc61e ("KVM: nVMX: support restore of VMX capability MSRs")
> Cc: [email protected]
> Cc: David Matlack <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/vmx/nested.c | 100 ++++++++++++++++++++------------------
> 1 file changed, 52 insertions(+), 48 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 00c7b00c017a..fca30e79b3a0 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -1223,7 +1223,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
> BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
> /* reserved */
> BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
> - u64 vmx_basic = vmx->nested.msrs.basic;
> + u64 vmx_basic = vmcs_config.nested.basic;
>
> if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
> return -EINVAL;
> @@ -1246,36 +1246,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
> return 0;
> }
>
> +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
> + u32 **low, u32 **high)
> +{
> + switch (msr_index) {
> + case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> + *low = &msrs->pinbased_ctls_low;
> + *high = &msrs->pinbased_ctls_high;
> + break;
> + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
> + *low = &msrs->procbased_ctls_low;
> + *high = &msrs->procbased_ctls_high;
> + break;
> + case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> + *low = &msrs->exit_ctls_low;
> + *high = &msrs->exit_ctls_high;
> + break;
> + case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> + *low = &msrs->entry_ctls_low;
> + *high = &msrs->entry_ctls_high;
> + break;
> + case MSR_IA32_VMX_PROCBASED_CTLS2:
> + *low = &msrs->secondary_ctls_low;
> + *high = &msrs->secondary_ctls_high;
> + break;
> + default:
> + BUG();
> + }
> +}
> +
> static int
> vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
> {
> - u64 supported;
> u32 *lowp, *highp;
> + u64 supported;
>
> - switch (msr_index) {
> - case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> - lowp = &vmx->nested.msrs.pinbased_ctls_low;
> - highp = &vmx->nested.msrs.pinbased_ctls_high;
> - break;
> - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
> - lowp = &vmx->nested.msrs.procbased_ctls_low;
> - highp = &vmx->nested.msrs.procbased_ctls_high;
> - break;
> - case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> - lowp = &vmx->nested.msrs.exit_ctls_low;
> - highp = &vmx->nested.msrs.exit_ctls_high;
> - break;
> - case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> - lowp = &vmx->nested.msrs.entry_ctls_low;
> - highp = &vmx->nested.msrs.entry_ctls_high;
> - break;
> - case MSR_IA32_VMX_PROCBASED_CTLS2:
> - lowp = &vmx->nested.msrs.secondary_ctls_low;
> - highp = &vmx->nested.msrs.secondary_ctls_high;
> - break;
> - default:
> - BUG();
> - }
> + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
>
> supported = vmx_control_msr(*lowp, *highp);
>
> @@ -1287,6 +1293,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
> if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
> return -EINVAL;
>
> + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
> *lowp = data;
> *highp = data >> 32;
> return 0;
> @@ -1300,10 +1307,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
> BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
> /* reserved */
> GENMASK_ULL(13, 9) | BIT_ULL(31);
> - u64 vmx_misc;
> -
> - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
> - vmx->nested.msrs.misc_high);
> + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
> + vmcs_config.nested.misc_high);
>
> if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
> return -EINVAL;
> @@ -1331,10 +1336,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
>
> static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
> {
> - u64 vmx_ept_vpid_cap;
> -
> - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
> - vmx->nested.msrs.vpid_caps);
> + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
> + vmcs_config.nested.vpid_caps);
>
> /* Every bit is either reserved or a feature bit. */
> if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
> @@ -1345,20 +1348,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
> return 0;
> }
>
> +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
> +{
> + switch (msr_index) {
> + case MSR_IA32_VMX_CR0_FIXED0:
> + return &msrs->cr0_fixed0;
> + case MSR_IA32_VMX_CR4_FIXED0:
> + return &msrs->cr4_fixed0;
> + default:
> + BUG();
> + }
> +}
> +
> static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
> {
> - u64 *msr;
> -
> - switch (msr_index) {
> - case MSR_IA32_VMX_CR0_FIXED0:
> - msr = &vmx->nested.msrs.cr0_fixed0;
> - break;
> - case MSR_IA32_VMX_CR4_FIXED0:
> - msr = &vmx->nested.msrs.cr4_fixed0;
> - break;
> - default:
> - BUG();
> - }
> + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
>
> /*
> * 1 bits (which indicates bits which "must-be-1" during VMX operation)
> @@ -1367,7 +1371,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
> if (!is_bitwise_subset(data, *msr, -1ULL))
> return -EINVAL;
>
> - *msr = data;
> + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
> return 0;
> }
>
> @@ -1428,7 +1432,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
> vmx->nested.msrs.vmcs_enum = data;
> return 0;
> case MSR_IA32_VMX_VMFUNC:
> - if (data & ~vmx->nested.msrs.vmfunc_controls)
> + if (data & ~vmcs_config.nested.vmfunc_controls)
> return -EINVAL;
> vmx->nested.msrs.vmfunc_controls = data;
> return 0;
> --
> 2.36.1.255.ge46751e96f-goog
>

2022-10-31 17:27:18

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Tue, Nov 01, 2022, Yu Zhang wrote:
> Hi Sean & Paolo,
>
> On Tue, Jun 07, 2022 at 09:35:54PM +0000, Sean Christopherson wrote:
> > Restrict the nVMX MSRs based on KVM's config, not based on the guest's
> > current config. Using the guest's config to audit the new config
> > prevents userspace from restoring the original config (KVM's config) if
> > at any point in the past the guest's config was restricted in any way.
>
> May I ask for an example here, to explain why we use the KVM config
> here, instead of the guest's? I mean, the guest's config can be
> adjusted after cpuid updates by vmx_vcpu_after_set_cpuid(). Yet the
> msr settings in vmcs_config.nested might be outdated by then.

vmcs_config.nested never becomes out-of-date, it's read-only after __init (not
currently marked as such, that will be remedied soon).

The auditing performed by KVM is purely to guard against userspace enabling
features that KVM doesn't support. KVM is not responsible for ensuring that the
vCPU's CPUID model match the VMX MSR model.

An example would be if userspace loaded the VMX MSRs with a default model, and
then enabled features one-by-one. In practice this doesn't happen because it's
more performant to gather all features and do a single KVM_SET_MSRS, but it's a
legitimate approach that KVM should allow.

> Another question is about the setting of secondary_ctls_high in
> nested_vmx_setup_ctls_msrs(). I saw there's a comment saying:
> "Do not include those that depend on CPUID bits, they are
> added later by vmx_vcpu_after_set_cpuid.".

That's a stale comment, see the very next commit, 8805875aa473 ("Revert "KVM: nVMX:
Do not expose MPX VMX controls when guest MPX disabled""), as well as the slightly
later commit 9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL
VM-{Entry,Exit} control"").

> But since cpuid updates can adjust the vmx->nested.msrs.secondary_ctls_high,
> do we really need to clear those flags for secondary_ctls_high in this
> global config?

As above, the comment is stale, KVM should not manipulate the VMX MSRs in response
to guest CPUID changes. The one exception to this is reserved CR0/CR4 bits. We
discussed quirking that behavior, but ultimately decided not to because (a) no
userspace actually cares and and (b) KVM would effectively need to make up behavior
if userspace allowed the guest to load CR4 bits via VM-Enter or VM-Exit that are
disallowed by CPUID, e.g. L1 could end up running with a CR4 that is supposed to
be impossible according to CPUID.

> Could we just set
> msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl?

KVM already does that in upstream (with further sanitization). See commit
bcdf201f8a4d ("KVM: nVMX: Use sanitized allowed-1 bits for VMX control MSRs").

> If yes, code(in nested_vmx_setup_ctls_msrs()) such as
> if (enable_ept) {
> /* nested EPT: emulate EPT also to L1 */
> msrs->secondary_ctls_high |=
> SECONDARY_EXEC_ENABLE_EPT;

This can't be completely removed, though unless I'm missing something, it can and
should be shifted to the sanitization code, e.g.

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8f67a9c4a287..0c41d5808413 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6800,6 +6800,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)

msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
+ SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
@@ -6820,9 +6821,6 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_SHADOW_VMCS;

if (enable_ept) {
- /* nested EPT: emulate EPT also to L1 */
- msrs->secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_EPT;
msrs->ept_caps =
VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPT_PAGE_WALK_5_BIT |


> or
> if (cpu_has_vmx_vmfunc()) {
> msrs->secondary_ctls_high |=
> SECONDARY_EXEC_ENABLE_VMFUNC;

This one is still required. KVM never enables VMFUNC for itself, i.e. it won't
be set in KVM's VMCS configuration.

> and other similar ones may also be uncessary.



2022-11-01 10:26:15

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Mon, Oct 31, 2022 at 05:11:10PM +0000, Sean Christopherson wrote:
> On Tue, Nov 01, 2022, Yu Zhang wrote:
> > Hi Sean & Paolo,
> >
> > On Tue, Jun 07, 2022 at 09:35:54PM +0000, Sean Christopherson wrote:
> > > Restrict the nVMX MSRs based on KVM's config, not based on the guest's
> > > current config. Using the guest's config to audit the new config
> > > prevents userspace from restoring the original config (KVM's config) if
> > > at any point in the past the guest's config was restricted in any way.
> >
> > May I ask for an example here, to explain why we use the KVM config
> > here, instead of the guest's? I mean, the guest's config can be
> > adjusted after cpuid updates by vmx_vcpu_after_set_cpuid(). Yet the
> > msr settings in vmcs_config.nested might be outdated by then.

Thanks a lot for your explanation, Sean. Questions are embedded below:

>
> vmcs_config.nested never becomes out-of-date, it's read-only after __init (not
> currently marked as such, that will be remedied soon).
>
> The auditing performed by KVM is purely to guard against userspace enabling
> features that KVM doesn't support. KVM is not responsible for ensuring that the
> vCPU's CPUID model match the VMX MSR model.

Do you mean the VMX MSR model shall not be changed after the cpuid updates?
And for VMX MSR model, do you mean the vmx->nested.msrs or the ones in
vmcs_config->nested?

What I observed is that vmx->nested.msrs.secondary_ctls_high will be changed
in vmx_adjust_secondary_exec_control(), which can be triggered after cpuid is
set.

Since KVM's config(vmcs_config->nested.secondary_ctls_high) is done during init
by nested_vmx_setup_ctls_msrs(), which only kept a subset of the flags from the
vmcs_confg->cpu_based_2nd_exec_ctrl, the vmx_restore_control_msr() could fail
later, when userspace VMM tries to enable a feature(the only one I witnessed is
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE) by setting MSR_IA32_VMX_PROCBASED_CTLS2.
Because the vmx->nested.msrs.secondary_ctls_high is updated by cpuid, but this
bit is not taken from vmcs_conf->cpu_based_2nd_exec_ctrl by nested_vmx_setup_ctls_msrs()
for vmcs_config->nested.secondary_ctls_high.

The failure can be fixed, simply by adding SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE in
nested_vmx_setup_ctls_msrs(), e.g.

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c62352dda6a..fa255391718c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6791,13 +6791,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);

- /*
- * secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by
- * vmx_vcpu_after_set_cpuid.
- */
msrs->secondary_ctls_low = 0;
-
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
@@ -6810,7 +6804,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;

/*
* We can emulate "VMCS shadowing," even if the hardware

But then I wonder, why do we need the bitwise and operation here first:
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;

And then reset many of the remaining flags based on configurations such as
enable_ept, enable_vpid, enable_unrestricted_guest etc... But maybe we need
go through this case by case.

>
> An example would be if userspace loaded the VMX MSRs with a default model, and
> then enabled features one-by-one. In practice this doesn't happen because it's
> more performant to gather all features and do a single KVM_SET_MSRS, but it's a
> legitimate approach that KVM should allow.
>
> > Another question is about the setting of secondary_ctls_high in
> > nested_vmx_setup_ctls_msrs(). I saw there's a comment saying:
> > "Do not include those that depend on CPUID bits, they are
> > added later by vmx_vcpu_after_set_cpuid.".
>
> That's a stale comment, see the very next commit, 8805875aa473 ("Revert "KVM: nVMX:
> Do not expose MPX VMX controls when guest MPX disabled""), as well as the slightly
> later commit 9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL
> VM-{Entry,Exit} control"").
>

So the comment can be and shall be removed, right?

> > But since cpuid updates can adjust the vmx->nested.msrs.secondary_ctls_high,
> > do we really need to clear those flags for secondary_ctls_high in this
> > global config?
>
> As above, the comment is stale, KVM should not manipulate the VMX MSRs in response
> to guest CPUID changes. The one exception to this is reserved CR0/CR4 bits. We
> discussed quirking that behavior, but ultimately decided not to because (a) no
> userspace actually cares and and (b) KVM would effectively need to make up behavior
> if userspace allowed the guest to load CR4 bits via VM-Enter or VM-Exit that are
> disallowed by CPUID, e.g. L1 could end up running with a CR4 that is supposed to
> be impossible according to CPUID.
>
> > Could we just set
> > msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl?
>
> KVM already does that in upstream (with further sanitization). See commit
> bcdf201f8a4d ("KVM: nVMX: Use sanitized allowed-1 bits for VMX control MSRs").
>
> > If yes, code(in nested_vmx_setup_ctls_msrs()) such as
> > if (enable_ept) {
> > /* nested EPT: emulate EPT also to L1 */
> > msrs->secondary_ctls_high |=
> > SECONDARY_EXEC_ENABLE_EPT;
>
> This can't be completely removed, though unless I'm missing something, it can and
> should be shifted to the sanitization code, e.g.
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 8f67a9c4a287..0c41d5808413 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -6800,6 +6800,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
>
> msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
> msrs->secondary_ctls_high &=
> + SECONDARY_EXEC_ENABLE_EPT |
> SECONDARY_EXEC_DESC |
> SECONDARY_EXEC_ENABLE_RDTSCP |
> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> @@ -6820,9 +6821,6 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
> SECONDARY_EXEC_SHADOW_VMCS;
>
> if (enable_ept) {
> - /* nested EPT: emulate EPT also to L1 */
> - msrs->secondary_ctls_high |=
> - SECONDARY_EXEC_ENABLE_EPT;
> msrs->ept_caps =
> VMX_EPT_PAGE_WALK_4_BIT |
> VMX_EPT_PAGE_WALK_5_BIT |
>
>
> > or
> > if (cpu_has_vmx_vmfunc()) {
> > msrs->secondary_ctls_high |=
> > SECONDARY_EXEC_ENABLE_VMFUNC;
>
> This one is still required. KVM never enables VMFUNC for itself, i.e. it won't
> be set in KVM's VMCS configuration.
>

My understanding is that, for VMFUNC, altough KVM does not support it,
SECONDARY_EXEC_ENABLE_VMFUNC is still set in the secondary proc-based
vm execution ctrol. KVM just uses different handlers for VMFUNC exits
from L1(to inject the #UD) and L2(to emulate the eptp switching). So
it doesn't matter if we do not clear this bit for vmcs_config->nested.
procbased_ctls_high.

I may probably missed something. But I hope my questions are clear
enough (though I also doubt it...) :)

B.R.
Yu

2022-11-01 18:13:25

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Tue, Nov 01, 2022, Yu Zhang wrote:
> On Mon, Oct 31, 2022 at 05:11:10PM +0000, Sean Christopherson wrote:
> > vmcs_config.nested never becomes out-of-date, it's read-only after __init (not
> > currently marked as such, that will be remedied soon).
> >
> > The auditing performed by KVM is purely to guard against userspace enabling
> > features that KVM doesn't support. KVM is not responsible for ensuring that the
> > vCPU's CPUID model match the VMX MSR model.
>
> Do you mean the VMX MSR model shall not be changed after the cpuid updates?

No, I mean that the virtual CPU model (CPUID + VMX MSRs) that is presented to the
guest is the responsibility of host userspace. KVM only cares about not enabling
bits/features that KVM doesn't supported.

> And for VMX MSR model, do you mean the vmx->nested.msrs or the ones in
> vmcs_config->nested?

vmx->nested.msrs. vmcs_config->nested is effectively the VMX equivalent of
KVM_GET_SUPPORTED_CPUID.

> What I observed is that vmx->nested.msrs.secondary_ctls_high will be changed
> in vmx_adjust_secondary_exec_control(), which can be triggered after cpuid is
> set.

Ugh, that path got overlooked when we yanked out KVM's manipulaton of VMX MSRs
in response to guest CPUID changes. I wonder if we can get away with changing
KVM's behavior to only ensure a feature isn't exposed to L2 when it's not exposed
to L1.

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6b4266e949a3..cfc35d559d91 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4523,8 +4523,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* Update the nested MSR settings so that a nested VMM can/can't set
* controls for features that are/aren't exposed to the guest.
*/
- if (nested) {
- if (enabled)
+ if (nested && !enabled)
+ if (exiting)
vmx->nested.msrs.secondary_ctls_high |= control;
else
vmx->nested.msrs.secondary_ctls_high &= ~control;


> Since KVM's config(vmcs_config->nested.secondary_ctls_high) is done during init
> by nested_vmx_setup_ctls_msrs(), which only kept a subset of the flags from the
> vmcs_confg->cpu_based_2nd_exec_ctrl, the vmx_restore_control_msr() could fail
> later, when userspace VMM tries to enable a feature(the only one I witnessed is
> SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE) by setting MSR_IA32_VMX_PROCBASED_CTLS2.
> Because the vmx->nested.msrs.secondary_ctls_high is updated by cpuid, but this
> bit is not taken from vmcs_conf->cpu_based_2nd_exec_ctrl by nested_vmx_setup_ctls_msrs()
> for vmcs_config->nested.secondary_ctls_high.
>
> The failure can be fixed, simply by adding SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE in
> nested_vmx_setup_ctls_msrs(), e.g.

Assuming KVM actually supports user wait/pause in L2, this is an orthogonal bug
to the CPUID-based manipulation above. KVM simply neglects to advertise to userspace
that ENABLE_USR_WAIT_PAUSE is supported for nested virtualization.

If KVM doesn't correctly support virtualizing user wait/pause for L2, then the
correct location to fix this is in vmx_secondary_exec_control().

> > > Another question is about the setting of secondary_ctls_high in
> > > nested_vmx_setup_ctls_msrs(). I saw there's a comment saying:
> > > "Do not include those that depend on CPUID bits, they are
> > > added later by vmx_vcpu_after_set_cpuid.".
> >
> > That's a stale comment, see the very next commit, 8805875aa473 ("Revert "KVM: nVMX:
> > Do not expose MPX VMX controls when guest MPX disabled""), as well as the slightly
> > later commit 9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL
> > VM-{Entry,Exit} control"").
> >
>
> So the comment can be and shall be removed, right?

Yep.

> > > if (cpu_has_vmx_vmfunc()) {
> > > msrs->secondary_ctls_high |=
> > > SECONDARY_EXEC_ENABLE_VMFUNC;
> >
> > This one is still required. KVM never enables VMFUNC for itself, i.e. it won't
> > be set in KVM's VMCS configuration.
> >
>
> My understanding is that, for VMFUNC, altough KVM does not support it,
> SECONDARY_EXEC_ENABLE_VMFUNC is still set in the secondary proc-based
> vm execution ctrol. KVM just uses different handlers for VMFUNC exits
> from L1(to inject the #UD) and L2(to emulate the eptp switching). So
> it doesn't matter if we do not clear this bit for vmcs_config->nested.
> procbased_ctls_high.

Ah, you're right, I didn't realize KVM enables VMFUNC in L1. Enabling VMFUNC for
L1 is silly though, it's trivial to clear the feature in vmx_secondary_exec_control().

That said, enabling VMFUNC in vmcs01 is an orthogonal topic, and it _is_ indeed
easier to keep the feature in the reference config. Now that the nested config
is derived from the non-nested config, nested_vmx_setup_ctls_msrs() can do:

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 61a2e551640a..751cc67aa1a9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6811,7 +6811,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_VMFUNC;

/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -6842,17 +6843,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
}
}

- if (cpu_has_vmx_vmfunc()) {
- msrs->secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_VMFUNC;
- /*
- * Advertise EPTP switching unconditionally
- * since we emulate it
- */
- if (enable_ept)
- msrs->vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
- }
+ /*
+ * Advertise EPTP switching if VMFUNC and EPT are supported, KVM
+ * emulates the actual EPTP switch in software.
+ */
+ if (cpu_has_vmx_vmfunc() && enable_ept)
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;

/*
* Old versions of KVM use the single-context version without

2022-11-02 09:07:14

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Tue, Nov 01, 2022 at 05:58:21PM +0000, Sean Christopherson wrote:
> On Tue, Nov 01, 2022, Yu Zhang wrote:
> > On Mon, Oct 31, 2022 at 05:11:10PM +0000, Sean Christopherson wrote:
> > > vmcs_config.nested never becomes out-of-date, it's read-only after __init (not
> > > currently marked as such, that will be remedied soon).
> > >
> > > The auditing performed by KVM is purely to guard against userspace enabling
> > > features that KVM doesn't support. KVM is not responsible for ensuring that the
> > > vCPU's CPUID model match the VMX MSR model.
> >
> > Do you mean the VMX MSR model shall not be changed after the cpuid updates?
>
> No, I mean that the virtual CPU model (CPUID + VMX MSRs) that is presented to the
> guest is the responsibility of host userspace. KVM only cares about not enabling
> bits/features that KVM doesn't supported.
>

Oh, I see. We need to guarantee the userspace VMM can not successfully
set a feature in vmx msr, if KVM does not support it.

> > And for VMX MSR model, do you mean the vmx->nested.msrs or the ones in
> > vmcs_config->nested?
>
> vmx->nested.msrs. vmcs_config->nested is effectively the VMX equivalent of
> KVM_GET_SUPPORTED_CPUID.
>
> > What I observed is that vmx->nested.msrs.secondary_ctls_high will be changed
> > in vmx_adjust_secondary_exec_control(), which can be triggered after cpuid is
> > set.
>
> Ugh, that path got overlooked when we yanked out KVM's manipulaton of VMX MSRs
> in response to guest CPUID changes. I wonder if we can get away with changing
> KVM's behavior to only ensure a feature isn't exposed to L2 when it's not exposed
> to L1.
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 6b4266e949a3..cfc35d559d91 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4523,8 +4523,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
> * Update the nested MSR settings so that a nested VMM can/can't set
> * controls for features that are/aren't exposed to the guest.
> */
> - if (nested) {
> - if (enabled)
> + if (nested && !enabled)
> + if (exiting)
> vmx->nested.msrs.secondary_ctls_high |= control;
> else
> vmx->nested.msrs.secondary_ctls_high &= ~control;
>

Indeed, this change can make sure a feature won't be exposed to L2, if L1
does not have it. But for the feature bits that L1 has, yet cleared from
the vmcs_conf->nested.msrs.secondary_ctls_high in nested_vmx_setup_ctls_msrs(),
there's no chance for userspace vmm to reset it again.

Well, I am not suggesting to give userspace vmm such permission(which I believe
is incorrect). And IIUC, vmcs_conf->nested.msrs.secondary_ctls_high shall serve
as a template to initialize vmx->nested.msrs.secondary_ctls_high. So maybe we
shall not mask off some features in nested_vmx_setup_ctls_msrs() at the beginning.

> > Since KVM's config(vmcs_config->nested.secondary_ctls_high) is done during init
> > by nested_vmx_setup_ctls_msrs(), which only kept a subset of the flags from the
> > vmcs_confg->cpu_based_2nd_exec_ctrl, the vmx_restore_control_msr() could fail
> > later, when userspace VMM tries to enable a feature(the only one I witnessed is
> > SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE) by setting MSR_IA32_VMX_PROCBASED_CTLS2.
> > Because the vmx->nested.msrs.secondary_ctls_high is updated by cpuid, but this
> > bit is not taken from vmcs_conf->cpu_based_2nd_exec_ctrl by nested_vmx_setup_ctls_msrs()
> > for vmcs_config->nested.secondary_ctls_high.
> >
> > The failure can be fixed, simply by adding SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE in
> > nested_vmx_setup_ctls_msrs(), e.g.
>
> Assuming KVM actually supports user wait/pause in L2, this is an orthogonal bug
> to the CPUID-based manipulation above. KVM simply neglects to advertise to userspace
> that ENABLE_USR_WAIT_PAUSE is supported for nested virtualization.
>
> If KVM doesn't correctly support virtualizing user wait/pause for L2, then the
> correct location to fix this is in vmx_secondary_exec_control().
>

Sorry, why vmx_secondary_exec_control()? Could we just change
nested_vmx_setup_ctls_msrs() like below:

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c62352dda6a..fa255391718c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6791,13 +6791,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);

- /*
- * secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by
- * vmx_vcpu_after_set_cpuid.
- */
msrs->secondary_ctls_low = 0;
-
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
@@ -6810,7 +6804,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;

/*
* We can emulate "VMCS shadowing," even if the hardware

Note: I did not use "if (cpu_has_vmx_waitpkg())" here, it looks like to
take off one's pants to fart(no offense, just a Chinese old saying meaning
unnecessary acts.:)).

> > > > Another question is about the setting of secondary_ctls_high in
> > > > nested_vmx_setup_ctls_msrs(). I saw there's a comment saying:
> > > > "Do not include those that depend on CPUID bits, they are
> > > > added later by vmx_vcpu_after_set_cpuid.".
> > >
> > > That's a stale comment, see the very next commit, 8805875aa473 ("Revert "KVM: nVMX:
> > > Do not expose MPX VMX controls when guest MPX disabled""), as well as the slightly
> > > later commit 9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL
> > > VM-{Entry,Exit} control"").
> > >
> >
> > So the comment can be and shall be removed, right?
>
> Yep.
>
> > > > if (cpu_has_vmx_vmfunc()) {
> > > > msrs->secondary_ctls_high |=
> > > > SECONDARY_EXEC_ENABLE_VMFUNC;
> > >
> > > This one is still required. KVM never enables VMFUNC for itself, i.e. it won't
> > > be set in KVM's VMCS configuration.
> > >
> >
> > My understanding is that, for VMFUNC, altough KVM does not support it,
> > SECONDARY_EXEC_ENABLE_VMFUNC is still set in the secondary proc-based
> > vm execution ctrol. KVM just uses different handlers for VMFUNC exits
> > from L1(to inject the #UD) and L2(to emulate the eptp switching). So
> > it doesn't matter if we do not clear this bit for vmcs_config->nested.
> > procbased_ctls_high.
>
> Ah, you're right, I didn't realize KVM enables VMFUNC in L1. Enabling VMFUNC for
> L1 is silly though, it's trivial to clear the feature in vmx_secondary_exec_control().
>
> That said, enabling VMFUNC in vmcs01 is an orthogonal topic, and it _is_ indeed
> easier to keep the feature in the reference config. Now that the nested config
> is derived from the non-nested config, nested_vmx_setup_ctls_msrs() can do:

Agreed. BTW, do you know why KVM took pains to do so? I mean, emulation for
L2's vmfunc does not rely on the existance of vmfunc, right? So, for L2,
we can just set vmcs02's SECONDARY_EXEC_ENABLE_VMFUNC based on vmcs12. And
for L1, we can just disable it by clearing it in vmx_secondary_exec_control(),
and remove the #UD injection logic from KVM?

B.R.
Yu

2022-11-03 17:05:58

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Wed, Nov 02, 2022, Yu Zhang wrote:
> On Tue, Nov 01, 2022 at 05:58:21PM +0000, Sean Christopherson wrote:
> > On Tue, Nov 01, 2022, Yu Zhang wrote:
> > > What I observed is that vmx->nested.msrs.secondary_ctls_high will be changed
> > > in vmx_adjust_secondary_exec_control(), which can be triggered after cpuid is
> > > set.
> >
> > Ugh, that path got overlooked when we yanked out KVM's manipulaton of VMX MSRs
> > in response to guest CPUID changes. I wonder if we can get away with changing
> > KVM's behavior to only ensure a feature isn't exposed to L2 when it's not exposed
> > to L1.
> >
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 6b4266e949a3..cfc35d559d91 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -4523,8 +4523,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
> > * Update the nested MSR settings so that a nested VMM can/can't set
> > * controls for features that are/aren't exposed to the guest.
> > */
> > - if (nested) {
> > - if (enabled)
> > + if (nested && !enabled)
> > + if (exiting)
> > vmx->nested.msrs.secondary_ctls_high |= control;
> > else
> > vmx->nested.msrs.secondary_ctls_high &= ~control;
> >
>
> Indeed, this change can make sure a feature won't be exposed to L2, if L1
> does not have it.

No, that's not the goal of the change. KVM already hides features in the VMX MSRs
if the base feature is not supported in L1 according to guest CPUID. The problem
is that, currently, KVM also _forces_ features to be enabled in the VMX MSRs when
the base feature IS supported in L1 (CPUID).

Ideally, KVM should NEVER manipulate VMX MSRs in response to guest CPUID changes.
That's what I was referring to earlier by commits:

8805875aa473 ("Revert "KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled"")
9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL"")

E.g. if userspace sets VMX MSRs and then sets guest CPUID, KVM will override the
nVMX CPU model defined by userspace. The scenario where userspace hides a "base"
feature but exposes the feature in the VMX MSRs is nonsensical, which is why I
think KVM can likely get away with force-disabling features.

The reverse is completely legitimate though: hiding a feature in VMX MSRs even if
the base feature is supported for L1, i.e. disallowing L1 from enabling the feature
in L2, is something that real VMMs will actually do, e.g. if the user doesn't trust
that KVM correctly handles all aspects of nested virtualization for the feature.

In other words, the behavior you're observing, where vmx->nested.msrs.secondary_ctls_high
is changed by vmx_adjust_secondary_exec_control(), is a completely separate bug
than the one below.

> But for the feature bits that L1 has, yet cleared from the
> vmcs_conf->nested.msrs.secondary_ctls_high in nested_vmx_setup_ctls_msrs(),
> there's no chance for userspace vmm to reset it again.
>
> Well, I am not suggesting to give userspace vmm such permission(which I believe
> is incorrect). And IIUC, vmcs_conf->nested.msrs.secondary_ctls_high shall serve
> as a template to initialize vmx->nested.msrs.secondary_ctls_high. So maybe we
> shall not mask off some features in nested_vmx_setup_ctls_msrs() at the beginning.
>
> > > Since KVM's config(vmcs_config->nested.secondary_ctls_high) is done during init
> > > by nested_vmx_setup_ctls_msrs(), which only kept a subset of the flags from the
> > > vmcs_confg->cpu_based_2nd_exec_ctrl, the vmx_restore_control_msr() could fail
> > > later, when userspace VMM tries to enable a feature(the only one I witnessed is
> > > SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE) by setting MSR_IA32_VMX_PROCBASED_CTLS2.
> > > Because the vmx->nested.msrs.secondary_ctls_high is updated by cpuid, but this
> > > bit is not taken from vmcs_conf->cpu_based_2nd_exec_ctrl by nested_vmx_setup_ctls_msrs()
> > > for vmcs_config->nested.secondary_ctls_high.
> > >
> > > The failure can be fixed, simply by adding SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE in
> > > nested_vmx_setup_ctls_msrs(), e.g.
> >
> > Assuming KVM actually supports user wait/pause in L2, this is an orthogonal bug
> > to the CPUID-based manipulation above. KVM simply neglects to advertise to userspace
> > that ENABLE_USR_WAIT_PAUSE is supported for nested virtualization.
> >
> > If KVM doesn't correctly support virtualizing user wait/pause for L2, then the
> > correct location to fix this is in vmx_secondary_exec_control().
> >
>
> Sorry, why vmx_secondary_exec_control()?

You missed the qualifier:

If KVM doesn't correctly support virtualizing user wait/pause for L2

If KVM should NOT be exposing ENABLE_USR_WAIT_PAUSE to the L1 VMM, then NOT
propagating the feature to msrs->secondary_ctls_low is correct. And if that's
the case, then vmx_secondary_exec_control() needs to be modified so that it does
NOT set ENABLE_USR_WAIT_PAUSE in vmx->nested.msrs.secondary_ctls_high.

> Could we just change nested_vmx_setup_ctls_msrs() like below:

If KVM correctly virtualizes the feature in a nested scenario, yes. I haven't
looked into ENABLE_USR_WAIT_PAUSE enough to know whether or not KVM gets the
nested virtualization pieces correct, hence the above qualifier.

> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 0c62352dda6a..fa255391718c 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -6791,13 +6791,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
> msrs->procbased_ctls_low &=
> ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
>
> - /*
> - * secondary cpu-based controls. Do not include those that
> - * depend on CPUID bits, they are added later by
> - * vmx_vcpu_after_set_cpuid.
> - */
> msrs->secondary_ctls_low = 0;
> -
> msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
> msrs->secondary_ctls_high &=
> SECONDARY_EXEC_DESC |
> @@ -6810,7 +6804,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
> SECONDARY_EXEC_ENABLE_INVPCID |
> SECONDARY_EXEC_RDSEED_EXITING |
> SECONDARY_EXEC_XSAVES |
> - SECONDARY_EXEC_TSC_SCALING;
> + SECONDARY_EXEC_TSC_SCALING |
> + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
>
> /*
> * We can emulate "VMCS shadowing," even if the hardware

2022-11-07 08:44:48

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Thu, Nov 03, 2022 at 04:53:11PM +0000, Sean Christopherson wrote:
> > >
> > > Ugh, that path got overlooked when we yanked out KVM's manipulaton of VMX MSRs
> > > in response to guest CPUID changes. I wonder if we can get away with changing
> > > KVM's behavior to only ensure a feature isn't exposed to L2 when it's not exposed
> > > to L1.
> > >
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 6b4266e949a3..cfc35d559d91 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -4523,8 +4523,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
> > > * Update the nested MSR settings so that a nested VMM can/can't set
> > > * controls for features that are/aren't exposed to the guest.
> > > */
> > > - if (nested) {
> > > - if (enabled)
> > > + if (nested && !enabled)
> > > + if (exiting)
> > > vmx->nested.msrs.secondary_ctls_high |= control;
> > > else
> > > vmx->nested.msrs.secondary_ctls_high &= ~control;
> > >
> >
> > Indeed, this change can make sure a feature won't be exposed to L2, if L1
> > does not have it.
>
> No, that's not the goal of the change. KVM already hides features in the VMX MSRs
> if the base feature is not supported in L1 according to guest CPUID. The problem
> is that, currently, KVM also _forces_ features to be enabled in the VMX MSRs when
> the base feature IS supported in L1 (CPUID).
>
> Ideally, KVM should NEVER manipulate VMX MSRs in response to guest CPUID changes.
> That's what I was referring to earlier by commits:
>
> 8805875aa473 ("Revert "KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled"")
> 9389d5774aca ("Revert "KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL"")
>
> E.g. if userspace sets VMX MSRs and then sets guest CPUID, KVM will override the
> nVMX CPU model defined by userspace. The scenario where userspace hides a "base"
> feature but exposes the feature in the VMX MSRs is nonsensical, which is why I
> think KVM can likely get away with force-disabling features.
>
> The reverse is completely legitimate though: hiding a feature in VMX MSRs even if
> the base feature is supported for L1, i.e. disallowing L1 from enabling the feature
> in L2, is something that real VMMs will actually do, e.g. if the user doesn't trust
> that KVM correctly handles all aspects of nested virtualization for the feature.

Thanks Sean. Let me try to rephrase my understandings of your statement(
and pls feel free to correct me):

1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes.
2> What makes sense is, if a feature is
a. disabled by guest CPUID, it shall not be exposed in guest VMX MSR;
b. enabled by guest CPUID, it could be either exposed or hidden in
guest VMX MSR.
3> So your previous change is to guarantee 2.a, and userspace VMM can choose
to follow follow either choices in 2.b(depending on whether it believes this
feature is correctly supported by KVM in nested).

Is above understanding correct?

But what if userspace VMM sets guest CPUID first, disabling a feature, and
then sets the guest VMX MSR bit with this feature enabled? Does KVM need to
check guest CPUID again, in vmx_restore_control_msr()?

I do not think above scenario is what QEMU does - QEMU checks guest CPUID
first with kvm_arch_get_supported_cpuid() before trying to set guest VMX MSR.
But I am not sure if this is mandatory step for all userspace VMM.

>
> In other words, the behavior you're observing, where vmx->nested.msrs.secondary_ctls_high
> is changed by vmx_adjust_secondary_exec_control(), is a completely separate bug
> than the one below.
>
> > >
> > > Assuming KVM actually supports user wait/pause in L2, this is an orthogonal bug
> > > to the CPUID-based manipulation above. KVM simply neglects to advertise to userspace
> > > that ENABLE_USR_WAIT_PAUSE is supported for nested virtualization.
> > >
> > > If KVM doesn't correctly support virtualizing user wait/pause for L2, then the
> > > correct location to fix this is in vmx_secondary_exec_control().
> > >
> >
> > Sorry, why vmx_secondary_exec_control()?
>
> You missed the qualifier:
>
> If KVM doesn't correctly support virtualizing user wait/pause for L2
>
> If KVM should NOT be exposing ENABLE_USR_WAIT_PAUSE to the L1 VMM, then NOT
> propagating the feature to msrs->secondary_ctls_low is correct. And if that's
> the case, then vmx_secondary_exec_control() needs to be modified so that it does
> NOT set ENABLE_USR_WAIT_PAUSE in vmx->nested.msrs.secondary_ctls_high.
>
> > Could we just change nested_vmx_setup_ctls_msrs() like below:
>
> If KVM correctly virtualizes the feature in a nested scenario, yes. I haven't
> looked into ENABLE_USR_WAIT_PAUSE enough to know whether or not KVM gets the
> nested virtualization pieces correct, hence the above qualifier.
>

Got it. I'll check that. Thanks!


B.R.
Yu

2022-11-07 15:31:16

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Mon, Nov 07, 2022, Yu Zhang wrote:
> On Thu, Nov 03, 2022 at 04:53:11PM +0000, Sean Christopherson wrote:
> > Ideally, KVM should NEVER manipulate VMX MSRs in response to guest CPUID changes.
> > That's what I was referring to earlier by commits:

...

> Thanks Sean. Let me try to rephrase my understandings of your statement(
> and pls feel free to correct me):
>
> 1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes.
> 2> What makes sense is, if a feature is
> a. disabled by guest CPUID, it shall not be exposed in guest VMX MSR;
> b. enabled by guest CPUID, it could be either exposed or hidden in
> guest VMX MSR.
> 3> So your previous change is to guarantee 2.a, and userspace VMM can choose
> to follow follow either choices in 2.b(depending on whether it believes this
> feature is correctly supported by KVM in nested).
>
> Is above understanding correct?

Not quite. Again, in an ideal world, KVM would not modify the VMX MSRs based on
guest CPUID. But it's possible userspace is relying on KVM to hide a feature from
L2 if it's hidden from L1, so to avoid breaking an otherwise valide userspace config,
it's worth enforcing that in KVM.

> But what if userspace VMM sets guest CPUID first, disabling a feature, and
> then sets the guest VMX MSR bit with this feature enabled? Does KVM need to
> check guest CPUID again, in vmx_restore_control_msr()?

No. It's not KVM's responsibility to provide a sane, valid vCPU model. So long
as KVM is not endangered by a bad config, userspace can do whatever it wants.

2022-11-08 11:14:29

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Mon, Nov 07, 2022 at 03:06:51PM +0000, Sean Christopherson wrote:
> On Mon, Nov 07, 2022, Yu Zhang wrote:
> > On Thu, Nov 03, 2022 at 04:53:11PM +0000, Sean Christopherson wrote:
> > > Ideally, KVM should NEVER manipulate VMX MSRs in response to guest CPUID changes.
> > > That's what I was referring to earlier by commits:
>
> ...
>
> > Thanks Sean. Let me try to rephrase my understandings of your statement(
> > and pls feel free to correct me):
> >
> > 1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> > disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes.
> > 2> What makes sense is, if a feature is
> > a. disabled by guest CPUID, it shall not be exposed in guest VMX MSR;
> > b. enabled by guest CPUID, it could be either exposed or hidden in
> > guest VMX MSR.
> > 3> So your previous change is to guarantee 2.a, and userspace VMM can choose
> > to follow follow either choices in 2.b(depending on whether it believes this
> > feature is correctly supported by KVM in nested).
> >
> > Is above understanding correct?
>
> Not quite. Again, in an ideal world, KVM would not modify the VMX MSRs based on
> guest CPUID. But it's possible userspace is relying on KVM to hide a feature from
> L2 if it's hidden from L1, so to avoid breaking an otherwise valide userspace config,
> it's worth enforcing that in KVM.
>

Sorry, maybe I should understand this way:

In theroy, KVM shall not modify guest VMX MSRs in response to the guest CPUID
updates. Therefore we shall not enforce the exposure of a feature in guest VMX
MSR, just because it is enabled in guest CPUID (e.g., userspace VMM can choose
to hide such feature so long as it believes KVM can not provide correct nested
support for this feature).

But in reverse, it is not reasonable for userspace VMM to expose a feature in
guest VMX MSR settings, if such feature is disabled in this guest's CPUID. So
KVM shall help to make sure such feature is hidden when guest CPUID changes.


BTW, I found my previous understanding of what vmx_adjust_secondary_exec_control()
currently does was also wrong. It could also be used for EXITING controls. And
for such flags(e.g., SECONDARY_EXEC_RDRAND_EXITING), values for the nested settings
(vmx->nested.msrs.secondary_ctls_high) and for the L1 execution controls(*exec_control)
could be opposite. So the statement:
"1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes."
is wrong.

Hopefully we are gonna change vmx_adjust_secondary_exec_control() soon...

B.R.
Yu

2022-11-08 19:16:26

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Tue, Nov 08, 2022, Yu Zhang wrote:
> On Mon, Nov 07, 2022 at 03:06:51PM +0000, Sean Christopherson wrote:
> > On Mon, Nov 07, 2022, Yu Zhang wrote:
> > > On Thu, Nov 03, 2022 at 04:53:11PM +0000, Sean Christopherson wrote:
> > > > Ideally, KVM should NEVER manipulate VMX MSRs in response to guest CPUID changes.
> > > > That's what I was referring to earlier by commits:
> >
> > ...
> >
> > > Thanks Sean. Let me try to rephrase my understandings of your statement(
> > > and pls feel free to correct me):
> > >
> > > 1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> > > disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes.
> > > 2> What makes sense is, if a feature is
> > > a. disabled by guest CPUID, it shall not be exposed in guest VMX MSR;
> > > b. enabled by guest CPUID, it could be either exposed or hidden in
> > > guest VMX MSR.
> > > 3> So your previous change is to guarantee 2.a, and userspace VMM can choose
> > > to follow follow either choices in 2.b(depending on whether it believes this
> > > feature is correctly supported by KVM in nested).
> > >
> > > Is above understanding correct?
> >
> > Not quite. Again, in an ideal world, KVM would not modify the VMX MSRs based on
> > guest CPUID. But it's possible userspace is relying on KVM to hide a feature from
> > L2 if it's hidden from L1, so to avoid breaking an otherwise valide userspace config,
> > it's worth enforcing that in KVM.
> >
>
> Sorry, maybe I should understand this way:
>
> In theroy, KVM shall not modify guest VMX MSRs in response to the guest CPUID
> updates. Therefore we shall not enforce the exposure of a feature in guest VMX
> MSR, just because it is enabled in guest CPUID (e.g., userspace VMM can choose
> to hide such feature so long as it believes KVM can not provide correct nested
> support for this feature).
>
> But in reverse, it is not reasonable for userspace VMM to expose a feature in
> guest VMX MSR settings, if such feature is disabled in this guest's CPUID. So
> KVM shall help to make sure such feature is hidden when guest CPUID changes.

No. Again, KVM _should never_ manipulate VMX MSRs in response to CPUID changes.
Keeping the existing behavior would be done purely to maintain backwards
compability with existing userspace, not because it's strictly the right thing to do.

E.g. as a strawman, a weird userspace could do KVM_SET_MSRS => KVM_SET_CPUID =>
KVM_SET_CPUID, where the first KVM_SET_CPUID reset to a base config and the second
KVM_SET_CPUID incorporates "optional" features. In that case, clearing bits in
the VMX MSRs on the first KVM_SET_CPUID would do the wrong thing if the second
KVM_SET_CPUID enabled the relevant features.

AFAIK, no userspace actually does something odd like that, whereas there are VMMs
that do KVM_SET_MSRS before KVM_SET_CPUID, e.g. disable a feature in VMX MSRs but
later enable the feature in CPUID for L1. And so disabling features is likely
safe-ish, but enabling feature most definitely can cause problems for userspace.

Hrm, actually, there are likely older VMMs that never set VMX MSRs, and so dropping
the "enable features" code might not be safe either. Grr. The obvious solution
would be to add a quirk, but maybe we can avoid a quirk by skipping KVM's
misguided updates if userspace has set the MSR. That should work for a userspace
that deliberately sets the MSR during setup, and for a userspace that blindly
migrates the MSR since the migrated value should already be correct/sane.

E.g.

diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 45162c1bcd8f..671479cd7721 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -51,6 +51,7 @@ struct nested_vmx_msrs {
u64 cr4_fixed1;
u64 vmcs_enum;
u64 vmfunc_controls;
+ bool secondary_set_by_userspace;
};

struct vmcs_config {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 62e3967cf131..3f691ed169d8 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1257,6 +1257,9 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
return -EINVAL;

+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS2)
+ vmx->nested.msrs.secondary_set_by_userspace = true;
+
vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
*lowp = data;
*highp = data >> 32;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ab89755dce66..8aadaae5b81e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4523,7 +4523,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* Update the nested MSR settings so that a nested VMM can/can't set
* controls for features that are/aren't exposed to the guest.
*/
- if (nested) {
+ if (nested && !vmx->nested.msrs.secondary_set_by_userspace) {
if (enabled)
vmx->nested.msrs.secondary_ctls_high |= control;
else


> BTW, I found my previous understanding of what vmx_adjust_secondary_exec_control()
> currently does was also wrong. It could also be used for EXITING controls. And
> for such flags(e.g., SECONDARY_EXEC_RDRAND_EXITING), values for the nested settings
> (vmx->nested.msrs.secondary_ctls_high) and for the L1 execution controls(*exec_control)
> could be opposite. So the statement:
> "1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes."
> is wrong.

No, it's correct. The EXITING controls are just inverted feature flags. E.g. if
RDRAND is disabled in CPUID, KVM sets the EXITING control so that KVM intercepts
RDRAND in order to inject #UD.

[EXIT_REASON_RDRAND] = kvm_handle_invalid_op,

2022-11-10 09:41:43

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

>
> No. Again, KVM _should never_ manipulate VMX MSRs in response to CPUID changes.
> Keeping the existing behavior would be done purely to maintain backwards
> compability with existing userspace, not because it's strictly the right thing to do.
>
> E.g. as a strawman, a weird userspace could do KVM_SET_MSRS => KVM_SET_CPUID =>
> KVM_SET_CPUID, where the first KVM_SET_CPUID reset to a base config and the second
> KVM_SET_CPUID incorporates "optional" features. In that case, clearing bits in
> the VMX MSRs on the first KVM_SET_CPUID would do the wrong thing if the second
> KVM_SET_CPUID enabled the relevant features.
>
> AFAIK, no userspace actually does something odd like that, whereas there are VMMs
> that do KVM_SET_MSRS before KVM_SET_CPUID, e.g. disable a feature in VMX MSRs but
> later enable the feature in CPUID for L1. And so disabling features is likely
> safe-ish, but enabling feature most definitely can cause problems for userspace.
>
> Hrm, actually, there are likely older VMMs that never set VMX MSRs, and so dropping
> the "enable features" code might not be safe either. Grr. The obvious solution
> would be to add a quirk, but maybe we can avoid a quirk by skipping KVM's
> misguided updates if userspace has set the MSR. That should work for a userspace
> that deliberately sets the MSR during setup, and for a userspace that blindly
> migrates the MSR since the migrated value should already be correct/sane.
>
Oh. Just saw your new selftest code, and fininally get your point(I hope
so...). Thanks!

> > BTW, I found my previous understanding of what vmx_adjust_secondary_exec_control()
> > currently does was also wrong. It could also be used for EXITING controls. And
> > for such flags(e.g., SECONDARY_EXEC_RDRAND_EXITING), values for the nested settings
> > (vmx->nested.msrs.secondary_ctls_high) and for the L1 execution controls(*exec_control)
> > could be opposite. So the statement:
> > "1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> > disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes."
> > is wrong.
>
> No, it's correct. The EXITING controls are just inverted feature flags. E.g. if
> RDRAND is disabled in CPUID, KVM sets the EXITING control so that KVM intercepts
> RDRAND in order to inject #UD.
>
> [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
>

Well, suppose
- cpu_has_vmx_rdrand() is true;
- meanwhile guest_cpuid_has(vcpu, X86_FEATURE_RDRAND) is false.

And then, what vmx_adjust_secondary_exec_control() currently does is:
1> keep the SECONDARY_EXEC_RDRAND_EXITING set in L1 secondary proc-
based execution control.
2> and then clear the SECONDARY_EXEC_RDRAND_EXITING in the high bits
of IA32_VMX_PROCBASED_CTLS2 MSR for nested by
vmx->nested.msrs.secondary_ctls_high &= ~control;
That means for L1 VMM, SECONDARY_EXEC_RDRAND_EXITING must be cleared
in its(VMCS12's) secondary proc-based VM-execution control, even when
rdrand is disabled in L1's and L2's CPUID.

I wonder, for native environment, if an instruction is not supported,
will the allowed 1-setting for its corresponding exiting feature in
IA32_VMX_PROCBASED_CTLS2 MSR be set, or be cleared? Maybe it should
be cleared, and executing such instruction in non-root will just get
a #UD directly instead of triggering a VM-Exit?

Note: I do not think this will cause any problem, just curious if L1
VMM can observe a behavior that's not supposed to be in native scenario(
only because what we are doing in KVM).

B.R.
Yu


2022-11-10 17:36:51

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 05/15] KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value

On Thu, Nov 10, 2022, Yu Zhang wrote:
> > > BTW, I found my previous understanding of what vmx_adjust_secondary_exec_control()
> > > currently does was also wrong. It could also be used for EXITING controls. And
> > > for such flags(e.g., SECONDARY_EXEC_RDRAND_EXITING), values for the nested settings
> > > (vmx->nested.msrs.secondary_ctls_high) and for the L1 execution controls(*exec_control)
> > > could be opposite. So the statement:
> > > "1> For now, what vmx_adjust_secondary_exec_control() does, is to enable/
> > > disable a feature in VMX MSR(and nVMX MSR) based on cpuid changes."
> > > is wrong.
> >
> > No, it's correct. The EXITING controls are just inverted feature flags. E.g. if
> > RDRAND is disabled in CPUID, KVM sets the EXITING control so that KVM intercepts
> > RDRAND in order to inject #UD.
> >
> > [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
> >
>
> Well, suppose
> - cpu_has_vmx_rdrand() is true;
> - meanwhile guest_cpuid_has(vcpu, X86_FEATURE_RDRAND) is false.
>
> And then, what vmx_adjust_secondary_exec_control() currently does is:
> 1> keep the SECONDARY_EXEC_RDRAND_EXITING set in L1 secondary proc-
> based execution control.
> 2> and then clear the SECONDARY_EXEC_RDRAND_EXITING in the high bits
> of IA32_VMX_PROCBASED_CTLS2 MSR for nested by
> vmx->nested.msrs.secondary_ctls_high &= ~control;
> That means for L1 VMM, SECONDARY_EXEC_RDRAND_EXITING must be cleared
> in its(VMCS12's) secondary proc-based VM-execution control, even when
> rdrand is disabled in L1's and L2's CPUID.

Again, it is _userspace's_ responsibility to provide a sane, consistent CPU model
to the guest.

> I wonder, for native environment, if an instruction is not supported,
> will the allowed 1-setting for its corresponding exiting feature in
> IA32_VMX_PROCBASED_CTLS2 MSR be set, or be cleared? Maybe it should
> be cleared, and executing such instruction in non-root will just get
> a #UD directly instead of triggering a VM-Exit?

For any reasonable interpretation of the SDM, it's a moot point. The SDM doesn't
call out these scenarios for instructions like RDTSCP because they're nonsensical,
but for other instructions that can be trapped by the hypervisor and can take a
#UD when they're supported, the #UD is prioritized of the VM-Exit. E.g. VMX
instructions have pseudocode like:

IF not in VMX operation
THEN #UD;
ELSIF in VMX non-root operation
THEN VM exit;

In other words, if the CPU doesn't recognize an instruction, it will generate a
#UD without getting to the (presumed) microcode flow that checks for VM-Exit.