Add a framework to manage and cache KVM-governed features, i.e. CPUID
based features that require explicit KVM enabling and/or need to be
queried semi-frequently by KVM. The idea originally came up in the
context of the architectural LBRs series as a way to avoid querying
guest CPUID in hot paths without needing a dedicated flag, but as
evidenced by the shortlog, the most common usage is to handle the ever-
growing list of SVM features that are exposed to L1.
Note, I don't like the name "governed", but it was the least awful thing I
could come up with. Suggestions most definitely welcome.
This series is lightly tested. I am posting somewhat speculatively to get
early feedback on the idea.
Sean Christopherson (12):
KVM: x86: Add a framework for enabling KVM-governed x86 features
KVM: x86/mmu: Use KVM-governed feature framework to track "GBPAGES
enabled"
KVM: VMX: Recompute "XSAVES enabled" only after CPUID update
KVM: VMX: Rename XSAVES control to follow KVM's preferred "ENABLE_XYZ"
KVM: x86: Use KVM-governed feature framework to track "XSAVES enabled"
KVM: nSVM: Use KVM-governed feature framework to track "NRIPS enabled"
KVM: nSVM: Use KVM-governed feature framework to track "TSC scaling
enabled"
KVM: nSVM: Use KVM-governed feature framework to track "vVM{SAVE,LOAD}
enabled"
KVM: nSVM: Use KVM-governed feature framework to track "LBRv enabled"
KVM: nSVM: Use KVM-governed feature framework to track "Pause Filter
enabled"
KVM: nSVM: Use KVM-governed feature framework to track "vGIF enabled"
KVM: x86: Disallow guest CPUID lookups when IRQs are disabled
arch/x86/include/asm/kvm_host.h | 11 ++++++
arch/x86/include/asm/vmx.h | 2 +-
arch/x86/kvm/cpuid.c | 31 +++++++++++++++++
arch/x86/kvm/cpuid.h | 51 ++++++++++++++++++++++++++++
arch/x86/kvm/governed_features.h | 19 +++++++++++
arch/x86/kvm/mmu/mmu.c | 20 ++---------
arch/x86/kvm/svm/nested.c | 48 ++++++++++++++++-----------
arch/x86/kvm/svm/svm.c | 57 +++++++++++++++++++++-----------
arch/x86/kvm/svm/svm.h | 13 ++------
arch/x86/kvm/vmx/capabilities.h | 2 +-
arch/x86/kvm/vmx/hyperv.h | 2 +-
arch/x86/kvm/vmx/nested.c | 6 ++--
arch/x86/kvm/vmx/nested.h | 2 +-
arch/x86/kvm/vmx/vmx.c | 48 +++++++++++++--------------
arch/x86/kvm/vmx/vmx.h | 2 +-
arch/x86/kvm/x86.c | 4 +--
16 files changed, 217 insertions(+), 101 deletions(-)
create mode 100644 arch/x86/kvm/governed_features.h
base-commit: 62ef199250cd46fb66fe98267137b7f64e0b41b4
--
2.39.2.637.g21b0678d19-goog
Introduce yet another X86_FEATURE flag framework to manage and cache KVM
governed features (for lack of a better term). "Governed" in this case
means that KVM has some level of involvement and/or vested interest in
whether or not an X86_FEATURE can be used by the guest. The intent of the
framework is twofold: to simplify caching of guest CPUID flags that KVM
needs to frequently query, and to add clarity to such caching, e.g. it
isn't immediately obvious that SVM's bundle of flags for "optional nested]
SVM features" track whether or not a flag is exposed to L1.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 11 +++++++
arch/x86/kvm/cpuid.c | 2 ++
arch/x86/kvm/cpuid.h | 51 ++++++++++++++++++++++++++++++++
arch/x86/kvm/governed_features.h | 9 ++++++
4 files changed, 73 insertions(+)
create mode 100644 arch/x86/kvm/governed_features.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 792a6037047a..cd660de02f7b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -835,6 +835,17 @@ struct kvm_vcpu_arch {
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ /*
+ * Track whether or not the guest is allowed to use features that are
+ * governed by KVM, where "governed" means KVM needs to manage state
+ * and/or explicitly enable the feature in hardware. Typically, but
+ * not always, governed features can be used by the guest if and only
+ * if both KVM and userspace want to expose the feature to the guest.
+ */
+ struct {
+ u32 enabled;
+ } governed_features;
+
u64 reserved_gpa_bits;
int maxphyaddr;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8f8edeaf8177..013fdc27fc8f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ vcpu->arch.governed_features.enabled = 0;
+
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b1658c0de847..f61a2106ba90 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -232,4 +232,55 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
}
+enum kvm_governed_features {
+#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
+#include "governed_features.h"
+ KVM_NR_GOVERNED_FEATURES
+};
+
+static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
+{
+ switch (x86_feature) {
+#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
+#include "governed_features.h"
+ default:
+ return -1;
+ }
+}
+
+static __always_inline int kvm_is_governed_feature(unsigned int x86_feature)
+{
+ return kvm_governed_feature_index(x86_feature) >= 0;
+}
+
+static __always_inline u32 kvm_governed_feature_bit(unsigned int x86_feature)
+{
+ int index = kvm_governed_feature_index(x86_feature);
+
+ BUILD_BUG_ON(index < 0);
+ return BIT(index);
+}
+
+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES >
+ sizeof(vcpu->arch.governed_features.enabled) * BITS_PER_BYTE);
+
+ vcpu->arch.governed_features.enabled |= kvm_governed_feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ if (guest_cpuid_has(vcpu, x86_feature))
+ kvm_governed_feature_set(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ return vcpu->arch.governed_features.enabled & kvm_governed_feature_bit(x86_feature);
+}
+
#endif
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
new file mode 100644
index 000000000000..40ce8e6608cd
--- /dev/null
+++ b/arch/x86/kvm/governed_features.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
+BUILD_BUG()
+#endif
+
+#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
+
+#undef KVM_GOVERNED_X86_FEATURE
+#undef KVM_GOVERNED_FEATURE
--
2.39.2.637.g21b0678d19-goog
Use the governed feature framework to track whether or not the guest can
use 1GiB pages, and drop the one-off helper that wraps the surprisingly
non-trivial logic surrounding 1GiB page usage in the guest.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/cpuid.c | 16 ++++++++++++++++
arch/x86/kvm/governed_features.h | 2 ++
arch/x86/kvm/mmu/mmu.c | 20 +++-----------------
3 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 013fdc27fc8f..3b604499c35c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -337,6 +337,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.governed_features.enabled = 0;
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor guest CPUID as KVM has full visibility and
+ * can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ if (!tdp_enabled)
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_GBPAGES);
+ else if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
+
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 40ce8e6608cd..b29c15d5e038 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -5,5 +5,7 @@ BUILD_BUG()
#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
+KVM_GOVERNED_X86_FEATURE(GBPAGES)
+
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c91ee2927dd7..36e4561554ca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4728,28 +4728,13 @@ __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
}
}
-static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
-{
- /*
- * If TDP is enabled, let the guest use GBPAGES if they're supported in
- * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
- * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
- * walk for performance and complexity reasons. Not to mention KVM
- * _can't_ solve the problem because GVA->GPA walks aren't visible to
- * KVM once a TDP translation is installed. Mimic hardware behavior so
- * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
- */
- return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
-}
-
static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use_gbpages(vcpu),
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
@@ -4826,7 +4811,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use_gbpages(vcpu), is_pse, is_amd);
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
if (!shadow_me_mask)
return;
--
2.39.2.637.g21b0678d19-goog
Recompute whether or not XSAVES is enabled for the guest only if the
guest's CPUID model changes instead of redoing the computation every time
KVM generates vmcs01's secondary execution controls. The boot_cpu_has()
and cpu_has_vmx_xsaves() checks should never change after KVM is loaded,
and if they do the kernel/KVM is hosed.
Opportunistically add a comment explaining _why_ XSAVES is effectively
exposed to the guest if and only if XSAVE is also exposed to the guest.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 47abd9101e68..b6fdb311a7d8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4620,19 +4620,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (cpu_has_vmx_xsaves()) {
- /* Exposing XSAVES only when XSAVE is exposed */
- bool xsaves_enabled =
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
- vcpu->arch.xsaves_enabled = xsaves_enabled;
-
+ if (cpu_has_vmx_xsaves())
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_XSAVES,
- xsaves_enabled, false);
- }
+ vcpu->arch.xsaves_enabled, false);
/*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@ -7709,8 +7700,15 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
- vcpu->arch.xsaves_enabled = false;
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ vcpu->arch.xsaves_enabled = cpu_has_vmx_xsaves() &&
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
--
2.39.2.637.g21b0678d19-goog
Rename the XSAVES secondary execution control to follow KVM's preferred
style so that XSAVES related logic can use common macros that depend on
KVM's preferred style.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/include/asm/vmx.h | 2 +-
arch/x86/kvm/vmx/capabilities.h | 2 +-
arch/x86/kvm/vmx/hyperv.h | 2 +-
arch/x86/kvm/vmx/nested.c | 6 +++---
arch/x86/kvm/vmx/nested.h | 2 +-
arch/x86/kvm/vmx/vmx.c | 2 +-
arch/x86/kvm/vmx/vmx.h | 2 +-
7 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498dc600bd5c..aeb319665502 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,7 +69,7 @@
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
-#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES)
+#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 45162c1bcd8f..3c3875b3dedd 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -252,7 +252,7 @@ static inline bool cpu_has_vmx_pml(void)
static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_ENABLE_XSAVES;
}
static inline bool cpu_has_vmx_waitpkg(void)
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 78d17667e7ec..51fe0251cb67 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -87,7 +87,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_DESC | \
SECONDARY_EXEC_ENABLE_RDTSCP | \
SECONDARY_EXEC_ENABLE_INVPCID | \
- SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
SECONDARY_EXEC_RDSEED_EXITING | \
SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_TSC_SCALING | \
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7c4f5ca405c7..1d19fcf02a8e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2301,7 +2301,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
- SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -6321,7 +6321,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
* If if it were, XSS would have to be checked against
* the XSS exit bitmap in vmcs12.
*/
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
case EXIT_REASON_UMWAIT:
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
@@ -6882,7 +6882,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 96952263b029..b4b9d51438c6 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -168,7 +168,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
{
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
}
static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b6fdb311a7d8..14ce195eee5a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4622,7 +4622,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (cpu_has_vmx_xsaves())
vmx_adjust_secondary_exec_control(vmx, &exec_control,
- SECONDARY_EXEC_XSAVES,
+ SECONDARY_EXEC_ENABLE_XSAVES,
vcpu->arch.xsaves_enabled, false);
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2acdc54bc34b..2db14e0f4081 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -574,7 +574,7 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_APIC_REGISTER_VIRT | \
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
SECONDARY_EXEC_SHADOW_VMCS | \
- SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
SECONDARY_EXEC_RDSEED_EXITING | \
SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_ENABLE_PML | \
--
2.39.2.637.g21b0678d19-goog
Use the governed feature framework to track if XSAVES is "enabled", i.e.
if XSAVES can be used by the guest. Add a comment in the SVM code to
explain the very unintuitive logic of deliberately NOT checking if XSAVES
is enumerated in the guest CPUID model.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/svm.c | 17 ++++++++++++---
arch/x86/kvm/vmx/vmx.c | 36 ++++++++++++++++----------------
arch/x86/kvm/x86.c | 4 ++--
4 files changed, 35 insertions(+), 23 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index b29c15d5e038..b896a64e4ac3 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -6,6 +6,7 @@ BUILD_BUG()
#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
KVM_GOVERNED_X86_FEATURE(GBPAGES)
+KVM_GOVERNED_X86_FEATURE(XSAVES)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b43775490074..d89e516449ad 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4128,9 +4128,20 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_cpuid_entry2 *best;
- vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES);
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
/* Update nrips enabled cache */
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 14ce195eee5a..c64a12756016 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4551,16 +4551,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* based on a single guest CPUID bit, with a dedicated feature bit. This also
* verifies that the control is actually supported by KVM and hardware.
*/
-#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
-({ \
- bool __enabled; \
- \
- if (cpu_has_vmx_##name()) { \
- __enabled = guest_cpuid_has(&(vmx)->vcpu, \
- X86_FEATURE_##feat_name); \
- vmx_adjust_secondary_exec_control(vmx, exec_control, \
- SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
- } \
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
+ __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
+ else \
+ __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
})
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
@@ -4620,10 +4623,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (cpu_has_vmx_xsaves())
- vmx_adjust_secondary_exec_control(vmx, &exec_control,
- SECONDARY_EXEC_ENABLE_XSAVES,
- vcpu->arch.xsaves_enabled, false);
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
/*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@ -4642,6 +4642,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_RDTSCP,
rdpid_or_rdtscp_enabled, false);
}
+
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -7705,10 +7706,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
* set if and only if XSAVE is supported.
*/
- vcpu->arch.xsaves_enabled = cpu_has_vmx_xsaves() &&
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+ if (cpu_has_vmx_xsaves() && boot_cpu_has(X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f706621c35b8..541982de5762 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -988,7 +988,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1023,7 +1023,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, host_xss);
}
--
2.39.2.637.g21b0678d19-goog
Track "NRIPS exposed to L1" via a governed feature flag instead of using
a dedicated bit/flag in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/nested.c | 6 +++---
arch/x86/kvm/svm/svm.c | 5 ++---
arch/x86/kvm/svm/svm.h | 1 -
4 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index b896a64e4ac3..359914112615 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -7,6 +7,7 @@ BUILD_BUG()
KVM_GOVERNED_X86_FEATURE(GBPAGES)
KVM_GOVERNED_X86_FEATURE(XSAVES)
+KVM_GOVERNED_X86_FEATURE(NRIPS)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 05d38944a6c0..0641cb943450 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -694,7 +694,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
* prior to injecting the event).
*/
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;
@@ -704,7 +704,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_injected = true;
svm->soft_int_csbase = vmcb12_csbase;
svm->soft_int_old_rip = vmcb12_rip;
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
else
svm->soft_int_next_rip = vmcb12_rip;
@@ -1004,7 +1004,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_save_pending_event_to_vmcb12(svm, vmcb12);
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d89e516449ad..cdffc6db8bc5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4143,9 +4143,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
- /* Update nrips enabled cache */
- svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
+ if (kvm_cpu_cap_has(X86_FEATURE_NRIPS))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 839809972da1..bd6ee6945bdd 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -258,7 +258,6 @@ struct vcpu_svm {
bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
- bool nrips_enabled : 1;
bool tsc_scaling_enabled : 1;
bool v_vmload_vmsave_enabled : 1;
bool lbrv_enabled : 1;
--
2.39.2.637.g21b0678d19-goog
Track "TSC scaling exposed to L1" via a governed feature flag instead of
using a dedicated bit/flag in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/nested.c | 4 ++--
arch/x86/kvm/svm/svm.c | 12 ++++++++----
arch/x86/kvm/svm/svm.h | 1 -
4 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 359914112615..0335576a80a8 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -8,6 +8,7 @@ BUILD_BUG()
KVM_GOVERNED_X86_FEATURE(GBPAGES)
KVM_GOVERNED_X86_FEATURE(XSAVES)
KVM_GOVERNED_X86_FEATURE(NRIPS)
+KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 0641cb943450..30e00c4e07c7 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -673,7 +673,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
- WARN_ON(!svm->tsc_scaling_enabled);
+ WARN_ON(!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR));
nested_svm_update_tsc_ratio_msr(vcpu);
}
@@ -1043,7 +1043,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
- WARN_ON(!svm->tsc_scaling_enabled);
+ WARN_ON(!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR));
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cdffc6db8bc5..dd4aead5462c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2737,7 +2737,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
- if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ if (!msr_info->host_initiated &&
+ !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
@@ -2879,7 +2880,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!svm->tsc_scaling_enabled) {
+ if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
@@ -2901,7 +2902,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_ratio_msr = data;
- if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
break;
@@ -4146,7 +4148,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (kvm_cpu_cap_has(X86_FEATURE_NRIPS))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
- svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+ if (tsc_scaling)
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+
svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index bd6ee6945bdd..a523cfcdd12e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -258,7 +258,6 @@ struct vcpu_svm {
bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
- bool tsc_scaling_enabled : 1;
bool v_vmload_vmsave_enabled : 1;
bool lbrv_enabled : 1;
bool pause_filter_enabled : 1;
--
2.39.2.637.g21b0678d19-goog
Track "virtual VMSAVE/VMLOAD exposed to L1" via a governed feature flag
instead of using a dedicated bit/flag in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/nested.c | 2 +-
arch/x86/kvm/svm/svm.c | 5 ++---
arch/x86/kvm/svm/svm.h | 1 -
4 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 0335576a80a8..b66b9d550f33 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -9,6 +9,7 @@ KVM_GOVERNED_X86_FEATURE(GBPAGES)
KVM_GOVERNED_X86_FEATURE(XSAVES)
KVM_GOVERNED_X86_FEATURE(NRIPS)
KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
+KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 30e00c4e07c7..6a96058c0e48 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
{
- if (!svm->v_vmload_vmsave_enabled)
+ if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
return true;
if (!nested_npt_enabled(svm))
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index dd4aead5462c..b3f0271c73b9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1162,8 +1162,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
-
- svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -4153,7 +4151,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
- svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (vls && !guest_cpuid_is_intel(vcpu))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a523cfcdd12e..1e3e7462b1d7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -258,7 +258,6 @@ struct vcpu_svm {
bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
- bool v_vmload_vmsave_enabled : 1;
bool lbrv_enabled : 1;
bool pause_filter_enabled : 1;
bool pause_threshold_enabled : 1;
--
2.39.2.637.g21b0678d19-goog
Track "LBR virtualization exposed to L1" via a governed feature flag
instead of using a dedicated bit/flag in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/nested.c | 23 +++++++++++++----------
arch/x86/kvm/svm/svm.c | 7 +++++--
arch/x86/kvm/svm/svm.h | 1 -
4 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index b66b9d550f33..16c58d61bdf6 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -10,6 +10,7 @@ KVM_GOVERNED_X86_FEATURE(XSAVES)
KVM_GOVERNED_X86_FEATURE(NRIPS)
KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
+KVM_GOVERNED_X86_FEATURE(LBRV)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6a96058c0e48..9e210b03e635 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -540,6 +540,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
bool new_vmcb12 = false;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
@@ -565,18 +566,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DT);
}
- kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, svm->nested.save.efer);
+ svm_set_efer(vcpu, svm->nested.save.efer);
- svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
- svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
+ svm_set_cr0(vcpu, svm->nested.save.cr0);
+ svm_set_cr4(vcpu, svm->nested.save.cr4);
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
- kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
- kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
- kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
+ kvm_rax_write(vcpu, vmcb12->save.rax);
+ kvm_rsp_write(vcpu, vmcb12->save.rsp);
+ kvm_rip_write(vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
vmcb02->save.rax = vmcb12->save.rax;
@@ -590,7 +591,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DR);
}
- if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
/*
* Reserved bits of DEBUGCTL are ignored. Be consistent with
* svm_set_msr's definition of reserved bits.
@@ -712,7 +714,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
LBR_CTL_ENABLE_MASK;
- if (svm->lbrv_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_LBRV))
vmcb02->control.virt_ext |=
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
@@ -1021,7 +1023,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_switch_vmcb(svm, &svm->vmcb01);
- if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b3f0271c73b9..42591c77f98a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -994,9 +994,11 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
LBR_CTL_ENABLE_MASK);
- if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+ if (unlikely(is_guest_mode(vcpu) &&
+ guest_can_use(vcpu, X86_FEATURE_LBRV))) {
if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
enable_lbrv = true;
+ }
if (enable_lbrv == current_enable_lbrv)
return;
@@ -4149,7 +4151,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (tsc_scaling)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
- svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+ if (lbrv)
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
if (vls && !guest_cpuid_is_intel(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1e3e7462b1d7..60817ff346b0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -258,7 +258,6 @@ struct vcpu_svm {
bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
- bool lbrv_enabled : 1;
bool pause_filter_enabled : 1;
bool pause_threshold_enabled : 1;
bool vgif_enabled : 1;
--
2.39.2.637.g21b0678d19-goog
Track "Pause Filtering is exposed to L1" via governed feature flags
instead of using dedicated bits/flags in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 2 ++
arch/x86/kvm/svm/nested.c | 10 ++++++++--
arch/x86/kvm/svm/svm.c | 8 ++++----
arch/x86/kvm/svm/svm.h | 2 --
4 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 16c58d61bdf6..93c7d840e546 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -11,6 +11,8 @@ KVM_GOVERNED_X86_FEATURE(NRIPS)
KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
KVM_GOVERNED_X86_FEATURE(LBRV)
+KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
+KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 9e210b03e635..c38f17ba818e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -721,8 +721,14 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
- pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
+ if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER))
+ pause_count12 = svm->nested.ctl.pause_filter_count;
+ else
+ pause_count12 = 0;
+ if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD))
+ pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ else
+ pause_thresh12 = 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
/* use guest values since host doesn't intercept PAUSE */
vmcb02->control.pause_filter_count = pause_count12;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 42591c77f98a..b18bd0b33942 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4157,11 +4157,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (vls && !guest_cpuid_is_intel(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+ if (kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
- svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+ if (kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 60817ff346b0..c05eea319d28 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -258,8 +258,6 @@ struct vcpu_svm {
bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
- bool pause_filter_enabled : 1;
- bool pause_threshold_enabled : 1;
bool vgif_enabled : 1;
u32 ldr_reg;
--
2.39.2.637.g21b0678d19-goog
Now that KVM has a framework for caching guest CPUID feature flags, add
a "rule" that IRQs must be enabled when doing guest CPUID lookups, and
enforce the rule via a lockdep assertion. CPUID lookups are slow, and
within KVM, IRQs are only ever disabled in hot paths, e.g. the core run
loop, fast page fault handling, etc. I.e. querying guest CPUID with IRQs
disabled, especially in the run loop, should be avoided.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/cpuid.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3b604499c35c..0f34774129d8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -90,6 +91,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *e;
int i;
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * patch, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
for (i = 0; i < nent; i++) {
e = &entries[i];
--
2.39.2.637.g21b0678d19-goog
Track "virtual GIF exposed to L1" via a governed feature flag instead of
using a dedicated bit/flag in vcpu_svm.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/governed_features.h | 1 +
arch/x86/kvm/svm/nested.c | 3 ++-
arch/x86/kvm/svm/svm.c | 3 ++-
arch/x86/kvm/svm/svm.h | 7 +++----
4 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 93c7d840e546..b49fdabb88c4 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -13,6 +13,7 @@ KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
KVM_GOVERNED_X86_FEATURE(LBRV)
KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
+KVM_GOVERNED_X86_FEATURE(VGIF)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c38f17ba818e..c73c2acaf4c0 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -648,7 +648,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
*/
- if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ if (guest_can_use(vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b18bd0b33942..11068e8eb969 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4163,7 +4163,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
- svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ if (vgif)
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
svm_recalc_instruction_intercepts(vcpu, svm);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index c05eea319d28..be5419975694 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -22,6 +22,7 @@
#include <asm/svm.h>
#include <asm/sev-common.h>
+#include "cpuid.h"
#include "kvm_cache_regs.h"
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
@@ -257,9 +258,6 @@ struct vcpu_svm {
unsigned long soft_int_next_rip;
bool soft_int_injected;
- /* optional nested SVM features that are enabled for this guest */
- bool vgif_enabled : 1;
-
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
@@ -484,7 +482,8 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
}
static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
--
2.39.2.637.g21b0678d19-goog
On Fri, Feb 17, 2023 at 03:10:15PM -0800, Sean Christopherson wrote:
> Use the governed feature framework to track if XSAVES is "enabled", i.e.
> if XSAVES can be used by the guest. Add a comment in the SVM code to
> explain the very unintuitive logic of deliberately NOT checking if XSAVES
> is enumerated in the guest CPUID model.
>
> No functional change intended.
xsaves_enabled in struct kvm_vcpu_arch is no longer used. But instead of
just deleting it, maybe we could move 'bool load_eoi_exitmap_pending' to
its place, so 7 bytes can be saved for each struct kvm_vcpu_arch:
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cd660de02f7b..0eef5469c165 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -740,7 +740,6 @@ struct kvm_vcpu_arch {
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
- bool load_eoi_exitmap_pending;
DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
@@ -750,7 +749,7 @@ struct kvm_vcpu_arch {
u64 smi_count;
bool at_instruction_boundary;
bool tpr_access_reporting;
- bool xsaves_enabled;
+ bool load_eoi_exitmap_pending;
bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
B.R.
Yu
On Fri, Feb 17, 2023 at 03:10:18PM -0800, Sean Christopherson wrote:
> Track "virtual VMSAVE/VMLOAD exposed to L1" via a governed feature flag
> instead of using a dedicated bit/flag in vcpu_svm.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/governed_features.h | 1 +
> arch/x86/kvm/svm/nested.c | 2 +-
> arch/x86/kvm/svm/svm.c | 5 ++---
> arch/x86/kvm/svm/svm.h | 1 -
> 4 files changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
> index 0335576a80a8..b66b9d550f33 100644
> --- a/arch/x86/kvm/governed_features.h
> +++ b/arch/x86/kvm/governed_features.h
> @@ -9,6 +9,7 @@ KVM_GOVERNED_X86_FEATURE(GBPAGES)
> KVM_GOVERNED_X86_FEATURE(XSAVES)
> KVM_GOVERNED_X86_FEATURE(NRIPS)
> KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
> +KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
>
> #undef KVM_GOVERNED_X86_FEATURE
> #undef KVM_GOVERNED_FEATURE
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 30e00c4e07c7..6a96058c0e48 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
>
> static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
> {
> - if (!svm->v_vmload_vmsave_enabled)
> + if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
> return true;
>
> if (!nested_npt_enabled(svm))
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index dd4aead5462c..b3f0271c73b9 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1162,8 +1162,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
>
> set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
> set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
> -
> - svm->v_vmload_vmsave_enabled = false;
> } else {
> /*
> * If hardware supports Virtual VMLOAD VMSAVE then enable it
> @@ -4153,7 +4151,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>
> svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
>
> - svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
> + if (vls && !guest_cpuid_is_intel(vcpu))
> + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
If yes, how about adding some comments? Thanks!
B.R.
Yu
> Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
> VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
Oh, sorry. I missed the guest_cpuid_has() in kvm_governed_feature_check_and_set().
So please just ignore my 2nd question.
As to the check of guest_cpuid_is_intel(), is it necessary?
B.R.
Yu
Sean Christopherson <[email protected]> writes:
> Introduce yet another X86_FEATURE flag framework to manage and cache KVM
> governed features (for lack of a better term). "Governed" in this case
> means that KVM has some level of involvement and/or vested interest in
> whether or not an X86_FEATURE can be used by the guest. The intent of the
> framework is twofold: to simplify caching of guest CPUID flags that KVM
> needs to frequently query, and to add clarity to such caching, e.g. it
> isn't immediately obvious that SVM's bundle of flags for "optional nested]
Nit: unneeded ']'
> SVM features" track whether or not a flag is exposed to L1.
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 11 +++++++
> arch/x86/kvm/cpuid.c | 2 ++
> arch/x86/kvm/cpuid.h | 51 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/governed_features.h | 9 ++++++
> 4 files changed, 73 insertions(+)
> create mode 100644 arch/x86/kvm/governed_features.h
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 792a6037047a..cd660de02f7b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -835,6 +835,17 @@ struct kvm_vcpu_arch {
> struct kvm_cpuid_entry2 *cpuid_entries;
> struct kvm_hypervisor_cpuid kvm_cpuid;
>
> + /*
> + * Track whether or not the guest is allowed to use features that are
> + * governed by KVM, where "governed" means KVM needs to manage state
> + * and/or explicitly enable the feature in hardware. Typically, but
> + * not always, governed features can be used by the guest if and only
> + * if both KVM and userspace want to expose the feature to the guest.
> + */
> + struct {
> + u32 enabled;
> + } governed_features;
> +
> u64 reserved_gpa_bits;
> int maxphyaddr;
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 8f8edeaf8177..013fdc27fc8f 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -335,6 +335,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> struct kvm_lapic *apic = vcpu->arch.apic;
> struct kvm_cpuid_entry2 *best;
>
> + vcpu->arch.governed_features.enabled = 0;
> +
> best = kvm_find_cpuid_entry(vcpu, 1);
> if (best && apic) {
> if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index b1658c0de847..f61a2106ba90 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -232,4 +232,55 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
> return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
> }
>
> +enum kvm_governed_features {
> +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
> +#include "governed_features.h"
> + KVM_NR_GOVERNED_FEATURES
> +};
> +
> +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
> +{
> + switch (x86_feature) {
> +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
> +#include "governed_features.h"
> + default:
> + return -1;
> + }
> +}
> +
> +static __always_inline int kvm_is_governed_feature(unsigned int x86_feature)
> +{
> + return kvm_governed_feature_index(x86_feature) >= 0;
> +}
> +
> +static __always_inline u32 kvm_governed_feature_bit(unsigned int x86_feature)
> +{
> + int index = kvm_governed_feature_index(x86_feature);
> +
> + BUILD_BUG_ON(index < 0);
> + return BIT(index);
> +}
> +
> +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES >
> + sizeof(vcpu->arch.governed_features.enabled) * BITS_PER_BYTE);
> +
> + vcpu->arch.governed_features.enabled |= kvm_governed_feature_bit(x86_feature);
> +}
> +
> +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + if (guest_cpuid_has(vcpu, x86_feature))
> + kvm_governed_feature_set(vcpu, x86_feature);
> +}
> +
> +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + return vcpu->arch.governed_features.enabled & kvm_governed_feature_bit(x86_feature);
> +}
> +
> #endif
> diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
> new file mode 100644
> index 000000000000..40ce8e6608cd
> --- /dev/null
> +++ b/arch/x86/kvm/governed_features.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
> +BUILD_BUG()
> +#endif
> +
> +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
> +
> +#undef KVM_GOVERNED_X86_FEATURE
> +#undef KVM_GOVERNED_FEATURE
--
Vitaly
On Tue, Feb 21, 2023, Yu Zhang wrote:
> > Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
> > VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
>
> Oh, sorry. I missed the guest_cpuid_has() in kvm_governed_feature_check_and_set().
> So please just ignore my 2nd question.
>
> As to the check of guest_cpuid_is_intel(), is it necessary?
Yes? The comment in init_vmcb_after_set_cpuid() says:
/*
* We must intercept SYSENTER_EIP and SYSENTER_ESP
* accesses because the processor only stores 32 bits.
* For the same reason we cannot use virtual VMLOAD/VMSAVE.
*/
but I'm struggling to connect the dots to SYSENTER. I suspect the comment is
misleading and has nothing to do 32-bit vs. 64-bit (or I'm reading it wrong) and
should be something like:
/*
* Disable virtual VMLOAD/VMSAVE and intercept VMLOAD/VMSAVE if the
* guest CPU is Intel in order to inject #UD.
*/
In other words, a non-SVM guest shouldn't be allowed to execute VMLOAD/VMSAVE.
On Tue, Feb 21, 2023 at 03:48:07PM -0800, Sean Christopherson wrote:
> On Tue, Feb 21, 2023, Yu Zhang wrote:
> > > Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
> > > VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
> >
> > Oh, sorry. I missed the guest_cpuid_has() in kvm_governed_feature_check_and_set().
> > So please just ignore my 2nd question.
> >
> > As to the check of guest_cpuid_is_intel(), is it necessary?
>
> Yes? The comment in init_vmcb_after_set_cpuid() says:
>
> /*
> * We must intercept SYSENTER_EIP and SYSENTER_ESP
> * accesses because the processor only stores 32 bits.
> * For the same reason we cannot use virtual VMLOAD/VMSAVE.
> */
>
> but I'm struggling to connect the dots to SYSENTER. I suspect the comment is
> misleading and has nothing to do 32-bit vs. 64-bit (or I'm reading it wrong) and
> should be something like:
>
> /*
> * Disable virtual VMLOAD/VMSAVE and intercept VMLOAD/VMSAVE if the
> * guest CPU is Intel in order to inject #UD.
> */
>
> In other words, a non-SVM guest shouldn't be allowed to execute VMLOAD/VMSAVE.
Yes. Such interpretation makes sense. And vmload/vmsave shall be intercepted
if guest CPU is Intel and #UD shall be injected. I guess this is done indirectly
by judging the EFER_SVME not set in EFER in nested_svm_check_permissions()?
And as to X86_FEATURE_V_VMSAVE_VMLOAD, should the guest_cpuid_has() return true
at all for a Intel guest?
B.R.
Yu
+Maxim
On Wed, Feb 22, 2023, Yu Zhang wrote:
> On Tue, Feb 21, 2023 at 03:48:07PM -0800, Sean Christopherson wrote:
> > On Tue, Feb 21, 2023, Yu Zhang wrote:
> > > > Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
> > > > VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
> > >
> > > Oh, sorry. I missed the guest_cpuid_has() in kvm_governed_feature_check_and_set().
> > > So please just ignore my 2nd question.
> > >
> > > As to the check of guest_cpuid_is_intel(), is it necessary?
> >
> > Yes? The comment in init_vmcb_after_set_cpuid() says:
> >
> > /*
> > * We must intercept SYSENTER_EIP and SYSENTER_ESP
> > * accesses because the processor only stores 32 bits.
> > * For the same reason we cannot use virtual VMLOAD/VMSAVE.
> > */
> >
> > but I'm struggling to connect the dots to SYSENTER. I suspect the comment is
> > misleading and has nothing to do 32-bit vs. 64-bit (or I'm reading it wrong) and
> > should be something like:
> >
> > /*
> > * Disable virtual VMLOAD/VMSAVE and intercept VMLOAD/VMSAVE if the
> > * guest CPU is Intel in order to inject #UD.
> > */
> >
> > In other words, a non-SVM guest shouldn't be allowed to execute VMLOAD/VMSAVE.
>
> Yes. Such interpretation makes sense. And vmload/vmsave shall be intercepted
> if guest CPU is Intel and #UD shall be injected. I guess this is done indirectly
> by judging the EFER_SVME not set in EFER in nested_svm_check_permissions()?
Nope, my interpretation is wrong. vmload_vmsave_interception() clears the upper
bits of SYSENTER_{EIP,ESP}
if (vmload) {
svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
svm->sysenter_eip_hi = 0;
svm->sysenter_esp_hi = 0;
} else {
svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
}
From commit adc2a23734ac ("KVM: nSVM: improve SYSENTER emulation on AMD"):
3. Disable vmload/vmsave virtualization if vendor=GenuineIntel.
(It is somewhat insane to set vendor=GenuineIntel and still enable
SVM for the guest but well whatever).
Then zero the high 32 bit parts when kvm intercepts and emulates vmload.
Presumably AMD hardware loads only the lower 32 bits, which would leave garbage
in the upper bits and even leak state from L1 to L2 (again ignoring the fact that
exposing SVM to an Intel vCPU is bonkers).
I'll opportunistically massage the comment to make it more explicit about why
VMLOAD needs to be intercepted.
That said, clearing the bits for this seems wrong. That would corrupt the MSRs
for 64-bit Intel guests. The "target" of the fix was 32-bit L2s, i.e. I doubt
anything would notice.
This patch fixes nested migration of 32 bit nested guests, that was
broken because incorrect cached values of SYSENTER msrs were stored in
the migration stream if L1 changed these msrs with
vmload prior to L2 entry.
Maxim, would anything actually break if KVM let L1 load 64-bit values for the
SYSENTER MSRs?
> And as to X86_FEATURE_V_VMSAVE_VMLOAD, should the guest_cpuid_has() return true
> at all for a Intel guest?
Yes, because guest CPUID is userspace controlled. Except for emulating architectural
side effects, e.g. size of XSAVE area, KVM doesn't sanitize guest CPUID.
On Tue, Feb 21, 2023, Yu Zhang wrote:
> On Fri, Feb 17, 2023 at 03:10:15PM -0800, Sean Christopherson wrote:
> > Use the governed feature framework to track if XSAVES is "enabled", i.e.
> > if XSAVES can be used by the guest. Add a comment in the SVM code to
> > explain the very unintuitive logic of deliberately NOT checking if XSAVES
> > is enumerated in the guest CPUID model.
> >
> > No functional change intended.
>
> xsaves_enabled in struct kvm_vcpu_arch is no longer used. But instead of
> just deleting it, maybe we could move 'bool load_eoi_exitmap_pending' to
> its place, so 7 bytes can be saved for each struct kvm_vcpu_arch:
I prefer leaving load_eoi_exitmap_pending where it is so that it's co-located with
ioapic_handled_vectors. I agree wasting 7 bytes is unfortunate, but I don't want
to take an ad hoc approach to shrinking per-vCPU structs. See the link below for
more discussion.
https://lore.kernel.org/all/[email protected]
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index cd660de02f7b..0eef5469c165 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -740,7 +740,6 @@ struct kvm_vcpu_arch {
> u64 efer;
> u64 apic_base;
> struct kvm_lapic *apic; /* kernel irqchip context */
> - bool load_eoi_exitmap_pending;
> DECLARE_BITMAP(ioapic_handled_vectors, 256);
> unsigned long apic_attention;
> int32_t apic_arb_prio;
> @@ -750,7 +749,7 @@ struct kvm_vcpu_arch {
> u64 smi_count;
> bool at_instruction_boundary;
> bool tpr_access_reporting;
> - bool xsaves_enabled;
> + bool load_eoi_exitmap_pending;
> bool xfd_no_write_intercept;
> u64 ia32_xss;
> u64 microcode_version;
>
> B.R.
> Yu
>
On Wed, Feb 22, 2023 at 08:39:01AM -0800, Sean Christopherson wrote:
> +Maxim
>
> On Wed, Feb 22, 2023, Yu Zhang wrote:
> > On Tue, Feb 21, 2023 at 03:48:07PM -0800, Sean Christopherson wrote:
> > > On Tue, Feb 21, 2023, Yu Zhang wrote:
> > > > > Sorry, why guest_cpuid_is_intel(vcpu)? Is it becasue that a AMD host with virtual
> > > > > VMSAVE/VMLOAD capability will always expose this feature for all AMD guests?
> > > >
> > > > Oh, sorry. I missed the guest_cpuid_has() in kvm_governed_feature_check_and_set().
> > > > So please just ignore my 2nd question.
> > > >
> > > > As to the check of guest_cpuid_is_intel(), is it necessary?
> > >
> > > Yes? The comment in init_vmcb_after_set_cpuid() says:
> > >
> > > /*
> > > * We must intercept SYSENTER_EIP and SYSENTER_ESP
> > > * accesses because the processor only stores 32 bits.
> > > * For the same reason we cannot use virtual VMLOAD/VMSAVE.
> > > */
> > >
> > > but I'm struggling to connect the dots to SYSENTER. I suspect the comment is
> > > misleading and has nothing to do 32-bit vs. 64-bit (or I'm reading it wrong) and
> > > should be something like:
> > >
> > > /*
> > > * Disable virtual VMLOAD/VMSAVE and intercept VMLOAD/VMSAVE if the
> > > * guest CPU is Intel in order to inject #UD.
> > > */
> > >
> > > In other words, a non-SVM guest shouldn't be allowed to execute VMLOAD/VMSAVE.
> >
> > Yes. Such interpretation makes sense. And vmload/vmsave shall be intercepted
> > if guest CPU is Intel and #UD shall be injected. I guess this is done indirectly
> > by judging the EFER_SVME not set in EFER in nested_svm_check_permissions()?
>
> Nope, my interpretation is wrong. vmload_vmsave_interception() clears the upper
> bits of SYSENTER_{EIP,ESP}
>
> if (vmload) {
> svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
> svm->sysenter_eip_hi = 0;
> svm->sysenter_esp_hi = 0;
> } else {
> svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
> }
>
> From commit adc2a23734ac ("KVM: nSVM: improve SYSENTER emulation on AMD"):
>
> 3. Disable vmload/vmsave virtualization if vendor=GenuineIntel.
> (It is somewhat insane to set vendor=GenuineIntel and still enable
> SVM for the guest but well whatever).
> Then zero the high 32 bit parts when kvm intercepts and emulates vmload.
>
> Presumably AMD hardware loads only the lower 32 bits, which would leave garbage
> in the upper bits and even leak state from L1 to L2 (again ignoring the fact that
> exposing SVM to an Intel vCPU is bonkers).
Is it because L1 is a VM migrated from Intel platform to AMD's?
So w/o commit adc2a23734ac ("KVM: nSVM: improve SYSENTER emulation on AMD"):
1> L1 could be a "GenuineIntel" with SVM capability (bizarre as it is), running
in 64-bit mode.
2> With no interception of MSR writes to the SYSENTER_EIP/ESP, L1 may set its
SYSENTER_EIP/ESP to a 64-bit value successfully (though sysenter/sysexit may
fail).
3> L2 could be in 32-bit mode. And if virtual vmload/vmsave is enabled for L1,
only lower 32 bits of those MSRs will be loaded, leaking the higher 32 bits.
Is above scenario the reason of Maxim's fix?
But why it is related to nested migration?
> I'll opportunistically massage the comment to make it more explicit about why
> VMLOAD needs to be intercepted.
>
> That said, clearing the bits for this seems wrong. That would corrupt the MSRs
> for 64-bit Intel guests. The "target" of the fix was 32-bit L2s, i.e. I doubt
> anything would notice.
>
> This patch fixes nested migration of 32 bit nested guests, that was
> broken because incorrect cached values of SYSENTER msrs were stored in
> the migration stream if L1 changed these msrs with
> vmload prior to L2 entry.
>
> Maxim, would anything actually break if KVM let L1 load 64-bit values for the
> SYSENTER MSRs?
>
> > And as to X86_FEATURE_V_VMSAVE_VMLOAD, should the guest_cpuid_has() return true
> > at all for a Intel guest?
>
> Yes, because guest CPUID is userspace controlled. Except for emulating architectural
> side effects, e.g. size of XSAVE area, KVM doesn't sanitize guest CPUID.
>
B.R.
Yu
On Wed, Feb 22, 2023 at 10:56:04AM -0800, Sean Christopherson wrote:
> On Tue, Feb 21, 2023, Yu Zhang wrote:
> > On Fri, Feb 17, 2023 at 03:10:15PM -0800, Sean Christopherson wrote:
> > > Use the governed feature framework to track if XSAVES is "enabled", i.e.
> > > if XSAVES can be used by the guest. Add a comment in the SVM code to
> > > explain the very unintuitive logic of deliberately NOT checking if XSAVES
> > > is enumerated in the guest CPUID model.
> > >
> > > No functional change intended.
> >
> > xsaves_enabled in struct kvm_vcpu_arch is no longer used. But instead of
> > just deleting it, maybe we could move 'bool load_eoi_exitmap_pending' to
> > its place, so 7 bytes can be saved for each struct kvm_vcpu_arch:
>
> I prefer leaving load_eoi_exitmap_pending where it is so that it's co-located with
> ioapic_handled_vectors. I agree wasting 7 bytes is unfortunate, but I don't want
> to take an ad hoc approach to shrinking per-vCPU structs. See the link below for
> more discussion.
>
> https://lore.kernel.org/all/[email protected]
Fair enough. :)
Thanks
Yu
On Fri, Feb 24, 2023, Yu Zhang wrote:
> On Wed, Feb 22, 2023 at 08:39:01AM -0800, Sean Christopherson wrote:
> > +Maxim
> >
> > On Wed, Feb 22, 2023, Yu Zhang wrote:
> > > On Tue, Feb 21, 2023 at 03:48:07PM -0800, Sean Christopherson wrote:
> > Nope, my interpretation is wrong. vmload_vmsave_interception() clears the upper
> > bits of SYSENTER_{EIP,ESP}
> >
> > if (vmload) {
> > svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
> > svm->sysenter_eip_hi = 0;
> > svm->sysenter_esp_hi = 0;
> > } else {
> > svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
> > }
> >
> > From commit adc2a23734ac ("KVM: nSVM: improve SYSENTER emulation on AMD"):
> >
> > 3. Disable vmload/vmsave virtualization if vendor=GenuineIntel.
> > (It is somewhat insane to set vendor=GenuineIntel and still enable
> > SVM for the guest but well whatever).
> > Then zero the high 32 bit parts when kvm intercepts and emulates vmload.
> >
> > Presumably AMD hardware loads only the lower 32 bits, which would leave garbage
> > in the upper bits and even leak state from L1 to L2 (again ignoring the fact that
> > exposing SVM to an Intel vCPU is bonkers).
> Is it because L1 is a VM migrated from Intel platform to AMD's?
I believe so.
> So w/o commit adc2a23734ac ("KVM: nSVM: improve SYSENTER emulation on AMD"):
> 1> L1 could be a "GenuineIntel" with SVM capability (bizarre as it is), running
> in 64-bit mode.
> 2> With no interception of MSR writes to the SYSENTER_EIP/ESP, L1 may set its
> SYSENTER_EIP/ESP to a 64-bit value successfully (though sysenter/sysexit may
> fail).
Yes, though the MSRs don't need to be passed through, KVM emulates the full 64 bits
if the guest CPUID model is Intel.
> 3> L2 could be in 32-bit mode. And if virtual vmload/vmsave is enabled for L1,
> only lower 32 bits of those MSRs will be loaded, leaking the higher 32 bits.
>
> Is above scenario the reason of Maxim's fix?
Yes, that's my understanding.
> But why it is related to nested migration?
I understand why it's related, but I don't understand why we bothered to add "support"
for this.
In theory, if L1 is migrated by L0 while L1 is running an L2 that uses SYSENTER,
problems will occur. I'm a bit lost as to how this matters in practice, as KVM
doesn't support cross-vendor nested virtualization, and if L1 can be enlightened
to the point where it can switch from VMX=>SVM during migration, what's the point
of doing a migration?
On 2/18/2023 7:10 AM, Sean Christopherson wrote:
> Introduce yet another X86_FEATURE flag framework to manage and cache KVM
> governed features (for lack of a better term). "Governed" in this case
> means that KVM has some level of involvement and/or vested interest in
> whether or not an X86_FEATURE can be used by the guest. The intent of the
> framework is twofold: to simplify caching of guest CPUID flags that KVM
> needs to frequently query, and to add clarity to such caching, e.g. it
> isn't immediately obvious that SVM's bundle of flags for "optional nested]
spare ]
> SVM features" track whether or not a flag is exposed to L1.
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 11 +++++++
> arch/x86/kvm/cpuid.c | 2 ++
> arch/x86/kvm/cpuid.h | 51 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/governed_features.h | 9 ++++++
> 4 files changed, 73 insertions(+)
> create mode 100644 arch/x86/kvm/governed_features.h
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 792a6037047a..cd660de02f7b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -835,6 +835,17 @@ struct kvm_vcpu_arch {
> struct kvm_cpuid_entry2 *cpuid_entries;
> struct kvm_hypervisor_cpuid kvm_cpuid;
>
> + /*
> + * Track whether or not the guest is allowed to use features that are
> + * governed by KVM, where "governed" means KVM needs to manage state
> + * and/or explicitly enable the feature in hardware. Typically, but
> + * not always, governed features can be used by the guest if and only
> + * if both KVM and userspace want to expose the feature to the guest.
> + */
> + struct {
> + u32 enabled;
Although there are some guidances/preconditions of using the framework,
is it possible that u32 will be ran out quickly after people starts to
use the framework?
Of course, I noticed there is build bug check on the length, it should
be OK to increase the length when needed.
> + } governed_features;
> +
> u64 reserved_gpa_bits;
> int maxphyaddr;
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 8f8edeaf8177..013fdc27fc8f 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -335,6 +335,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> struct kvm_lapic *apic = vcpu->arch.apic;
> struct kvm_cpuid_entry2 *best;
>
> + vcpu->arch.governed_features.enabled = 0;
> +
> best = kvm_find_cpuid_entry(vcpu, 1);
> if (best && apic) {
> if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index b1658c0de847..f61a2106ba90 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -232,4 +232,55 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
> return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
> }
>
> +enum kvm_governed_features {
> +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
> +#include "governed_features.h"
> + KVM_NR_GOVERNED_FEATURES
> +};
> +
> +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
> +{
> + switch (x86_feature) {
> +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
> +#include "governed_features.h"
> + default:
> + return -1;
> + }
> +}
> +
> +static __always_inline int kvm_is_governed_feature(unsigned int x86_feature)
Is it better to use bool instead of int?
> +{
> + return kvm_governed_feature_index(x86_feature) >= 0;
> +}
> +
> +static __always_inline u32 kvm_governed_feature_bit(unsigned int x86_feature)
> +{
> + int index = kvm_governed_feature_index(x86_feature);
> +
> + BUILD_BUG_ON(index < 0);
> + return BIT(index);
> +}
> +
> +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES >
> + sizeof(vcpu->arch.governed_features.enabled) * BITS_PER_BYTE);
> +
> + vcpu->arch.governed_features.enabled |= kvm_governed_feature_bit(x86_feature);
> +}
> +
> +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + if (guest_cpuid_has(vcpu, x86_feature))
> + kvm_governed_feature_set(vcpu, x86_feature);
> +}
> +
> +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
> + unsigned int x86_feature)
> +{
> + return vcpu->arch.governed_features.enabled & kvm_governed_feature_bit(x86_feature);
> +}
> +
> #endif
> diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
> new file mode 100644
> index 000000000000..40ce8e6608cd
> --- /dev/null
> +++ b/arch/x86/kvm/governed_features.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
> +BUILD_BUG()
> +#endif
> +
> +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
> +
> +#undef KVM_GOVERNED_X86_FEATURE
> +#undef KVM_GOVERNED_FEATURE
On Wed, Feb 22, 2023, Sean Christopherson wrote:
> +Maxim
>
> On Wed, Feb 22, 2023, Yu Zhang wrote:
> I'll opportunistically massage the comment to make it more explicit about why
> VMLOAD needs to be intercepted.
>
> That said, clearing the bits for this seems wrong. That would corrupt the MSRs
> for 64-bit Intel guests. The "target" of the fix was 32-bit L2s, i.e. I doubt
> anything would notice.
>
> This patch fixes nested migration of 32 bit nested guests, that was
> broken because incorrect cached values of SYSENTER msrs were stored in
> the migration stream if L1 changed these msrs with
> vmload prior to L2 entry.
Aha! Finally figured out what this code is doing. KVM intercepts VMLOAD so that
KVM can correctly model the VMLOAD behavior of dropping bits 63:32, i.e. to clear
svm->sysenter_eip_hi and svm->sysenter_esp_hi.
So the code is correct. I'll add this comment:
/*
* Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
On Thu, Jun 29, 2023, Binbin Wu wrote:
>
>
> On 2/18/2023 7:10 AM, Sean Christopherson wrote:
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 792a6037047a..cd660de02f7b 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -835,6 +835,17 @@ struct kvm_vcpu_arch {
> > struct kvm_cpuid_entry2 *cpuid_entries;
> > struct kvm_hypervisor_cpuid kvm_cpuid;
> > + /*
> > + * Track whether or not the guest is allowed to use features that are
> > + * governed by KVM, where "governed" means KVM needs to manage state
> > + * and/or explicitly enable the feature in hardware. Typically, but
> > + * not always, governed features can be used by the guest if and only
> > + * if both KVM and userspace want to expose the feature to the guest.
> > + */
> > + struct {
> > + u32 enabled;
> Although there are some guidances/preconditions of using the framework,
> is it possible that u32 will be ran out quickly after people starts to use
> the framework?
It's definitely possible. And there's no reason to limit this to a u32, I really
have no idea why I did that.
Ah, it's because "struct kvm_vcpu_arch" is defined in arch/x86/include/asm/kvm_host.h,
and I didn't want to expose governed_features.h in arch/x86/include/asm. Hrm,
that's really annoying.
Aha! A better workaround for that conudrum would be to define an explicit "max"
and use that, with a FIXME to call out that this really should use
KVM_NR_GOVERNED_FEATURES directly. I have aspirations of moving kvm_host.h to
arch/<arch>/kvm, at which point this can be cleaned up by declaring "enum
kvm_governed_features" in kvm_host.h (though it'll likely be named something
like kvm_arch.h at that point).
/*
* FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
* when "struct kvm_vcpu_arch" is no longer defined in an
* arch/x86/include/asm header. The max is mostly arbitrary, i.e.
* can be increased as necessary.
*/
#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG
/*
* Track whether or not the guest is allowed to use features that are
* governed by KVM, where "governed" means KVM needs to manage state
* and/or explicitly enable the feature in hardware. Typically, but
* not always, governed features can be used by the guest if and only
* if both KVM and userspace want to expose the feature to the guest.
*/
struct {
DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES);
} governed_features;
> Of course, I noticed there is build� bug check on the length, it should be
> OK to increase the length when needed.
> > +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
> > +{
> > + switch (x86_feature) {
> > +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
> > +#include "governed_features.h"
> > + default:
> > + return -1;
> > + }
> > +}
> > +
> > +static __always_inline int kvm_is_governed_feature(unsigned int x86_feature)
> Is it better to use bool instead of int?
Yes, this definitely should return a bool.
Thanks!
On Fri, Feb 17, 2023 at 03:10:11PM -0800, Sean Christopherson wrote:
>+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
>+ unsigned int x86_feature)
>+{
>+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES >
>+ sizeof(vcpu->arch.governed_features.enabled) * BITS_PER_BYTE);
>+
>+ vcpu->arch.governed_features.enabled |= kvm_governed_feature_bit(x86_feature);
>+}
>+
>+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
>+ unsigned int x86_feature)
>+{
>+ if (guest_cpuid_has(vcpu, x86_feature))
Most callers in this series are conditional on either boot_cpu_has() or some
local variables. Can we convert them to kvm_cpu_cap_has() and incorporate them
within this function? i.e.,
if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
The benefits of doing so are
1. callers needn't repeat
if (kvm_cpu_cap_has(x86_feature))
kvm_governed_feature_check_and_set(x86_feature)
2. this fits the idea better that guests can use a governed feature only if host
supports it _and_ QEMU exposes it to the guest.
>+ kvm_governed_feature_set(vcpu, x86_feature);
>+}
>+
On Thu, Jun 29, 2023 at 09:50:34AM -0700, Sean Christopherson wrote:
> On Wed, Feb 22, 2023, Sean Christopherson wrote:
> > +Maxim
> >
> > On Wed, Feb 22, 2023, Yu Zhang wrote:
> > I'll opportunistically massage the comment to make it more explicit about why
> > VMLOAD needs to be intercepted.
> >
> > That said, clearing the bits for this seems wrong. That would corrupt the MSRs
> > for 64-bit Intel guests. The "target" of the fix was 32-bit L2s, i.e. I doubt
> > anything would notice.
> >
> > This patch fixes nested migration of 32 bit nested guests, that was
> > broken because incorrect cached values of SYSENTER msrs were stored in
> > the migration stream if L1 changed these msrs with
> > vmload prior to L2 entry.
>
> Aha! Finally figured out what this code is doing. KVM intercepts VMLOAD so that
> KVM can correctly model the VMLOAD behavior of dropping bits 63:32, i.e. to clear
> svm->sysenter_eip_hi and svm->sysenter_esp_hi.
>
> So the code is correct. I'll add this comment:
>
> /*
> * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
> * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
> * SVM on Intel is bonkers and extremely unlikely to work).
> */
>
Oh.. Because L2 will never be a 64-bit Intel guest, and the emulation of vmload
shall follow APM's requirement(to clear the upper 32 bits)?
Thanks a lot for bring me back to this discussion... I totally forgot it. :)
B.R.
Yu
Thanks a lot for this explanation, Sean!
On Fri, Jun 30, 2023, Chao Gao wrote:
> On Fri, Feb 17, 2023 at 03:10:11PM -0800, Sean Christopherson wrote:
> >+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
> >+ unsigned int x86_feature)
> >+{
> >+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES >
> >+ sizeof(vcpu->arch.governed_features.enabled) * BITS_PER_BYTE);
> >+
> >+ vcpu->arch.governed_features.enabled |= kvm_governed_feature_bit(x86_feature);
> >+}
> >+
> >+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
> >+ unsigned int x86_feature)
> >+{
> >+ if (guest_cpuid_has(vcpu, x86_feature))
>
> Most callers in this series are conditional on either boot_cpu_has() or some
> local variables. Can we convert them to kvm_cpu_cap_has() and incorporate them
> within this function? i.e.,
>
> if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
Hmm, I was going to say "no", as most callers don't check kvm_cpu_cap_has() verbatim,
but it doesn't have to be that way. The majority of SVM features factor in module
params, but KVM should set the kvm_cpu capability if and only if a feature is supported
in hardware *and* enabled by its module param.
And arguably that's kinda sorta a bug fix, because this
if (lbrv)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
technically should be
if (lbrv && nested)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
Heh, and it's kinda sorta a bug fix for XSAVES on VMX, because this
if (cpu_has_vmx_xsaves() && boot_cpu_has(X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
should technically be
if (kvm_cpu_cap_has(X86_FEATURE_XSAVES) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
> The benefits of doing so are
> 1. callers needn't repeat
>
> if (kvm_cpu_cap_has(x86_feature))
> kvm_governed_feature_check_and_set(x86_feature)
>
> 2. this fits the idea better that guests can use a governed feature only if host
> supports it _and_ QEMU exposes it to the guest.
Agreed, especially since we'll still have kvm_governed_feature_set() for the
extra special cases.
Thanks for the input!