Allow a non-zero value for non-present SPTE and removed SPTE,
so that TDX can set the "suppress VE" bit. This is taken from
https://patchew.org/linux/[email protected]/
with review comments addressed:
- do not dereference an address from the VMCS to include #VE info
in the dump
- fail hard if the #VE info page cannot be allocated
Paolo
Isaku Yamahata (2):
KVM: x86/mmu: Add Suppress VE bit to EPT
shadow_mmio_mask/shadow_present_mask
KVM: VMX: Introduce test mode related to EPT violation VE
Paolo Bonzini (1):
KVM, x86: add architectural support code for #VE
Sean Christopherson (4):
KVM: Allow page-sized MMU caches to be initialized with custom 64-bit
values
KVM: x86/mmu: Replace hardcoded value 0 for the initial value for SPTE
KVM: x86/mmu: Allow non-zero value for non-present SPTE and removed
SPTE
KVM: x86/mmu: Track shadow MMIO value on a per-VM basis
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/include/asm/vmx.h | 13 ++++++++
arch/x86/kvm/Kconfig | 13 ++++++++
arch/x86/kvm/mmu/mmu.c | 21 ++++++++-----
arch/x86/kvm/mmu/paging_tmpl.h | 14 ++++-----
arch/x86/kvm/mmu/spte.c | 24 ++++++++-------
arch/x86/kvm/mmu/spte.h | 24 ++++++++++++---
arch/x86/kvm/mmu/tdp_mmu.c | 18 +++++------
arch/x86/kvm/vmx/vmcs.h | 5 ++++
arch/x86/kvm/vmx/vmx.c | 53 ++++++++++++++++++++++++++++++++-
arch/x86/kvm/vmx/vmx.h | 6 +++-
include/linux/kvm_types.h | 1 +
virt/kvm/kvm_main.c | 16 ++++++++--
13 files changed, 167 insertions(+), 43 deletions(-)
--
2.43.0
From: Sean Christopherson <[email protected]>
TDX will use a different shadow PTE entry value for MMIO from VMX. Add a
member to kvm_arch and track value for MMIO per-VM instead of a global
variable. By using the per-VM EPT entry value for MMIO, the existing VMX
logic is kept working. Introduce a separate setter function so that guest
TD can use a different value later.
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
Message-Id: <229a18434e5d83f45b1fcd7bf1544d79db1becb6.1705965635.git.isaku.yamahata@intel.com>
Reviewed-by: Xiaoyao Li <[email protected]>
Reviewed-by: Binbin Wu <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/mmu/mmu.c | 7 ++++---
arch/x86/kvm/mmu/spte.c | 4 ++--
arch/x86/kvm/mmu/spte.h | 4 ++--
arch/x86/kvm/mmu/tdp_mmu.c | 6 +++---
5 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c69840647e..9f92bdb78504 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1313,6 +1313,8 @@ struct kvm_arch {
*/
spinlock_t mmu_unsync_pages_lock;
+ u64 shadow_mmio_value;
+
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fbfdc606f1f1..45b6d8f9e359 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2462,7 +2462,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
- } else if (is_mmio_spte(pte)) {
+ } else if (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
@@ -4144,7 +4144,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (WARN_ON_ONCE(reserved))
return -EINVAL;
- if (is_mmio_spte(spte)) {
+ if (is_mmio_spte(vcpu->kvm, spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned int access = get_mmio_spte_access(spte);
@@ -4760,7 +4760,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
- if (unlikely(is_mmio_spte(*sptep))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
mmu_spte_clear_no_track(sptep);
return true;
@@ -6267,6 +6267,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
void kvm_mmu_init_vm(struct kvm *kvm)
{
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 0a0e83859c27..a5e014d7bc62 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!shadow_mmio_value);
+ WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
access &= shadow_mmio_access_mask;
- spte |= shadow_mmio_value | access;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 8056b7853a79..5dd5405fa07a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -265,9 +265,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
likely(enable_mmio_caching);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f5401967897a..5fd618abc243 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,8 +495,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
!is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
@@ -1028,7 +1028,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
--
2.43.0
From: Isaku Yamahata <[email protected]>
To make use of the same value of shadow_mmio_mask and shadow_present_mask
for TDX and VMX, add Suppress-VE bit to shadow_mmio_mask and
shadow_present_mask so that they can be common for both VMX and TDX.
TDX will require shadow_mmio_mask and shadow_present_mask to include
VMX_SUPPRESS_VE for shared GPA so that EPT violation is triggered for
shared GPA. For VMX, VMX_SUPPRESS_VE doesn't matter for MMIO because the
spte value is defined so as to cause EPT misconfig.
Signed-off-by: Isaku Yamahata <[email protected]>
Message-Id: <97cc616b3563cd8277be91aaeb3e14bce23c3649.1705965635.git.isaku.yamahata@intel.com>
Reviewed-by: Xiaoyao Li <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/mmu/spte.c | 6 ++++--
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4dba17363008..ac6da0a5f5e6 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -514,6 +514,7 @@ enum vmcs_field {
#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 768aaeddf5fa..0a0e83859c27 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -413,7 +413,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
/*
* EPT overrides the host MTRRs, and so KVM must program the desired
* memtype directly into the SPTEs. Note, this mask is just the mask
@@ -430,7 +432,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
- VMX_EPT_RWX_MASK, 0);
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
--
2.43.0
From: Sean Christopherson <[email protected]>
For TD guest, the current way to emulate MMIO doesn't work any more, as KVM
is not able to access the private memory of TD guest and do the emulation.
Instead, TD guest expects to receive #VE when it accesses the MMIO and then
it can explicitly make hypercall to KVM to get the expected information.
To achieve this, the TDX module always enables "EPT-violation #VE" in the
VMCS control. And accordingly, for the MMIO spte for the shared GPA,
1. KVM needs to set "suppress #VE" bit for the non-present SPTE so that EPT
violation happens on TD accessing MMIO range. 2. On EPT violation, KVM
sets the MMIO spte to clear "suppress #VE" bit so the TD guest can receive
the #VE instead of EPT misconfiguration unlike VMX case. For the shared GPA
that is not populated yet, EPT violation need to be triggered when TD guest
accesses such shared GPA. The non-present SPTE value for shared GPA should
set "suppress #VE" bit.
Add "suppress #VE" bit (bit 63) to SHADOW_NONPRESENT_VALUE and
REMOVED_SPTE. Unconditionally set the "suppress #VE" bit (which is bit 63)
for both AMD and Intel as: 1) AMD hardware doesn't use this bit when
present bit is off; 2) for normal VMX guest, KVM never enables the
"EPT-violation #VE" in VMCS control and "suppress #VE" bit is ignored by
hardware.
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
Reviewed-by: Binbin Wu <[email protected]>
Reviewed-by: Xiaoyao Li <[email protected]>
Message-Id: <a99cb866897c7083430dce7f24c63b17d7121134.1705965635.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/mmu/paging_tmpl.h | 12 ++++++------
arch/x86/kvm/mmu/spte.c | 14 +++++++-------
arch/x86/kvm/mmu/spte.h | 16 +++++++++++++++-
3 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index bebd73cd61bb..9aac3aa93d88 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -933,13 +933,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 6c7ab3aa6aa7..768aaeddf5fa 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -144,19 +144,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
- WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
spte |= shadow_present_mask;
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 0f4ec2859474..8056b7853a79 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -149,7 +149,21 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
@@ -192,7 +206,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
- * vulnerability. Use only low bits to avoid 64-bit immediates.
+ * vulnerability.
*
* Only used by the TDP MMU.
*/
--
2.43.0
Dump the contents of the #VE info data structure and assert that #VE does
not happen, but do not yet do anything with it.
No functional change intended, separated for clarity only.
Extracted from a patch by Isaku Yamahata <[email protected]>.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/vmx.h | 12 ++++++++++++
arch/x86/kvm/vmx/vmx.c | 4 ++++
2 files changed, 16 insertions(+)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ac6da0a5f5e6..d77a31039f24 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,6 +71,7 @@
#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -226,6 +227,8 @@ enum vmcs_field {
VMREAD_BITMAP_HIGH = 0x00002027,
VMWRITE_BITMAP = 0x00002028,
VMWRITE_BITMAP_HIGH = 0x00002029,
+ VE_INFORMATION_ADDRESS = 0x0000202A,
+ VE_INFORMATION_ADDRESS_HIGH = 0x0000202B,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
ENCLS_EXITING_BITMAP = 0x0000202E,
@@ -631,4 +634,13 @@ enum vmx_l1d_flush_state {
extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+struct vmx_ve_information {
+ u32 exit_reason;
+ u32 delivery;
+ u64 exit_qualification;
+ u64 guest_linear_address;
+ u64 guest_physical_address;
+ u16 eptp_index;
+};
+
#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6780313914f8..d780eee9b697 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6408,6 +6408,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ pr_err("VE info address = 0x%016llx\n",
+ vmcs_read64(VE_INFORMATION_ADDRESS));
+ }
}
/*
--
2.43.0
From: Sean Christopherson <[email protected]>
The TDX support will need the "suppress #VE" bit (bit 63) set as the
initial value for SPTE. To reduce code change size, introduce a new macro
SHADOW_NONPRESENT_VALUE for the initial value for the shadow page table
entry (SPTE) and replace hard-coded value 0 for it. Initialize shadow page
tables with their value.
The plan is to unconditionally set the "suppress #VE" bit for both AMD and
Intel as: 1) AMD hardware uses the bit 63 as NX for present SPTE and
ignored for non-present SPTE; 2) for conventional VMX guests, KVM never
enables the "EPT-violation #VE" in VMCS control and "suppress #VE" bit is
ignored by hardware.
No functional change intended.
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
Message-Id: <acdf09bf60cad12c495005bf3495c54f6b3069c9.1705965635.git.isaku.yamahata@intel.com>
[Remove unnecessary CONFIG_X86_64 check. - Paolo]
Reviewed-by: Xiaoyao Li <[email protected]>
Reviewed-by: Binbin Wu <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 14 +++++++++-----
arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
arch/x86/kvm/mmu/spte.h | 4 +++-
arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++------
4 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 08900a0563f9..fbfdc606f1f1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -567,9 +567,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -603,7 +603,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -1897,7 +1897,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- if (!sp->spt[i])
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@ -6120,7 +6121,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f35..bebd73cd61bb 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index f5c600c52f83..0f4ec2859474 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -149,6 +149,8 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+#define SHADOW_NONPRESENT_VALUE 0ULL
+
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@@ -194,7 +196,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE 0x5a0ULL
+#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c6192a52bd31..f5401967897a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present to non-present. Use
* the raw write helper to avoid an unnecessary check on volatile bits.
*/
- __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
return 0;
}
@@ -740,8 +740,8 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
if (!shared)
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
@@ -808,8 +808,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+ SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
return true;
}
@@ -843,7 +843,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
/*
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
--
2.43.0
From: Isaku Yamahata <[email protected]>
To support TDX, KVM is enhanced to operate with #VE. For TDX, KVM uses the
suppress #VE bit in EPT entries selectively, in order to be able to trap
non-present conditions. However, #VE isn't used for VMX and it's a bug
if it happens. To be defensive and test that VMX case isn't broken
introduce an option ept_violation_ve_test and when it's set, BUG the vm.
Suggested-by: Paolo Bonzini <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
Message-Id: <d6db6ba836605c0412e166359ba5c46a63c22f86.1705965635.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/Kconfig | 13 ++++++++++
arch/x86/kvm/vmx/vmcs.h | 5 ++++
arch/x86/kvm/vmx/vmx.c | 53 ++++++++++++++++++++++++++++++++++++++---
arch/x86/kvm/vmx/vmx.h | 6 ++++-
4 files changed, 73 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 0ebdd088f28b..d64fb2b3eb69 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -95,6 +95,19 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ default KVM_PROVE_MMU || DEBUG_KERNEL
+ depends on KVM_INTEL
+ help
+
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ If unsure, say N.
+
config X86_SGX_KVM
bool "Software Guard eXtensions (SGX) Virtualization"
depends on X86_SGX && KVM_INTEL
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d780eee9b697..f4644f61d770 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -869,6 +869,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
/*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
@@ -2602,6 +2608,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2626,6 +2635,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -4588,6 +4598,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4711,8 +4722,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+ return -EIO;
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -6409,8 +6427,22 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
- pr_err("VE info address = 0x%016llx\n",
- vmcs_read64(VE_INFORMATION_ADDRESS));
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
}
}
@@ -7466,6 +7498,7 @@ void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
int vmx_vcpu_create(struct kvm_vcpu *vcpu)
@@ -7559,6 +7592,20 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 65786dbe7d60..0da79a386825 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -362,6 +362,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -574,7 +577,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
--
2.43.0
On Tue, May 07, 2024 at 11:44:54AM -0400,
Paolo Bonzini <[email protected]> wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index c6192a52bd31..f5401967897a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
> * here since the SPTE is going from non-present to non-present. Use
> * the raw write helper to avoid an unnecessary check on volatile bits.
> */
> - __kvm_tdp_mmu_write_spte(iter->sptep, 0);
> + __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
>
> return 0;
> }
> @@ -740,8 +740,8 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
> continue;
>
> if (!shared)
> - tdp_mmu_iter_set_spte(kvm, &iter, 0);
> - else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
> + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
> + else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
> goto retry;
> }
> }
> @@ -808,8 +808,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
> return false;
>
> - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
> - sp->gfn, sp->role.level + 1);
> + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> + SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
>
> return true;
> }
> @@ -843,7 +843,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> !is_last_spte(iter.old_spte, iter.level))
> continue;
>
> - tdp_mmu_iter_set_spte(kvm, &iter, 0);
> + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
>
> /*
> * Zappings SPTEs in invalid roots doesn't require a TLB flush,
> --
> 2.43.0
I missed one conversion. Here is the fix. I found this during reviewing
TDX TDP MMU changes at [1].
[1] https://lore.kernel.org/kvm/[email protected]/
Paolo, how do you want me to proceed? I can send a updated patch or you can
directly fix the patch in kvm-coco-queue. I'm fine with either way.
From 7910130c0a3f2c5d814d6f14d663b4b692a2c7e4 Mon Sep 17 00:00:00 2001
Message-ID: <7910130c0a3f2c5d814d6f14d663b4b692a2c7e4.1715793643.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <[email protected]>
Date: Wed, 15 May 2024 10:19:08 -0700
Subject: [PATCH] fixup! KVM: x86/mmu: Replace hardcoded value 0 for the
initial value for SPTE
---
arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1259dd63defc..36539c1b36cd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -626,7 +626,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* SPTEs.
*/
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- 0, iter->level, true);
+ SHADOW_NONPRESENT_VALUE, iter->level, true);
return 0;
}
base-commit: 698ca1e403579ca00e16a5b28ae4d576d9f1b20e
--
2.43.2
--
Isaku Yamahata <[email protected]>
On 5/15/24 19:32, Isaku Yamahata wrote:
> Paolo, how do you want me to proceed? I can send a updated patch or you can
> directly fix the patch in kvm-coco-queue. I'm fine with either way.
I'll fix it, thanks!
Paolo
> From 7910130c0a3f2c5d814d6f14d663b4b692a2c7e4 Mon Sep 17 00:00:00 2001
> Message-ID:<7910130c0a3f2c5d814d6f14d663b4b692a2c7e4.1715793643.git.isaku.yamahata@intel.com>
> From: Isaku Yamahata<[email protected]>
> Date: Wed, 15 May 2024 10:19:08 -0700
> Subject: [PATCH] fixup! KVM: x86/mmu: Replace hardcoded value 0 for the
> initial value for SPTE
>
> ---
> arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 1259dd63defc..36539c1b36cd 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -626,7 +626,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
> * SPTEs.
> */
> handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
> - 0, iter->level, true);
> + SHADOW_NONPRESENT_VALUE, iter->level, true);
>
> return 0;
> }
>
> base-commit: 698ca1e403579ca00e16a5b28ae4d576d9f1b20e
On Tue, May 07, 2024, Paolo Bonzini wrote:
> @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
> if (is_invalid_opcode(intr_info))
> return handle_ud(vcpu);
>
> + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> + return -EIO;
I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
there's another bug lurking.
https://lore.kernel.org/all/[email protected]
------------[ cut here ]------------
WARNING: CPU: 6 PID: 68167 at arch/x86/kvm/vmx/vmx.c:5217 handle_exception_nmi+0xd4/0x5b0 [kvm_intel]
Modules linked in: kvm_intel kvm vfat fat dummy bridge stp llc spidev cdc_ncm cdc_eem cdc_ether usbnet mii xhci_pci xhci_hcd ehci_pci ehci_hcd gq(O) sha3_generic
CPU: 6 PID: 68167 Comm: qemu Tainted: G S O 6.9.0-smp--a3fee713d124-sigh #308
Hardware name: Google Interlaken/interlaken, BIOS 0.20231025.0-0 10/25/2023
RIP: 0010:handle_exception_nmi+0xd4/0x5b0 [kvm_intel]
Code: 03 00 80 75 4e 48 89 df be 07 00 00 00 e8 24 79 e7 ff b8 01 00 00 00 eb bd 48 8b 0b b8 fb ff ff ff 80 b9 11 9f 00 00 00 75 ac <0f> 0b 48 8b 3b 66 c7 87 11 9f 00 00 01 01 be 01 03 00 00 e8 f4 66
RSP: 0018:ff201f9afeebfb38 EFLAGS: 00010246
RAX: 00000000fffffffb RBX: ff201f5bea710000 RCX: ff43efc142e18000
RDX: 4813020000000002 RSI: 0000000000000000 RDI: ff201f5bea710000
RBP: ff201f9afeebfb70 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: ffffffffc0a3cd40 R12: 0000000080000300
R13: 0000000000000000 R14: 0000000080000314 R15: 0000000080000314
FS: 00007f65328006c0(0000) GS:ff201f993df00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000040b5712002 CR4: 0000000000773ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
vmx_handle_exit+0x565/0x7e0 [kvm_intel]
vcpu_run+0x188b/0x22b0 [kvm]
kvm_arch_vcpu_ioctl_run+0x358/0x680 [kvm]
kvm_vcpu_ioctl+0x4ca/0x5b0 [kvm]
__se_sys_ioctl+0x7b/0xd0
__x64_sys_ioctl+0x21/0x30
x64_sys_call+0x15ac/0x2e40
do_syscall_64+0x85/0x160
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f653422bfbb
</TASK>
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff85101206>] copy_process+0x366/0x13b0
softirqs last enabled at (0): [<ffffffff85101206>] copy_process+0x366/0x13b0
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace 0000000000000000 ]---
On Wed, May 15, 2024, Sean Christopherson wrote:
> On Tue, May 07, 2024, Paolo Bonzini wrote:
> > @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
> > if (is_invalid_opcode(intr_info))
> > return handle_ud(vcpu);
> >
> > + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> > + return -EIO;
>
> I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
>
> I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> there's another bug lurking.
*sigh*
AFAICT, I'm hitting a hardware issue. The #VE occurs when the CPU does an A/D
assist on an entry in the L2's PML4 (L2 GPA 0x109fff8). EPT A/D bits are disabled,
and KVM has write-protected the GPA (hooray for shadowing EPT entries). The CPU
tries to write the PML4 entry to do the A/D assist and generates what appears to
be a spurious #VE.
Isaku, please forward this to the necessary folks at Intel. I doubt whatever
is broken will block TDX, but it would be nice to get a root cause so we at least
know whether or not TDX is a ticking time bomb.
A branch with fixes (nested support for PROVE_VE is broken) and debug hooks can
be found here:
https://github.com/sean-jc/linux vmx/prove_ve_fixes
The failing KUT is nVMX's ept_access_test_not_present. It is 100% reproducible
on my system (in isolation and in sequence).
./x86/run x86/vmx.flat -smp 1 -cpu max,host-phys-bits,+vmx -m 2560 -append ept_access_test_not_present
I ruled out KVM TLB flushing bugs by doing a full INVEPT before every entry to L2.
I (more or less) ruled out KVM SPTE bugs by printing the failing translation
before every entry to L2, and adding KVM_MMU_WARN_ON() checks on the paths that
write SPTEs to assert that the SPTE value won't generate a #VE.
I ruled out a completely bogus EPT Violation by simply resuming the guest without
clearing the #VE info's busy field, and verifying by tracepoints that the same
EPT violation occurs (and gets fixed by KVM).
Unless I botched the SPTE printing, which doesn't seem to be the case as the
printed SPTEs match KVM's tracepoints, I'm out of ideas.
Basic system info:
processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 106
model name : Intel(R) Xeon(R) Platinum 8373C CPU @ 2.60GHz
stepping : 6
microcode : 0xd0003b9
cpu MHz : 2600.000
cache size : 55296 KB
physical id : 0
siblings : 72
core id : 1
cpu cores : 36
address sizes : 46 bits physical, 57 bits virtual
Relevant addresses printed from the test:
PTE[4] @ 109fff8 = 9fed0007
PTE[3] @ 9fed0ff0 = 9fed1007
PTE[2] @ 9fed1000 = 9fed2007
VA PTE @ 9fed2000 = 8000000007
Created EPT @ 9feca008 = 11d2007
Created EPT @ 11d2000 = 11d3007
Created EPT @ 11d3000 = 11d4007
L1 hva = 40000000, hpa = 40000000, L2 gva = ffffffff80000000, gpa = 8000000000
And the splat from KVM, with extra printing of the exploding translation, and a
dump of the VMCS.
kvm: VM-Enter 109fff8, spte[4] = 0x8000000000000000
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x86100040f9e008f7
kvm: VM-Enter 109fff8, spte[4] = 0x80100040becb6807, spte[3] = 0x80100040911ed807, spte[2] = 0x82100040f9e008f5
------------[ cut here ]------------
WARNING: CPU: 93 PID: 16309 at arch/x86/kvm/vmx/vmx.c:5217 handle_exception_nmi+0x418/0x5d0 [kvm_intel]
Modules linked in: kvm_intel kvm vfat fat dummy bridge stp llc spidev cdc_ncm cdc_eem cdc_ether usbnet mii xhci_pci xhci_hcd ehci_pci ehci_hcd gq(O) sha3_generic [last unloaded: kvm]
CPU: 93 PID: 16309 Comm: qemu Tainted: G S W O 6.9.0-smp--317ea923d74d-vmenter #319
Hardware name: Google Interlaken/interlaken, BIOS 0.20231025.0-0 10/25/2023
RIP: 0010:handle_exception_nmi+0x418/0x5d0 [kvm_intel]
Code: 48 89 75 c8 44 0f 79 75 c8 2e 0f 86 bf 01 00 00 48 89 df be 01 00 00 00 4c 89 fa e8 f2 78 ed ff b8 01 00 00 00 e9 74 ff ff ff <0f> 0b 4c 8b b3 b8 22 00 00 41 8b 36 83 fe 30 74 09 f6 05 5a ac 01
RSP: 0018:ff3c22846acebb38 EFLAGS: 00010246
RAX: 0000000000000001 RBX: ff3c2284dff2c580 RCX: ff3c22845cba9000
RDX: 4813020000000002 RSI: 0000000000000000 RDI: ff3c2284dff2c580
RBP: ff3c22846acebb70 R08: ff3c2284a3b3a180 R09: 0000000000000001
R10: 0000000000000005 R11: ffffffffc0978d80 R12: 0000000080000300
R13: 0000000000000000 R14: 0000000080000314 R15: 0000000080000314
FS: 00007fc71fc006c0(0000) GS:ff3c22c2bf880000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000012c9fc005 CR4: 0000000000773ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
vmx_handle_exit+0x565/0x7e0 [kvm_intel]
vcpu_run+0x188b/0x22b0 [kvm]
kvm_arch_vcpu_ioctl_run+0x358/0x680 [kvm]
kvm_vcpu_ioctl+0x4ca/0x5b0 [kvm]
__se_sys_ioctl+0x7b/0xd0
__x64_sys_ioctl+0x21/0x30
x64_sys_call+0x15ac/0x2e40
do_syscall_64+0x85/0x160
? clear_bhb_loop+0x45/0xa0
? clear_bhb_loop+0x45/0xa0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7fc7c5e2bfbb
Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
RSP: 002b:00007fc71fbffbf0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 000000000000ae80 RCX: 00007fc7c5e2bfbb
RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000c
RBP: 000055557d2ef5f0 R08: 00007fc7c600e1c8 R09: 00007fc7c67ab0b0
R10: 0000000000000123 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
</TASK>
---[ end trace 0000000000000000 ]---
kvm_intel: VMCS 0000000034d8de8f, last attempted VM-entry on CPU 93
kvm_intel: *** Guest State ***
kvm_intel: CR0: actual=0x0000000080010031, shadow=0x0000000080010031, gh_mask=fffffffffffefff7
kvm_intel: CR4: actual=0x0000000000002060, shadow=0x0000000000002020, gh_mask=fffffffffffef871
kvm_intel: CR3 = 0x000000000109f000
kvm_intel: PDPTR0 = 0x0000000000000000 PDPTR1 = 0x0000000000000000
kvm_intel: PDPTR2 = 0x0000000000000000 PDPTR3 = 0x0000000000000000
kvm_intel: RSP = 0x000000009fec6f20 RIP = 0x0000000000410d39
kvm_intel: RFLAGS=0x00010097 DR7 = 0x0000000000000400
kvm_intel: Sysenter RSP=000000009fec8000 CS:RIP=0008:00000000004001d8
kvm_intel: CS: sel=0x0008, attr=0x0a09b, limit=0xffffffff, base=0x0000000000000000
kvm_intel: DS: sel=0x0010, attr=0x0c093, limit=0xffffffff, base=0x0000000000000000
kvm_intel: SS: sel=0x0010, attr=0x0c093, limit=0xffffffff, base=0x0000000000000000
kvm_intel: ES: sel=0x0010, attr=0x0c093, limit=0xffffffff, base=0x0000000000000000
kvm_intel: FS: sel=0x0010, attr=0x0c093, limit=0xffffffff, base=0x0000000000000000
kvm_intel: GS: sel=0x0010, attr=0x0c093, limit=0xffffffff, base=0x00000000005390f0
kvm_intel: GDTR: limit=0x0000106f, base=0x000000000042aee0
kvm_intel: LDTR: sel=0x0000, attr=0x00082, limit=0x0000ffff, base=0x0000000000000000
kvm_intel: IDTR: limit=0x00000fff, base=0x000000000054aa60
kvm_intel: TR: sel=0x0080, attr=0x0008b, limit=0x0000ffff, base=0x00000000005442c0
kvm_intel: EFER= 0x0000000000000500
kvm_intel: PAT = 0x0007040600070406
kvm_intel: DebugCtl = 0x0000000000000000 DebugExceptions = 0x0000000000000000
kvm_intel: Interruptibility = 00000000 ActivityState = 00000000
kvm_intel: MSR guest autoload:
kvm_intel: 0: msr=0x00000600 value=0x0000000000000000
kvm_intel: *** Host State ***
kvm_intel: RIP = 0xffffffffc098e6c0 RSP = 0xff3c22846aceba38
kvm_intel: CS=0010 SS=0018 DS=0000 ES=0000 FS=0000 GS=0000 TR=0040
kvm_intel: FSBase=00007fc71fc006c0 GSBase=ff3c22c2bf880000 TRBase=fffffe5926d88000
kvm_intel: GDTBase=fffffe5926d86000 IDTBase=fffffe0000000000
kvm_intel: CR0=0000000080050033 CR3=000000012c9fc005 CR4=0000000000773ef0
kvm_intel: Sysenter RSP=fffffe5926d88000 CS:RIP=0010:ffffffffb7801fa0
kvm_intel: EFER= 0x0000000000000d01
kvm_intel: PAT = 0x0407050600070106
kvm_intel: MSR host autoload:
kvm_intel: 0: msr=0x00000600 value=0xfffffe5926da0000
kvm_intel: *** Control State ***
kvm_intel: CPUBased=0xa5986dfa SecondaryExec=0x02040462 TertiaryExec=0x0000000000000000
kvm_intel: PinBased=0x0000007f EntryControls=0000d3ff ExitControls=002befff
kvm_intel: ExceptionBitmap=00160042 PFECmask=00000000 PFECmatch=00000000
kvm_intel: VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
kvm_intel: VMExit: intr_info=80000314 errcode=0000fff8 ilen=00000003
kvm_intel: reason=00000000 qualification=0000000000000000
kvm_intel: IDTVectoring: info=00000000 errcode=00000000
kvm_intel: TSC Offset = 0xffcd4eeccb7b3279
kvm_intel: TSC Multiplier = 0x0001000000000000
kvm_intel: EPT pointer = 0x0000000114fd601e
kvm_intel: PLE Gap=00000000 Window=00000000
kvm_intel: Virtual processor ID = 0x0001
kvm_intel: VE info address = 0x0000000135a04000
kvm_intel: ve_info: 0x00000030 0xffffffff 0x00000000000006ab 0xffffffff80000000 0x000000000109fff8 0x0000
kvm: #VE 109fff8, spte[4] = 0x8010000136b61807, spte[3] = 0x8010000136b60807, spte[2] = 0x82100001950008f5
kvm: VM-Enter 109fff8, spte[4] = 0x8010000136b61807, spte[3] = 0x8010000136b60807, spte[2] = 0x82100001950008f5
kvm: VM-Enter 109fff8, spte[4] = 0x8010000136b61807, spte[3] = 0x8010000136b60807, spte[2] = 0x80100001b5790807, spte[1] = 0x861000019509f877
On Thu, May 16, 2024 at 06:40:02PM -0700,
Sean Christopherson <[email protected]> wrote:
> On Wed, May 15, 2024, Sean Christopherson wrote:
> > On Tue, May 07, 2024, Paolo Bonzini wrote:
> > > @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
> > > if (is_invalid_opcode(intr_info))
> > > return handle_ud(vcpu);
> > >
> > > + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> > > + return -EIO;
> >
> > I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> > the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> > still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
> >
> > I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> > there's another bug lurking.
>
> *sigh*
>
> AFAICT, I'm hitting a hardware issue. The #VE occurs when the CPU does an A/D
> assist on an entry in the L2's PML4 (L2 GPA 0x109fff8). EPT A/D bits are disabled,
> and KVM has write-protected the GPA (hooray for shadowing EPT entries). The CPU
> tries to write the PML4 entry to do the A/D assist and generates what appears to
> be a spurious #VE.
>
> Isaku, please forward this to the necessary folks at Intel. I doubt whatever
> is broken will block TDX, but it would be nice to get a root cause so we at least
> know whether or not TDX is a ticking time bomb.
Sure, let me forward it.
I tested it lightly myself. but I couldn't reproduce it.
--
Isaku Yamahata <[email protected]>
On 5/16/24 01:38, Sean Christopherson wrote:
> On Tue, May 07, 2024, Paolo Bonzini wrote:
>> @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>> if (is_invalid_opcode(intr_info))
>> return handle_ud(vcpu);
>>
>> + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
>> + return -EIO;
>
> I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
>
> I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> there's another bug lurking.
>
> https://lore.kernel.org/all/[email protected]
I cannot reproduce it on a Skylake (Xeon Gold 5120), with or without
Isaku's fix, with either ./runtests.sh or your reproducer line.
However I can reproduce it only if eptad=0 and with the following line:
/x86/run x86/vmx.flat -smp 1 -cpu max,host-phys-bits,+vmx -m 2560 \
-append 'ept_access_test_not_present ept_access_test_read_only'
Paolo
On Fri, May 17, 2024, Isaku Yamahata wrote:
> On Thu, May 16, 2024 at 06:40:02PM -0700,
> Sean Christopherson <[email protected]> wrote:
>
> > On Wed, May 15, 2024, Sean Christopherson wrote:
> > > On Tue, May 07, 2024, Paolo Bonzini wrote:
> > > > @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
> > > > if (is_invalid_opcode(intr_info))
> > > > return handle_ud(vcpu);
> > > >
> > > > + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> > > > + return -EIO;
> > >
> > > I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> > > the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> > > still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
> > >
> > > I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> > > there's another bug lurking.
> >
> > *sigh*
> >
> > AFAICT, I'm hitting a hardware issue. The #VE occurs when the CPU does an A/D
> > assist on an entry in the L2's PML4 (L2 GPA 0x109fff8). EPT A/D bits are disabled,
> > and KVM has write-protected the GPA (hooray for shadowing EPT entries). The CPU
> > tries to write the PML4 entry to do the A/D assist and generates what appears to
> > be a spurious #VE.
> >
> > Isaku, please forward this to the necessary folks at Intel. I doubt whatever
> > is broken will block TDX, but it would be nice to get a root cause so we at least
> > know whether or not TDX is a ticking time bomb.
>
> Sure, let me forward it.
> I tested it lightly myself. but I couldn't reproduce it.
This repros on a CLX and SKX, but not my client RPL box. I verified the same
A/D-assist write-protection EPT Violation occurs on RPL, and that PROVE_VE is
enabled, so I don't think RPL is simply getting lucky.
Unless I'm missing something, this really does look like a CPU issue.
On Fri, May 17, 2024, Paolo Bonzini wrote:
> On 5/16/24 01:38, Sean Christopherson wrote:
> > On Tue, May 07, 2024, Paolo Bonzini wrote:
> > > @@ -5200,6 +5215,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
> > > if (is_invalid_opcode(intr_info))
> > > return handle_ud(vcpu);
> > > + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> > > + return -EIO;
> >
> > I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> > the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> > still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
> >
> > I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> > there's another bug lurking.
> >
> > https://lore.kernel.org/all/[email protected]
>
> I cannot reproduce it on a Skylake (Xeon Gold 5120), with or without Isaku's
> fix, with either ./runtests.sh or your reproducer line.
>
> However I can reproduce it only if eptad=0 and with the following line:
>
> ./x86/run x86/vmx.flat -smp 1 -cpu max,host-phys-bits,+vmx -m 2560 \
> -append 'ept_access_test_not_present ept_access_test_read_only'
FWIW, I tried that on RPL, still no failure.
On 5/17/24 18:38, Sean Christopherson wrote:
>>> I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
>>> the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
>>> still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
>>>
>>> I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
>>> there's another bug lurking.
>>>
>>> https://lore.kernel.org/all/[email protected]
>> I cannot reproduce it on a Skylake (Xeon Gold 5120), with or without Isaku's
>> fix, with either ./runtests.sh or your reproducer line.
>>
>> However I can reproduce it only if eptad=0 and with the following line:
>>
>> ./x86/run x86/vmx.flat -smp 1 -cpu max,host-phys-bits,+vmx -m 2560 \
>> -append 'ept_access_test_not_present ept_access_test_read_only'
>
> FWIW, I tried that on RPL, still no failure.
Ok, so it does look like a CPU issue. Even with the fixes you
identified, I don't see any other solution than adding scary text in
Kconfig, defaulting it to "n", and adding an also-very-scary
pr_err_once("...") the first time VMPTRLD is executed with
CONFIG_KVM_INTEL_PROVE_VE.
Paolo
On Fri, May 17, 2024, Paolo Bonzini wrote:
> On 5/17/24 18:38, Sean Christopherson wrote:
> > > > I've hit this three times now when running KVM-Unit-Tests (I'm pretty sure it's
> > > > the EPT test, unsurprisingly). And unless I screwed up my testing, I verified it
> > > > still fires with Isaku's fix[*], though I'm suddenly having problems repro'ing.
> > > >
> > > > I'll update tomorrow as to whether I botched my testing of Isaku's fix, or if
> > > > there's another bug lurking.
> > > >
> > > > https://lore.kernel.org/all/[email protected]
> > > I cannot reproduce it on a Skylake (Xeon Gold 5120), with or without Isaku's
> > > fix, with either ./runtests.sh or your reproducer line.
> > >
> > > However I can reproduce it only if eptad=0 and with the following line:
> > >
> > > ./x86/run x86/vmx.flat -smp 1 -cpu max,host-phys-bits,+vmx -m 2560 \
> > > -append 'ept_access_test_not_present ept_access_test_read_only'
> >
> > FWIW, I tried that on RPL, still no failure.
>
> Ok, so it does look like a CPU issue. Even with the fixes you identified, I
> don't see any other solution than adding scary text in Kconfig, defaulting
> it to "n", and adding an also-very-scary pr_err_once("...") the first time
> VMPTRLD is executed with CONFIG_KVM_INTEL_PROVE_VE.
I don't think we need to make it super scary, at least not yet. KVM just needs
to not kill the VM, which thanks to the BUSY flag is trivial: just resume the guest.
Then the failure is "just" a WARN, which won't be anywhere near as problematic for
KVM developers. I doubt syzbot will hit this, purely because syzbot runs almost
exclusively in VMs, i.e. won't have #VE support.
If we don't have a resolution by rc6 or so, then maybe consider doing something
more drastic?
I agree that it should be off by default though. And the help text should be
more clear that this intended only for developers and testing environments.
I have a handful of patches, including one to not kill the VM. I'll try to post
them later today, mostly just need to write changelogs.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 75082c4a9ac4..5c22186671e9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -98,15 +98,15 @@ config KVM_INTEL
config KVM_INTEL_PROVE_VE
bool "Check that guests do not receive #VE exceptions"
- default KVM_PROVE_MMU || DEBUG_KERNEL
- depends on KVM_INTEL
+ depends on KVM_INTEL && KVM_PROVE_MMU
help
-
Checks that KVM's page table management code will not incorrectly
let guests receive a virtualization exception. Virtualization
exceptions will be trapped by the hypervisor rather than injected
in the guest.
+ This should never be enabled in a production environment.
+
If unsure, say N.
config X86_SGX_KVM
On Fri, May 17, 2024 at 8:18 PM Sean Christopherson <[email protected]> wrote:
> > Ok, so it does look like a CPU issue. Even with the fixes you identified, I
> > don't see any other solution than adding scary text in Kconfig, defaulting
> > it to "n", and adding an also-very-scary pr_err_once("...") the first time
> > VMPTRLD is executed with CONFIG_KVM_INTEL_PROVE_VE.
>
> I don't think we need to make it super scary, at least not yet. KVM just needs
> to not kill the VM, which thanks to the BUSY flag is trivial: just resume the guest.
> Then the failure is "just" a WARN, which won't be anywhere near as problematic for
> KVM developers.
>
> If we don't have a resolution by rc6 or so, then maybe consider doing something
> more drastic?
>
> I agree that it should be off by default though. And the help text should be
> more clear that this intended only for developers and testing environments.
>
> I have a handful of patches, including one to not kill the VM. I'll try to post
> them later today, mostly just need to write changelogs.
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 75082c4a9ac4..5c22186671e9 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -98,15 +98,15 @@ config KVM_INTEL
>
> config KVM_INTEL_PROVE_VE
> bool "Check that guests do not receive #VE exceptions"
> - default KVM_PROVE_MMU || DEBUG_KERNEL
> - depends on KVM_INTEL
> + depends on KVM_INTEL && KVM_PROVE_MMU
> help
"depends on KVM_PROVE_MMU" is wrong, I think. I'd like to keep it
enabled without slowing down too much the VMs, for example.
On the other hand "default DEBUG_KERNEL" is definitely too heavy
with these CPU issues.
Paolo