v4:
2/3: Use WARN_ONCE to avoid logging dos
v3:
https://lkml.org/lkml/2017/7/10/684
3/3: Add missing nested_release_page_clean() and check the
eptp as mentioned in SDM 24.6.14
v2:
https://lkml.org/lkml/2017/7/6/813
1/3: Patch to enable vmfunc on the host but cause a #UD if
L1 tries to use it directly. (new)
2/3: Expose vmfunc to the nested hypervisor, but no vm functions
are exposed and L0 emulates a vmfunc vmexit to L1.
3/3: Force a vmfunc vmexit when L2 tries to use vmfunc and emulate
eptp switching. Unconditionally expose EPTP switching to the
L1 hypervisor since L0 fakes eptp switching via a mmu reload.
These patches expose eptp switching/vmfunc to the nested hypervisor.
vmfunc is enabled in the secondary controls for the host and is
exposed to the nested hypervisor. However, if the nested hypervisor
decides to use eptp switching, L0 emulates it.
v1:
https://lkml.org/lkml/2017/6/29/958
Bandan Das (3):
KVM: vmx: Enable VMFUNCs
KVM: nVMX: Enable VMFUNC for the L1 hypervisor
KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
arch/x86/include/asm/vmx.h | 9 ++++
arch/x86/kvm/vmx.c | 125 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 132 insertions(+), 2 deletions(-)
--
2.9.4
Expose VMFUNC in MSRs and VMCS fields. No actual VMFUNCs are enabled.
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Bandan Das <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
---
arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 51 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a483b49..fe8f5fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -240,6 +240,7 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 posted_intr_desc_addr;
+ u64 vm_function_control;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
@@ -481,6 +482,7 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed0;
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
+ u64 nested_vmx_vmfunc_controls;
};
#define POSTED_INTR_ON 0
@@ -763,6 +765,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -1394,6 +1397,11 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2780,6 +2788,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ if (cpu_has_vmx_vmfunc()) {
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ vmx->nested.nested_vmx_vmfunc_controls = 0;
+ }
+
/*
* Old versions of KVM use the single-context version without
* checking for support, so declare that it is supported even
@@ -3149,6 +3163,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = vmx->nested.nested_vmx_ept_caps |
((u64)vmx->nested.nested_vmx_vpid_caps << 32);
break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = vmx->nested.nested_vmx_vmfunc_controls;
+ break;
default:
return 1;
}
@@ -7752,7 +7769,29 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
static int handle_vmfunc(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+ /*
+ * VMFUNC is only supported for nested guests, but we always enable the
+ * secondary control for simplicity; for non-nested mode, fake that we
+ * didn't by injecting #UD.
+ */
+ if (!is_guest_mode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ goto fail;
+ WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+fail:
+ nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_readl(EXIT_QUALIFICATION));
return 1;
}
@@ -10053,7 +10092,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -10061,6 +10101,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control |= vmcs12_exec_ctrl;
}
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
vmcs12->eoi_exit_bitmap0);
@@ -10310,6 +10354,11 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx->nested.nested_vmx_entry_ctls_high))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ ~vmx->nested.nested_vmx_vmfunc_controls))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
--
2.9.4
When L2 uses vmfunc, L0 utilizes the associated vmexit to
emulate a switching of the ept pointer by reloading the
guest MMU.
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Bandan Das <[email protected]>
---
arch/x86/include/asm/vmx.h | 6 +++++
arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da5375e..5f63a2e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -115,6 +115,10 @@
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
+#define VMFUNC_EPTP_ENTRIES 512
+
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
return vmx_basic & GENMASK_ULL(30, 0);
@@ -200,6 +204,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP2_HIGH = 0x00002021,
EOI_EXIT_BITMAP3 = 0x00002022,
EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ EPTP_LIST_ADDRESS = 0x00002024,
+ EPTP_LIST_ADDRESS_HIGH = 0x00002025,
VMREAD_BITMAP = 0x00002026,
VMWRITE_BITMAP = 0x00002028,
XSS_EXIT_BITMAP = 0x0000202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe8f5fc..0a969fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -246,6 +246,7 @@ struct __packed vmcs12 {
u64 eoi_exit_bitmap1;
u64 eoi_exit_bitmap2;
u64 eoi_exit_bitmap3;
+ u64 eptp_list_address;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
}
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VMFUNC;
- vmx->nested.nested_vmx_vmfunc_controls = 0;
+ /*
+ * Advertise EPTP switching unconditionally
+ * since we emulate it
+ */
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12;
u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+ u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ struct page *page = NULL;
+ u64 *l1_eptp_list, address;
/*
* VMFUNC is only supported for nested guests, but we always enable the
@@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
}
vmcs12 = get_vmcs12(vcpu);
- if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
+ WARN_ON_ONCE(function))
+ goto fail;
+
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !nested_cpu_has_eptp_switching(vmcs12))
+ goto fail;
+
+ if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
+ goto fail;
+
+ page = nested_get_page(vcpu, vmcs12->eptp_list_address);
+ if (!page)
goto fail;
- WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+ l1_eptp_list = kmap(page);
+ address = l1_eptp_list[index];
+ if (!address)
+ goto fail;
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != address) {
+ if (address >> cpuid_maxphyaddr(vcpu) ||
+ !IS_ALIGNED(address, 4096))
+ goto fail;
+ kvm_mmu_unload(vcpu);
+ vmcs12->ept_pointer = address;
+ kvm_mmu_reload(vcpu);
+ kunmap(page);
+ nested_release_page_clean(page);
+ }
+ return kvm_skip_emulated_instruction(vcpu);
fail:
+ if (page) {
+ kunmap(page);
+ nested_release_page_clean(page);
+ }
nested_vmx_vmexit(vcpu, vmx->exit_reason,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
--
2.9.4
Enable VMFUNC in the secondary execution controls. This simplifies the
changes necessary to expose it to nested hypervisors. VMFUNCs still
cause #UD when invoked.
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Bandan Das <[email protected]>
---
arch/x86/include/asm/vmx.h | 3 +++
arch/x86/kvm/vmx.c | 22 +++++++++++++++++++++-
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 35cd06f..da5375e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,6 +72,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_RDRAND 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_RDSEED 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
@@ -187,6 +188,8 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH = 0x00002015,
POSTED_INTR_DESC_ADDR = 0x00002016,
POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ VM_FUNCTION_CONTROL = 0x00002018,
+ VM_FUNCTION_CONTROL_HIGH = 0x00002019,
EPT_POINTER = 0x0000201a,
EPT_POINTER_HIGH = 0x0000201b,
EOI_EXIT_BITMAP0 = 0x0000201c,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b9..a483b49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1314,6 +1314,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
SECONDARY_EXEC_TSC_SCALING;
}
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -3575,7 +3581,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_VMFUNC;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -5233,6 +5240,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
#endif
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
@@ -7740,6 +7750,12 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7790,6 +7806,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_VMFUNC] = handle_vmfunc,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
@@ -8111,6 +8128,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_PML_FULL:
/* We emulate PML support to L1. */
return false;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return false;
default:
return true;
}
--
2.9.4
On 10.07.2017 22:49, Bandan Das wrote:
> When L2 uses vmfunc, L0 utilizes the associated vmexit to
> emulate a switching of the ept pointer by reloading the
> guest MMU.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Bandan Das <[email protected]>
> ---
> arch/x86/include/asm/vmx.h | 6 +++++
> arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 61 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index da5375e..5f63a2e 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -115,6 +115,10 @@
> #define VMX_MISC_SAVE_EFER_LMA 0x00000020
> #define VMX_MISC_ACTIVITY_HLT 0x00000040
>
> +/* VMFUNC functions */
> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
> +#define VMFUNC_EPTP_ENTRIES 512
> +
> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
> {
> return vmx_basic & GENMASK_ULL(30, 0);
> @@ -200,6 +204,8 @@ enum vmcs_field {
> EOI_EXIT_BITMAP2_HIGH = 0x00002021,
> EOI_EXIT_BITMAP3 = 0x00002022,
> EOI_EXIT_BITMAP3_HIGH = 0x00002023,
> + EPTP_LIST_ADDRESS = 0x00002024,
> + EPTP_LIST_ADDRESS_HIGH = 0x00002025,
> VMREAD_BITMAP = 0x00002026,
> VMWRITE_BITMAP = 0x00002028,
> XSS_EXIT_BITMAP = 0x0000202C,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index fe8f5fc..0a969fb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
> u64 eoi_exit_bitmap1;
> u64 eoi_exit_bitmap2;
> u64 eoi_exit_bitmap3;
> + u64 eptp_list_address;
> u64 xss_exit_bitmap;
> u64 guest_physical_address;
> u64 vmcs_link_pointer;
> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
> FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
> FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
> + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
> FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
> FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
> FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
> return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
> }
>
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> +{
> + return nested_cpu_has_vmfunc(vmcs12) &&
> + (vmcs12->vm_function_control &
I wonder if it makes sense to rename vm_function_control to
- vmfunc_control
- vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
- vmfunc_ctrl
> + VMX_VMFUNC_EPTP_SWITCHING);
> +}
> +
> static inline bool is_nmi(u32 intr_info)
> {
> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
> if (cpu_has_vmx_vmfunc()) {
> vmx->nested.nested_vmx_secondary_ctls_high |=
> SECONDARY_EXEC_ENABLE_VMFUNC;
> - vmx->nested.nested_vmx_vmfunc_controls = 0;
> + /*
> + * Advertise EPTP switching unconditionally
> + * since we emulate it
> + */
> + vmx->nested.nested_vmx_vmfunc_controls =
> + VMX_VMFUNC_EPTP_SWITCHING;> }
>
> /*
> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> struct vmcs12 *vmcs12;
> u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> + struct page *page = NULL;
> + u64 *l1_eptp_list, address;
>
> /*
> * VMFUNC is only supported for nested guests, but we always enable the
> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> }
>
> vmcs12 = get_vmcs12(vcpu);
> - if ((vmcs12->vm_function_control & (1 << function)) == 0)
> + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> + WARN_ON_ONCE(function))
"... instruction causes a VM exit if the bit at position EAX is 0 in the
VM-function controls (the selected VM function is
not enabled)."
So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
completely.
> + goto fail;
> +
> + if (!nested_cpu_has_ept(vmcs12) ||
> + !nested_cpu_has_eptp_switching(vmcs12))
> + goto fail;
> +
> + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
> + goto fail;
I can find the definition for an vmexit in case of index >=
VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
Can you give me a hint?
> +
> + page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> + if (!page)
> goto fail;
> - WARN_ONCE(1, "VMCS12 VM function control should have been zero");
> +
> + l1_eptp_list = kmap(page);
> + address = l1_eptp_list[index];
> + if (!address)
> + goto fail;
Can you move that check to the other address checks below? (or rework if
this make sense, see below)
> + /*
> + * If the (L2) guest does a vmfunc to the currently
> + * active ept pointer, we don't have to do anything else
> + */
> + if (vmcs12->ept_pointer != address) {
> + if (address >> cpuid_maxphyaddr(vcpu) ||
> + !IS_ALIGNED(address, 4096))
Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
(triggering a KVM_REQ_TRIPLE_FAULT)
> + goto fail;
> + kvm_mmu_unload(vcpu);
> + vmcs12->ept_pointer = address;
> + kvm_mmu_reload(vcpu);
I was thinking about something like this:
kvm_mmu_unload(vcpu);
old = vmcs12->ept_pointer;
vmcs12->ept_pointer = address;
if (kvm_mmu_reload(vcpu)) {
/* pointer invalid, restore previous state */
kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
vmcs12->ept_pointer = old;
kvm_mmu_reload(vcpu);
goto fail;
}
The you can inherit the checks from mmu_check_root().
Wonder why I can't spot checks for cpuid_maxphyaddr() /
IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
checks should be identical.
> + kunmap(page);
> + nested_release_page_clean(page);
shouldn't the kunmap + nested_release_page_clean go outside the if clause?
> + }
> + return kvm_skip_emulated_instruction(vcpu);
>
> fail:
> + if (page) {
> + kunmap(page);
> + nested_release_page_clean(page);
> + }
> nested_vmx_vmexit(vcpu, vmx->exit_reason,
> vmcs_read32(VM_EXIT_INTR_INFO),
> vmcs_readl(EXIT_QUALIFICATION));
>
David and mmu code are not yet best friends. So sorry if I am missing
something.
--
Thanks,
David
On 11/07/2017 09:51, David Hildenbrand wrote:
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> + return nested_cpu_has_vmfunc(vmcs12) &&
>> + (vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl
Blame Intel for this. :) They use full English names for VMCS fields (so
"VM-function control") and uppercase names for MSRs (so "IA32_VMX_VMFUNC").
"nested_vmx_vmfunc_controls" could be renamed to "nested_vmx_vmfunc" for
consistency, but I like the longer name too.
Paolo
[David did a great review, so I'll just point out things I noticed.]
2017-07-11 09:51+0200, David Hildenbrand:
> On 10.07.2017 22:49, Bandan Das wrote:
> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> > emulate a switching of the ept pointer by reloading the
> > guest MMU.
> >
> > Signed-off-by: Paolo Bonzini <[email protected]>
> > Signed-off-by: Bandan Das <[email protected]>
> > ---
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> > }
> >
> > vmcs12 = get_vmcs12(vcpu);
> > - if ((vmcs12->vm_function_control & (1 << function)) == 0)
> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> > + WARN_ON_ONCE(function))
>
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
>
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.
It assumes that vm_function_control is not > 1, which is (should be)
guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
is 1.
> > + goto fail;
The rest of the code assumes that the function is
VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
Writing it as
WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
would be cleared and I'd prefer to move the part that handles
VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
going to add more than one VM FUNC. :])
> > + if (!nested_cpu_has_ept(vmcs12) ||
> > + !nested_cpu_has_eptp_switching(vmcs12))
> > + goto fail;
This brings me to a missing vm-entry check:
If “EPTP switching” VM-function control is 1, the “enable EPT”
VM-execution control must also be 1. In addition, the EPTP-list address
must satisfy the following checks:
• Bits 11:0 of the address must be 0.
• The address must not set any bits beyond the processor’s
physical-address width.
so this one could be
if (!nested_cpu_has_eptp_switching(vmcs12) ||
WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
after adding the check.
David Hildenbrand <[email protected]> writes:
> On 10.07.2017 22:49, Bandan Das wrote:
>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> emulate a switching of the ept pointer by reloading the
>> guest MMU.
>>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> Signed-off-by: Bandan Das <[email protected]>
>> ---
>> arch/x86/include/asm/vmx.h | 6 +++++
>> arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++---
>> 2 files changed, 61 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index da5375e..5f63a2e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -115,6 +115,10 @@
>> #define VMX_MISC_SAVE_EFER_LMA 0x00000020
>> #define VMX_MISC_ACTIVITY_HLT 0x00000040
>>
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
>> +#define VMFUNC_EPTP_ENTRIES 512
>> +
>> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>> {
>> return vmx_basic & GENMASK_ULL(30, 0);
>> @@ -200,6 +204,8 @@ enum vmcs_field {
>> EOI_EXIT_BITMAP2_HIGH = 0x00002021,
>> EOI_EXIT_BITMAP3 = 0x00002022,
>> EOI_EXIT_BITMAP3_HIGH = 0x00002023,
>> + EPTP_LIST_ADDRESS = 0x00002024,
>> + EPTP_LIST_ADDRESS_HIGH = 0x00002025,
>> VMREAD_BITMAP = 0x00002026,
>> VMWRITE_BITMAP = 0x00002028,
>> XSS_EXIT_BITMAP = 0x0000202C,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index fe8f5fc..0a969fb 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>> u64 eoi_exit_bitmap1;
>> u64 eoi_exit_bitmap2;
>> u64 eoi_exit_bitmap3;
>> + u64 eptp_list_address;
>> u64 xss_exit_bitmap;
>> u64 guest_physical_address;
>> u64 vmcs_link_pointer;
>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>> FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>> FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>> + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>> FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>> FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>> FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>> return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>> }
>>
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> + return nested_cpu_has_vmfunc(vmcs12) &&
>> + (vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl
I tend to follow the SDM names because it's easy to look for them.
>> + VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>> static inline bool is_nmi(u32 intr_info)
>> {
>> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>> if (cpu_has_vmx_vmfunc()) {
>> vmx->nested.nested_vmx_secondary_ctls_high |=
>> SECONDARY_EXEC_ENABLE_VMFUNC;
>> - vmx->nested.nested_vmx_vmfunc_controls = 0;
>> + /*
>> + * Advertise EPTP switching unconditionally
>> + * since we emulate it
>> + */
>> + vmx->nested.nested_vmx_vmfunc_controls =
>> + VMX_VMFUNC_EPTP_SWITCHING;> }
>>
>> /*
>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>> struct vmcs12 *vmcs12;
>> u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> + struct page *page = NULL;
>> + u64 *l1_eptp_list, address;
>>
>> /*
>> * VMFUNC is only supported for nested guests, but we always enable the
>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> }
>>
>> vmcs12 = get_vmcs12(vcpu);
>> - if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> + WARN_ON_ONCE(function))
>
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
>
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.
It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
not misused.
>> + goto fail;
>> +
>> + if (!nested_cpu_has_ept(vmcs12) ||
>> + !nested_cpu_has_eptp_switching(vmcs12))
>> + goto fail;
>> +
>> + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>> + goto fail;
>
> I can find the definition for an vmexit in case of index >=
> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>
> Can you give me a hint?
I don't think there is. Since, we are basically emulating eptp switching
for L2, this is a good check to have.
>> +
>> + page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> + if (!page)
>> goto fail;
>> - WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>> +
>> + l1_eptp_list = kmap(page);
>> + address = l1_eptp_list[index];
>> + if (!address)
>> + goto fail;
>
> Can you move that check to the other address checks below? (or rework if
> this make sense, see below)
>
>> + /*
>> + * If the (L2) guest does a vmfunc to the currently
>> + * active ept pointer, we don't have to do anything else
>> + */
>> + if (vmcs12->ept_pointer != address) {
>> + if (address >> cpuid_maxphyaddr(vcpu) ||
>> + !IS_ALIGNED(address, 4096))
>
> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
> (triggering a KVM_REQ_TRIPLE_FAULT)
If there's a triple fault, I think it's a good idea to inject it
back. Basically, there's no need to take care of damage control
that L1 is intentionally doing.
>> + goto fail;
>> + kvm_mmu_unload(vcpu);
>> + vmcs12->ept_pointer = address;
>> + kvm_mmu_reload(vcpu);
>
> I was thinking about something like this:
>
> kvm_mmu_unload(vcpu);
> old = vmcs12->ept_pointer;
> vmcs12->ept_pointer = address;
> if (kvm_mmu_reload(vcpu)) {
> /* pointer invalid, restore previous state */
> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> vmcs12->ept_pointer = old;
> kvm_mmu_reload(vcpu);
> goto fail;
> }
>
> The you can inherit the checks from mmu_check_root().
>
>
> Wonder why I can't spot checks for cpuid_maxphyaddr() /
> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
> checks should be identical.
I think the reason is vmcs12->ept_pointer is never used directly. It's
used to create a shadow table but nevertheless, the check should be there.
>
>> + kunmap(page);
>> + nested_release_page_clean(page);
>
> shouldn't the kunmap + nested_release_page_clean go outside the if clause?
:) Indeed, thanks for the catch.
Bandan
>> + }
>> + return kvm_skip_emulated_instruction(vcpu);
>>
>> fail:
>> + if (page) {
>> + kunmap(page);
>> + nested_release_page_clean(page);
>> + }
>> nested_vmx_vmexit(vcpu, vmx->exit_reason,
>> vmcs_read32(VM_EXIT_INTR_INFO),
>> vmcs_readl(EXIT_QUALIFICATION));
>>
>
> David and mmu code are not yet best friends. So sorry if I am missing
> something.
Radim Krčmář <[email protected]> writes:
> [David did a great review, so I'll just point out things I noticed.]
>
> 2017-07-11 09:51+0200, David Hildenbrand:
>> On 10.07.2017 22:49, Bandan Das wrote:
>> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> > emulate a switching of the ept pointer by reloading the
>> > guest MMU.
>> >
>> > Signed-off-by: Paolo Bonzini <[email protected]>
>> > Signed-off-by: Bandan Das <[email protected]>
>> > ---
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> > }
>> >
>> > vmcs12 = get_vmcs12(vcpu);
>> > - if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> > + WARN_ON_ONCE(function))
>>
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>>
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It assumes that vm_function_control is not > 1, which is (should be)
> guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> is 1.
>
>> > + goto fail;
>
> The rest of the code assumes that the function is
> VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>
> Writing it as
>
> WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>
> would be cleared and I'd prefer to move the part that handles
> VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> going to add more than one VM FUNC. :])
IMO, for now, this should be fine because we are not even passing through the
hardware's eptp switching. Even if there are other vm functions, they
won't be available for the nested case and cause any conflict.
>> > + if (!nested_cpu_has_ept(vmcs12) ||
>> > + !nested_cpu_has_eptp_switching(vmcs12))
>> > + goto fail;
>
> This brings me to a missing vm-entry check:
>
> If “EPTP switching” VM-function control is 1, the “enable EPT”
> VM-execution control must also be 1. In addition, the EPTP-list address
> must satisfy the following checks:
> • Bits 11:0 of the address must be 0.
> • The address must not set any bits beyond the processor’s
> physical-address width.
>
> so this one could be
>
> if (!nested_cpu_has_eptp_switching(vmcs12) ||
> WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
I will reverse the order here but the vm entry check is unnecessary because
the check on the list address is already being done in this function.
> after adding the check.
On Tue, Jul 11, 2017 at 10:58 AM, Bandan Das <[email protected]> wrote:
> David Hildenbrand <[email protected]> writes:
>
>> On 10.07.2017 22:49, Bandan Das wrote:
>>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>>> emulate a switching of the ept pointer by reloading the
>>> guest MMU.
>>>
>>> Signed-off-by: Paolo Bonzini <[email protected]>
>>> Signed-off-by: Bandan Das <[email protected]>
>>> ---
>>> arch/x86/include/asm/vmx.h | 6 +++++
>>> arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>> 2 files changed, 61 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>> index da5375e..5f63a2e 100644
>>> --- a/arch/x86/include/asm/vmx.h
>>> +++ b/arch/x86/include/asm/vmx.h
>>> @@ -115,6 +115,10 @@
>>> #define VMX_MISC_SAVE_EFER_LMA 0x00000020
>>> #define VMX_MISC_ACTIVITY_HLT 0x00000040
>>>
>>> +/* VMFUNC functions */
>>> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
>>> +#define VMFUNC_EPTP_ENTRIES 512
>>> +
>>> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>> {
>>> return vmx_basic & GENMASK_ULL(30, 0);
>>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>> EOI_EXIT_BITMAP2_HIGH = 0x00002021,
>>> EOI_EXIT_BITMAP3 = 0x00002022,
>>> EOI_EXIT_BITMAP3_HIGH = 0x00002023,
>>> + EPTP_LIST_ADDRESS = 0x00002024,
>>> + EPTP_LIST_ADDRESS_HIGH = 0x00002025,
>>> VMREAD_BITMAP = 0x00002026,
>>> VMWRITE_BITMAP = 0x00002028,
>>> XSS_EXIT_BITMAP = 0x0000202C,
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index fe8f5fc..0a969fb 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>> u64 eoi_exit_bitmap1;
>>> u64 eoi_exit_bitmap2;
>>> u64 eoi_exit_bitmap3;
>>> + u64 eptp_list_address;
>>> u64 xss_exit_bitmap;
>>> u64 guest_physical_address;
>>> u64 vmcs_link_pointer;
>>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>> FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>> FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>>> + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>> FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>> FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>> FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>> return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>> }
>>>
>>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>>> +{
>>> + return nested_cpu_has_vmfunc(vmcs12) &&
>>> + (vmcs12->vm_function_control &
>>
>> I wonder if it makes sense to rename vm_function_control to
>> - vmfunc_control
>> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
>> - vmfunc_ctrl
>
> I tend to follow the SDM names because it's easy to look for them.
>
>>> + VMX_VMFUNC_EPTP_SWITCHING);
>>> +}
>>> +
>>> static inline bool is_nmi(u32 intr_info)
>>> {
>>> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>> if (cpu_has_vmx_vmfunc()) {
>>> vmx->nested.nested_vmx_secondary_ctls_high |=
>>> SECONDARY_EXEC_ENABLE_VMFUNC;
>>> - vmx->nested.nested_vmx_vmfunc_controls = 0;
>>> + /*
>>> + * Advertise EPTP switching unconditionally
>>> + * since we emulate it
>>> + */
>>> + vmx->nested.nested_vmx_vmfunc_controls =
>>> + VMX_VMFUNC_EPTP_SWITCHING;> }
>>>
>>> /*
>>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>>> struct vmcs12 *vmcs12;
>>> u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>>> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>>> + struct page *page = NULL;
>>> + u64 *l1_eptp_list, address;
>>>
>>> /*
>>> * VMFUNC is only supported for nested guests, but we always enable the
>>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>> }
>>>
>>> vmcs12 = get_vmcs12(vcpu);
>>> - if ((vmcs12->vm_function_control & (1 << function)) == 0)
>>> + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>>> + WARN_ON_ONCE(function))
>>
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>>
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
> not misused.
>
>>> + goto fail;
>>> +
>>> + if (!nested_cpu_has_ept(vmcs12) ||
>>> + !nested_cpu_has_eptp_switching(vmcs12))
>>> + goto fail;
>>> +
>>> + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>>> + goto fail;
>>
>> I can find the definition for an vmexit in case of index >=
>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>
>> Can you give me a hint?
>
> I don't think there is. Since, we are basically emulating eptp switching
> for L2, this is a good check to have.
There is nothing wrong with a hypervisor using physical page 0 for
whatever purpose it likes, including an EPTP list.
>>> +
>>> + page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>>> + if (!page)
>>> goto fail;
>>> - WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>>> +
>>> + l1_eptp_list = kmap(page);
>>> + address = l1_eptp_list[index];
>>> + if (!address)
>>> + goto fail;
>>
>> Can you move that check to the other address checks below? (or rework if
>> this make sense, see below)
>>
>>> + /*
>>> + * If the (L2) guest does a vmfunc to the currently
>>> + * active ept pointer, we don't have to do anything else
>>> + */
>>> + if (vmcs12->ept_pointer != address) {
>>> + if (address >> cpuid_maxphyaddr(vcpu) ||
>>> + !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> + goto fail;
>>> + kvm_mmu_unload(vcpu);
>>> + vmcs12->ept_pointer = address;
>>> + kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>> /* pointer invalid, restore previous state */
>> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> vmcs12->ept_pointer = old;
>> kvm_mmu_reload(vcpu);
>> goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().
>>
>>
>> Wonder why I can't spot checks for cpuid_maxphyaddr() /
>> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
>> checks should be identical.
>
> I think the reason is vmcs12->ept_pointer is never used directly. It's
> used to create a shadow table but nevertheless, the check should be there.
>
>>
>>> + kunmap(page);
>>> + nested_release_page_clean(page);
>>
>> shouldn't the kunmap + nested_release_page_clean go outside the if clause?
>
> :) Indeed, thanks for the catch.
>
> Bandan
>
>>> + }
>>> + return kvm_skip_emulated_instruction(vcpu);
>>>
>>> fail:
>>> + if (page) {
>>> + kunmap(page);
>>> + nested_release_page_clean(page);
>>> + }
>>> nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>> vmcs_read32(VM_EXIT_INTR_INFO),
>>> vmcs_readl(EXIT_QUALIFICATION));
>>>
>>
>> David and mmu code are not yet best friends. So sorry if I am missing
>> something.
Bandan Das <[email protected]> writes:
....
>>> + /*
>>> + * If the (L2) guest does a vmfunc to the currently
>>> + * active ept pointer, we don't have to do anything else
>>> + */
>>> + if (vmcs12->ept_pointer != address) {
>>> + if (address >> cpuid_maxphyaddr(vcpu) ||
>>> + !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> + goto fail;
>>> + kvm_mmu_unload(vcpu);
>>> + vmcs12->ept_pointer = address;
>>> + kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>> /* pointer invalid, restore previous state */
>> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> vmcs12->ept_pointer = old;
>> kvm_mmu_reload(vcpu);
>> goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().
Actually, thinking about this a bit more, I agree with you. Any fault
with a vmfunc operation should end with a vmfunc vmexit, so this
is a good thing to have. Thank you for this idea! :)
Bandan
Jim Mattson <[email protected]> writes:
...
>>> I can find the definition for an vmexit in case of index >=
>>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>>
>>> Can you give me a hint?
>>
>> I don't think there is. Since, we are basically emulating eptp switching
>> for L2, this is a good check to have.
>
> There is nothing wrong with a hypervisor using physical page 0 for
> whatever purpose it likes, including an EPTP list.
Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
address most likely means it forgot to initialize it. Whatever damage it does will
still end up with vmfunc vmexit anyway.
Bandan
2017-07-11 14:05-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
>
> > [David did a great review, so I'll just point out things I noticed.]
> >
> > 2017-07-11 09:51+0200, David Hildenbrand:
> >> On 10.07.2017 22:49, Bandan Das wrote:
> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> >> > emulate a switching of the ept pointer by reloading the
> >> > guest MMU.
> >> >
> >> > Signed-off-by: Paolo Bonzini <[email protected]>
> >> > Signed-off-by: Bandan Das <[email protected]>
> >> > ---
> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >> > }
> >> >
> >> > vmcs12 = get_vmcs12(vcpu);
> >> > - if ((vmcs12->vm_function_control & (1 << function)) == 0)
> >> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> >> > + WARN_ON_ONCE(function))
> >>
> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> >> VM-function controls (the selected VM function is
> >> not enabled)."
> >>
> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> >> completely.
> >
> > It assumes that vm_function_control is not > 1, which is (should be)
> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> > is 1.
> >
> >> > + goto fail;
> >
> > The rest of the code assumes that the function is
> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
> >
> > Writing it as
> >
> > WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
> >
> > would be cleared and I'd prefer to move the part that handles
> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> > going to add more than one VM FUNC. :])
>
> IMO, for now, this should be fine because we are not even passing through the
> hardware's eptp switching. Even if there are other vm functions, they
> won't be available for the nested case and cause any conflict.
Yeah, it is fine function-wise, I was just pointing out that it looks
ugly to me.
Btw. have you looked what we'd need to do for the hardware pass-through?
I'd expect big changes to MMU. :)
> >> > + if (!nested_cpu_has_ept(vmcs12) ||
> >> > + !nested_cpu_has_eptp_switching(vmcs12))
> >> > + goto fail;
> >
> > This brings me to a missing vm-entry check:
> >
> > If “EPTP switching” VM-function control is 1, the “enable EPT”
> > VM-execution control must also be 1. In addition, the EPTP-list address
> > must satisfy the following checks:
> > • Bits 11:0 of the address must be 0.
> > • The address must not set any bits beyond the processor’s
> > physical-address width.
> >
> > so this one could be
> >
> > if (!nested_cpu_has_eptp_switching(vmcs12) ||
> > WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
>
> I will reverse the order here but the vm entry check is unnecessary because
> the check on the list address is already being done in this function.
Here is too late, the nested VM-entry should have failed, never letting
this situation happen. We want an equivalent of
if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
in nested controls checks, right next to the reserved fields check.
And then also the check EPTP-list check. All of them only checked when
nested_cpu_has_vmfunc(vmcs12).
2017-07-11 14:35-0400, Bandan Das:
> Jim Mattson <[email protected]> writes:
> ...
> >>> I can find the definition for an vmexit in case of index >=
> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >>>
> >>> Can you give me a hint?
> >>
> >> I don't think there is. Since, we are basically emulating eptp switching
> >> for L2, this is a good check to have.
> >
> > There is nothing wrong with a hypervisor using physical page 0 for
> > whatever purpose it likes, including an EPTP list.
>
> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> address most likely means it forgot to initialize it. Whatever damage it does will
> still end up with vmfunc vmexit anyway.
Most likely, but not certainly. I also don't see a to diverge from the
spec here.
2017-07-11 14:24-0400, Bandan Das:
> Bandan Das <[email protected]> writes:
> > If there's a triple fault, I think it's a good idea to inject it
> > back. Basically, there's no need to take care of damage control
> > that L1 is intentionally doing.
> >
> >>> + goto fail;
> >>> + kvm_mmu_unload(vcpu);
> >>> + vmcs12->ept_pointer = address;
> >>> + kvm_mmu_reload(vcpu);
> >>
> >> I was thinking about something like this:
> >>
> >> kvm_mmu_unload(vcpu);
> >> old = vmcs12->ept_pointer;
> >> vmcs12->ept_pointer = address;
> >> if (kvm_mmu_reload(vcpu)) {
> >> /* pointer invalid, restore previous state */
> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> vmcs12->ept_pointer = old;
> >> kvm_mmu_reload(vcpu);
> >> goto fail;
> >> }
> >>
> >> The you can inherit the checks from mmu_check_root().
>
> Actually, thinking about this a bit more, I agree with you. Any fault
> with a vmfunc operation should end with a vmfunc vmexit, so this
> is a good thing to have. Thank you for this idea! :)
SDM says
IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
if in EPTP) THEN VMexit;
and no other mentions of a VM exit, so I think that the VM exit happens
only under these conditions:
— The EPT memory type (bits 2:0) must be a value supported by the
processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
Appendix A.10).
— Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
an EPT page-walk length of 4; see Section 28.2.2.
— Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
as 0, indicating that the processor does not support accessed and
dirty flags for EPT.
— Reserved bits 11:7 and 63:N (where N is the processor’s
physical-address width) must all be 0.
And it looks like we need parts of nested_ept_init_mmu_context() to
properly handle VMX_EPT_AD_ENABLE_BIT.
The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
we just invalidate the MMU.
Radim Krčmář <[email protected]> writes:
> 2017-07-11 14:05-0400, Bandan Das:
>> Radim Krčmář <[email protected]> writes:
>>
>> > [David did a great review, so I'll just point out things I noticed.]
>> >
>> > 2017-07-11 09:51+0200, David Hildenbrand:
>> >> On 10.07.2017 22:49, Bandan Das wrote:
>> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> >> > emulate a switching of the ept pointer by reloading the
>> >> > guest MMU.
>> >> >
>> >> > Signed-off-by: Paolo Bonzini <[email protected]>
>> >> > Signed-off-by: Bandan Das <[email protected]>
>> >> > ---
>> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> >> > }
>> >> >
>> >> > vmcs12 = get_vmcs12(vcpu);
>> >> > - if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> >> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> >> > + WARN_ON_ONCE(function))
>> >>
>> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> >> VM-function controls (the selected VM function is
>> >> not enabled)."
>> >>
>> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> >> completely.
>> >
>> > It assumes that vm_function_control is not > 1, which is (should be)
>> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
>> > is 1.
>> >
>> >> > + goto fail;
>> >
>> > The rest of the code assumes that the function is
>> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>> >
>> > Writing it as
>> >
>> > WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>> >
>> > would be cleared and I'd prefer to move the part that handles
>> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
>> > going to add more than one VM FUNC. :])
>>
>> IMO, for now, this should be fine because we are not even passing through the
>> hardware's eptp switching. Even if there are other vm functions, they
>> won't be available for the nested case and cause any conflict.
>
> Yeah, it is fine function-wise, I was just pointing out that it looks
> ugly to me.
Ok, lemme switch this to a switch statement style handler function. That way,
future additions would be easier.
> Btw. have you looked what we'd need to do for the hardware pass-through?
> I'd expect big changes to MMU. :)
Yes, the first version was actually using vmfunc 0 directly, well not exatly, the
first time would go through this path and then the next time the processor would
handle it directly. Paolo pointed out an issue that I was ready to fix but he wasn't
comfortable with the idea. I actually agree with him, it's too much untested code
for something that would be rarely used.
>> >> > + if (!nested_cpu_has_ept(vmcs12) ||
>> >> > + !nested_cpu_has_eptp_switching(vmcs12))
>> >> > + goto fail;
>> >
>> > This brings me to a missing vm-entry check:
>> >
>> > If “EPTP switching” VM-function control is 1, the “enable EPT”
>> > VM-execution control must also be 1. In addition, the EPTP-list address
>> > must satisfy the following checks:
>> > • Bits 11:0 of the address must be 0.
>> > • The address must not set any bits beyond the processor’s
>> > physical-address width.
>> >
>> > so this one could be
>> >
>> > if (!nested_cpu_has_eptp_switching(vmcs12) ||
>> > WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
>>
>> I will reverse the order here but the vm entry check is unnecessary because
>> the check on the list address is already being done in this function.
>
> Here is too late, the nested VM-entry should have failed, never letting
> this situation happen. We want an equivalent of
>
> if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
> return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>
> in nested controls checks, right next to the reserved fields check.
> And then also the check EPTP-list check. All of them only checked when
> nested_cpu_has_vmfunc(vmcs12).
Actually, I misread 25.5.5.3. There are two checks. Here, the list entry
needs to be checked so that eptp won't cause a vmentry failure. The vmentry
needs to check the eptp list address itself. I will add that check for the
list address in the next version.
Bandan
Radim Krčmář <[email protected]> writes:
> 2017-07-11 14:35-0400, Bandan Das:
>> Jim Mattson <[email protected]> writes:
>> ...
>> >>> I can find the definition for an vmexit in case of index >=
>> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >>>
>> >>> Can you give me a hint?
>> >>
>> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> for L2, this is a good check to have.
>> >
>> > There is nothing wrong with a hypervisor using physical page 0 for
>> > whatever purpose it likes, including an EPTP list.
>>
>> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> address most likely means it forgot to initialize it. Whatever damage it does will
>> still end up with vmfunc vmexit anyway.
>
> Most likely, but not certainly. I also don't see a to diverge from the
> spec here.
Actually, this is a specific case where I would like to diverge from the spec.
But then again, it's L1 shooting itself in the foot and this would be a rarely
used code path, so, I am fine removing it.
Radim Krčmář <[email protected]> writes:
> 2017-07-11 14:24-0400, Bandan Das:
>> Bandan Das <[email protected]> writes:
>> > If there's a triple fault, I think it's a good idea to inject it
>> > back. Basically, there's no need to take care of damage control
>> > that L1 is intentionally doing.
>> >
>> >>> + goto fail;
>> >>> + kvm_mmu_unload(vcpu);
>> >>> + vmcs12->ept_pointer = address;
>> >>> + kvm_mmu_reload(vcpu);
>> >>
>> >> I was thinking about something like this:
>> >>
>> >> kvm_mmu_unload(vcpu);
>> >> old = vmcs12->ept_pointer;
>> >> vmcs12->ept_pointer = address;
>> >> if (kvm_mmu_reload(vcpu)) {
>> >> /* pointer invalid, restore previous state */
>> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> vmcs12->ept_pointer = old;
>> >> kvm_mmu_reload(vcpu);
>> >> goto fail;
>> >> }
>> >>
>> >> The you can inherit the checks from mmu_check_root().
>>
>> Actually, thinking about this a bit more, I agree with you. Any fault
>> with a vmfunc operation should end with a vmfunc vmexit, so this
>> is a good thing to have. Thank you for this idea! :)
>
> SDM says
>
> IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> if in EPTP) THEN VMexit;
This section here:
As noted in Section 25.5.5.2, an execution of the
EPTP-switching VM function that causes a VM exit (as specified
above), uses the basic exit reason 59, indicating “VMFUNC”.
The length of the VMFUNC instruction is saved into the
VM-exit instruction-length field. No additional VM-exit
information is provided.
Although, it adds (as specified above), from testing, any vmexit that
happens as a result of the execution of the vmfunc instruction always
has exit reason 59.
IMO, the case David pointed out comes under "as a result of the
execution of the vmfunc instruction", so I would prefer exiting
with reason 59.
> and no other mentions of a VM exit, so I think that the VM exit happens
> only under these conditions:
>
> — The EPT memory type (bits 2:0) must be a value supported by the
> processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
> Appendix A.10).
> — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
> an EPT page-walk length of 4; see Section 28.2.2.
> — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
> bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
> as 0, indicating that the processor does not support accessed and
> dirty flags for EPT.
> — Reserved bits 11:7 and 63:N (where N is the processor’s
> physical-address width) must all be 0.
>
> And it looks like we need parts of nested_ept_init_mmu_context() to
> properly handle VMX_EPT_AD_ENABLE_BIT.
I completely ignored AD and the #VE sections. I will add a TODO item
in the comment section.
> The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
> we just invalidate the MMU.
2017-07-11 15:50-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
> > 2017-07-11 14:24-0400, Bandan Das:
> >> Bandan Das <[email protected]> writes:
> >> > If there's a triple fault, I think it's a good idea to inject it
> >> > back. Basically, there's no need to take care of damage control
> >> > that L1 is intentionally doing.
> >> >
> >> >>> + goto fail;
> >> >>> + kvm_mmu_unload(vcpu);
> >> >>> + vmcs12->ept_pointer = address;
> >> >>> + kvm_mmu_reload(vcpu);
> >> >>
> >> >> I was thinking about something like this:
> >> >>
> >> >> kvm_mmu_unload(vcpu);
> >> >> old = vmcs12->ept_pointer;
> >> >> vmcs12->ept_pointer = address;
> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> /* pointer invalid, restore previous state */
> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> vmcs12->ept_pointer = old;
> >> >> kvm_mmu_reload(vcpu);
> >> >> goto fail;
> >> >> }
> >> >>
> >> >> The you can inherit the checks from mmu_check_root().
> >>
> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> is a good thing to have. Thank you for this idea! :)
> >
> > SDM says
> >
> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> > if in EPTP) THEN VMexit;
>
> This section here:
> As noted in Section 25.5.5.2, an execution of the
> EPTP-switching VM function that causes a VM exit (as specified
> above), uses the basic exit reason 59, indicating “VMFUNC”.
> The length of the VMFUNC instruction is saved into the
> VM-exit instruction-length field. No additional VM-exit
> information is provided.
>
> Although, it adds (as specified above), from testing, any vmexit that
> happens as a result of the execution of the vmfunc instruction always
> has exit reason 59.
>
> IMO, the case David pointed out comes under "as a result of the
> execution of the vmfunc instruction", so I would prefer exiting
> with reason 59.
Right, the exit reason is 59 for reasons that trigger a VM exit
(i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
unrelated stuff.
If the EPTP value is correct, then the switch should succeed.
If the EPTP is correct, but bogus, then the guest should get
EPT_MISCONFIG VM exit on its first access (when reading the
instruction). Source: I added
vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
shortly before a VMLAUNCH on L0. :)
I think that we might be emulating this case incorrectly and throwing
triple faults when it should be VM exits in vcpu_run().
> > and no other mentions of a VM exit, so I think that the VM exit happens
> > only under these conditions:
> >
> > — The EPT memory type (bits 2:0) must be a value supported by the
> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
> > Appendix A.10).
> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
> > an EPT page-walk length of 4; see Section 28.2.2.
> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
> > as 0, indicating that the processor does not support accessed and
> > dirty flags for EPT.
> > — Reserved bits 11:7 and 63:N (where N is the processor’s
> > physical-address width) must all be 0.
> >
> > And it looks like we need parts of nested_ept_init_mmu_context() to
> > properly handle VMX_EPT_AD_ENABLE_BIT.
>
> I completely ignored AD and the #VE sections. I will add a TODO item
> in the comment section.
AFAIK, we don't support #VE, but AD would be nice to handle from the
beginning. (I think that caling nested_ept_init_mmu_context() as-is
isn't that bad.)
2017-07-11 15:38-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
>
> > 2017-07-11 14:35-0400, Bandan Das:
> >> Jim Mattson <[email protected]> writes:
> >> ...
> >> >>> I can find the definition for an vmexit in case of index >=
> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >>>
> >> >>> Can you give me a hint?
> >> >>
> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> for L2, this is a good check to have.
> >> >
> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> > whatever purpose it likes, including an EPTP list.
> >>
> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> still end up with vmfunc vmexit anyway.
> >
> > Most likely, but not certainly. I also don't see a to diverge from the
> > spec here.
>
> Actually, this is a specific case where I would like to diverge from the spec.
> But then again, it's L1 shooting itself in the foot and this would be a rarely
> used code path, so, I am fine removing it.
Thanks, we're not here to judge the guest, but to provide a bare-metal
experience. :)
Radim Krčmář <[email protected]> writes:
> 2017-07-11 15:50-0400, Bandan Das:
>> Radim Krčmář <[email protected]> writes:
>> > 2017-07-11 14:24-0400, Bandan Das:
>> >> Bandan Das <[email protected]> writes:
>> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> > back. Basically, there's no need to take care of damage control
>> >> > that L1 is intentionally doing.
>> >> >
>> >> >>> + goto fail;
>> >> >>> + kvm_mmu_unload(vcpu);
>> >> >>> + vmcs12->ept_pointer = address;
>> >> >>> + kvm_mmu_reload(vcpu);
>> >> >>
>> >> >> I was thinking about something like this:
>> >> >>
>> >> >> kvm_mmu_unload(vcpu);
>> >> >> old = vmcs12->ept_pointer;
>> >> >> vmcs12->ept_pointer = address;
>> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> /* pointer invalid, restore previous state */
>> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> vmcs12->ept_pointer = old;
>> >> >> kvm_mmu_reload(vcpu);
>> >> >> goto fail;
>> >> >> }
>> >> >>
>> >> >> The you can inherit the checks from mmu_check_root().
>> >>
>> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> is a good thing to have. Thank you for this idea! :)
>> >
>> > SDM says
>> >
>> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> > if in EPTP) THEN VMexit;
>>
>> This section here:
>> As noted in Section 25.5.5.2, an execution of the
>> EPTP-switching VM function that causes a VM exit (as specified
>> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> The length of the VMFUNC instruction is saved into the
>> VM-exit instruction-length field. No additional VM-exit
>> information is provided.
>>
>> Although, it adds (as specified above), from testing, any vmexit that
>> happens as a result of the execution of the vmfunc instruction always
>> has exit reason 59.
>>
>> IMO, the case David pointed out comes under "as a result of the
>> execution of the vmfunc instruction", so I would prefer exiting
>> with reason 59.
>
> Right, the exit reason is 59 for reasons that trigger a VM exit
> (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> unrelated stuff.
>
> If the EPTP value is correct, then the switch should succeed.
> If the EPTP is correct, but bogus, then the guest should get
> EPT_MISCONFIG VM exit on its first access (when reading the
> instruction). Source: I added
My point is that we are using kvm_mmu_reload() to emulate eptp
switching. If that emulation of vmfunc fails, it should exit with reason
59.
> vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>
> shortly before a VMLAUNCH on L0. :)
What happens if this ept pointer is actually in the eptp list and the guest
switches to it using vmfunc ? I think it will exit with reason 59.
> I think that we might be emulating this case incorrectly and throwing
> triple faults when it should be VM exits in vcpu_run().
No, I agree with not throwing a triple fault. We should clear it out.
But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> > — The EPT memory type (bits 2:0) must be a value supported by the
>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> > Appendix A.10).
>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> > an EPT page-walk length of 4; see Section 28.2.2.
>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> > as 0, indicating that the processor does not support accessed and
>> > dirty flags for EPT.
>> > — Reserved bits 11:7 and 63:N (where N is the processor’s
>> > physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the
Nevertheless, it's good to have the nested hypervisor be able to use it
just like vmfunc.
> beginning. (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)
Ok, I will take a look.
Radim Krčmář <[email protected]> writes:
> 2017-07-11 15:38-0400, Bandan Das:
>> Radim Krčmář <[email protected]> writes:
>>
>> > 2017-07-11 14:35-0400, Bandan Das:
>> >> Jim Mattson <[email protected]> writes:
>> >> ...
>> >> >>> I can find the definition for an vmexit in case of index >=
>> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >> >>>
>> >> >>> Can you give me a hint?
>> >> >>
>> >> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> >> for L2, this is a good check to have.
>> >> >
>> >> > There is nothing wrong with a hypervisor using physical page 0 for
>> >> > whatever purpose it likes, including an EPTP list.
>> >>
>> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> >> address most likely means it forgot to initialize it. Whatever damage it does will
>> >> still end up with vmfunc vmexit anyway.
>> >
>> > Most likely, but not certainly. I also don't see a to diverge from the
>> > spec here.
>>
>> Actually, this is a specific case where I would like to diverge from the spec.
>> But then again, it's L1 shooting itself in the foot and this would be a rarely
>> used code path, so, I am fine removing it.
>
> Thanks, we're not here to judge the guest, but to provide a bare-metal
> experience. :)
There are certain cases where do. For example, when L2 instruction emulation
fails we decide to kill L2 instead of injecting the error to L1 and let it handle
that. Anyway, that's a different topic, I was just trying to point out there
are cases kvm does a somewhat policy decision...
2017-07-11 16:34-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
>
> > 2017-07-11 15:50-0400, Bandan Das:
> >> Radim Krčmář <[email protected]> writes:
> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> Bandan Das <[email protected]> writes:
> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> > back. Basically, there's no need to take care of damage control
> >> >> > that L1 is intentionally doing.
> >> >> >
> >> >> >>> + goto fail;
> >> >> >>> + kvm_mmu_unload(vcpu);
> >> >> >>> + vmcs12->ept_pointer = address;
> >> >> >>> + kvm_mmu_reload(vcpu);
> >> >> >>
> >> >> >> I was thinking about something like this:
> >> >> >>
> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> /* pointer invalid, restore previous state */
> >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> vmcs12->ept_pointer = old;
> >> >> >> kvm_mmu_reload(vcpu);
> >> >> >> goto fail;
> >> >> >> }
> >> >> >>
> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >>
> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> is a good thing to have. Thank you for this idea! :)
> >> >
> >> > SDM says
> >> >
> >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> > if in EPTP) THEN VMexit;
> >>
> >> This section here:
> >> As noted in Section 25.5.5.2, an execution of the
> >> EPTP-switching VM function that causes a VM exit (as specified
> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> The length of the VMFUNC instruction is saved into the
> >> VM-exit instruction-length field. No additional VM-exit
> >> information is provided.
> >>
> >> Although, it adds (as specified above), from testing, any vmexit that
> >> happens as a result of the execution of the vmfunc instruction always
> >> has exit reason 59.
> >>
> >> IMO, the case David pointed out comes under "as a result of the
> >> execution of the vmfunc instruction", so I would prefer exiting
> >> with reason 59.
> >
> > Right, the exit reason is 59 for reasons that trigger a VM exit
> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> > unrelated stuff.
> >
> > If the EPTP value is correct, then the switch should succeed.
> > If the EPTP is correct, but bogus, then the guest should get
> > EPT_MISCONFIG VM exit on its first access (when reading the
> > instruction). Source: I added
>
> My point is that we are using kvm_mmu_reload() to emulate eptp
> switching. If that emulation of vmfunc fails, it should exit with reason
> 59.
Yeah, we just disagree on what is a vmfunc failure.
> > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
> >
> > shortly before a VMLAUNCH on L0. :)
>
> What happens if this ept pointer is actually in the eptp list and the guest
> switches to it using vmfunc ? I think it will exit with reason 59.
I think otherwise, because it doesn't cause a VM entry failure on
bare-metal (and SDM says that we get a VM exit only if there would be a
VM entry failure).
I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
(Experiment pending :])
> > I think that we might be emulating this case incorrectly and throwing
> > triple faults when it should be VM exits in vcpu_run().
>
> No, I agree with not throwing a triple fault. We should clear it out.
> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
Here we disagree. I think that it's a bug do the VM exit, so we can
just keep the original bug -- we want to eventually fix it and it's no
worse till then.
Radim Krčmář <[email protected]> writes:
> 2017-07-11 16:34-0400, Bandan Das:
>> Radim Krčmář <[email protected]> writes:
>>
>> > 2017-07-11 15:50-0400, Bandan Das:
>> >> Radim Krčmář <[email protected]> writes:
>> >> > 2017-07-11 14:24-0400, Bandan Das:
>> >> >> Bandan Das <[email protected]> writes:
>> >> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> >> > back. Basically, there's no need to take care of damage control
>> >> >> > that L1 is intentionally doing.
>> >> >> >
>> >> >> >>> + goto fail;
>> >> >> >>> + kvm_mmu_unload(vcpu);
>> >> >> >>> + vmcs12->ept_pointer = address;
>> >> >> >>> + kvm_mmu_reload(vcpu);
>> >> >> >>
>> >> >> >> I was thinking about something like this:
>> >> >> >>
>> >> >> >> kvm_mmu_unload(vcpu);
>> >> >> >> old = vmcs12->ept_pointer;
>> >> >> >> vmcs12->ept_pointer = address;
>> >> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> >> /* pointer invalid, restore previous state */
>> >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> >> vmcs12->ept_pointer = old;
>> >> >> >> kvm_mmu_reload(vcpu);
>> >> >> >> goto fail;
>> >> >> >> }
>> >> >> >>
>> >> >> >> The you can inherit the checks from mmu_check_root().
>> >> >>
>> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> >> is a good thing to have. Thank you for this idea! :)
>> >> >
>> >> > SDM says
>> >> >
>> >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> >> > if in EPTP) THEN VMexit;
>> >>
>> >> This section here:
>> >> As noted in Section 25.5.5.2, an execution of the
>> >> EPTP-switching VM function that causes a VM exit (as specified
>> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> >> The length of the VMFUNC instruction is saved into the
>> >> VM-exit instruction-length field. No additional VM-exit
>> >> information is provided.
>> >>
>> >> Although, it adds (as specified above), from testing, any vmexit that
>> >> happens as a result of the execution of the vmfunc instruction always
>> >> has exit reason 59.
>> >>
>> >> IMO, the case David pointed out comes under "as a result of the
>> >> execution of the vmfunc instruction", so I would prefer exiting
>> >> with reason 59.
>> >
>> > Right, the exit reason is 59 for reasons that trigger a VM exit
>> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
>> > unrelated stuff.
>> >
>> > If the EPTP value is correct, then the switch should succeed.
>> > If the EPTP is correct, but bogus, then the guest should get
>> > EPT_MISCONFIG VM exit on its first access (when reading the
>> > instruction). Source: I added
>>
>> My point is that we are using kvm_mmu_reload() to emulate eptp
>> switching. If that emulation of vmfunc fails, it should exit with reason
>> 59.
>
> Yeah, we just disagree on what is a vmfunc failure.
>
>> > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>> >
>> > shortly before a VMLAUNCH on L0. :)
>>
>> What happens if this ept pointer is actually in the eptp list and the guest
>> switches to it using vmfunc ? I think it will exit with reason 59.
>
> I think otherwise, because it doesn't cause a VM entry failure on
> bare-metal (and SDM says that we get a VM exit only if there would be a
> VM entry failure).
> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
> (Experiment pending :])
>
>> > I think that we might be emulating this case incorrectly and throwing
>> > triple faults when it should be VM exits in vcpu_run().
>>
>> No, I agree with not throwing a triple fault. We should clear it out.
>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>
> Here we disagree. I think that it's a bug do the VM exit, so we can
Why do you think it's a bug ? The eptp switching function really didn't
succeed as far as our emulation goes when kvm_mmu_reload() fails.
And as such, the generic vmexit failure event should be a vmfunc vmexit.
We cannot strictly follow the spec here, the spec doesn't even mention a way
to emulate eptp switching. If setting up the switching succeeded and the
new root pointer is invalid or whatever, I really don't care what happens
next but this is not the case. We fail to get a new root pointer and without
that, we can't even make a switch!
> just keep the original bug -- we want to eventually fix it and it's no
> worse till then.
Anyway, can you please confirm again what is the behavior that you
are expecting if kvm_mmu_reload fails ? This would be a rarely used
branch and I am actually fine diverging from what I think is right if
I can get the reviewers to agree on a common thing.
(Thanks for giving this a closer look, Radim. I really appreciate it.)
Bandan
2017-07-11 17:08-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
>
> > 2017-07-11 16:34-0400, Bandan Das:
> >> Radim Krčmář <[email protected]> writes:
> >>
> >> > 2017-07-11 15:50-0400, Bandan Das:
> >> >> Radim Krčmář <[email protected]> writes:
> >> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> >> Bandan Das <[email protected]> writes:
> >> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> >> > back. Basically, there's no need to take care of damage control
> >> >> >> > that L1 is intentionally doing.
> >> >> >> >
> >> >> >> >>> + goto fail;
> >> >> >> >>> + kvm_mmu_unload(vcpu);
> >> >> >> >>> + vmcs12->ept_pointer = address;
> >> >> >> >>> + kvm_mmu_reload(vcpu);
> >> >> >> >>
> >> >> >> >> I was thinking about something like this:
> >> >> >> >>
> >> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> >> /* pointer invalid, restore previous state */
> >> >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> >> vmcs12->ept_pointer = old;
> >> >> >> >> kvm_mmu_reload(vcpu);
> >> >> >> >> goto fail;
> >> >> >> >> }
> >> >> >> >>
> >> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >> >>
> >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> >> is a good thing to have. Thank you for this idea! :)
> >> >> >
> >> >> > SDM says
> >> >> >
> >> >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> >> > if in EPTP) THEN VMexit;
> >> >>
> >> >> This section here:
> >> >> As noted in Section 25.5.5.2, an execution of the
> >> >> EPTP-switching VM function that causes a VM exit (as specified
> >> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> >> The length of the VMFUNC instruction is saved into the
> >> >> VM-exit instruction-length field. No additional VM-exit
> >> >> information is provided.
> >> >>
> >> >> Although, it adds (as specified above), from testing, any vmexit that
> >> >> happens as a result of the execution of the vmfunc instruction always
> >> >> has exit reason 59.
> >> >>
> >> >> IMO, the case David pointed out comes under "as a result of the
> >> >> execution of the vmfunc instruction", so I would prefer exiting
> >> >> with reason 59.
> >> >
> >> > Right, the exit reason is 59 for reasons that trigger a VM exit
> >> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> >> > unrelated stuff.
> >> >
> >> > If the EPTP value is correct, then the switch should succeed.
> >> > If the EPTP is correct, but bogus, then the guest should get
> >> > EPT_MISCONFIG VM exit on its first access (when reading the
> >> > instruction). Source: I added
> >>
> >> My point is that we are using kvm_mmu_reload() to emulate eptp
> >> switching. If that emulation of vmfunc fails, it should exit with reason
> >> 59.
>>
>> Yeah, we just disagree on what is a vmfunc failure.
>>
>>> > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>>> >
>>> > shortly before a VMLAUNCH on L0. :)
>>>
>>> What happens if this ept pointer is actually in the eptp list and the guest
>>> switches to it using vmfunc ? I think it will exit with reason 59.
>>
>> I think otherwise, because it doesn't cause a VM entry failure on
>> bare-metal (and SDM says that we get a VM exit only if there would be a
>> VM entry failure).
>> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
>> (Experiment pending :])
>>
>>> > I think that we might be emulating this case incorrectly and throwing
>>> > triple faults when it should be VM exits in vcpu_run().
>>>
>>> No, I agree with not throwing a triple fault. We should clear it out.
>>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>>
>> Here we disagree. I think that it's a bug do the VM exit, so we can
>
> Why do you think it's a bug ?
SDM defines a different behavior and hardware doesn't do that either.
There are only two reasons for a VMFUNC VM exit from EPTP switching:
1) ECX > 0
2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER
KVM can fail for other reasons because of its bugs, but that should be
notified to the guest in another way. Rebooting the guest is kind of
acceptable in that case.
> The eptp switching function really didn't
> succeed as far as our emulation goes when kvm_mmu_reload() fails.
> And as such, the generic vmexit failure event should be a vmfunc vmexit.
I interpret it as two separate events -- at first, the vmfunc succeeds
and when it later tries to access memory through the new EPTP (valid,
but not pointing into backed memory), it results in a EPT_MISCONFIG VM
exit.
> We cannot strictly follow the spec here, the spec doesn't even mention a way
> to emulate eptp switching. If setting up the switching succeeded and the
> new root pointer is invalid or whatever, I really don't care what happens
> next but this is not the case. We fail to get a new root pointer and without
> that, we can't even make a switch!
We just make it behave exactly how the spec says that it behaves. We do
have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
all we need for the emulation.
The function doesn't dereference that pointer, it just looks at its
value to decide whether it is valid or not. (btw. we should check that
properly, because we cannot depend on VM entry failure pass-through like
the normal case.)
The dereference done in kvm_mmu_reload() should happen after EPTP
switching finishes, because the spec doesn't mention a VM exit for other
reason than invalid EPT_POINTER value.
>> just keep the original bug -- we want to eventually fix it and it's no
>> worse till then.
>
> Anyway, can you please confirm again what is the behavior that you
> are expecting if kvm_mmu_reload fails ? This would be a rarely used
> branch and I am actually fine diverging from what I think is right if
> I can get the reviewers to agree on a common thing.
kvm_mmu_reload() fails when mmu_check_root() is false, which means that
the pointed physical address is not backed. We've hit this corner-case
in the past -- Jim said that the chipset returns all 1s if a read is not
claimed.
So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
pointer pointed to a memory of all 1s, which would likely result in
EPT_MISCONFIG when the guest does a memory access.
It is a mishandled corner case, but turning it into VM exit would only
confuse an OS that receives the impossible VM exit and potentially
confuse reader of the KVM logic.
I think that not using kvm_mmu_reload() directly in EPTP switching is
best. The bug is not really something we care about.
2017-07-11 16:45-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
>
> > 2017-07-11 15:38-0400, Bandan Das:
> >> Radim Krčmář <[email protected]> writes:
> >>
> >> > 2017-07-11 14:35-0400, Bandan Das:
> >> >> Jim Mattson <[email protected]> writes:
> >> >> ...
> >> >> >>> I can find the definition for an vmexit in case of index >=
> >> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >> >>>
> >> >> >>> Can you give me a hint?
> >> >> >>
> >> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> >> for L2, this is a good check to have.
> >> >> >
> >> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> >> > whatever purpose it likes, including an EPTP list.
> >> >>
> >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> >> still end up with vmfunc vmexit anyway.
> >> >
> >> > Most likely, but not certainly. I also don't see a to diverge from the
> >> > spec here.
> >>
> >> Actually, this is a specific case where I would like to diverge from the spec.
> >> But then again, it's L1 shooting itself in the foot and this would be a rarely
> >> used code path, so, I am fine removing it.
> >
> > Thanks, we're not here to judge the guest, but to provide a bare-metal
> > experience. :)
>
> There are certain cases where do. For example, when L2 instruction emulation
> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
> that. Anyway, that's a different topic, I was just trying to point out there
> are cases kvm does a somewhat policy decision...
Emulation failure is a KVM bug and we are too lazy to implement the
bare-metal behavior correctly, but avoiding the EPTP list bug is
actually easier than introducing it. You can make KVM simpler and
improve bare-metal emulation at the same time.
Radim Krčmář <[email protected]> writes:
...
>> > Thanks, we're not here to judge the guest, but to provide a bare-metal
>> > experience. :)
>>
>> There are certain cases where do. For example, when L2 instruction emulation
>> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
>> that. Anyway, that's a different topic, I was just trying to point out there
>> are cases kvm does a somewhat policy decision...
>
> Emulation failure is a KVM bug and we are too lazy to implement the
> bare-metal behavior correctly, but avoiding the EPTP list bug is
> actually easier than introducing it. You can make KVM simpler and
> improve bare-metal emulation at the same time.
We are just talking past each other here trying to impose point of views.
Checking for 0 makes KVM simpler. As I said before, a 0 list_address means
that the hypervisor forgot to initialize it. Feel free to show me examples
where the hypervisor does indeed use a 0 address for eptp list address or
anything vm specific. You disagreed and I am fine with it.
Radim Krčmář <[email protected]> writes:
...
>> Why do you think it's a bug ?
>
> SDM defines a different behavior and hardware doesn't do that either.
> There are only two reasons for a VMFUNC VM exit from EPTP switching:
>
> 1) ECX > 0
> 2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER
>
> KVM can fail for other reasons because of its bugs, but that should be
> notified to the guest in another way. Rebooting the guest is kind of
> acceptable in that case.
>
>> The eptp switching function really didn't
>> succeed as far as our emulation goes when kvm_mmu_reload() fails.
>> And as such, the generic vmexit failure event should be a vmfunc vmexit.
>
> I interpret it as two separate events -- at first, the vmfunc succeeds
> and when it later tries to access memory through the new EPTP (valid,
> but not pointing into backed memory), it results in a EPT_MISCONFIG VM
> exit.
>
>> We cannot strictly follow the spec here, the spec doesn't even mention a way
>> to emulate eptp switching. If setting up the switching succeeded and the
>> new root pointer is invalid or whatever, I really don't care what happens
>> next but this is not the case. We fail to get a new root pointer and without
>> that, we can't even make a switch!
>
> We just make it behave exactly how the spec says that it behaves. We do
> have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
> all we need for the emulation.
> The function doesn't dereference that pointer, it just looks at its
> value to decide whether it is valid or not. (btw. we should check that
> properly, because we cannot depend on VM entry failure pass-through like
> the normal case.)
>
> The dereference done in kvm_mmu_reload() should happen after EPTP
> switching finishes, because the spec doesn't mention a VM exit for other
> reason than invalid EPT_POINTER value.
>
>>> just keep the original bug -- we want to eventually fix it and it's no
>>> worse till then.
>>
>> Anyway, can you please confirm again what is the behavior that you
>> are expecting if kvm_mmu_reload fails ? This would be a rarely used
>> branch and I am actually fine diverging from what I think is right if
>> I can get the reviewers to agree on a common thing.
>
> kvm_mmu_reload() fails when mmu_check_root() is false, which means that
> the pointed physical address is not backed. We've hit this corner-case
> in the past -- Jim said that the chipset returns all 1s if a read is not
> claimed.
>
> So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
> pointer pointed to a memory of all 1s, which would likely result in
> EPT_MISCONFIG when the guest does a memory access.
As much as I would like to disagree with you, I have already spent way more
time on this then I want. Please let's just leave it here, then ? The mmu unload
will make sure there's an invalid root hpa and whatever happens next, happens.
> It is a mishandled corner case, but turning it into VM exit would only
> confuse an OS that receives the impossible VM exit and potentially
> confuse reader of the KVM logic.
>
> I think that not using kvm_mmu_reload() directly in EPTP switching is
> best. The bug is not really something we care about.
2017-07-12 14:11-0400, Bandan Das:
> As much as I would like to disagree with you, I have already spent way more
> time on this then I want. Please let's just leave it here, then ? The mmu unload
> will make sure there's an invalid root hpa and whatever happens next, happens.
Sure; let's discuss the subtleties of hardware emulation over a beer.
>>> + /*
>>> + * If the (L2) guest does a vmfunc to the currently
>>> + * active ept pointer, we don't have to do anything else
>>> + */
>>> + if (vmcs12->ept_pointer != address) {
>>> + if (address >> cpuid_maxphyaddr(vcpu) ||
>>> + !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
I quickly rushed over the massive amount of comments. Sounds like you'll
be preparing a v5. Would be great if you could add some comments that
were the result of this discussion (for parts that are not that obvious
- triple faults) - thanks!
--
Thanks,
David
David Hildenbrand <[email protected]> writes:
>>>> + /*
>>>> + * If the (L2) guest does a vmfunc to the currently
>>>> + * active ept pointer, we don't have to do anything else
>>>> + */
>>>> + if (vmcs12->ept_pointer != address) {
>>>> + if (address >> cpuid_maxphyaddr(vcpu) ||
>>>> + !IS_ALIGNED(address, 4096))
>>>
>>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>>> (triggering a KVM_REQ_TRIPLE_FAULT)
>>
>> If there's a triple fault, I think it's a good idea to inject it
>> back. Basically, there's no need to take care of damage control
>> that L1 is intentionally doing.
>
> I quickly rushed over the massive amount of comments. Sounds like you'll
> be preparing a v5. Would be great if you could add some comments that
> were the result of this discussion (for parts that are not that obvious
> - triple faults) - thanks!
Will do. Basically, we agreed that we don't need to do anything with mmu_reload() faillures
because the invalid eptp that mmu_unload will write to root_hpa will result in an ept
violation.
Bandan
Radim Krčmář <[email protected]> writes:
...
>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> > — The EPT memory type (bits 2:0) must be a value supported by the
>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> > Appendix A.10).
>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> > an EPT page-walk length of 4; see Section 28.2.2.
>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> > as 0, indicating that the processor does not support accessed and
>> > dirty flags for EPT.
>> > — Reserved bits 11:7 and 63:N (where N is the processor’s
>> > physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the
> beginning. (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)
I went back to the spec to take a look at the AD handling. It doesn't look
like anything needs to be done since nested_ept_init_mmu_context() is already
being called with the correct eptp in prepare_vmcs02 ? Anything else that
needs to be done for AD handling in vmfunc context ?
Thanks,
Bandan
2017-07-17 13:58-0400, Bandan Das:
> Radim Krčmář <[email protected]> writes:
> ...
>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>> > only under these conditions:
>>> >
>>> > — The EPT memory type (bits 2:0) must be a value supported by the
>>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>> > Appendix A.10).
>>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>> > an EPT page-walk length of 4; see Section 28.2.2.
>>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>> > as 0, indicating that the processor does not support accessed and
>>> > dirty flags for EPT.
>>> > — Reserved bits 11:7 and 63:N (where N is the processor’s
>>> > physical-address width) must all be 0.
>>> >
>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>>
>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>> in the comment section.
>>
>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>> beginning. (I think that caling nested_ept_init_mmu_context() as-is
>> isn't that bad.)
>
> I went back to the spec to take a look at the AD handling. It doesn't look
> like anything needs to be done since nested_ept_init_mmu_context() is already
> being called with the correct eptp in prepare_vmcs02 ? Anything else that
> needs to be done for AD handling in vmfunc context ?
AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
We want to forward the new AD configuration to KVM's MMU.
Radim Krčmář <[email protected]> writes:
> 2017-07-17 13:58-0400, Bandan Das:
>> Radim Krčmář <[email protected]> writes:
>> ...
>>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>>> > only under these conditions:
>>>> >
>>>> > — The EPT memory type (bits 2:0) must be a value supported by the
>>>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>>> > Appendix A.10).
>>>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>>> > an EPT page-walk length of 4; see Section 28.2.2.
>>>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>>> > as 0, indicating that the processor does not support accessed and
>>>> > dirty flags for EPT.
>>>> > — Reserved bits 11:7 and 63:N (where N is the processor’s
>>>> > physical-address width) must all be 0.
>>>> >
>>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>>>
>>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>>> in the comment section.
>>>
>>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>>> beginning. (I think that caling nested_ept_init_mmu_context() as-is
>>> isn't that bad.)
>>
>> I went back to the spec to take a look at the AD handling. It doesn't look
>> like anything needs to be done since nested_ept_init_mmu_context() is already
>> being called with the correct eptp in prepare_vmcs02 ? Anything else that
>> needs to be done for AD handling in vmfunc context ?
>
> AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
> we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
> We want to forward the new AD configuration to KVM's MMU.
Thanks, I had incorrectly assumed that prepare_vmcs02 will be called after
an exit unconditionally. I will work something up soon.
Bandan