Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S967987AbeAORcD (ORCPT + 1 other); Mon, 15 Jan 2018 12:32:03 -0500 Received: from mx1.redhat.com ([209.132.183.28]:32778 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1030272AbeAORbw (ORCPT ); Mon, 15 Jan 2018 12:31:52 -0500 From: Vitaly Kuznetsov To: kvm@vger.kernel.org Cc: x86@kernel.org, Paolo Bonzini , =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= , "K. Y. Srinivasan" , Haiyang Zhang , Stephen Hemminger , "Michael Kelley (EOSG)" , Mohammed Gamal , Cathy Avery , Bandan Das , linux-kernel@vger.kernel.org Subject: [RFC 6/6] x86/kvm: use enlightened VMCS when running on Hyper-V Date: Mon, 15 Jan 2018 18:31:05 +0100 Message-Id: <20180115173105.31845-7-vkuznets@redhat.com> In-Reply-To: <20180115173105.31845-1-vkuznets@redhat.com> References: <20180115173105.31845-1-vkuznets@redhat.com> X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.30]); Mon, 15 Jan 2018 17:31:52 +0000 (UTC) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: Early prototype. When running nested KVM on Hyper-V it's possible to use so called 'Enlightened VMCS' and do normal memory reads/writes instead of doing VMWRITE/VMREAD instructions. Tests show that this speeds up tight CPUID loop almost 3 times: Before: ./cpuid_tight 20459 After: ./cpuid_tight 7698 checkpatch.pl errors/warnings and 32bit brokenness are known things. Main RFC questions I have are: - Do we want to have this per L2 VM or per L1 host? - How can we achieve zero overhead for non-Hyper-V deployments? Use static keys? But this will only work if we decide to do eVMCS per host. - Can we do better than a big switch in evmcs_read()/evmcs_write()? And probably don't use 'case' defines which checkpatch.pl hates. Signed-off-by: Vitaly Kuznetsov --- arch/x86/kvm/vmx.c | 595 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 593 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index efff9d035543..dfdfd15c3d60 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -198,6 +199,9 @@ extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 +static bool __read_mostly enlightened_vmcs = true; +module_param(enlightened_vmcs, bool, 0444); + struct vmcs { u32 revision_id; u32 abort; @@ -1498,11 +1502,22 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) loaded_vmcs->launched = 0; } +static inline void vmcs_load_enlightened(u64 phys_addr) +{ + int cpu = smp_processor_id(); + + hv_vp_assist_page[cpu]->current_nested_vmcs = phys_addr; + hv_vp_assist_page[cpu]->enlighten_vmentry = 1; +} + static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); u8 error; + if (enlightened_vmcs) + return vmcs_load_enlightened(phys_addr); + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); @@ -1620,6 +1635,514 @@ static inline void ept_sync_context(u64 eptp) ept_sync_global(); } +/* + * Enlightened VMCSv1 doesn't support these: + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * GUEST_PML_INDEX = 0x00000812, + * IO_BITMAP_A_HIGH = 0x00002001, + * IO_BITMAP_B_HIGH = 0x00002003, + * MSR_BITMAP_HIGH = 0x00002005, + * VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + * VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + * VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + * PML_ADDRESS = 0x0000200e, + * PML_ADDRESS_HIGH = 0x0000200f, + * TSC_OFFSET_HIGH = 0x00002011, + * VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + * APIC_ACCESS_ADDR = 0x00002014, + * APIC_ACCESS_ADDR_HIGH = 0x00002015, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, + * VM_FUNCTION_CONTROL = 0x00002018, + * VM_FUNCTION_CONTROL_HIGH = 0x00002019, + * EPT_POINTER_HIGH = 0x0000201b, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP0_HIGH = 0x0000201d, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP1_HIGH = 0x0000201f, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP2_HIGH = 0x00002021, + * EOI_EXIT_BITMAP3 = 0x00002022, + * EOI_EXIT_BITMAP3_HIGH = 0x00002023, + * EPTP_LIST_ADDRESS = 0x00002024, + * EPTP_LIST_ADDRESS_HIGH = 0x00002025, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * XSS_EXIT_BITMAP_HIGH = 0x0000202D, + * TSC_MULTIPLIER = 0x00002032, + * TSC_MULTIPLIER_HIGH = 0x00002033, + * GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + * VMCS_LINK_POINTER_HIGH = 0x00002801, + * GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + * GUEST_IA32_PAT_HIGH = 0x00002805, + * GUEST_IA32_EFER_HIGH = 0x00002807, + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, + * GUEST_PDPTR0_HIGH = 0x0000280b, + * GUEST_PDPTR1_HIGH = 0x0000280d, + * GUEST_PDPTR2_HIGH = 0x0000280f, + * GUEST_PDPTR3_HIGH = 0x00002811, + * GUEST_BNDCFGS_HIGH = 0x00002813, + * GUEST_IA32_RTIT_CTL = 0x00002814, + * GUEST_IA32_RTIT_CTL_HIGH = 0x00002815, + * HOST_IA32_PAT_HIGH = 0x00002c01, + * HOST_IA32_EFER_HIGH = 0x00002c03, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + * HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, + * VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + * VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + * VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + */ + +#define evmcs_write_field(field, efield, mask) \ + case field: \ + evmcs->efield = value; \ + evmcs->hv_clean_fields &= ~mask; \ + break; + +#define evmcs_read_field(field, efield) \ + case field: \ + return evmcs->efield; \ + +static void evmcs_write(unsigned long field, u64 value) +{ + int cpu = smp_processor_id(); + struct hv_enlightened_vmcs *evmcs = + __va(hv_vp_assist_page[cpu]->current_nested_vmcs); + + switch (field) { + /* 64 bit fields */ + evmcs_write_field(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + evmcs_write_field(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC); + evmcs_write_field(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC); + evmcs_write_field(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_IA32_SYSENTER_ESP, + host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_IA32_SYSENTER_EIP, + host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP); + evmcs_write_field(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP); + evmcs_write_field(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP); + evmcs_write_field(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2); + evmcs_write_field(VIRTUAL_APIC_PAGE_ADDR, + virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2); + evmcs_write_field(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_PENDING_DBG_EXCEPTIONS, + guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR); + evmcs_write_field(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER); + evmcs_write_field(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT); + evmcs_write_field(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2); + /* no mask defined in the spec */ + evmcs_write_field(VM_EXIT_MSR_STORE_ADDR, + vm_exit_msr_store_addr, 0xffff); + evmcs_write_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + 0xffff); + evmcs_write_field(VM_ENTRY_MSR_LOAD_ADDR, + vm_entry_msr_load_addr, 0xffff); + evmcs_write_field(CR3_TARGET_VALUE0, cr3_target_value0, 0xffff); + evmcs_write_field(CR3_TARGET_VALUE1, cr3_target_value1, 0xffff); + evmcs_write_field(CR3_TARGET_VALUE2, cr3_target_value2, 0xffff); + evmcs_write_field(CR3_TARGET_VALUE3, cr3_target_value3, 0xffff); + + /* 32 bit fields */ + evmcs_write_field(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + evmcs_write_field(GUEST_INTERRUPTIBILITY_INFO, + guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC); + evmcs_write_field(CPU_BASED_VM_EXEC_CONTROL, + cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC); + evmcs_write_field(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN); + evmcs_write_field(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY); + evmcs_write_field(VM_ENTRY_INTR_INFO_FIELD, + vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT); + evmcs_write_field(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT); + evmcs_write_field(VM_ENTRY_INSTRUCTION_LEN, + vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT); + evmcs_write_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(PIN_BASED_VM_EXEC_CONTROL, + pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1); + evmcs_write_field(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1); + evmcs_write_field(SECONDARY_VM_EXEC_CONTROL, + secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1); + evmcs_write_field(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + evmcs_write_field(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + /* no mask defined in the spec */ + evmcs_write_field(PAGE_FAULT_ERROR_CODE_MASK, + page_fault_error_code_mask, 0xffff); + evmcs_write_field(PAGE_FAULT_ERROR_CODE_MATCH, + page_fault_error_code_match, 0xffff); + evmcs_write_field(CR3_TARGET_COUNT, cr3_target_count, + 0xffff); + evmcs_write_field(VM_EXIT_MSR_STORE_COUNT, + vm_exit_msr_store_count, 0xffff); + evmcs_write_field(VM_EXIT_MSR_LOAD_COUNT, + vm_exit_msr_load_count, 0xffff); + evmcs_write_field(VM_ENTRY_MSR_LOAD_COUNT, + vm_entry_msr_load_count, 0xffff); + + /* 16 bit fields */ + evmcs_write_field(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1); + evmcs_write_field(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2); + evmcs_write_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT); + default: + pr_err("VMX: no EVMCS support write:0x%lx\n", field); + } +} + +static u64 evmcs_read(unsigned long field) +{ + int cpu = smp_processor_id(); + struct hv_enlightened_vmcs *evmcs = + __va(hv_vp_assist_page[cpu]->current_nested_vmcs); + + switch (field) { + /* 64 bit fields */ + evmcs_read_field(GUEST_RIP, guest_rip); + evmcs_read_field(GUEST_RSP, guest_rsp); + evmcs_read_field(GUEST_RFLAGS, guest_rflags); + evmcs_read_field(HOST_IA32_PAT, host_ia32_pat); + evmcs_read_field(HOST_IA32_EFER, host_ia32_efer); + evmcs_read_field(HOST_CR0, host_cr0); + evmcs_read_field(HOST_CR3, host_cr3); + evmcs_read_field(HOST_CR4, host_cr4); + evmcs_read_field(HOST_IA32_SYSENTER_ESP, + host_ia32_sysenter_esp); + evmcs_read_field(HOST_IA32_SYSENTER_EIP, + host_ia32_sysenter_eip); + evmcs_read_field(HOST_RIP, host_rip); + evmcs_read_field(IO_BITMAP_A, io_bitmap_a); + evmcs_read_field(IO_BITMAP_B, io_bitmap_b); + evmcs_read_field(MSR_BITMAP, msr_bitmap); + evmcs_read_field(GUEST_ES_BASE, guest_es_base); + evmcs_read_field(GUEST_CS_BASE, guest_cs_base); + evmcs_read_field(GUEST_SS_BASE, guest_ss_base); + evmcs_read_field(GUEST_DS_BASE, guest_ds_base); + evmcs_read_field(GUEST_FS_BASE, guest_fs_base); + evmcs_read_field(GUEST_GS_BASE, guest_gs_base); + evmcs_read_field(GUEST_LDTR_BASE, guest_ldtr_base); + evmcs_read_field(GUEST_TR_BASE, guest_tr_base); + evmcs_read_field(GUEST_GDTR_BASE, guest_gdtr_base); + evmcs_read_field(GUEST_IDTR_BASE, guest_idtr_base); + evmcs_read_field(TSC_OFFSET, tsc_offset); + evmcs_read_field(VIRTUAL_APIC_PAGE_ADDR, + virtual_apic_page_addr); + evmcs_read_field(VMCS_LINK_POINTER, vmcs_link_pointer); + evmcs_read_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl); + evmcs_read_field(GUEST_IA32_PAT, guest_ia32_pat); + evmcs_read_field(GUEST_IA32_EFER, guest_ia32_efer); + evmcs_read_field(GUEST_PDPTR0, guest_pdptr0); + evmcs_read_field(GUEST_PDPTR1, guest_pdptr1); + evmcs_read_field(GUEST_PDPTR2, guest_pdptr2); + evmcs_read_field(GUEST_PDPTR3, guest_pdptr3); + evmcs_read_field(GUEST_PENDING_DBG_EXCEPTIONS, + guest_pending_dbg_exceptions); + evmcs_read_field(GUEST_SYSENTER_ESP, guest_sysenter_esp); + evmcs_read_field(GUEST_SYSENTER_EIP, guest_sysenter_eip); + evmcs_read_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask); + evmcs_read_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask); + evmcs_read_field(CR0_READ_SHADOW, cr0_read_shadow); + evmcs_read_field(CR4_READ_SHADOW, cr4_read_shadow); + evmcs_read_field(GUEST_CR0, guest_cr0); + evmcs_read_field(GUEST_CR3, guest_cr3); + evmcs_read_field(GUEST_CR4, guest_cr4); + evmcs_read_field(GUEST_DR7, guest_dr7); + evmcs_read_field(HOST_FS_BASE, host_fs_base); + evmcs_read_field(HOST_GS_BASE, host_gs_base); + evmcs_read_field(HOST_TR_BASE, host_tr_base); + evmcs_read_field(HOST_GDTR_BASE, host_gdtr_base); + evmcs_read_field(HOST_IDTR_BASE, host_idtr_base); + evmcs_read_field(HOST_RSP, host_rsp); + evmcs_read_field(EPT_POINTER, ept_pointer); + evmcs_read_field(GUEST_BNDCFGS, guest_bndcfgs); + evmcs_read_field(XSS_EXIT_BITMAP, xss_exit_bitmap); + evmcs_read_field(GUEST_PHYSICAL_ADDRESS, + guest_physical_address); + evmcs_read_field(EXIT_QUALIFICATION, exit_qualification); + /* + * Not implemented in KVM: + * evmcs_read_field(0x00006402, exit_io_instruction_ecx); + * evmcs_read_field(0x00006404, exit_io_instruction_esi); + * evmcs_read_field(0x00006406, exit_io_instruction_esi); + * evmcs_read_field(0x00006408, exit_io_instruction_eip); + */ + evmcs_read_field(GUEST_LINEAR_ADDRESS, guest_linear_address); + + /* no mask defined in the spec */ + evmcs_read_field(VM_EXIT_MSR_STORE_ADDR, + vm_exit_msr_store_addr); + evmcs_read_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr); + evmcs_read_field(VM_ENTRY_MSR_LOAD_ADDR, + vm_entry_msr_load_addr); + evmcs_read_field(CR3_TARGET_VALUE0, cr3_target_value0); + evmcs_read_field(CR3_TARGET_VALUE1, cr3_target_value1); + evmcs_read_field(CR3_TARGET_VALUE2, cr3_target_value2); + evmcs_read_field(CR3_TARGET_VALUE3, cr3_target_value3); + + /* 32 bit fields */ + evmcs_read_field(TPR_THRESHOLD, tpr_threshold); + evmcs_read_field(GUEST_INTERRUPTIBILITY_INFO, + guest_interruptibility_info); + evmcs_read_field(CPU_BASED_VM_EXEC_CONTROL, + cpu_based_vm_exec_control); + evmcs_read_field(EXCEPTION_BITMAP, exception_bitmap); + evmcs_read_field(VM_ENTRY_CONTROLS, vm_entry_controls); + evmcs_read_field(VM_ENTRY_INTR_INFO_FIELD, + vm_entry_intr_info_field); + evmcs_read_field(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code); + evmcs_read_field(VM_ENTRY_INSTRUCTION_LEN, + vm_entry_instruction_len); + evmcs_read_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs); + evmcs_read_field(PIN_BASED_VM_EXEC_CONTROL, + pin_based_vm_exec_control); + evmcs_read_field(VM_EXIT_CONTROLS, vm_exit_controls); + evmcs_read_field(SECONDARY_VM_EXEC_CONTROL, + secondary_vm_exec_control); + evmcs_read_field(GUEST_ES_LIMIT, guest_es_limit); + evmcs_read_field(GUEST_CS_LIMIT, guest_cs_limit); + evmcs_read_field(GUEST_SS_LIMIT, guest_ss_limit); + evmcs_read_field(GUEST_DS_LIMIT, guest_ds_limit); + evmcs_read_field(GUEST_FS_LIMIT, guest_fs_limit); + evmcs_read_field(GUEST_GS_LIMIT, guest_gs_limit); + evmcs_read_field(GUEST_LDTR_LIMIT, guest_ldtr_limit); + evmcs_read_field(GUEST_TR_LIMIT, guest_tr_limit); + evmcs_read_field(GUEST_GDTR_LIMIT, guest_gdtr_limit); + evmcs_read_field(GUEST_IDTR_LIMIT, guest_idtr_limit); + evmcs_read_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes); + evmcs_read_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes); + evmcs_read_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes); + evmcs_read_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes); + evmcs_read_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes); + evmcs_read_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes); + evmcs_read_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes); + evmcs_read_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes); + evmcs_read_field(GUEST_ACTIVITY_STATE, guest_activity_state); + evmcs_read_field(GUEST_SYSENTER_CS, guest_sysenter_cs); + evmcs_read_field(VM_INSTRUCTION_ERROR, vm_instruction_error); + evmcs_read_field(VM_EXIT_REASON, vm_exit_reason); + evmcs_read_field(VM_EXIT_INTR_INFO, vm_exit_intr_info); + evmcs_read_field(VM_EXIT_INTR_ERROR_CODE, + vm_exit_intr_error_code); + evmcs_read_field(IDT_VECTORING_INFO_FIELD, + idt_vectoring_info_field); + evmcs_read_field(IDT_VECTORING_ERROR_CODE, + idt_vectoring_error_code); + evmcs_read_field(VM_EXIT_INSTRUCTION_LEN, + vm_exit_instruction_len); + evmcs_read_field(VMX_INSTRUCTION_INFO, vmx_instruction_info); + /* no mask defined in the spec */ + evmcs_read_field(PAGE_FAULT_ERROR_CODE_MASK, + page_fault_error_code_mask); + evmcs_read_field(PAGE_FAULT_ERROR_CODE_MATCH, + page_fault_error_code_match); + evmcs_read_field(CR3_TARGET_COUNT, cr3_target_count); + evmcs_read_field(VM_EXIT_MSR_STORE_COUNT, + vm_exit_msr_store_count); + evmcs_read_field(VM_EXIT_MSR_LOAD_COUNT, + vm_exit_msr_load_count); + evmcs_read_field(VM_ENTRY_MSR_LOAD_COUNT, + vm_entry_msr_load_count); + + /* 16 bit fields */ + evmcs_read_field(HOST_ES_SELECTOR, host_es_selector); + evmcs_read_field(HOST_CS_SELECTOR, host_cs_selector); + evmcs_read_field(HOST_SS_SELECTOR, host_ss_selector); + evmcs_read_field(HOST_DS_SELECTOR, host_ds_selector); + evmcs_read_field(HOST_FS_SELECTOR, host_fs_selector); + evmcs_read_field(HOST_GS_SELECTOR, host_gs_selector); + evmcs_read_field(HOST_TR_SELECTOR, host_tr_selector); + evmcs_read_field(GUEST_ES_SELECTOR, guest_es_selector); + evmcs_read_field(GUEST_CS_SELECTOR, guest_cs_selector); + evmcs_read_field(GUEST_SS_SELECTOR, guest_ss_selector); + evmcs_read_field(GUEST_DS_SELECTOR, guest_ds_selector); + evmcs_read_field(GUEST_FS_SELECTOR, guest_fs_selector); + evmcs_read_field(GUEST_GS_SELECTOR, guest_gs_selector); + evmcs_read_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector); + evmcs_read_field(GUEST_TR_SELECTOR, guest_tr_selector); + evmcs_read_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id); + + default: + pr_err("VMX: no EVMCS support read:0x%lx\n", field); + } + + return 0; +} + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, @@ -1676,18 +2199,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) static __always_inline u16 vmcs_read16(unsigned long field) { vmcs_check16(field); + if (enlightened_vmcs) + return evmcs_read(field); return __vmcs_readl(field); } static __always_inline u32 vmcs_read32(unsigned long field) { vmcs_check32(field); + if (enlightened_vmcs) + return evmcs_read(field); return __vmcs_readl(field); } static __always_inline u64 vmcs_read64(unsigned long field) { vmcs_check64(field); + if (enlightened_vmcs) + return evmcs_read(field); #ifdef CONFIG_X86_64 return __vmcs_readl(field); #else @@ -1698,6 +2227,8 @@ static __always_inline u64 vmcs_read64(unsigned long field) static __always_inline unsigned long vmcs_readl(unsigned long field) { vmcs_checkl(field); + if (enlightened_vmcs) + return evmcs_read(field); return __vmcs_readl(field); } @@ -1721,18 +2252,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val static __always_inline void vmcs_write16(unsigned long field, u16 value) { vmcs_check16(field); + if (enlightened_vmcs) + return evmcs_write(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write32(unsigned long field, u32 value) { vmcs_check32(field); + if (enlightened_vmcs) + return evmcs_write(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write64(unsigned long field, u64 value) { vmcs_check64(field); + if (enlightened_vmcs) + return evmcs_write(field, value); + __vmcs_writel(field, value); #ifndef CONFIG_X86_64 asm volatile (""); @@ -1743,6 +2283,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { vmcs_checkl(field); + if (enlightened_vmcs) + return evmcs_write(field, value); + __vmcs_writel(field, value); } @@ -1750,6 +2293,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_clear_bits does not support 64-bit fields"); + if (enlightened_vmcs) + return evmcs_write(field, evmcs_read(field) & ~mask); + __vmcs_writel(field, __vmcs_readl(field) & ~mask); } @@ -1757,6 +2303,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_set_bits does not support 64-bit fields"); + if (enlightened_vmcs) + return evmcs_write(field, evmcs_read(field) | mask); + __vmcs_writel(field, __vmcs_readl(field) | mask); } @@ -3891,7 +4440,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - vmcs_conf->revision_id = vmx_msr_low; + + if (enlightened_vmcs) + vmcs_conf->revision_id = ms_hyperv.nested_features & 0xff; + else + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -9520,6 +10073,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + struct hv_enlightened_vmcs *evmcs = NULL; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9581,6 +10135,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_arm_hv_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; + + if (enlightened_vmcs) { + int cpu = smp_processor_id(); + + evmcs = __va(hv_vp_assist_page[cpu]->current_nested_vmcs); + + /* Crude hack: put RSP-8 to enlightened VMCS host_rsp field */ + asm volatile ("mov %%rsp, (%%rax); sub $32, (%%rax)" : : + "a"(&evmcs->host_rsp)); + vmx->host_rsp = evmcs->host_rsp; + } asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -9686,6 +10251,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* All fields are CLEAN */ + if (evmcs) + evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (have_spec_ctrl) { rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); if (vmx->spec_ctrl) @@ -12463,7 +13032,29 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { static int __init vmx_init(void) { - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + int r; + +#ifdef CONFIG_HYPERVISOR_GUEST + if (enlightened_vmcs && + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + int cpu; + + /* check that we have assist pages on all CPUs */ + for_each_online_cpu(cpu) { + if (!hv_vp_assist_page[cpu]) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) + pr_info("VMX: using Hyper-V Enlightened VMCS\n"); + } else { + enlightened_vmcs = false; + } +#endif + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; -- 2.14.3