Changes since v2:
- Move hyperv.h out of uapi [Radim Krčmář], PATCHes 1 and 2 added.
- define KVM_EVMCS_VERSION, HV_X64_ENLIGHTENED_VMCS_VERSION [Radim Krčmář]
- WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář]
- add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
dump_vmcs() [Radim Krčmář]
When running nested KVM on Hyper-V it's possible to use so called
'Enlightened VMCS' and do normal memory reads/writes instead of
doing VMWRITE/VMREAD instructions. In addition, clean field mask
provides a huge room for optimization on L0's side.
Tight CPUID loop test shows significant speedup (E5-2667 v4 @ 3.20GHz):
Before: 18890 cycles
After: 8304 cycles
Ladi Prosek (1):
x86/kvm: rename HV_X64_MSR_APIC_ASSIST_PAGE to
HV_X64_MSR_VP_ASSIST_PAGE
Vitaly Kuznetsov (6):
x86/hyper-v: move hyperv.h out of uapi
x86/hyper-v: move definitions from TLFS to hyperv-tlfs.h
x86/hyper-v: allocate and use Virtual Processor Assist Pages
x86/hyper-v: define struct hv_enlightened_vmcs and clean field bits
x86/hyper-v: detect nested features
x86/kvm: use Enlightened VMCS when running on Hyper-V
MAINTAINERS | 2 +-
arch/x86/hyperv/hv_init.c | 35 +-
arch/x86/include/asm/hyperv-tlfs.h | 695 +++++++++++++++++++++++++++++++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/mshyperv.h | 91 +----
arch/x86/include/uapi/asm/hyperv.h | 421 ---------------------
arch/x86/include/uapi/asm/kvm_para.h | 1 -
arch/x86/kernel/cpu/mshyperv.c | 21 +-
arch/x86/kvm/hyperv.c | 8 +-
arch/x86/kvm/lapic.h | 2 +-
arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 2 +-
drivers/hv/connection.c | 1 -
drivers/hv/hv.c | 1 -
drivers/hv/hyperv_vmbus.h | 1 +
drivers/hv/vmbus_drv.c | 1 -
include/linux/hyperv.h | 1 -
17 files changed, 1379 insertions(+), 530 deletions(-)
create mode 100644 arch/x86/include/asm/hyperv-tlfs.h
delete mode 100644 arch/x86/include/uapi/asm/hyperv.h
--
2.14.3
Enlightened VMCS is just a structure in memory, the main benefit
besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
mask: we tell the underlying hypervisor which fields were modified
since VMEXIT so there's no need to inspect them all.
Tight CPUID loop test shows significant speedup:
Before: 18890 cycles
After: 8304 cycles
Static key is being used to avoid performance penalty for non-Hyper-V
deployments. Tests show we add around 3 (three) CPU cycles on each
VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
but I don't see a clean way to use static key in assembly.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
Changes since v2:
- define KVM_EVMCS_VERSION [Radim Krčmář]
- WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář]
- add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
dump_vmcs() [Radim Krčmář]
---
arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 615 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 051dab74e4e9..44b6efa7d54e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -53,6 +53,7 @@
#include <asm/mmu_context.h>
#include <asm/microcode.h>
#include <asm/nospec-branch.h>
+#include <asm/mshyperv.h>
#include "trace.h"
#include "pmu.h"
@@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = {
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
+DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+#define KVM_EVMCS_VERSION 1
+
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \
+ (u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16)
+
+/*
+ * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs,
+ * upped 16 bits hold clean field mask.
+ */
+static const u32 vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+
+static inline u16 get_evmcs_offset(unsigned long field)
+{
+ unsigned int index = ROL16(field, 6);
+
+ if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
+ WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n",
+ field);
+ return 0;
+ }
+
+ return (u16)vmcs_field_to_evmcs_1[index];
+}
+
+static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ u32 evmcs_field;
+
+ if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
+ WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n",
+ field);
+ return 0;
+ }
+
+ evmcs_field = vmcs_field_to_evmcs_1[index];
+
+ *clean_field = evmcs_field >> 16;
+
+ return (u16)evmcs_field;
+}
+
+static inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ u16 offset = get_evmcs_offset_cf(field, &clean_field);
+
+ if (!offset)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ u16 offset = get_evmcs_offset_cf(field, &clean_field);
+
+ if (!offset)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ u16 offset = get_evmcs_offset_cf(field, &clean_field);
+
+ if (!offset)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline u64 evmcs_read64(unsigned long field)
+{
+ u16 offset = get_evmcs_offset(field);
+
+ if (!offset)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static inline u32 evmcs_read32(unsigned long field)
+{
+ u16 offset = get_evmcs_offset(field);
+
+ if (!offset)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static inline u16 evmcs_read16(unsigned long field)
+{
+ u16 offset = get_evmcs_offset(field);
+
+ if (!offset)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static void vmcs_load_enlightened(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl,
+ u32 *pin_based_exec_ctrl)
+{
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ */
+ *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+
+ /*
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ */
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VM_FUNCTION_CONTROL = 0x00002018, */
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /*
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ */
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * TSC_MULTIPLIER = 0x00002032,
+ */
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+
+ /*
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ */
+ *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+ /*
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+ *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /*
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+}
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static inline void evmcs_write64(unsigned long field, u64 value) {}
+static inline void evmcs_write32(unsigned long field, u32 value) {}
+static inline void evmcs_write16(unsigned long field, u16 value) {}
+static inline u64 evmcs_read64(unsigned long field) { return 0; }
+static inline u32 evmcs_read32(unsigned long field) { return 0; }
+static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void vmcs_load_enlightened(u64 phys_addr) {}
+static void evmcs_sanitize_exec_ctrls(void) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1473,6 +1952,9 @@ static void vmcs_load(struct vmcs *vmcs)
u64 phys_addr = __pa(vmcs);
u8 error;
+ if (static_branch_unlikely(&enable_evmcs))
+ return vmcs_load_enlightened(phys_addr);
+
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
@@ -1646,18 +2128,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
return __vmcs_readl(field);
}
static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
return __vmcs_readl(field);
}
static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
#else
@@ -1668,6 +2156,8 @@ static __always_inline u64 vmcs_read64(unsigned long field)
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
return __vmcs_readl(field);
}
@@ -1691,18 +2181,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
@@ -1713,6 +2212,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
}
@@ -1720,6 +2222,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
}
@@ -1727,6 +2232,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
@@ -3596,6 +4104,14 @@ static int hardware_enable(void)
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
@@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- vmcs_conf->revision_id = vmx_msr_low;
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs_conf->revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -6990,6 +7511,17 @@ static __init int hardware_setup(void)
goto out;
}
+ if (static_branch_unlikely(&enable_evmcs)) {
+ evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl,
+ &vmcs_config.pin_based_exec_ctrl);
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ */
+ cpu_has_load_perf_global_ctrl = false;
+ }
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -8745,6 +9277,10 @@ static void dump_vmcs(void)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_sanitize_exec_ctrls(&secondary_exec_control,
+ &pin_based_exec_ctrl);
+
pr_err("*** Guest State ***\n");
pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
@@ -8784,7 +9320,8 @@ static void dump_vmcs(void)
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
- if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
@@ -8820,7 +9357,8 @@ static void dump_vmcs(void)
pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
vmcs_read64(HOST_IA32_EFER),
vmcs_read64(HOST_IA32_PAT));
- if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
@@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr3, cr4, evmcs_rsp;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
vmx->__launched = vmx->loaded_vmcs->launched;
+
+ evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
+ (unsigned long)¤t_evmcs->host_rsp : 0;
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9471,15 +10013,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
+ /* Avoid VMWRITE when Enlightened VMCS is in use */
+ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "jz 2f \n\t"
+ "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ "jmp 1f \n\t"
+ "2: \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 2f \n\t"
+ "je 3f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
- "2: \n\t"
+ "3: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
@@ -9548,7 +10096,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
@@ -9573,10 +10121,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi", "rsi"
+ , "rax", "rbx", "rdi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
- , "eax", "ebx", "edi", "esi"
+ , "eax", "ebx", "edi"
#endif
);
@@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
+ /* All fields are clean at this point */
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
@@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
static int __init vmx_init(void)
{
- int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ int r;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above. We can also disable eVMCS support
+ * with module parameter.
+ */
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+ int cpu;
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
@@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void)
#endif
kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
}
module_init(vmx_init)
--
2.14.3
Virtual Processor Assist Pages usage allows us to do optimized EOI
processing for APIC, enable Enlightened VMCS support in KVM and more.
struct hv_vp_assist_page is defined according to the Hyper-V TLFS v5.0b.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
Changes since v1:
move HV_X64_ENLIGHTENED_VMCS_RECOMMENDED definition to this patch
---
arch/x86/hyperv/hv_init.c | 33 +++++++++++++++++++++++++++++++++
arch/x86/include/asm/hyperv-tlfs.h | 13 +++++++++++++
arch/x86/include/asm/mshyperv.h | 10 ++++++++++
3 files changed, 56 insertions(+)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 4b82bc206929..2e0c0351c5f8 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
+struct hv_vp_assist_page **hv_vp_assist_page;
+EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+
u32 hv_max_vp_index;
static int hv_cpu_init(unsigned int cpu)
@@ -101,6 +104,23 @@ static int hv_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
+ if (!hv_vp_assist_page)
+ return 0;
+
+ if (!hv_vp_assist_page[smp_processor_id()])
+ hv_vp_assist_page[smp_processor_id()] =
+ __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+
+ if (hv_vp_assist_page[smp_processor_id()]) {
+ u64 val;
+
+ val = vmalloc_to_pfn(hv_vp_assist_page[smp_processor_id()]);
+ val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
+ }
+
return 0;
}
@@ -198,6 +218,12 @@ static int hv_cpu_die(unsigned int cpu)
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
+ vfree(hv_vp_assist_page[cpu]);
+ hv_vp_assist_page[cpu] = NULL;
+ }
+
if (hv_reenlightenment_cb == NULL)
return 0;
@@ -241,6 +267,13 @@ void hyperv_init(void)
if (!hv_vp_index)
return;
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ if (!hv_vp_assist_page) {
+ ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ return;
+ }
+
if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
hv_cpu_init, hv_cpu_die) < 0)
goto free_vp_index;
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 5c0d8fab87a3..3d0fba18ab4c 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -156,6 +156,9 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+/* Recommend using enlightened VMCS */
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14)
+
/*
* Crash notification flag.
*/
@@ -469,6 +472,16 @@ struct hv_timer_message_payload {
__u64 delivery_time; /* When the message was delivered */
};
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
#define HV_STIMER_ENABLE (1ULL << 0)
#define HV_STIMER_PERIODIC (1ULL << 1)
#define HV_STIMER_LAZY (1ULL << 2)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 38cfbe9a5794..dcb40567b35e 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -218,6 +218,12 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
*/
extern u32 *hv_vp_index;
extern u32 hv_max_vp_index;
+extern struct hv_vp_assist_page **hv_vp_assist_page;
+
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return hv_vp_assist_page[cpu];
+}
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
@@ -254,6 +260,10 @@ static inline void hyperv_setup_mmu_ops(void) {}
static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
static inline void clear_hv_tscchange_cb(void) {}
static inline void hyperv_stop_tsc_emulation(void) {};
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_HYPERV */
#ifdef CONFIG_HYPERV_TSCPAGE
--
2.14.3
TLFS 5.0 says: "Support for an enlightened VMCS interface is reported with
CPUID leaf 0x40000004. If an enlightened VMCS interface is supported,
additional nested enlightenments may be discovered by reading the CPUID
leaf 0x4000000A (see 2.4.11)."
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
Changes since v2:
- define HV_X64_ENLIGHTENED_VMCS_VERSION [Radim Krčmář]
---
arch/x86/include/asm/hyperv-tlfs.h | 4 ++++
arch/x86/include/asm/mshyperv.h | 1 +
arch/x86/kernel/cpu/mshyperv.c | 4 ++++
3 files changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 472dc38110fa..2e2f0a741e84 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -14,6 +14,7 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
#define HYPERV_CPUID_MIN 0x40000005
@@ -332,6 +333,9 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
(~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+/* Hyper-V Enlightened VMCS version mask in nested features CPUID */
+#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff
+
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index dcb40567b35e..cc784b1f6bf1 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -13,6 +13,7 @@ struct ms_hyperv_info {
u32 features;
u32 misc_features;
u32 hints;
+ u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
};
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0dfc568c110c..8432bf215e30 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -208,6 +208,10 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED)
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
--
2.14.3
The definitions are according to the Hyper-V TLFS v5.0. KVM on Hyper-V will
use these.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
arch/x86/include/asm/hyperv-tlfs.h | 200 +++++++++++++++++++++++++++++++++++++
1 file changed, 200 insertions(+)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 3d0fba18ab4c..472dc38110fa 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -482,6 +482,206 @@ struct hv_vp_assist_page {
__u64 current_nested_vmcs;
};
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ u32 hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+
#define HV_STIMER_ENABLE (1ULL << 0)
#define HV_STIMER_PERIODIC (1ULL << 1)
#define HV_STIMER_LAZY (1ULL << 2)
--
2.14.3
hyperv.h is not part of uapi, there are no (known) users outside of kernel.
We are making changes to this file to match current Hyper-V TLFS and we
don't want to maintain backwards compatibility.
Move the file renaming to hyperv-tlfs.h to avoid confusing it with
mshyperv.h. In future, all definitions from TLFS should go to it and
all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
MAINTAINERS | 2 +-
arch/x86/hyperv/hv_init.c | 2 +-
arch/x86/include/asm/hyperv-tlfs.h | 421 +++++++++++++++++++++++++++++++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/mshyperv.h | 2 +-
arch/x86/include/uapi/asm/hyperv.h | 421 -----------------------------------
arch/x86/include/uapi/asm/kvm_para.h | 1 -
arch/x86/kernel/cpu/mshyperv.c | 2 +-
drivers/hv/connection.c | 1 -
drivers/hv/hv.c | 1 -
drivers/hv/hyperv_vmbus.h | 1 +
drivers/hv/vmbus_drv.c | 1 -
include/linux/hyperv.h | 1 -
13 files changed, 427 insertions(+), 430 deletions(-)
create mode 100644 arch/x86/include/asm/hyperv-tlfs.h
delete mode 100644 arch/x86/include/uapi/asm/hyperv.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 4623caf8d72d..80befd9f4775 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6531,7 +6531,7 @@ S: Maintained
F: Documentation/networking/netvsc.txt
F: arch/x86/include/asm/mshyperv.h
F: arch/x86/include/asm/trace/hyperv.h
-F: arch/x86/include/uapi/asm/hyperv.h
+F: arch/x86/include/asm/hyperv-tlfs.h
F: arch/x86/kernel/cpu/mshyperv.c
F: arch/x86/hyperv
F: drivers/hid/hid-hyperv.c
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 2edc49e7409b..4b82bc206929 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -21,7 +21,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
new file mode 100644
index 000000000000..e311a175014c
--- /dev/null
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -0,0 +1,421 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_HYPERV_TLFS_H
+#define _ASM_X86_HYPERV_TLFS_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
+#define HYPERV_CPUID_MIN 0x40000005
+#define HYPERV_CPUID_MAX 0x4000ffff
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/* Partition reference TSC MSR is available */
+#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
+
+/* A partition's reference time stamp counter (TSC) page */
+#define HV_X64_MSR_REFERENCE_TSC 0x40000021
+
+/*
+ * There is a single feature flag that signifies if the partition has access
+ * to MSRs with local APIC and TSC frequencies.
+ */
+#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
+
+/* AccessReenlightenmentControls privilege */
+#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
+
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/* Frequency MSRs available */
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
+
+/* Crash MSR available */
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/*
+ * Virtual APIC support
+ */
+#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9)
+
+/* Recommend using the newer ExProcessorMasks interface */
+#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+
+/*
+ * Crash notification flag.
+ */
+#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET 0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME 0x40000010
+
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+
+/* MSR used to retrieve the TSC frequency */
+#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
+
+/* MSR used to retrieve the local APIC timer frequency */
+#define HV_X64_MSR_APIC_FREQUENCY 0x40000023
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS \
+ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
+/* TSC emulation after migration */
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
+
+struct hv_reenlightenment_control {
+ __u64 vector:8;
+ __u64 reserved1:8;
+ __u64 enabled:1;
+ __u64 reserved2:15;
+ __u64 target_vp:32;
+};
+
+#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
+#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
+
+struct hv_tsc_emulation_control {
+ __u64 enabled:1;
+ __u64 reserved:63;
+};
+
+struct hv_tsc_emulation_status {
+ __u64 inprogress:1;
+ __u64 reserved:63;
+};
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
+#define HVCALL_POST_MESSAGE 0x005c
+#define HVCALL_SIGNAL_EVENT 0x005d
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
+#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+#define HV_FLUSH_ALL_PROCESSORS BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
+
+enum HV_GENERIC_SET_FORMAT {
+ HV_GENERIC_SET_SPARCE_4K,
+ HV_GENERIC_SET_ALL,
+};
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+#define HV_STATUS_INSUFFICIENT_MEMORY 11
+#define HV_STATUS_INVALID_CONNECTION_ID 18
+#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+
+typedef struct _HV_REFERENCE_TSC_PAGE {
+ __u32 tsc_sequence;
+ __u32 res1;
+ __u64 tsc_scale;
+ __s64 tsc_offset;
+} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT (16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1 (0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SINT_MASKED (1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
+
+#define HV_SYNIC_STIMER_COUNT (4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE (256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+ HVMSG_NONE = 0x00000000,
+
+ /* Memory access messages. */
+ HVMSG_UNMAPPED_GPA = 0x80000000,
+ HVMSG_GPA_INTERCEPT = 0x80000001,
+
+ /* Timer notification messages. */
+ HVMSG_TIMER_EXPIRED = 0x80000010,
+
+ /* Error messages. */
+ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
+ HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
+ HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
+
+ /* Trace buffer complete messages. */
+ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
+
+ /* Platform-specific processor intercept messages. */
+ HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
+ HVMSG_X64_MSR_INTERCEPT = 0x80010001,
+ HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
+ HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
+ HVMSG_X64_APIC_EOI = 0x80010004,
+ HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+ __u8 asu8;
+ struct {
+ __u8 msg_pending:1;
+ __u8 reserved:7;
+ };
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+ __u32 asu32;
+ struct {
+ __u32 id:24;
+ __u32 reserved:8;
+ } u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+ __u32 message_type;
+ __u8 payload_size;
+ union hv_message_flags message_flags;
+ __u8 reserved[2];
+ union {
+ __u64 sender;
+ union hv_port_id port;
+ };
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+ struct hv_message_header header;
+ union {
+ __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+ } u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+ struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+ __u32 timer_index;
+ __u32 reserved;
+ __u64 expiration_time; /* When the timer expired */
+ __u64 delivery_time; /* When the message was delivered */
+};
+
+#define HV_STIMER_ENABLE (1ULL << 0)
+#define HV_STIMER_PERIODIC (1ULL << 1)
+#define HV_STIMER_LAZY (1ULL << 2)
+#define HV_STIMER_AUTOENABLE (1ULL << 3)
+#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
+
+#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b605a5b6a30c..75bed25f1284 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
+#include <asm/hyperv-tlfs.h>
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 25283f7eb299..044323a59354 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -6,7 +6,7 @@
#include <linux/atomic.h>
#include <linux/nmi.h>
#include <asm/io.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
/*
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
deleted file mode 100644
index 099414345865..000000000000
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ /dev/null
@@ -1,421 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
-
-#include <linux/types.h>
-
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
- */
-#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
-#define HYPERV_CPUID_INTERFACE 0x40000001
-#define HYPERV_CPUID_VERSION 0x40000002
-#define HYPERV_CPUID_FEATURES 0x40000003
-#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
-#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
-
-#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
-#define HYPERV_CPUID_MIN 0x40000005
-#define HYPERV_CPUID_MAX 0x4000ffff
-
-/*
- * Feature identification. EAX indicates which features are available
- * to the partition based upon the current partition privileges.
- */
-
-/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
-#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
-/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
-#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
-/* Partition reference TSC MSR is available */
-#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
-
-/* A partition's reference time stamp counter (TSC) page */
-#define HV_X64_MSR_REFERENCE_TSC 0x40000021
-
-/*
- * There is a single feature flag that signifies if the partition has access
- * to MSRs with local APIC and TSC frequencies.
- */
-#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
-
-/* AccessReenlightenmentControls privilege */
-#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
-
-/*
- * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
- * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
- */
-#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
-/*
- * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
- * HV_X64_MSR_STIMER3_COUNT) available
- */
-#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
-/*
- * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
- * are available
- */
-#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
-/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
-#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
-/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
-#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
-/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
-#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
- /*
- * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
- * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
- * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
- */
-#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
-
-/* Frequency MSRs available */
-#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
-
-/* Crash MSR available */
-#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
-
-/*
- * Feature identification: EBX indicates which flags were specified at
- * partition creation. The format is the same as the partition creation
- * flag structure defined in section Partition Creation Flags.
- */
-#define HV_X64_CREATE_PARTITIONS (1 << 0)
-#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
-#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
-#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
-#define HV_X64_POST_MESSAGES (1 << 4)
-#define HV_X64_SIGNAL_EVENTS (1 << 5)
-#define HV_X64_CREATE_PORT (1 << 6)
-#define HV_X64_CONNECT_PORT (1 << 7)
-#define HV_X64_ACCESS_STATS (1 << 8)
-#define HV_X64_DEBUGGING (1 << 11)
-#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
-#define HV_X64_CONFIGURE_PROFILER (1 << 13)
-
-/*
- * Feature identification. EDX indicates which miscellaneous features
- * are available to the partition.
- */
-/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
-#define HV_X64_MWAIT_AVAILABLE (1 << 0)
-/* Guest debugging support is available */
-#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
-/* Performance Monitor support is available*/
-#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
-/* Support for physical CPU dynamic partitioning events is available*/
-#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
-/*
- * Support for passing hypercall input parameter block via XMM
- * registers is available
- */
-#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
-/* Support for a virtual guest idle state is available */
-#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
-/* Guest crash data handler available */
-#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
-
-/*
- * Implementation recommendations. Indicates which behaviors the hypervisor
- * recommends the OS implement for optimal performance.
- */
- /*
- * Recommend using hypercall for address space switches rather
- * than MOV to CR3 instruction
- */
-#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0)
-/* Recommend using hypercall for local TLB flushes rather
- * than INVLPG or MOV to CR3 instructions */
-#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
-/*
- * Recommend using hypercall for remote TLB flushes rather
- * than inter-processor interrupts
- */
-#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
-/*
- * Recommend using MSRs for accessing APIC registers
- * EOI, ICR and TPR rather than their memory-mapped counterparts
- */
-#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
-/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
-#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
-/*
- * Recommend using relaxed timing for this partition. If used,
- * the VM should disable any watchdog timeouts that rely on the
- * timely delivery of external interrupts
- */
-#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
-
-/*
- * Virtual APIC support
- */
-#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9)
-
-/* Recommend using the newer ExProcessorMasks interface */
-#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
-
-/*
- * Crash notification flag.
- */
-#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
-
-/* MSR used to identify the guest OS. */
-#define HV_X64_MSR_GUEST_OS_ID 0x40000000
-
-/* MSR used to setup pages used to communicate with the hypervisor. */
-#define HV_X64_MSR_HYPERCALL 0x40000001
-
-/* MSR used to provide vcpu index */
-#define HV_X64_MSR_VP_INDEX 0x40000002
-
-/* MSR used to reset the guest OS. */
-#define HV_X64_MSR_RESET 0x40000003
-
-/* MSR used to provide vcpu runtime in 100ns units */
-#define HV_X64_MSR_VP_RUNTIME 0x40000010
-
-/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
-
-/* MSR used to retrieve the TSC frequency */
-#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
-
-/* MSR used to retrieve the local APIC timer frequency */
-#define HV_X64_MSR_APIC_FREQUENCY 0x40000023
-
-/* Define the virtual APIC registers */
-#define HV_X64_MSR_EOI 0x40000070
-#define HV_X64_MSR_ICR 0x40000071
-#define HV_X64_MSR_TPR 0x40000072
-#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
-
-/* Define synthetic interrupt controller model specific registers. */
-#define HV_X64_MSR_SCONTROL 0x40000080
-#define HV_X64_MSR_SVERSION 0x40000081
-#define HV_X64_MSR_SIEFP 0x40000082
-#define HV_X64_MSR_SIMP 0x40000083
-#define HV_X64_MSR_EOM 0x40000084
-#define HV_X64_MSR_SINT0 0x40000090
-#define HV_X64_MSR_SINT1 0x40000091
-#define HV_X64_MSR_SINT2 0x40000092
-#define HV_X64_MSR_SINT3 0x40000093
-#define HV_X64_MSR_SINT4 0x40000094
-#define HV_X64_MSR_SINT5 0x40000095
-#define HV_X64_MSR_SINT6 0x40000096
-#define HV_X64_MSR_SINT7 0x40000097
-#define HV_X64_MSR_SINT8 0x40000098
-#define HV_X64_MSR_SINT9 0x40000099
-#define HV_X64_MSR_SINT10 0x4000009A
-#define HV_X64_MSR_SINT11 0x4000009B
-#define HV_X64_MSR_SINT12 0x4000009C
-#define HV_X64_MSR_SINT13 0x4000009D
-#define HV_X64_MSR_SINT14 0x4000009E
-#define HV_X64_MSR_SINT15 0x4000009F
-
-/*
- * Synthetic Timer MSRs. Four timers per vcpu.
- */
-#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
-
-/* Hyper-V guest crash notification MSR's */
-#define HV_X64_MSR_CRASH_P0 0x40000100
-#define HV_X64_MSR_CRASH_P1 0x40000101
-#define HV_X64_MSR_CRASH_P2 0x40000102
-#define HV_X64_MSR_CRASH_P3 0x40000103
-#define HV_X64_MSR_CRASH_P4 0x40000104
-#define HV_X64_MSR_CRASH_CTL 0x40000105
-#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63)
-#define HV_X64_MSR_CRASH_PARAMS \
- (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
-
-/* TSC emulation after migration */
-#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
-
-struct hv_reenlightenment_control {
- __u64 vector:8;
- __u64 reserved1:8;
- __u64 enabled:1;
- __u64 reserved2:15;
- __u64 target_vp:32;
-};
-
-#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
-#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
-
-struct hv_tsc_emulation_control {
- __u64 enabled:1;
- __u64 reserved:63;
-};
-
-struct hv_tsc_emulation_status {
- __u64 inprogress:1;
- __u64 reserved:63;
-};
-
-#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
-#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
-
-/* Declare the various hypercall operations. */
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
-#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
-#define HVCALL_POST_MESSAGE 0x005c
-#define HVCALL_SIGNAL_EVENT 0x005d
-
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
-
-#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
-#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
-
-#define HV_PROCESSOR_POWER_STATE_C0 0
-#define HV_PROCESSOR_POWER_STATE_C1 1
-#define HV_PROCESSOR_POWER_STATE_C2 2
-#define HV_PROCESSOR_POWER_STATE_C3 3
-
-#define HV_FLUSH_ALL_PROCESSORS BIT(0)
-#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
-#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
-#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
-
-enum HV_GENERIC_SET_FORMAT {
- HV_GENERIC_SET_SPARCE_4K,
- HV_GENERIC_SET_ALL,
-};
-
-/* hypercall status code */
-#define HV_STATUS_SUCCESS 0
-#define HV_STATUS_INVALID_HYPERCALL_CODE 2
-#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
-#define HV_STATUS_INVALID_ALIGNMENT 4
-#define HV_STATUS_INSUFFICIENT_MEMORY 11
-#define HV_STATUS_INVALID_CONNECTION_ID 18
-#define HV_STATUS_INSUFFICIENT_BUFFERS 19
-
-typedef struct _HV_REFERENCE_TSC_PAGE {
- __u32 tsc_sequence;
- __u32 res1;
- __u64 tsc_scale;
- __s64 tsc_offset;
-} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
-
-/* Define the number of synthetic interrupt sources. */
-#define HV_SYNIC_SINT_COUNT (16)
-/* Define the expected SynIC version. */
-#define HV_SYNIC_VERSION_1 (0x1)
-
-#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
-#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
-#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
-#define HV_SYNIC_SINT_MASKED (1ULL << 16)
-#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
-#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
-
-#define HV_SYNIC_STIMER_COUNT (4)
-
-/* Define synthetic interrupt controller message constants. */
-#define HV_MESSAGE_SIZE (256)
-#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
-#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
-
-/* Define hypervisor message types. */
-enum hv_message_type {
- HVMSG_NONE = 0x00000000,
-
- /* Memory access messages. */
- HVMSG_UNMAPPED_GPA = 0x80000000,
- HVMSG_GPA_INTERCEPT = 0x80000001,
-
- /* Timer notification messages. */
- HVMSG_TIMER_EXPIRED = 0x80000010,
-
- /* Error messages. */
- HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
- HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
- HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
-
- /* Trace buffer complete messages. */
- HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
-
- /* Platform-specific processor intercept messages. */
- HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
- HVMSG_X64_MSR_INTERCEPT = 0x80010001,
- HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
- HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
- HVMSG_X64_APIC_EOI = 0x80010004,
- HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
-};
-
-/* Define synthetic interrupt controller message flags. */
-union hv_message_flags {
- __u8 asu8;
- struct {
- __u8 msg_pending:1;
- __u8 reserved:7;
- };
-};
-
-/* Define port identifier type. */
-union hv_port_id {
- __u32 asu32;
- struct {
- __u32 id:24;
- __u32 reserved:8;
- } u;
-};
-
-/* Define synthetic interrupt controller message header. */
-struct hv_message_header {
- __u32 message_type;
- __u8 payload_size;
- union hv_message_flags message_flags;
- __u8 reserved[2];
- union {
- __u64 sender;
- union hv_port_id port;
- };
-};
-
-/* Define synthetic interrupt controller message format. */
-struct hv_message {
- struct hv_message_header header;
- union {
- __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
- } u;
-};
-
-/* Define the synthetic interrupt message page layout. */
-struct hv_message_page {
- struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
-};
-
-/* Define timer message payload structure. */
-struct hv_timer_message_payload {
- __u32 timer_index;
- __u32 reserved;
- __u64 expiration_time; /* When the timer expired */
- __u64 delivery_time; /* When the message was delivered */
-};
-
-#define HV_STIMER_ENABLE (1ULL << 0)
-#define HV_STIMER_PERIODIC (1ULL << 1)
-#define HV_STIMER_LAZY (1ULL << 2)
-#define HV_STIMER_AUTOENABLE (1ULL << 3)
-#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
-
-#endif
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6cfa9c8cb7d6..17f2d04a3d88 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -3,7 +3,6 @@
#define _UAPI_ASM_X86_KVM_PARA_H
#include <linux/types.h>
-#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 9340f41ce8d3..04f760432a17 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -22,7 +22,7 @@
#include <linux/kexec.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/irq_regs.h>
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 447371f4de56..72855182b191 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -31,7 +31,6 @@
#include <linux/vmalloc.h>
#include <linux/hyperv.h>
#include <linux/export.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index fe96aab9e794..45f3694bbb76 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -29,7 +29,6 @@
#include <linux/version.h>
#include <linux/interrupt.h>
#include <linux/clockchips.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 22300ec7b556..500f805a6ef2 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -27,6 +27,7 @@
#include <linux/list.h>
#include <asm/sync_bitops.h>
+#include <asm/hyperv-tlfs.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index bc65c4d79c1f..b10fe26c4891 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,7 +36,6 @@
#include <linux/cpu.h>
#include <linux/sched/task_stack.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <linux/notifier.h>
#include <linux/ptrace.h>
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 93bd6fcd6e62..eed8b33b0173 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -26,7 +26,6 @@
#define _HYPERV_H
#include <uapi/linux/hyperv.h>
-#include <uapi/asm/hyperv.h>
#include <linux/types.h>
#include <linux/scatterlist.h>
--
2.14.3
From: Ladi Prosek <[email protected]>
The assist page has been used only for the paravirtual EOI so far, hence
the "APIC" in the MSR name. Renaming to match the Hyper-V TLFS where it's
called "Virtual VP Assist MSR".
Signed-off-by: Ladi Prosek <[email protected]>
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
arch/x86/include/asm/hyperv-tlfs.h | 10 +++++-----
arch/x86/kvm/hyperv.c | 8 ++++----
arch/x86/kvm/lapic.h | 2 +-
arch/x86/kvm/x86.c | 2 +-
4 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index d1f1f9d9f807..5c0d8fab87a3 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -189,7 +189,7 @@
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
#define HV_X64_MSR_TPR 0x40000072
-#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
#define HV_X64_MSR_SCONTROL 0x40000080
@@ -324,10 +324,10 @@ struct hv_tsc_emulation_status {
#define HVCALL_POST_MESSAGE 0x005c
#define HVCALL_SIGNAL_EVENT 0x005d
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index dc97f2544b6f..9fb0ed9b1670 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1009,17 +1009,17 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
hv->vp_index = (u32)data;
break;
- case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
u64 gfn;
unsigned long addr;
- if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
hv->hv_vapic = data;
if (kvm_lapic_enable_pv_eoi(vcpu, 0))
return 1;
break;
}
- gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
@@ -1129,7 +1129,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
- case HV_X64_MSR_APIC_ASSIST_PAGE:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
data = hv->hv_vapic;
break;
case HV_X64_MSR_VP_RUNTIME:
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 56c36014f7b7..edce055e9fd7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -109,7 +109,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18b5ca7a3197..372db116e010 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1032,7 +1032,7 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_SCONTROL,
HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ HV_X64_MSR_VP_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_IA32_TSC_ADJUST,
--
2.14.3
mshyperv.h now only contains fucntions/variables we define in kernel, all
definitions from TLFS should go to hyperv-tlfs.h.
'enum hv_cpuid_function' is removed as we already have this info in
hyperv-tlfs.h, code in mshyperv.c is adjusted accordingly.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
arch/x86/include/asm/hyperv-tlfs.h | 57 ++++++++++++++++++++++++++++
arch/x86/include/asm/mshyperv.h | 78 +-------------------------------------
arch/x86/kernel/cpu/mshyperv.c | 15 ++++----
3 files changed, 66 insertions(+), 84 deletions(-)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index e311a175014c..d1f1f9d9f807 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -237,6 +237,55 @@
#define HV_X64_MSR_CRASH_PARAMS \
(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+/*
+ * Declare the MSR used to setup pages used to communicate with the hypervisor.
+ */
+union hv_x64_msr_hypercall_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 guest_physical_address:52;
+ };
+};
+
+/*
+ * TSC page layout.
+ */
+struct ms_hyperv_tsc_page {
+ volatile u32 tsc_sequence;
+ u32 reserved1;
+ volatile u64 tsc_scale;
+ volatile s64 tsc_offset;
+ u64 reserved2[509];
+};
+
+/*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0 - Distro specific identification
+ *
+ *
+ */
+
+#define HV_LINUX_VENDOR_ID 0x8100
+
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -298,6 +347,14 @@ enum HV_GENERIC_SET_FORMAT {
HV_GENERIC_SET_ALL,
};
+#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
+#define HV_HYPERCALL_FAST_BIT BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_REP_COMP_OFFSET 32
+#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
+#define HV_HYPERCALL_REP_START_OFFSET 48
+#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
+
/* hypercall status code */
#define HV_STATUS_SUCCESS 0
#define HV_STATUS_INVALID_HYPERCALL_CODE 2
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 044323a59354..38cfbe9a5794 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -9,25 +9,6 @@
#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HVCPUID_VERSION_FEATURES).
- */
-enum hv_cpuid_function {
- HVCPUID_VERSION_FEATURES = 0x00000001,
- HVCPUID_VENDOR_MAXFUNCTION = 0x40000000,
- HVCPUID_INTERFACE = 0x40000001,
-
- /*
- * The remaining functions depend on the value of
- * HVCPUID_INTERFACE
- */
- HVCPUID_VERSION = 0x40000002,
- HVCPUID_FEATURES = 0x40000003,
- HVCPUID_ENLIGHTENMENT_INFO = 0x40000004,
- HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
-};
-
struct ms_hyperv_info {
u32 features;
u32 misc_features;
@@ -38,58 +19,9 @@ struct ms_hyperv_info {
extern struct ms_hyperv_info ms_hyperv;
-/*
- * Declare the MSR used to setup pages used to communicate with the hypervisor.
- */
-union hv_x64_msr_hypercall_contents {
- u64 as_uint64;
- struct {
- u64 enable:1;
- u64 reserved:11;
- u64 guest_physical_address:52;
- };
-};
-
-/*
- * TSC page layout.
- */
-
-struct ms_hyperv_tsc_page {
- volatile u32 tsc_sequence;
- u32 reserved1;
- volatile u64 tsc_scale;
- volatile s64 tsc_offset;
- u64 reserved2[509];
-};
/*
- * The guest OS needs to register the guest ID with the hypervisor.
- * The guest ID is a 64 bit entity and the structure of this ID is
- * specified in the Hyper-V specification:
- *
- * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
- *
- * While the current guideline does not specify how Linux guest ID(s)
- * need to be generated, our plan is to publish the guidelines for
- * Linux and other guest operating systems that currently are hosted
- * on Hyper-V. The implementation here conforms to this yet
- * unpublished guidelines.
- *
- *
- * Bit(s)
- * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
- * 62:56 - Os Type; Linux is 0x100
- * 55:48 - Distro specific identification
- * 47:16 - Linux kernel version number
- * 15:0 - Distro specific identification
- *
- *
- */
-
-#define HV_LINUX_VENDOR_ID 0x8100
-
-/*
- * Generate the guest ID based on the guideline described above.
+ * Generate the guest ID.
*/
static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
@@ -215,14 +147,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
-#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
-#define HV_HYPERCALL_FAST_BIT BIT(16)
-#define HV_HYPERCALL_VARHEAD_OFFSET 17
-#define HV_HYPERCALL_REP_COMP_OFFSET 32
-#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
-#define HV_HYPERCALL_REP_START_OFFSET 48
-#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
-
/* Fast hypercall with 8 bytes of input and no output */
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
{
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 04f760432a17..0dfc568c110c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -180,8 +180,8 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
- ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS);
- ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS);
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
@@ -189,11 +189,12 @@ static void __init ms_hyperv_init_platform(void)
/*
* Extract host information.
*/
- if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
+ HYPERV_CPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
hv_host_info_eax, hv_host_info_ebx >> 16,
--
2.14.3
On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> Enlightened VMCS is just a structure in memory, the main benefit
> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
> mask: we tell the underlying hypervisor which fields were modified
> since VMEXIT so there's no need to inspect them all.
>
> Tight CPUID loop test shows significant speedup:
> Before: 18890 cycles
> After: 8304 cycles
>
> Static key is being used to avoid performance penalty for non-Hyper-V
> deployments. Tests show we add around 3 (three) CPU cycles on each
> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> but I don't see a clean way to use static key in assembly.
STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
Thanks,
tglx
Thomas Gleixner <[email protected]> writes:
> On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
>
>> Enlightened VMCS is just a structure in memory, the main benefit
>> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
>> mask: we tell the underlying hypervisor which fields were modified
>> since VMEXIT so there's no need to inspect them all.
>>
>> Tight CPUID loop test shows significant speedup:
>> Before: 18890 cycles
>> After: 8304 cycles
>>
>> Static key is being used to avoid performance penalty for non-Hyper-V
>> deployments. Tests show we add around 3 (three) CPU cycles on each
>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> but I don't see a clean way to use static key in assembly.
>
> STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
>
Thanks for the tip,
with a single kernel user of these APIs it was easy to miss :-)
Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
(afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
nice if we can make them behave the same way static_branch_likely() and
friends do: compile into something else when !HAVE_JUMP_LABEL so we can
avoid nasty #ifdefs in C code.
That said I'd like to defer the question to KVM maintainers: Paolo,
Radim, what would you like me to do? Use STATIC_JUMP_IF_TRUE/FALSE as
they are, try to make them work for !HAVE_JUMP_LABEL and use them or
maybe we can commit the series as-is and have it as a future
optimization (e.g. when HAVE_JUMP_LABEL becomes mandatory)?
--
Vitaly
2018-03-12 15:19+0100, Vitaly Kuznetsov:
> Thomas Gleixner <[email protected]> writes:
>
> > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> >
> >> Enlightened VMCS is just a structure in memory, the main benefit
> >> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
> >> mask: we tell the underlying hypervisor which fields were modified
> >> since VMEXIT so there's no need to inspect them all.
> >>
> >> Tight CPUID loop test shows significant speedup:
> >> Before: 18890 cycles
> >> After: 8304 cycles
> >>
> >> Static key is being used to avoid performance penalty for non-Hyper-V
> >> deployments. Tests show we add around 3 (three) CPU cycles on each
> >> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> >> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> >> but I don't see a clean way to use static key in assembly.
> >
> > STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
> >
>
> Thanks for the tip,
>
> with a single kernel user of these APIs it was easy to miss :-)
Indeed, I had no idea.
> Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
> (afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
> nice if we can make them behave the same way static_branch_likely() and
> friends do: compile into something else when !HAVE_JUMP_LABEL so we can
> avoid nasty #ifdefs in C code.
>
> That said I'd like to defer the question to KVM maintainers: Paolo,
> Radim, what would you like me to do? Use STATIC_JUMP_IF_TRUE/FALSE as
> they are, try to make them work for !HAVE_JUMP_LABEL and use them or
> maybe we can commit the series as-is and have it as a future
> optimization (e.g. when HAVE_JUMP_LABEL becomes mandatory)?
Please take a look into making a macro that uses STATIC_JUMP_IF_FALSE or
reads the value from provided static_key and does a test-jump, depending
on HAVE_JUMP_LABEL.
It doesn't need to be suited for general use, just something that moves
the ugliness away from vmx_vcpu_run.
(Although having it in jump_label.h would be great. I think the main
obstacle is clobbering of flags.)
If it were still looking horrible, I'm ok with the series as-is,
thanks.
> -----Original Message-----
> From: Vitaly Kuznetsov <[email protected]>
> Sent: Friday, March 9, 2018 6:03 AM
> To: [email protected]
> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Kr?m??
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
> Kelley (EOSG) <[email protected]>; Mohammed Gamal
> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
> [email protected]
> Subject: [PATCH v3 1/7] x86/hyper-v: move hyperv.h out of uapi
>
> hyperv.h is not part of uapi, there are no (known) users outside of kernel.
> We are making changes to this file to match current Hyper-V TLFS and we
> don't want to maintain backwards compatibility.
>
> Move the file renaming to hyperv-tlfs.h to avoid confusing it with
> mshyperv.h. In future, all definitions from TLFS should go to it and
> all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
[snip]
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index 447371f4de56..72855182b191 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -31,7 +31,6 @@
> #include <linux/vmalloc.h>
> #include <linux/hyperv.h>
> #include <linux/export.h>
> -#include <asm/hyperv.h>
This #include should remain and be changed to asm/hyperv-tlfs.h. This
file uses the hypercall status values HV_STATUS_*.
> #include <asm/mshyperv.h>
>
> #include "hyperv_vmbus.h"
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index fe96aab9e794..45f3694bbb76 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -29,7 +29,6 @@
> #include <linux/version.h>
> #include <linux/interrupt.h>
> #include <linux/clockchips.h>
> -#include <asm/hyperv.h>
This #include should remain and be changed to asm/hyperv-tlfs.h. This
file uses HV_MESSAGE_* values.
> #include <asm/mshyperv.h>
> #include "hyperv_vmbus.h"
>
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index 22300ec7b556..500f805a6ef2 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -27,6 +27,7 @@
>
> #include <linux/list.h>
> #include <asm/sync_bitops.h>
> +#include <asm/hyperv-tlfs.h>
> #include <linux/atomic.h>
> #include <linux/hyperv.h>
> #include <linux/interrupt.h>
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index bc65c4d79c1f..b10fe26c4891 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -36,7 +36,6 @@
> #include <linux/cpu.h>
> #include <linux/sched/task_stack.h>
>
> -#include <asm/hyperv.h>
This #include should remain and be changed to asm/hyperv-tlfs.h. This
file uses HVMSG_* values.
> #include <asm/mshyperv.h>
> #include <linux/notifier.h>
> #include <linux/ptrace.h>
Michael
> -----Original Message-----
> From: Vitaly Kuznetsov <[email protected]>
> Sent: Friday, March 9, 2018 6:03 AM
> To: [email protected]
> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Kr?m??
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
> Kelley (EOSG) <[email protected]>; Mohammed Gamal
> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
> [email protected]
> Subject: [PATCH v3 2/7] x86/hyper-v: move definitions from TLFS to hyperv-tlfs.h
>
> mshyperv.h now only contains fucntions/variables we define in kernel, all
> definitions from TLFS should go to hyperv-tlfs.h.
>
> 'enum hv_cpuid_function' is removed as we already have this info in
> hyperv-tlfs.h, code in mshyperv.c is adjusted accordingly.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Michael Kelley <[email protected]>
> -----Original Message-----
> From: Vitaly Kuznetsov <[email protected]>
> Sent: Friday, March 9, 2018 6:03 AM
> To: [email protected]
> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Kr?m??
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
> Kelley (EOSG) <[email protected]>; Mohammed Gamal
> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
> [email protected]
> Subject: [PATCH v3 4/7] x86/hyper-v: allocate and use Virtual Processor Assist Pages
>
> Virtual Processor Assist Pages usage allows us to do optimized EOI
> processing for APIC, enable Enlightened VMCS support in KVM and more.
> struct hv_vp_assist_page is defined according to the Hyper-V TLFS v5.0b.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Michael Kelley <[email protected]>
> -----Original Message-----
> From: Vitaly Kuznetsov <[email protected]>
> Sent: Friday, March 9, 2018 6:03 AM
> To: [email protected]
> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Kr?m??
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
> Kelley (EOSG) <[email protected]>; Mohammed Gamal
> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
> [email protected]
> Subject: [PATCH v3 5/7] x86/hyper-v: define struct hv_enlightened_vmcs and clean field bits
>
> The definitions are according to the Hyper-V TLFS v5.0. KVM on Hyper-V will
> use these.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Michael Kelley <[email protected]>
> -----Original Message-----
> From: Vitaly Kuznetsov <[email protected]>
> Sent: Friday, March 9, 2018 6:03 AM
> To: [email protected]
> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Krčmář
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
> Kelley (EOSG) <[email protected]>; Mohammed Gamal
> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
> [email protected]
> Subject: [PATCH v3 6/7] x86/hyper-v: detect nested features
>
> TLFS 5.0 says: "Support for an enlightened VMCS interface is reported with
> CPUID leaf 0x40000004. If an enlightened VMCS interface is supported,
> additional nested enlightenments may be discovered by reading the CPUID
> leaf 0x4000000A (see 2.4.11)."
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Michael Kelley <[email protected]>
"Michael Kelley (EOSG)" <[email protected]> writes:
>> -----Original Message-----
>> From: Vitaly Kuznetsov <[email protected]>
>> Sent: Friday, March 9, 2018 6:03 AM
>> To: [email protected]
>> Cc: [email protected]; Paolo Bonzini <[email protected]>; Radim Krčmář
>> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
>> <[email protected]>; Stephen Hemminger <[email protected]>; Michael
>> Kelley (EOSG) <[email protected]>; Mohammed Gamal
>> <[email protected]>; Cathy Avery <[email protected]>; Bandan Das <[email protected]>;
>> [email protected]
>> Subject: [PATCH v3 1/7] x86/hyper-v: move hyperv.h out of uapi
>>
>> hyperv.h is not part of uapi, there are no (known) users outside of kernel.
>> We are making changes to this file to match current Hyper-V TLFS and we
>> don't want to maintain backwards compatibility.
>>
>> Move the file renaming to hyperv-tlfs.h to avoid confusing it with
>> mshyperv.h. In future, all definitions from TLFS should go to it and
>> all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
>>
>> Signed-off-by: Vitaly Kuznetsov <[email protected]>
>> ---
>
> [snip]
>
>> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
>> index 447371f4de56..72855182b191 100644
>> --- a/drivers/hv/connection.c
>> +++ b/drivers/hv/connection.c
>> @@ -31,7 +31,6 @@
>> #include <linux/vmalloc.h>
>> #include <linux/hyperv.h>
>> #include <linux/export.h>
>> -#include <asm/hyperv.h>
>
> This #include should remain and be changed to asm/hyperv-tlfs.h. This
> file uses the hypercall status values HV_STATUS_*.
>
>> #include <asm/mshyperv.h>
My idea was that <asm/mshyperv.h> already includes <asm/hyperv-tlfs.h> so
there's no need to include them both.
>>
>> #include "hyperv_vmbus.h"
>> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
>> index fe96aab9e794..45f3694bbb76 100644
>> --- a/drivers/hv/hv.c
>> +++ b/drivers/hv/hv.c
>> @@ -29,7 +29,6 @@
>> #include <linux/version.h>
>> #include <linux/interrupt.h>
>> #include <linux/clockchips.h>
>> -#include <asm/hyperv.h>
>
> This #include should remain and be changed to asm/hyperv-tlfs.h. This
> file uses HV_MESSAGE_* values.
>
Ditto.
>> #include <asm/mshyperv.h>
>> #include "hyperv_vmbus.h"
>>
>> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
>> index 22300ec7b556..500f805a6ef2 100644
>> --- a/drivers/hv/hyperv_vmbus.h
>> +++ b/drivers/hv/hyperv_vmbus.h
>> @@ -27,6 +27,7 @@
>>
>> #include <linux/list.h>
>> #include <asm/sync_bitops.h>
>> +#include <asm/hyperv-tlfs.h>
>> #include <linux/atomic.h>
>> #include <linux/hyperv.h>
>> #include <linux/interrupt.h>
>> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
>> index bc65c4d79c1f..b10fe26c4891 100644
>> --- a/drivers/hv/vmbus_drv.c
>> +++ b/drivers/hv/vmbus_drv.c
>> @@ -36,7 +36,6 @@
>> #include <linux/cpu.h>
>> #include <linux/sched/task_stack.h>
>>
>> -#include <asm/hyperv.h>
>
> This #include should remain and be changed to asm/hyperv-tlfs.h. This
> file uses HVMSG_* values.
>
Ditto.
>> #include <asm/mshyperv.h>
>> #include <linux/notifier.h>
>> #include <linux/ptrace.h>
Thanks for the review!
--
Vitaly
On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
> Enlightened VMCS is just a structure in memory, the main benefit
> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
> mask: we tell the underlying hypervisor which fields were modified
> since VMEXIT so there's no need to inspect them all.
>
> Tight CPUID loop test shows significant speedup:
> Before: 18890 cycles
> After: 8304 cycles
>
> Static key is being used to avoid performance penalty for non-Hyper-V
> deployments. Tests show we add around 3 (three) CPU cycles on each
> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> but I don't see a clean way to use static key in assembly.
If you want to live dangerously, you can use text_poke_early to change
the vmwrite to mov. It's just a single instruction, so it's probably
not too hard.
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> Changes since v2:
> - define KVM_EVMCS_VERSION [Radim Krčmář]
> - WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář]
> - add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
> dump_vmcs() [Radim Krčmář]
> ---
> arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 615 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 051dab74e4e9..44b6efa7d54e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -53,6 +53,7 @@
> #include <asm/mmu_context.h>
> #include <asm/microcode.h>
> #include <asm/nospec-branch.h>
> +#include <asm/mshyperv.h>
>
> #include "trace.h"
> #include "pmu.h"
> @@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = {
> MSR_EFER, MSR_TSC_AUX, MSR_STAR,
> };
>
> +DEFINE_STATIC_KEY_FALSE(enable_evmcs);
> +
> +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> +static bool __read_mostly enlightened_vmcs = true;
> +module_param(enlightened_vmcs, bool, 0444);
> +
> +#define KVM_EVMCS_VERSION 1
> +
> +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
> +#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \
> + (u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16)
> +
> +/*
> + * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs,
> + * upped 16 bits hold clean field mask.
> + */
> +static const u32 vmcs_field_to_evmcs_1[] = {
> + /* 64 bit rw */
> + EVMCS1_FIELD(GUEST_RIP, guest_rip,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
Maybe we should use a single "#include"d file (like vmx_shadow_fields.h)
and share it between HV-on-KVM and KVM-on-HV.
...
> + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
> + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
We shouldn't use these on Hyper-V, should we (that is, shouldn't the
WARN below fire if you try---and so why include them at all)?
> +
> +static inline u16 get_evmcs_offset(unsigned long field)
> +{
> + unsigned int index = ROL16(field, 6);
> +
> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
> + WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n",
> + field);
> + return 0;
> + }
> +
> + return (u16)vmcs_field_to_evmcs_1[index];
> +}
> +
> +static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field)
> +{
> + unsigned int index = ROL16(field, 6);
> + u32 evmcs_field;
> +
> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
> + WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n",
> + field);
> + return 0;
> + }
> +
> + evmcs_field = vmcs_field_to_evmcs_1[index];
> +
> + *clean_field = evmcs_field >> 16;
> +
> + return (u16)evmcs_field;
> +}
You can mark this __always_inline, and make it
if (clean_field)
*clean_field = evmcs_field >> 16;
or alternatively, use a two-element struct and do
evmcs_field = &vmcs_field_to_evmcs_1[index];
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
Also, if you return int and make the WARN_ONCE case return -ENOENT, GCC
should be able to optimize out the "if (!offset)" (which becomes "if
(offset < 0)") in the callers. Nitpicking, but...
> +static void vmcs_load_enlightened(u64 phys_addr)
> +{
> + struct hv_vp_assist_page *vp_ap =
> + hv_get_vp_assist_page(smp_processor_id());
> +
> + vp_ap->current_nested_vmcs = phys_addr;
> + vp_ap->enlighten_vmentry = 1;
> +}
evmcs_load?
> +static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl,
> + u32 *pin_based_exec_ctrl)
> +{
> + *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> + *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
How can these be set?
> @@ -3596,6 +4104,14 @@ static int hardware_enable(void)
> if (cr4_read_shadow() & X86_CR4_VMXE)
> return -EBUSY;
>
> + /*
> + * This can happen if we hot-added a CPU but failed to allocate
> + * VP assist page for it.
> + */
> + if (static_branch_unlikely(&enable_evmcs) &&
> + !hv_get_vp_assist_page(cpu))
> + return -EFAULT;
-ENODEV? Maybe add a printk, because this is really rare.
> INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
> INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
> spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
> @@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
> vmcs_conf->size = vmx_msr_high & 0x1fff;
> vmcs_conf->order = get_order(vmcs_conf->size);
> vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
> - vmcs_conf->revision_id = vmx_msr_low;
> +
> + /* KVM supports Enlightened VMCS v1 only */
> + if (static_branch_unlikely(&enable_evmcs))
> + vmcs_conf->revision_id = KVM_EVMCS_VERSION;
> + else
> + vmcs_conf->revision_id = vmx_msr_low;
>
> vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
> vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
> @@ -6990,6 +7511,17 @@ static __init int hardware_setup(void)
> goto out;
> }
>
> + if (static_branch_unlikely(&enable_evmcs)) {
> + evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl,
> + &vmcs_config.pin_based_exec_ctrl);
Why not do it in setup_vmcs_config after the vmcs_conf->vmentry_ctrl
assignment (and pass &vmcs_config, which there is "vmcs_conf", directly
to the function)? And if sanitizing clears the bits in vmentry_ctl and
vmexit_ctl, there's no need to clear cpu_has_load_perf_global_ctrl.
> + /*
> + * Enlightened VMCSv1 doesn't support these:
> + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
> + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
> + */
> + cpu_has_load_perf_global_ctrl = false;> + }
> +
> if (boot_cpu_has(X86_FEATURE_NX))
> kvm_enable_efer_bits(EFER_NX);
>
> @@ -8745,6 +9277,10 @@ static void dump_vmcs(void)
> if (cpu_has_secondary_exec_ctrls())
> secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>
> + if (static_branch_unlikely(&enable_evmcs))
> + evmcs_sanitize_exec_ctrls(&secondary_exec_control,
> + &pin_based_exec_ctrl);
This is wrong, we're reading the VMCS so the values must already be
sanitized (and if not, that's the bug and we want dump_vmcs to print the
"wrong" values).
> pr_err("*** Guest State ***\n");
> pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
> vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
> @@ -8784,7 +9320,8 @@ static void dump_vmcs(void)
> pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
> vmcs_read64(GUEST_IA32_DEBUGCTL),
> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
> - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
> + if (cpu_has_load_perf_global_ctrl &&
> + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
> pr_err("PerfGlobCtl = 0x%016llx\n",
> vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
> if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
> @@ -8820,7 +9357,8 @@ static void dump_vmcs(void)
> pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
> vmcs_read64(HOST_IA32_EFER),
> vmcs_read64(HOST_IA32_PAT));
> - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
> + if (cpu_has_load_perf_global_ctrl &&
> + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
> pr_err("PerfGlobCtl = 0x%016llx\n",
> vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
>
> @@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
> static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> - unsigned long cr3, cr4;
> + unsigned long cr3, cr4, evmcs_rsp;
>
> /* Record the guest's net vcpu time for enforced NMI injections. */
> if (unlikely(!enable_vnmi &&
> @@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>
> vmx->__launched = vmx->loaded_vmcs->launched;
> +
> + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
> + (unsigned long)¤t_evmcs->host_rsp : 0;
(If you use text_poke_early, you can do this assignment unconditionally,
since it's just a single lea instruction).
> @@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> /* Eliminate branch target predictions from guest mode */
> vmexit_fill_RSB();
>
> + /* All fields are clean at this point */
> + if (static_branch_unlikely(&enable_evmcs))
> + current_evmcs->hv_clean_fields |=
> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
> +
> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
> if (vmx->host_debugctlmsr)
> update_debugctlmsr(vmx->host_debugctlmsr);
> @@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>
> static int __init vmx_init(void)
> {
> - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
> + int r;
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> + /*
> + * Enlightened VMCS usage should be recommended and the host needs
> + * to support eVMCS v1 or above. We can also disable eVMCS support
> + * with module parameter.
> + */
> + if (enlightened_vmcs &&
> + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
> + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
> + KVM_EVMCS_VERSION) {
> + int cpu;
> +
> + /* Check that we have assist pages on all online CPUs */
> + for_each_online_cpu(cpu) {
> + if (!hv_get_vp_assist_page(cpu)) {
> + enlightened_vmcs = false;
> + break;
> + }
> + }
> + if (enlightened_vmcs) {
> + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
> + static_branch_enable(&enable_evmcs);
> + }
> + }
A bit nicer to clear enlightened_vmcs in the "else" branch?
That's it. Nice work!
Paolo
> +#endif
> +
> + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
> __alignof__(struct vcpu_vmx), THIS_MODULE);
> if (r)
> return r;
> @@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void)
> #endif
>
> kvm_exit();
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> + if (static_branch_unlikely(&enable_evmcs)) {
> + int cpu;
> + struct hv_vp_assist_page *vp_ap;
> + /*
> + * Reset everything to support using non-enlightened VMCS
> + * access later (e.g. when we reload the module with
> + * enlightened_vmcs=0)
> + */
> + for_each_online_cpu(cpu) {
> + vp_ap = hv_get_vp_assist_page(cpu);
> +
> + if (!vp_ap)
> + continue;
> +
> + vp_ap->current_nested_vmcs = 0;
> + vp_ap->enlighten_vmentry = 0;
> + }
> +
> + static_branch_disable(&enable_evmcs);
> + }
> +#endif
> }
>
> module_init(vmx_init)
>
On 12/03/2018 15:19, Vitaly Kuznetsov wrote:
>>> Static key is being used to avoid performance penalty for non-Hyper-V
>>> deployments. Tests show we add around 3 (three) CPU cycles on each
>>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>>> but I don't see a clean way to use static key in assembly.
>> STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
>>
> Thanks for the tip,
>
> with a single kernel user of these APIs it was easy to miss :-)
>
> Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
> (afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
> nice if we can make them behave the same way static_branch_likely() and
> friends do: compile into something else when !HAVE_JUMP_LABEL so we can
> avoid nasty #ifdefs in C code.
>
> That said I'd like to defer the question to KVM maintainers: Paolo,
> Radim, what would you like me to do? Use STATIC_JUMP_IF_TRUE/FALSE as
> they are, try to make them work for !HAVE_JUMP_LABEL and use them or
> maybe we can commit the series as-is and have it as a future
> optimization (e.g. when HAVE_JUMP_LABEL becomes mandatory)?
With a single instruction to patch, poking at the text manually might be
an option... Otherwise, it's okay as-is.
Paolo
On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> @@ -198,6 +218,12 @@ static int hv_cpu_die(unsigned int cpu)
> struct hv_reenlightenment_control re_ctrl;
> unsigned int new_cpu;
>
> + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
> + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
> + vfree(hv_vp_assist_page[cpu]);
> + hv_vp_assist_page[cpu] = NULL;
So this is freed before the CPU is actually dead. And this runs in
preemtible context. Is the wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); enough to
prevent eventual users of the assist page on the outgoing CPU from
accessing it?
> if (hv_reenlightenment_cb == NULL)
> return 0;
>
> @@ -241,6 +267,13 @@ void hyperv_init(void)
> if (!hv_vp_index)
> return;
>
> + hv_vp_assist_page = kcalloc(num_possible_cpus(),
> + sizeof(*hv_vp_assist_page), GFP_KERNEL);
> + if (!hv_vp_assist_page) {
> + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
> + return;
> + }
> +
> if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
> hv_cpu_init, hv_cpu_die) < 0)
> goto free_vp_index;
Shouldn't you free hv_vp_assist_page in the error path?
> +extern struct hv_vp_assist_page **hv_vp_assist_page;
> +
> +static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
> +{
> + return hv_vp_assist_page[cpu];
Shouldn't that check hv_vp_assist_page != NULL?
Thanks,
tglx
On Mon, 12 Mar 2018, Vitaly Kuznetsov wrote:
> Thomas Gleixner <[email protected]> writes:
> > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> >> Static key is being used to avoid performance penalty for non-Hyper-V
> >> deployments. Tests show we add around 3 (three) CPU cycles on each
> >> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> >> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> >> but I don't see a clean way to use static key in assembly.
> >
> > STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
> >
>
> Thanks for the tip,
>
> with a single kernel user of these APIs it was easy to miss :-)
>
> Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
> (afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
> nice if we can make them behave the same way static_branch_likely() and
> friends do: compile into something else when !HAVE_JUMP_LABEL so we can
> avoid nasty #ifdefs in C code.
What's the reason for !jump label builds of a recent kernel? Old compilers?
Thanks,
tglx
On Fri, Mar 09, 2018 at 03:02:43PM +0100, Vitaly Kuznetsov wrote:
> hyperv.h is not part of uapi, there are no (known) users outside of kernel.
> We are making changes to this file to match current Hyper-V TLFS and we
> don't want to maintain backwards compatibility.
>
> Move the file renaming to hyperv-tlfs.h to avoid confusing it with
> mshyperv.h. In future, all definitions from TLFS should go to it and
> all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
What is TLFS? And hows does it differe from mshyperv.h?
On Wed, Mar 14, 2018 at 9:13 AM, Christoph Hellwig <[email protected]> wrote:
> On Fri, Mar 09, 2018 at 03:02:43PM +0100, Vitaly Kuznetsov wrote:
>> hyperv.h is not part of uapi, there are no (known) users outside of kernel.
>> We are making changes to this file to match current Hyper-V TLFS and we
>> don't want to maintain backwards compatibility.
>>
>> Move the file renaming to hyperv-tlfs.h to avoid confusing it with
>> mshyperv.h. In future, all definitions from TLFS should go to it and
>> all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
>
> What is TLFS? And hows does it differe from mshyperv.h?
The TLFS is the "Top-Level Functional Specification" for Hyper-V.
https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
Radim Krčmář <[email protected]> writes:
> 2018-03-12 15:19+0100, Vitaly Kuznetsov:
>>
>> That said I'd like to defer the question to KVM maintainers: Paolo,
>> Radim, what would you like me to do? Use STATIC_JUMP_IF_TRUE/FALSE as
>> they are, try to make them work for !HAVE_JUMP_LABEL and use them or
>> maybe we can commit the series as-is and have it as a future
>> optimization (e.g. when HAVE_JUMP_LABEL becomes mandatory)?
>
> Please take a look into making a macro that uses STATIC_JUMP_IF_FALSE or
> reads the value from provided static_key and does a test-jump, depending
> on HAVE_JUMP_LABEL.
> It doesn't need to be suited for general use, just something that moves
> the ugliness away from vmx_vcpu_run.
> (Although having it in jump_label.h would be great. I think the main
> obstacle is clobbering of flags.)
>
The other problem is that we actually have inline assembly and I'm not
sure how to use .macros from '#ifdef __ASSEMBLY__' sections there ...
anyway, I tried using the jump label magic and I ended up with the
following:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 44b6efa7d54e..fb15ccf260fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9932,10 +9932,26 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
}
+#ifdef HAVE_JUMP_LABEL
+#define STATIC_CHECK_EVMCS_INUSE(label, key) \
+ ".Lstatic_evmcs:\n\t" \
+ ".byte 0xe9\n\t" \
+ ".long " #label " - .Lstatic_evmcs_after\n\t" \
+ ".Lstatic_evmcs_after:\n" \
+ ".pushsection __jump_table, \"aw\" \n\t" \
+ _ASM_ALIGN "\n\t" \
+ _ASM_PTR ".Lstatic_evmcs, " #label ", %c[" #key "] + 1 \n\t" \
+ ".popsection \n\t"
+#else
+#define STATIC_CHECK_EVMCS_INUSE(label, key) \
+ "cmpl $0, (%c[" #key "])\n\t" \
+ "je " #label "\n\t"
+#endif
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4, evmcs_rsp;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -10002,9 +10018,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
- evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
- (unsigned long)¤t_evmcs->host_rsp : 0;
-
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -10013,12 +10026,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
- /* Avoid VMWRITE when Enlightened VMCS is in use */
- "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "jz 2f \n\t"
- "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ /* Avoid VMWRITE to HOST_SP when Enlightened VMCS is in use */
+ STATIC_CHECK_EVMCS_INUSE(.Lvmwrite_sp, enable_evmcs)
+ "mov %%" _ASM_SP ", %c[evmcs_hrsp](%2) \n\t"
"jmp 1f \n\t"
- "2: \n\t"
+ ".Lvmwrite_sp: \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
@@ -10096,10 +10108,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(current_evmcs),
+ [enable_evmcs]"i"(&enable_evmcs),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ [evmcs_hrsp]"i"(offsetof(struct hv_enlightened_vmcs, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
What I particularly dislike is that we now depend on jump labels
internals. Generalizing this hack doesn't seem practical as
non-HAVE_JUMP_LABEL path cloobbers FLAGS and requiring users to know
that is cumbersome...
I'd say 'too ugly' but I can continue investigating if there're fresh ideas.
--
Vitaly
Thomas Gleixner <[email protected]> writes:
> On Mon, 12 Mar 2018, Vitaly Kuznetsov wrote:
>> Thomas Gleixner <[email protected]> writes:
>> > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
>> >> Static key is being used to avoid performance penalty for non-Hyper-V
>> >> deployments. Tests show we add around 3 (three) CPU cycles on each
>> >> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> >> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> >> but I don't see a clean way to use static key in assembly.
>> >
>> > STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
>> >
>>
>> Thanks for the tip,
>>
>> with a single kernel user of these APIs it was easy to miss :-)
>>
>> Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
>> (afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
>> nice if we can make them behave the same way static_branch_likely() and
>> friends do: compile into something else when !HAVE_JUMP_LABEL so we can
>> avoid nasty #ifdefs in C code.
>
> What's the reason for !jump label builds of a recent kernel? Old compilers?
>
To be honest I don't see any, we can start depending on HAVE_JUMP_LABEL
for CONFIG_KVM I guess.
--
Vitaly
On Wed, 14 Mar 2018, Vitaly Kuznetsov wrote:
> Thomas Gleixner <[email protected]> writes:
> > On Mon, 12 Mar 2018, Vitaly Kuznetsov wrote:
> >> Thomas Gleixner <[email protected]> writes:
> >> > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> >> >> Static key is being used to avoid performance penalty for non-Hyper-V
> >> >> deployments. Tests show we add around 3 (three) CPU cycles on each
> >> >> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> >> >> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> >> >> but I don't see a clean way to use static key in assembly.
> >> >
> >> > STATIC_JUMP_IF_TRUE, STATIC_JUMP_IF_FALSE are your friends.
> >> >
> >>
> >> Thanks for the tip,
> >>
> >> with a single kernel user of these APIs it was easy to miss :-)
> >>
> >> Unfortunately, these APIs are only present if HAVE_JUMP_LABEL and
> >> (afaiu) we still care about KVM on !HAVE_JUMP_LABEL builds. It would be
> >> nice if we can make them behave the same way static_branch_likely() and
> >> friends do: compile into something else when !HAVE_JUMP_LABEL so we can
> >> avoid nasty #ifdefs in C code.
> >
> > What's the reason for !jump label builds of a recent kernel? Old compilers?
> >
>
> To be honest I don't see any, we can start depending on HAVE_JUMP_LABEL
> for CONFIG_KVM I guess.
We currently try to move the minimum compiler version to one which provides
jump label support, so this should be a non issue.
@Peter: What was the final conclusion of this discussion?
Thanks,
tglx
On Wed, Mar 14, 2018 at 08:59:25PM +0100, Thomas Gleixner wrote:
> We currently try to move the minimum compiler version to one which provides
> jump label support, so this should be a non issue.
>
> @Peter: What was the final conclusion of this discussion?
We all said we'd do it. I just haven't come around to resending you that
patch. I'll try and do so tomorrow.
On Wed, Mar 14, 2018 at 09:42:42AM -0700, Joshua R. Poulson wrote:
> >> Move the file renaming to hyperv-tlfs.h to avoid confusing it with
> >> mshyperv.h. In future, all definitions from TLFS should go to it and
> >> all kernel objects should go to mshyperv.h or include/linux/hyperv.h.
> >
> > What is TLFS? And hows does it differe from mshyperv.h?
>
> The TLFS is the "Top-Level Functional Specification" for Hyper-V.
>
> https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
Please put that reference in the changelog and header.
Paolo Bonzini <[email protected]> writes:
> On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
>> Enlightened VMCS is just a structure in memory, the main benefit
>> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
>> mask: we tell the underlying hypervisor which fields were modified
>> since VMEXIT so there's no need to inspect them all.
>>
>> Tight CPUID loop test shows significant speedup:
>> Before: 18890 cycles
>> After: 8304 cycles
>>
>> Static key is being used to avoid performance penalty for non-Hyper-V
>> deployments. Tests show we add around 3 (three) CPU cycles on each
>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> but I don't see a clean way to use static key in assembly.
>
> If you want to live dangerously, you can use text_poke_early to change
> the vmwrite to mov. It's just a single instruction, so it's probably
> not too hard.
>
I'd say it's not worth it ...
>> Signed-off-by: Vitaly Kuznetsov <[email protected]>
>> ---
>> Changes since v2:
>> - define KVM_EVMCS_VERSION [Radim Krčmář]
>> - WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář]
>> - add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
>> dump_vmcs() [Radim Krčmář]
>> ---
>> arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 615 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 051dab74e4e9..44b6efa7d54e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -53,6 +53,7 @@
>> #include <asm/mmu_context.h>
>> #include <asm/microcode.h>
>> #include <asm/nospec-branch.h>
>> +#include <asm/mshyperv.h>
>>
>> #include "trace.h"
>> #include "pmu.h"
>> @@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = {
>> MSR_EFER, MSR_TSC_AUX, MSR_STAR,
>> };
>>
>> +DEFINE_STATIC_KEY_FALSE(enable_evmcs);
>> +
>> +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +static bool __read_mostly enlightened_vmcs = true;
>> +module_param(enlightened_vmcs, bool, 0444);
>> +
>> +#define KVM_EVMCS_VERSION 1
>> +
>> +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
>> +#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \
>> + (u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16)
>> +
>> +/*
>> + * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs,
>> + * upped 16 bits hold clean field mask.
>> + */
>> +static const u32 vmcs_field_to_evmcs_1[] = {
>> + /* 64 bit rw */
>> + EVMCS1_FIELD(GUEST_RIP, guest_rip,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
>
> Maybe we should use a single "#include"d file (like vmx_shadow_fields.h)
> and share it between HV-on-KVM and KVM-on-HV.
>
> ...
Actually, yes, looking at 13k+ lines of code in vmx.c makes me think
it's time we start doing something about it :-)
>
>> + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>
> We shouldn't use these on Hyper-V, should we (that is, shouldn't the
> WARN below fire if you try---and so why include them at all)?
>
True, these shouldn't be used and that's why there's no clean field
assigned to them. They, however, do have a corresponding eVMCS field.
I will try removing them in next version.
>> +
>> +static inline u16 get_evmcs_offset(unsigned long field)
>> +{
>> + unsigned int index = ROL16(field, 6);
>> +
>> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> + WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n",
>> + field);
>> + return 0;
>> + }
>> +
>> + return (u16)vmcs_field_to_evmcs_1[index];
>> +}
>> +
>> +static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field)
>> +{
>> + unsigned int index = ROL16(field, 6);
>> + u32 evmcs_field;
>> +
>> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> + WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n",
>> + field);
>> + return 0;
>> + }
>> +
>> + evmcs_field = vmcs_field_to_evmcs_1[index];
>> +
>> + *clean_field = evmcs_field >> 16;
>> +
>> + return (u16)evmcs_field;
>> +}
>
> You can mark this __always_inline, and make it
>
> if (clean_field)
> *clean_field = evmcs_field >> 16;
>
> or alternatively, use a two-element struct and do
>
> evmcs_field = &vmcs_field_to_evmcs_1[index];
> if (clean_field)
> *clean_field = evmcs_field->clean_field;
> return evmcs_field->offset;
>
> Also, if you return int and make the WARN_ONCE case return -ENOENT, GCC
> should be able to optimize out the "if (!offset)" (which becomes "if
> (offset < 0)") in the callers. Nitpicking, but...
>
Ok, good suggestion, I'll try.
>> +static void vmcs_load_enlightened(u64 phys_addr)
>> +{
>> + struct hv_vp_assist_page *vp_ap =
>> + hv_get_vp_assist_page(smp_processor_id());
>> +
>> + vp_ap->current_nested_vmcs = phys_addr;
>> + vp_ap->enlighten_vmentry = 1;
>> +}
>
> evmcs_load?
>
Works for me,
>> +static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl,
>> + u32 *pin_based_exec_ctrl)
>> +{
>> + *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>> + *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
>
> How can these be set?
>
They can not if Hyper-V behaves but Radim didn't want to trust it -- so
the suggestion was to forcefully disable unsupported controls.
>> @@ -3596,6 +4104,14 @@ static int hardware_enable(void)
>> if (cr4_read_shadow() & X86_CR4_VMXE)
>> return -EBUSY;
>>
>> + /*
>> + * This can happen if we hot-added a CPU but failed to allocate
>> + * VP assist page for it.
>> + */
>> + if (static_branch_unlikely(&enable_evmcs) &&
>> + !hv_get_vp_assist_page(cpu))
>> + return -EFAULT;
>
> -ENODEV? Maybe add a printk, because this is really rare.
>
Ok,
>> INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
>> INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
>> spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
>> @@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>> vmcs_conf->size = vmx_msr_high & 0x1fff;
>> vmcs_conf->order = get_order(vmcs_conf->size);
>> vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
>> - vmcs_conf->revision_id = vmx_msr_low;
>> +
>> + /* KVM supports Enlightened VMCS v1 only */
>> + if (static_branch_unlikely(&enable_evmcs))
>> + vmcs_conf->revision_id = KVM_EVMCS_VERSION;
>> + else
>> + vmcs_conf->revision_id = vmx_msr_low;
>>
>> vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
>> vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
>> @@ -6990,6 +7511,17 @@ static __init int hardware_setup(void)
>> goto out;
>> }
>>
>> + if (static_branch_unlikely(&enable_evmcs)) {
>> + evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl,
>> + &vmcs_config.pin_based_exec_ctrl);
>
> Why not do it in setup_vmcs_config after the vmcs_conf->vmentry_ctrl
> assignment (and pass &vmcs_config, which there is "vmcs_conf", directly
> to the function)? And if sanitizing clears the bits in vmentry_ctl and
> vmexit_ctl, there's no need to clear cpu_has_load_perf_global_ctrl.
>
Ok, if we decide to keep 'sanitization' in place.
>> + /*
>> + * Enlightened VMCSv1 doesn't support these:
>> + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
>> + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
>> + */
>> + cpu_has_load_perf_global_ctrl = false;> + }
>> +
>> if (boot_cpu_has(X86_FEATURE_NX))
>> kvm_enable_efer_bits(EFER_NX);
>>
>> @@ -8745,6 +9277,10 @@ static void dump_vmcs(void)
>> if (cpu_has_secondary_exec_ctrls())
>> secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>>
>> + if (static_branch_unlikely(&enable_evmcs))
>> + evmcs_sanitize_exec_ctrls(&secondary_exec_control,
>> + &pin_based_exec_ctrl);
>
> This is wrong, we're reading the VMCS so the values must already be
> sanitized (and if not, that's the bug and we want dump_vmcs to print the
> "wrong" values).
The problem is that we vmcs_read these fields later in the function and
this will now WARN(). Initally, there was no WARN() for non-existent
fields so this could work (we would just print zeroes for unsupported
fields). Maybe, additional WARN_ON() is not a big deal here.
In reality, these controls should never be set.
>
>> pr_err("*** Guest State ***\n");
>> pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
>> vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
>> @@ -8784,7 +9320,8 @@ static void dump_vmcs(void)
>> pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
>> vmcs_read64(GUEST_IA32_DEBUGCTL),
>> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
>> - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>> + if (cpu_has_load_perf_global_ctrl &&
>> + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>> pr_err("PerfGlobCtl = 0x%016llx\n",
>> vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
>> if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
>> @@ -8820,7 +9357,8 @@ static void dump_vmcs(void)
>> pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
>> vmcs_read64(HOST_IA32_EFER),
>> vmcs_read64(HOST_IA32_PAT));
>> - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>> + if (cpu_has_load_perf_global_ctrl &&
>> + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>> pr_err("PerfGlobCtl = 0x%016llx\n",
>> vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
>>
>> @@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
>> static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> {
>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>> - unsigned long cr3, cr4;
>> + unsigned long cr3, cr4, evmcs_rsp;
>>
>> /* Record the guest's net vcpu time for enforced NMI injections. */
>> if (unlikely(!enable_vnmi &&
>> @@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>>
>> vmx->__launched = vmx->loaded_vmcs->launched;
>> +
>> + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
>> + (unsigned long)¤t_evmcs->host_rsp : 0;
>
> (If you use text_poke_early, you can do this assignment unconditionally,
> since it's just a single lea instruction).
>
Something to take a look at)
>> @@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> /* Eliminate branch target predictions from guest mode */
>> vmexit_fill_RSB();
>>
>> + /* All fields are clean at this point */
>> + if (static_branch_unlikely(&enable_evmcs))
>> + current_evmcs->hv_clean_fields |=
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
>> +
>> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
>> if (vmx->host_debugctlmsr)
>> update_debugctlmsr(vmx->host_debugctlmsr);
>> @@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>
>> static int __init vmx_init(void)
>> {
>> - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>> + int r;
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> + /*
>> + * Enlightened VMCS usage should be recommended and the host needs
>> + * to support eVMCS v1 or above. We can also disable eVMCS support
>> + * with module parameter.
>> + */
>> + if (enlightened_vmcs &&
>> + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
>> + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
>> + KVM_EVMCS_VERSION) {
>> + int cpu;
>> +
>> + /* Check that we have assist pages on all online CPUs */
>> + for_each_online_cpu(cpu) {
>> + if (!hv_get_vp_assist_page(cpu)) {
>> + enlightened_vmcs = false;
>> + break;
>> + }
>> + }
>> + if (enlightened_vmcs) {
>> + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
>> + static_branch_enable(&enable_evmcs);
>> + }
>> + }
>
> A bit nicer to clear enlightened_vmcs in the "else" branch?
Yes, as a precaution, why not. (But we should solely rely on
'enable_evmcs' later on).
>
> That's it. Nice work!
>
> Paolo
>
>> +#endif
>> +
>> + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>> __alignof__(struct vcpu_vmx), THIS_MODULE);
>> if (r)
>> return r;
>> @@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void)
>> #endif
>>
>> kvm_exit();
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> + if (static_branch_unlikely(&enable_evmcs)) {
>> + int cpu;
>> + struct hv_vp_assist_page *vp_ap;
>> + /*
>> + * Reset everything to support using non-enlightened VMCS
>> + * access later (e.g. when we reload the module with
>> + * enlightened_vmcs=0)
>> + */
>> + for_each_online_cpu(cpu) {
>> + vp_ap = hv_get_vp_assist_page(cpu);
>> +
>> + if (!vp_ap)
>> + continue;
>> +
>> + vp_ap->current_nested_vmcs = 0;
>> + vp_ap->enlighten_vmentry = 0;
>> + }
>> +
>> + static_branch_disable(&enable_evmcs);
>> + }
>> +#endif
>> }
>>
>> module_init(vmx_init)
>>
--
Vitaly
Thomas Gleixner <[email protected]> writes:
> On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
>> @@ -198,6 +218,12 @@ static int hv_cpu_die(unsigned int cpu)
>> struct hv_reenlightenment_control re_ctrl;
>> unsigned int new_cpu;
>>
>> + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
>> + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
>> + vfree(hv_vp_assist_page[cpu]);
>> + hv_vp_assist_page[cpu] = NULL;
>
> So this is freed before the CPU is actually dead. And this runs in
> preemtible context. Is the wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); enough to
> prevent eventual users of the assist page on the outgoing CPU from
> accessing it?
>
After we do wrmsrl() the page is no longer 'magic' so in case eventual
users try using it they'll most likely misbehave -- so changing the
shutdown order won't help.
The only user of these pages is currently KVM. Can we still have vCPUs
running on the outgoing CPU at this point? If case we can we're in
trouble and we need to somehow kick them out first.
>> if (hv_reenlightenment_cb == NULL)
>> return 0;
>>
>> @@ -241,6 +267,13 @@ void hyperv_init(void)
>> if (!hv_vp_index)
>> return;
>>
>> + hv_vp_assist_page = kcalloc(num_possible_cpus(),
>> + sizeof(*hv_vp_assist_page), GFP_KERNEL);
>> + if (!hv_vp_assist_page) {
>> + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
>> + return;
>> + }
>> +
>> if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
>> hv_cpu_init, hv_cpu_die) < 0)
>> goto free_vp_index;
>
> Shouldn't you free hv_vp_assist_page in the error path?
>
Yep, will do.
>> +extern struct hv_vp_assist_page **hv_vp_assist_page;
>> +
>> +static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
>> +{
>> + return hv_vp_assist_page[cpu];
>
> Shouldn't that check hv_vp_assist_page != NULL?
>
Not strictly required as we clean HV_X64_ENLIGHTENED_VMCS_RECOMMENDED
above so KVM won't use it but I can add the check to make the API
better.
Thanks,
--
Vitaly
On 15/03/2018 10:56, Vitaly Kuznetsov wrote:
>> + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
Hmm, actually these six are used. I guess
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL is the best we can do, apart from
asking Microsoft to fix the spec.
>>> +{
>>> + *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
>>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>>> + *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
>> How can these be set?
>>
> They can not if Hyper-V behaves but Radim didn't want to trust it -- so
> the suggestion was to forcefully disable unsupported controls.
Yeah, it's good to have, especially if placed before we start using the
values that are read.
>> This is wrong, we're reading the VMCS so the values must already be
>> sanitized (and if not, that's the bug and we want dump_vmcs to print the
>> "wrong" values).
>
> The problem is that we vmcs_read these fields later in the function and
> this will now WARN(). Initally, there was no WARN() for non-existent
> fields so this could work (we would just print zeroes for unsupported
> fields). Maybe, additional WARN_ON() is not a big deal here.
If you WARN(), isn't it because the secondary_exec_control had a bad
value to begin with? As you say, the controls should never be set.
Thanks,
Paolo
On Thu, 15 Mar 2018, Vitaly Kuznetsov wrote:
> Thomas Gleixner <[email protected]> writes:
> > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> >> @@ -198,6 +218,12 @@ static int hv_cpu_die(unsigned int cpu)
> >> struct hv_reenlightenment_control re_ctrl;
> >> unsigned int new_cpu;
> >>
> >> + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
> >> + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
> >> + vfree(hv_vp_assist_page[cpu]);
> >> + hv_vp_assist_page[cpu] = NULL;
> >
> > So this is freed before the CPU is actually dead. And this runs in
> > preemtible context. Is the wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); enough to
> > prevent eventual users of the assist page on the outgoing CPU from
> > accessing it?
> >
>
> After we do wrmsrl() the page is no longer 'magic' so in case eventual
> users try using it they'll most likely misbehave -- so changing the
> shutdown order won't help.
>
> The only user of these pages is currently KVM. Can we still have vCPUs
> running on the outgoing CPU at this point? If case we can we're in
> trouble and we need to somehow kick them out first.
The first thing we do in unplug is to mark the CPU inactive, but I'm not
sure whether that prevents something which was on the CPU before and
perhaps preempted or is affine to that CPU to be scheduled in
again. Peter????
Thanks,
tglx
On Thu, Mar 15, 2018 at 12:45:03PM +0100, Thomas Gleixner wrote:
> On Thu, 15 Mar 2018, Vitaly Kuznetsov wrote:
> > Thomas Gleixner <[email protected]> writes:
> > > On Fri, 9 Mar 2018, Vitaly Kuznetsov wrote:
> > >> @@ -198,6 +218,12 @@ static int hv_cpu_die(unsigned int cpu)
> > >> struct hv_reenlightenment_control re_ctrl;
> > >> unsigned int new_cpu;
> > >>
> > >> + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
> > >> + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
> > >> + vfree(hv_vp_assist_page[cpu]);
> > >> + hv_vp_assist_page[cpu] = NULL;
> > >
> > > So this is freed before the CPU is actually dead. And this runs in
> > > preemtible context. Is the wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); enough to
> > > prevent eventual users of the assist page on the outgoing CPU from
> > > accessing it?
> > >
> >
> > After we do wrmsrl() the page is no longer 'magic' so in case eventual
> > users try using it they'll most likely misbehave -- so changing the
> > shutdown order won't help.
> >
> > The only user of these pages is currently KVM. Can we still have vCPUs
> > running on the outgoing CPU at this point? If case we can we're in
> > trouble and we need to somehow kick them out first.
>
> The first thing we do in unplug is to mark the CPU inactive, but I'm not
> sure whether that prevents something which was on the CPU before and
> perhaps preempted or is affine to that CPU to be scheduled in
> again. Peter????
I think we can still have tasks running at this point.
AP_ACTIVE (sched_cpu_deactivate) simply takes the CPU out of the active
mask, which guarantees no new tasks will land on the CPU.
We'll then proceed all the way to TEARDOWN_CPU as 'normal', at which
point we'll call stop_machine() which does the old DYING callbacks.
It sounds like we want this done here, although possibly we can't do
vfree() from that context, in which case it needs to store the pointer
and do that from a BP callback (what used to be the OFFLINE callbacks or
something).
On Thu, 15 Mar 2018, Peter Zijlstra wrote:
> On Thu, Mar 15, 2018 at 12:45:03PM +0100, Thomas Gleixner wrote:
> > On Thu, 15 Mar 2018, Vitaly Kuznetsov wrote:
> > > The only user of these pages is currently KVM. Can we still have vCPUs
> > > running on the outgoing CPU at this point? If case we can we're in
> > > trouble and we need to somehow kick them out first.
> >
> > The first thing we do in unplug is to mark the CPU inactive, but I'm not
> > sure whether that prevents something which was on the CPU before and
> > perhaps preempted or is affine to that CPU to be scheduled in
> > again. Peter????
>
> I think we can still have tasks running at this point.
>
> AP_ACTIVE (sched_cpu_deactivate) simply takes the CPU out of the active
> mask, which guarantees no new tasks will land on the CPU.
>
> We'll then proceed all the way to TEARDOWN_CPU as 'normal', at which
> point we'll call stop_machine() which does the old DYING callbacks.
>
> It sounds like we want this done here, although possibly we can't do
> vfree() from that context, in which case it needs to store the pointer
> and do that from a BP callback (what used to be the OFFLINE callbacks or
> something).
So the wrmsr() wants to be in the dying range. The vfree() is questionable
anyway because the re-onlining of that CPU will just allocate it again. So
it could very well stay around.
Thanks,
tglx
Paolo Bonzini <[email protected]> writes:
> On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
>> Enlightened VMCS is just a structure in memory, the main benefit
>> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
>> mask: we tell the underlying hypervisor which fields were modified
>> since VMEXIT so there's no need to inspect them all.
>>
>> Tight CPUID loop test shows significant speedup:
>> Before: 18890 cycles
>> After: 8304 cycles
>>
>> Static key is being used to avoid performance penalty for non-Hyper-V
>> deployments. Tests show we add around 3 (three) CPU cycles on each
>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> but I don't see a clean way to use static key in assembly.
>
> If you want to live dangerously, you can use text_poke_early to change
> the vmwrite to mov. It's just a single instruction, so it's probably
> not too hard.
It is not:
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_X86_64)
+
+/* Luckily, both original and new instructions are of the same length */
+#define EVMCS_RSP_OPCODE_LEN 3
+static evmcs_patch_vmx_cpu_run(void)
+{
+ u8 *addr;
+ u8 opcode_old[] = {0x0f, 0x79, 0xd4}; // vmwrite rsp, rdx
+ u8 opcode_new[] = {0x48, 0x89, 0x26}; // mov rsp, (rsi)
+
+ /*
+ * What we're searching for MUST be present in vmx_cpu_run().
+ * We replace the first occurance only.
+ */
+ for (addr = (u8 *)vmx_vcpu_run; ; addr++) {
+ if (!memcmp(addr, opcode_old, EVMCS_RSP_OPCODE_LEN)) {
+ /*
+ * vmx_vcpu_run is not currently running on other CPUs but
+ * using text_poke_early() would require us to do manual
+ * RW remapping of the area.
+ */
+ text_poke(addr, opcode_new, EVMCS_RSP_OPCODE_LEN);
+ break;
+ }
+ }
+}
+#endif
+
text_poke() also needs to be exported.
This works. But hell, this is a crude hack :-) Not sure if there's a
cleaner way to find what needs to be patched without something like jump
label table ...
--
Vitaly
2018-03-15 16:19+0100, Vitaly Kuznetsov:
> Paolo Bonzini <[email protected]> writes:
>
> > On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
> >> Enlightened VMCS is just a structure in memory, the main benefit
> >> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
> >> mask: we tell the underlying hypervisor which fields were modified
> >> since VMEXIT so there's no need to inspect them all.
> >>
> >> Tight CPUID loop test shows significant speedup:
> >> Before: 18890 cycles
> >> After: 8304 cycles
> >>
> >> Static key is being used to avoid performance penalty for non-Hyper-V
> >> deployments. Tests show we add around 3 (three) CPU cycles on each
> >> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
> >> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
> >> but I don't see a clean way to use static key in assembly.
> >
> > If you want to live dangerously, you can use text_poke_early to change
> > the vmwrite to mov. It's just a single instruction, so it's probably
> > not too hard.
>
> It is not:
>
> +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_X86_64)
> +
> +/* Luckily, both original and new instructions are of the same length */
> +#define EVMCS_RSP_OPCODE_LEN 3
> +static evmcs_patch_vmx_cpu_run(void)
> +{
> + u8 *addr;
> + u8 opcode_old[] = {0x0f, 0x79, 0xd4}; // vmwrite rsp, rdx
> + u8 opcode_new[] = {0x48, 0x89, 0x26}; // mov rsp, (rsi)
> +
> + /*
> + * What we're searching for MUST be present in vmx_cpu_run().
> + * We replace the first occurance only.
> + */
> + for (addr = (u8 *)vmx_vcpu_run; ; addr++) {
> + if (!memcmp(addr, opcode_old, EVMCS_RSP_OPCODE_LEN)) {
> + /*
> + * vmx_vcpu_run is not currently running on other CPUs but
> + * using text_poke_early() would require us to do manual
> + * RW remapping of the area.
> + */
> + text_poke(addr, opcode_new, EVMCS_RSP_OPCODE_LEN);
> + break;
> + }
> + }
> +}
> +#endif
> +
>
> text_poke() also needs to be exported.
>
> This works. But hell, this is a crude hack :-) Not sure if there's a
> cleaner way to find what needs to be patched without something like jump
> label table ...
Yeah, I can see us accidently patching parts of other instructions. :)
The target instruction address can be made into a C-accessible symbol
with the same trick that vmx_return uses -- add a .global containing the
address of a label (not sure if a more direct approach would work).
The evil in me likes it. (The good is too lazy to add a decent patching
infrastructure for just one user.)
I would be a bit happier if we didn't assume the desired instruction and
therefore put constraints on a remote code.
We actually already have mov in the assembly:
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" // here
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
Is there a drawback in switching '%c[host_rsp](%0)' to be a general
memory pointer and put either &vmx->host_rsp or ¤t_evmcs->host_rsp
in there?
We could just overwrite ASM_VMX_VMWRITE_RSP_RDX with a nop then. :)
Thanks.
2018-03-15 18:02+0100, Radim Krčmář:
> We actually already have mov in the assembly:
>
> "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
Oh hell, I didn't pay attention to this line before.
> "je 1f \n\t"
> "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" // here
> __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
I bet this path is executed only once in VM's lifetime and what we're
doing is wasting more resources than we're ever going to save ...
> "1: \n\t"
Radim Krčmář <[email protected]> writes:
> 2018-03-15 18:02+0100, Radim Krčmář:
>> We actually already have mov in the assembly:
>>
>> "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
>
> Oh hell, I didn't pay attention to this line before.
>
This is still going to work if we conditionally replace it with pointer
to evmcs as you suggested before but ...
>> "je 1f \n\t"
>> "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" // here
>> __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
>
> I bet this path is executed only once in VM's lifetime and what we're
> doing is wasting more resources than we're ever going to save ...
>
yes, we're not gonna save anything...
--
Vitaly
On Thu, 15 Mar 2018, Radim Kr?m?? wrote:
> 2018-03-15 16:19+0100, Vitaly Kuznetsov:
> > This works. But hell, this is a crude hack :-) Not sure if there's a
> > cleaner way to find what needs to be patched without something like jump
> > label table ...
>
> Yeah, I can see us accidently patching parts of other instructions. :)
>
> The target instruction address can be made into a C-accessible symbol
> with the same trick that vmx_return uses -- add a .global containing the
> address of a label (not sure if a more direct approach would work).
>
> The evil in me likes it. (The good is too lazy to add a decent patching
> infrastructure for just one user.)
Can we just use jump labels please? There is agreement that 4.17 will have
a dependency on a jump label capable compiler for x86.
Thanks,
tglx
2018-03-15 20:28+0100, Thomas Gleixner:
> On Thu, 15 Mar 2018, Radim Krčmář wrote:
> > 2018-03-15 16:19+0100, Vitaly Kuznetsov:
> > > This works. But hell, this is a crude hack :-) Not sure if there's a
> > > cleaner way to find what needs to be patched without something like jump
> > > label table ...
> >
> > Yeah, I can see us accidently patching parts of other instructions. :)
> >
> > The target instruction address can be made into a C-accessible symbol
> > with the same trick that vmx_return uses -- add a .global containing the
> > address of a label (not sure if a more direct approach would work).
> >
> > The evil in me likes it. (The good is too lazy to add a decent patching
> > infrastructure for just one user.)
>
> Can we just use jump labels please? There is agreement that 4.17 will have
> a dependency on a jump label capable compiler for x86.
Luckily, it turned out that the path is very cold and should use the
simple test-and-jump.