This series allows guests to use the MSR_IA32_SPEC_CTRL and
MSR_IA32_PRED_CMD model specific registers that were added as mitigations
for CVE-2017-5715.
These are only the KVM specific parts of the fix. It does *not* yet
include any protection for reading host memory from the guest, because
that would be done in the same way as the rest of Linux. So there is no
IBRS *usage* here, no retpolines, no stuffing of the return stack buffer.
(KVM already includes a fix to clear all registers on vmexit, which is
enough to block Google Project Zero's PoC exploit).
However, I am including the changes to use IBPB (indirect branch
predictor barrier) if available. That occurs only when there is a VCPU
switch on a physical CPU, thus it has a small impact on performance.
The patches are a bit hackish because the relevant cpufeatures have
not been included yet, and because I wanted to make the patches easier
to backport to distro kernels if desired, but I would still like to
have them in 4.16.
Please review.
Thanks,
Paolo
Paolo Bonzini (5):
KVM: x86: add SPEC_CTRL and IBPB_SUPPORT accessors
x86/msr: add definitions for indirect branch predictor MSRs
kvm: vmx: pass MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD down to the guest
kvm: svm: pass MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD down to guest
KVM: x86: add SPEC_CTRL and IBPB_SUPPORT to MSR and CPUID lists
Tim Chen (1):
kvm: vmx: Set IBPB when running a different VCPU
Tom Lendacky (1):
x86/svm: Set IBPB when running a different VCPU
arch/x86/include/asm/msr-index.h | 5 ++++
arch/x86/kvm/cpuid.c | 27 +++++++++++++----
arch/x86/kvm/cpuid.h | 22 ++++++++++++++
arch/x86/kvm/svm.c | 65 +++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/vmx.c | 41 +++++++++++++++++++++++++
arch/x86/kvm/x86.c | 1 +
6 files changed, 154 insertions(+), 7 deletions(-)
--
1.8.3.1
KVM will start using them soon.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/msr-index.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 03ffde6217d0..ec08f1d8d39b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -39,6 +39,11 @@
/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_SPEC_CTRL 0x00000048
+
+#define MSR_IA32_PRED_CMD 0x00000049
+#define FEATURE_SET_IBPB (1UL << 0)
+
#define MSR_PPIN_CTL 0x0000004e
#define MSR_PPIN 0x0000004f
--
1.8.3.1
Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
it yet).
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 669f5f74857d..d00bcad7336e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -120,6 +120,8 @@
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
+static bool __read_mostly have_spec_ctrl;
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -609,6 +611,8 @@ struct vcpu_vmx {
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
+ u64 spec_ctrl;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ to_vmx(vcpu)->spec_ctrl = msr_info->data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
goto out;
}
+ /*
+ * FIXME: this is only needed until SPEC_CTRL is supported
+ * by upstream Linux in cpufeatures, then it can be replaced
+ * with static_cpu_has.
+ */
+ have_spec_ctrl = cpu_has_spec_ctrl();
+ if (have_spec_ctrl)
+ pr_info("kvm: SPEC_CTRL available\n");
+ else
+ pr_info("kvm: SPEC_CTRL not available\n");
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
pt_guest_enter(vmx);
+ if (have_spec_ctrl && vmx->spec_ctrl != 0)
+ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
atomic_switch_perf_msrs(vmx);
vmx_arm_hv_timer(vcpu);
@@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ if (have_spec_ctrl) {
+ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ }
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
--
1.8.3.1
Expose them to userspace, now that guests can use them.
I am not adding cpufeatures here to avoid having a kernel
that shows spec_ctrl in /proc/cpuinfo and actually has no
support whatsoever for IBRS/IBPB. Keep the ugly special-casing
for now, and clean it up once the generic arch/x86/ code
learns about them.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/cpuid.c | 24 +++++++++++++++++++++---
arch/x86/kvm/x86.c | 1 +
2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 767af697c20c..5f43a5940275 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -397,7 +397,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) |
+ KF(SPEC_CTRL) | KF(STIBP);
+
+ /* cpuid 0x80000008.edx */
+ const u32 kvm_cpuid_80000008_ebx_x86_features =
+ KF(IBPB_SUPPORT);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -483,7 +488,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
- entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
+ /*
+ * FIXME: the special casing of SPEC_CTRL and STIBP
+ * can be removed once they become regular
+ * cpufeatures.
+ */
+ entry->edx &= (
+ get_scattered_cpuid_leaf(7, 0, CPUID_EDX) |
+ KF(SPEC_CTRL) | KF(STIBP));
} else {
entry->ebx = 0;
entry->ecx = 0;
@@ -651,7 +663,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (!g_phys_as)
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
- entry->ebx = entry->edx = 0;
+ /*
+ * FIXME: mask against cpufeatures, with
+ * get_scattered_cpuid_leaf(0x80000008, 0, CPUID_EBX),
+ * once IBPB_SUPPORT becomes a regular cpufeature.
+ */
+ entry->ebx &= kvm_cpuid_80000008_ebx_x86_features;
+ entry->edx = 0;
break;
}
case 0x80000019:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index daa1918031df..4abb37d9f4d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1032,6 +1032,7 @@ unsigned int kvm_get_pt_addr_cnt(void)
MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_SPEC_CTRL,
};
static unsigned num_msrs_to_save;
--
1.8.3.1
From: Tom Lendacky <[email protected]>
Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is
going to run a VCPU different from what was previously run. Nested
virtualization uses the same VMCB for the second level guest, but the
L1 hypervisor should be using IBRS to protect itself.
Signed-off-by: Tom Lendacky <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/svm.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 779981a00d01..dd744d896cec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -289,6 +289,7 @@ struct amd_svm_iommu_ir {
module_param(vgif, int, 0444);
static bool __read_mostly have_spec_ctrl;
+static bool __read_mostly have_ibpb_support;
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
@@ -540,6 +541,7 @@ struct svm_cpu_data {
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
+ struct vmcb *current_vmcb;
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -1151,6 +1153,11 @@ static __init int svm_hardware_setup(void)
pr_info("kvm: SPEC_CTRL available\n");
else
pr_info("kvm: SPEC_CTRL not available\n");
+ have_ibpb_support = have_spec_ctrl || cpu_has_ibpb_support();
+ if (have_ibpb_support)
+ pr_info("kvm: IBPB_SUPPORT available\n");
+ else
+ pr_info("kvm: IBPB_SUPPORT not available\n");
return 0;
@@ -1725,11 +1732,19 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
+
+ /*
+ * The VMCB could be recycled, causing a false negative in
+ * svm_vcpu_load; block speculative execution.
+ */
+ if (have_ibpb_support)
+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
if (unlikely(cpu != vcpu->cpu)) {
@@ -1758,6 +1773,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ if (have_ibpb_support)
+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
+ }
+
avic_vcpu_load(vcpu, cpu);
}
@@ -2798,6 +2819,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_vmcb)
return 1;
+ /*
+ * No need for IBPB here, the L1 hypervisor should be running with
+ * IBRS=1 and inserts one already when switching L2 VMs.
+ */
+
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
svm->nested.vmcb = 0;
@@ -3061,6 +3087,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (!nested_vmcb)
return false;
+ /*
+ * No need for IBPB here, since the nested VM is less privileged. The
+ * L1 hypervisor inserts one already when switching L2 VMs.
+ */
+
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
nested_vmcb->control.exit_code_hi = 0;
--
1.8.3.1
From: Tim Chen <[email protected]>
Ensure an IBPB (Indirect branch prediction barrier) before every VCPU
switch.
Signed-off-by: Tim Chen <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d00bcad7336e..bf127c570675 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2375,6 +2375,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
+ if (have_spec_ctrl)
+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
}
if (!already_loaded) {
@@ -4029,6 +4031,13 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+
+ /*
+ * The VMCS could be recycled, causing a false negative in
+ * vmx_vcpu_load; block speculative execution.
+ */
+ if (have_spec_ctrl)
+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
}
static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
--
1.8.3.1
Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
it yet).
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/svm.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 55dde3896898..779981a00d01 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -183,6 +183,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
+ u64 spec_ctrl;
+
u32 *msrpm;
ulong nmi_iret_rip;
@@ -236,7 +238,7 @@ struct amd_svm_iommu_ir {
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
+ bool always; /* True if intercept is always off */
} direct_access_msrs[] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
@@ -248,6 +250,8 @@ struct amd_svm_iommu_ir {
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = true },
+ { .index = MSR_IA32_PRED_CMD, .always = true },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -283,6 +287,8 @@ struct amd_svm_iommu_ir {
/* enable/disable Virtual GIF */
static int vgif = true;
module_param(vgif, int, 0444);
+
+static bool __read_mostly have_spec_ctrl;
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
@@ -1135,6 +1141,17 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ /*
+ * FIXME: this is only needed until SPEC_CTRL is supported
+ * by upstream Linux in cpufeatures, then it can be replaced
+ * with static_cpu_has.
+ */
+ have_spec_ctrl = cpu_has_spec_ctrl();
+ if (have_spec_ctrl)
+ pr_info("kvm: SPEC_CTRL available\n");
+ else
+ pr_info("kvm: SPEC_CTRL not available\n");
+
return 0;
err:
@@ -3599,6 +3616,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ msr_info->data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
@@ -3754,6 +3774,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_SPEC_CTRL:
+ svm->spec_ctrl = data;
+ break;
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
@@ -4942,6 +4965,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
+ if (have_spec_ctrl && svm->spec_ctrl != 0)
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -5015,6 +5041,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ if (have_spec_ctrl) {
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+ if (svm->spec_ctrl != 0)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ }
+
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
--
1.8.3.1
As an interim measure until SPEC_CTRL is supported by upstream
Linux in cpufeatures, add a function that lets vmx.c and svm.c
know whether to save/restore MSR_IA32_SPEC_CTRL.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/cpuid.c | 3 ---
arch/x86/kvm/cpuid.h | 22 ++++++++++++++++++++++
2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8e9a07c557f1..767af697c20c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void)
#define F(x) bit(X86_FEATURE_##x)
-/* These are scattered features in cpufeatures.h. */
-#define KVM_CPUID_BIT_AVX512_4VNNIW 2
-#define KVM_CPUID_BIT_AVX512_4FMAPS 3
#define KF(x) bit(KVM_CPUID_BIT_##x)
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c2cea6651279..8d04ccf177ce 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -155,6 +155,28 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
+/* These are scattered features in cpufeatures.h. */
+#define KVM_CPUID_BIT_AVX512_4VNNIW 2
+#define KVM_CPUID_BIT_AVX512_4FMAPS 3
+#define KVM_CPUID_BIT_SPEC_CTRL 26
+#define KVM_CPUID_BIT_STIBP 27
+
+/* CPUID[eax=0x80000008].ebx */
+#define KVM_CPUID_BIT_IBPB_SUPPORT 12
+
+static inline bool cpu_has_spec_ctrl(void)
+{
+ u32 eax, ebx, ecx, edx;
+ cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+
+ return edx & bit(KVM_CPUID_BIT_SPEC_CTRL);
+}
+
+static inline bool cpu_has_ibpb_support(void)
+{
+ return cpuid_ebx(0x80000008) & bit(KVM_CPUID_BIT_IBPB_SUPPORT);
+}
+
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
{
return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
--
1.8.3.1
On Mon, Jan 08, 2018 at 07:08:39PM +0100, Paolo Bonzini wrote:
> As an interim measure until SPEC_CTRL is supported by upstream
> Linux in cpufeatures, add a function that lets vmx.c and svm.c
> know whether to save/restore MSR_IA32_SPEC_CTRL.
Reviewed-by: Konrad Rzeszutek Wilk <[email protected]>
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/cpuid.c | 3 ---
> arch/x86/kvm/cpuid.h | 22 ++++++++++++++++++++++
> 2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 8e9a07c557f1..767af697c20c 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void)
>
> #define F(x) bit(X86_FEATURE_##x)
>
> -/* These are scattered features in cpufeatures.h. */
> -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> #define KF(x) bit(KVM_CPUID_BIT_##x)
>
> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index c2cea6651279..8d04ccf177ce 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -155,6 +155,28 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
> return x86_stepping(best->eax);
> }
>
> +/* These are scattered features in cpufeatures.h. */
> +#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> +#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> +#define KVM_CPUID_BIT_SPEC_CTRL 26
> +#define KVM_CPUID_BIT_STIBP 27
> +
> +/* CPUID[eax=0x80000008].ebx */
> +#define KVM_CPUID_BIT_IBPB_SUPPORT 12
> +
> +static inline bool cpu_has_spec_ctrl(void)
> +{
> + u32 eax, ebx, ecx, edx;
> + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
> +
> + return edx & bit(KVM_CPUID_BIT_SPEC_CTRL);
> +}
> +
> +static inline bool cpu_has_ibpb_support(void)
> +{
> + return cpuid_ebx(0x80000008) & bit(KVM_CPUID_BIT_IBPB_SUPPORT);
> +}
> +
> static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
> {
> return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
> --
> 1.8.3.1
>
>
On Mon, Jan 08, 2018 at 07:08:40PM +0100, Paolo Bonzini wrote:
> KVM will start using them soon.
Perhaps include a bit of description?
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/include/asm/msr-index.h | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 03ffde6217d0..ec08f1d8d39b 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -39,6 +39,11 @@
>
> /* Intel MSRs. Some also available on other CPUs */
>
> +#define MSR_IA32_SPEC_CTRL 0x00000048
> +
> +#define MSR_IA32_PRED_CMD 0x00000049
> +#define FEATURE_SET_IBPB (1UL << 0)
> +
> #define MSR_PPIN_CTL 0x0000004e
> #define MSR_PPIN 0x0000004f
>
> --
> 1.8.3.1
>
>
On Mon, Jan 08, 2018 at 07:08:41PM +0100, Paolo Bonzini wrote:
> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> it yet).
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 669f5f74857d..d00bcad7336e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -120,6 +120,8 @@
> module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
> #endif
>
> +static bool __read_mostly have_spec_ctrl;
> +
> #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
> #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
> #define KVM_VM_CR0_ALWAYS_ON \
> @@ -609,6 +611,8 @@ struct vcpu_vmx {
> u64 msr_host_kernel_gs_base;
> u64 msr_guest_kernel_gs_base;
> #endif
> + u64 spec_ctrl;
> +
> u32 vm_entry_controls_shadow;
> u32 vm_exit_controls_shadow;
> u32 secondary_exec_control;
> @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + msr_info->data = to_vmx(vcpu)->spec_ctrl;
> + break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
> @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + to_vmx(vcpu)->spec_ctrl = msr_info->data;
> + break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
> goto out;
> }
>
> + /*
> + * FIXME: this is only needed until SPEC_CTRL is supported
> + * by upstream Linux in cpufeatures, then it can be replaced
> + * with static_cpu_has.
> + */
> + have_spec_ctrl = cpu_has_spec_ctrl();
> + if (have_spec_ctrl)
> + pr_info("kvm: SPEC_CTRL available\n");
> + else
> + pr_info("kvm: SPEC_CTRL not available\n");
> +
> if (boot_cpu_has(X86_FEATURE_NX))
> kvm_enable_efer_bits(EFER_NX);
>
> @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>
> memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
> vmx_msr_bitmap_legacy, PAGE_SIZE);
> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> pt_guest_enter(vmx);
>
> + if (have_spec_ctrl && vmx->spec_ctrl)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
Shouldn't this be as close to the assembler code as possible?
> atomic_switch_perf_msrs(vmx);
>
> vmx_arm_hv_timer(vcpu);
I don't really understand the organization of this file, but I put
these MSRs in the /* Intel defined MSRs. */ block, between
MSR_IA32_TSC_ADJUST and MSR_IA32_BNDCFGS.
On Mon, Jan 8, 2018 at 10:35 AM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
> On Mon, Jan 08, 2018 at 07:08:40PM +0100, Paolo Bonzini wrote:
>> KVM will start using them soon.
>
> Perhaps include a bit of description?
>>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/include/asm/msr-index.h | 5 +++++
>> 1 file changed, 5 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>> index 03ffde6217d0..ec08f1d8d39b 100644
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -39,6 +39,11 @@
>>
>> /* Intel MSRs. Some also available on other CPUs */
>>
>> +#define MSR_IA32_SPEC_CTRL 0x00000048
>> +
>> +#define MSR_IA32_PRED_CMD 0x00000049
>> +#define FEATURE_SET_IBPB (1UL << 0)
>> +
>> #define MSR_PPIN_CTL 0x0000004e
>> #define MSR_PPIN 0x0000004f
>>
>> --
>> 1.8.3.1
>>
>>
On 08/01/18 20:08, Paolo Bonzini wrote:
> KVM will start using them soon.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/include/asm/msr-index.h | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 03ffde6217d0..ec08f1d8d39b 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -39,6 +39,11 @@
>
> /* Intel MSRs. Some also available on other CPUs */
>
> +#define MSR_IA32_SPEC_CTRL 0x00000048
> +
> +#define MSR_IA32_PRED_CMD 0x00000049
> +#define FEATURE_SET_IBPB (1UL << 0)
> +
> #define MSR_PPIN_CTL 0x0000004e
> #define MSR_PPIN 0x0000004f
>
>
Trivially,
Reviewed-by: Liran Alon <[email protected]>
On 08/01/18 20:08, Paolo Bonzini wrote:
> As an interim measure until SPEC_CTRL is supported by upstream
> Linux in cpufeatures, add a function that lets vmx.c and svm.c
> know whether to save/restore MSR_IA32_SPEC_CTRL.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/cpuid.c | 3 ---
> arch/x86/kvm/cpuid.h | 22 ++++++++++++++++++++++
> 2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 8e9a07c557f1..767af697c20c 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void)
>
> #define F(x) bit(X86_FEATURE_##x)
>
> -/* These are scattered features in cpufeatures.h. */
> -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> #define KF(x) bit(KVM_CPUID_BIT_##x)
>
> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index c2cea6651279..8d04ccf177ce 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -155,6 +155,28 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
> return x86_stepping(best->eax);
> }
>
> +/* These are scattered features in cpufeatures.h. */
> +#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> +#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> +#define KVM_CPUID_BIT_SPEC_CTRL 26
> +#define KVM_CPUID_BIT_STIBP 27
> +
> +/* CPUID[eax=0x80000008].ebx */
> +#define KVM_CPUID_BIT_IBPB_SUPPORT 12
> +
> +static inline bool cpu_has_spec_ctrl(void)
> +{
> + u32 eax, ebx, ecx, edx;
> + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
> +
> + return edx & bit(KVM_CPUID_BIT_SPEC_CTRL);
Why not just "return cpuid_edx(7) & bit(KVM_CPUID_BIT_SPEC_CTRL);"?
This is also consistent with how cpu_has_ibpb_support() is written.
> +}
> +
> +static inline bool cpu_has_ibpb_support(void)
> +{
> + return cpuid_ebx(0x80000008) & bit(KVM_CPUID_BIT_IBPB_SUPPORT);
> +}
> +
> static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
> {
> return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
>
Guest usage of MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD should be
predicated on guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL).
On Mon, Jan 8, 2018 at 10:08 AM, Paolo Bonzini <[email protected]> wrote:
> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> it yet).
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 669f5f74857d..d00bcad7336e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -120,6 +120,8 @@
> module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
> #endif
>
> +static bool __read_mostly have_spec_ctrl;
> +
> #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
> #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
> #define KVM_VM_CR0_ALWAYS_ON \
> @@ -609,6 +611,8 @@ struct vcpu_vmx {
> u64 msr_host_kernel_gs_base;
> u64 msr_guest_kernel_gs_base;
> #endif
> + u64 spec_ctrl;
> +
> u32 vm_entry_controls_shadow;
> u32 vm_exit_controls_shadow;
> u32 secondary_exec_control;
> @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + msr_info->data = to_vmx(vcpu)->spec_ctrl;
> + break;
I have:
if (!have_spec_ctrl ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_spec_ctrl;
break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
> @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + to_vmx(vcpu)->spec_ctrl = msr_info->data;
> + break;
I have:
if (!have_spec_ctrl ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
return 1;
to_vmx(vcpu)->msr_ia32_spec_ctrl = data;
break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
> goto out;
> }
>
> + /*
> + * FIXME: this is only needed until SPEC_CTRL is supported
> + * by upstream Linux in cpufeatures, then it can be replaced
> + * with static_cpu_has.
> + */
> + have_spec_ctrl = cpu_has_spec_ctrl();
> + if (have_spec_ctrl)
> + pr_info("kvm: SPEC_CTRL available\n");
> + else
> + pr_info("kvm: SPEC_CTRL not available\n");
> +
> if (boot_cpu_has(X86_FEATURE_NX))
> kvm_enable_efer_bits(EFER_NX);
>
> @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
I have a lot of changes to MSR permission bitmap handling, but these
intercepts should only be disabled when guest_cpuid_has(vcpu,
X86_FEATURE_SPEC_CTRL).
> memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
> vmx_msr_bitmap_legacy, PAGE_SIZE);
> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> pt_guest_enter(vmx);
>
> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
Here, I have:
/*
* If the guest is allowed to write to MSR_IA32_SPEC_CTRL,
* store it on VM-exit.
*/
if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
add_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
else
clear_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
/*
* If the guest's IA32_SPEC_CTRL MSR doesn't match the host's
* IA32_SPEC_CTRL MSR, then add the MSR to the atomic switch
* MSRs, so that the guest value will be loaded on VM-entry
* and the host value will be loaded on VM-exit.
*/
if (vmx->msr_ia32_spec_ctrl != spec_ctrl_enabled())
add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
vmx->msr_ia32_spec_ctrl,
spec_ctrl_enabled());
else
clear_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL);
> atomic_switch_perf_msrs(vmx);
>
> vmx_arm_hv_timer(vcpu);
> @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> #endif
> );
>
> + if (have_spec_ctrl) {
> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> + if (vmx->spec_ctrl)
> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> + }
> +
I know the VM-exit MSR load and store lists are probably slower, but
I'm a little uncomfortable restoring the host's IA32_SPEC_CTRL MSR
late if the guest has it clear and the host has it set.
> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
> if (vmx->host_debugctlmsr)
> update_debugctlmsr(vmx->host_debugctlmsr);
> --
> 1.8.3.1
>
>
On 08/01/18 20:08, Paolo Bonzini wrote:
> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> it yet).
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 669f5f74857d..d00bcad7336e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -120,6 +120,8 @@
> module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
> #endif
>
> +static bool __read_mostly have_spec_ctrl;
> +
> #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
> #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
> #define KVM_VM_CR0_ALWAYS_ON \
> @@ -609,6 +611,8 @@ struct vcpu_vmx {
> u64 msr_host_kernel_gs_base;
> u64 msr_guest_kernel_gs_base;
> #endif
> + u64 spec_ctrl;
> +
> u32 vm_entry_controls_shadow;
> u32 vm_exit_controls_shadow;
> u32 secondary_exec_control;
> @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + msr_info->data = to_vmx(vcpu)->spec_ctrl;
> + break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
> @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + to_vmx(vcpu)->spec_ctrl = msr_info->data;
> + break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
> goto out;
> }
>
> + /*
> + * FIXME: this is only needed until SPEC_CTRL is supported
> + * by upstream Linux in cpufeatures, then it can be replaced
> + * with static_cpu_has.
> + */
> + have_spec_ctrl = cpu_has_spec_ctrl();
> + if (have_spec_ctrl)
> + pr_info("kvm: SPEC_CTRL available\n");
> + else
> + pr_info("kvm: SPEC_CTRL not available\n");
> +
> if (boot_cpu_has(X86_FEATURE_NX))
> kvm_enable_efer_bits(EFER_NX);
>
> @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>
> memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
> vmx_msr_bitmap_legacy, PAGE_SIZE);
> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> pt_guest_enter(vmx);
>
> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
Intel specifies that the restore of MSR_IA32_SPEC_CTRL to guest value
using WRMSR should be done after the last indirect branch before VMEntry.
However, atomic_switch_perf_msrs() calls perf_guest_get_msrs() which
calls x86_pmu.guest_get_msrs() which is an indirect branch.
Therefore, it seems that this block should be done after the call to
vmx_arm_hv_timer().
> atomic_switch_perf_msrs(vmx);
>
> vmx_arm_hv_timer(vcpu);
> @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> #endif
> );
>
> + if (have_spec_ctrl) {
> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> + if (vmx->spec_ctrl)
> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> + }
> +
> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
> if (vmx->host_debugctlmsr)
> update_debugctlmsr(vmx->host_debugctlmsr);
>
On 08/01/18 20:08, Paolo Bonzini wrote:
> From: Tim Chen <[email protected]>
>
> Ensure an IBPB (Indirect branch prediction barrier) before every VCPU
> switch.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d00bcad7336e..bf127c570675 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2375,6 +2375,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> vmcs_load(vmx->loaded_vmcs->vmcs);
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> if (!already_loaded) {
> @@ -4029,6 +4031,13 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
> free_vmcs(loaded_vmcs->vmcs);
> loaded_vmcs->vmcs = NULL;
> WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
> +
> + /*
> + * The VMCS could be recycled, causing a false negative in
> + * vmx_vcpu_load; block speculative execution.
> + */
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
>
Reviewed-by: Liran Alon <[email protected]>
Shouldn't there be an IBPB on *any* context switch away from a VCPU
thread, even if it is to a non-VCPU thread?
On Mon, Jan 8, 2018 at 10:08 AM, Paolo Bonzini <[email protected]> wrote:
> From: Tim Chen <[email protected]>
>
> Ensure an IBPB (Indirect branch prediction barrier) before every VCPU
> switch.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d00bcad7336e..bf127c570675 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2375,6 +2375,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> vmcs_load(vmx->loaded_vmcs->vmcs);
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> if (!already_loaded) {
> @@ -4029,6 +4031,13 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
> free_vmcs(loaded_vmcs->vmcs);
> loaded_vmcs->vmcs = NULL;
> WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
> +
> + /*
> + * The VMCS could be recycled, causing a false negative in
> + * vmx_vcpu_load; block speculative execution.
> + */
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
> --
> 1.8.3.1
>
>
On Mon, 2018-01-08 at 19:08 +0100, Paolo Bonzini wrote:
>
> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
I think this one probably *is* safe even without an 'else lfence',
which means that the CPU can speculate around it, but it wants a
comment explaining that someone has properly analysed it and saying
precisely why.
On 08/01/18 20:08, Paolo Bonzini wrote:
> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> it yet).
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/svm.c | 34 +++++++++++++++++++++++++++++++++-
> 1 file changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 55dde3896898..779981a00d01 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -183,6 +183,8 @@ struct vcpu_svm {
> u64 gs_base;
> } host;
>
> + u64 spec_ctrl;
> +
> u32 *msrpm;
>
> ulong nmi_iret_rip;
> @@ -236,7 +238,7 @@ struct amd_svm_iommu_ir {
>
> static const struct svm_direct_access_msrs {
> u32 index; /* Index of the MSR */
> - bool always; /* True if intercept is always on */
> + bool always; /* True if intercept is always off */
By examining svm_vcpu_init_msrpm() it seems you are correct as if
always==true then read/write bits are cleared from msrpm (which
pass-through the MSR to the guest).
However, I do think this comment change should be done in a separate
commit as it is unrelated.
> } direct_access_msrs[] = {
> { .index = MSR_STAR, .always = true },
> { .index = MSR_IA32_SYSENTER_CS, .always = true },
> @@ -248,6 +250,8 @@ struct amd_svm_iommu_ir {
> { .index = MSR_CSTAR, .always = true },
> { .index = MSR_SYSCALL_MASK, .always = true },
> #endif
> + { .index = MSR_IA32_SPEC_CTRL, .always = true },
> + { .index = MSR_IA32_PRED_CMD, .always = true },
> { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
> { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
> { .index = MSR_IA32_LASTINTFROMIP, .always = false },
> @@ -283,6 +287,8 @@ struct amd_svm_iommu_ir {
> /* enable/disable Virtual GIF */
> static int vgif = true;
> module_param(vgif, int, 0444);
> +
> +static bool __read_mostly have_spec_ctrl;
>
> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
> @@ -1135,6 +1141,17 @@ static __init int svm_hardware_setup(void)
> pr_info("Virtual GIF supported\n");
> }
>
> + /*
> + * FIXME: this is only needed until SPEC_CTRL is supported
> + * by upstream Linux in cpufeatures, then it can be replaced
> + * with static_cpu_has.
> + */
> + have_spec_ctrl = cpu_has_spec_ctrl();
> + if (have_spec_ctrl)
> + pr_info("kvm: SPEC_CTRL available\n");
> + else
> + pr_info("kvm: SPEC_CTRL not available\n");
> +
> return 0;
>
> err:
> @@ -3599,6 +3616,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_VM_CR:
> msr_info->data = svm->nested.vm_cr_msr;
> break;
> + case MSR_IA32_SPEC_CTRL:
> + msr_info->data = svm->spec_ctrl;
> + break;
> case MSR_IA32_UCODE_REV:
> msr_info->data = 0x01000065;
> break;
> @@ -3754,6 +3774,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
> case MSR_VM_IGNNE:
> vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + svm->spec_ctrl = data;
> + break;
> case MSR_IA32_APICBASE:
> if (kvm_vcpu_apicv_active(vcpu))
> avic_update_vapic_bar(to_svm(vcpu), data);
> @@ -4942,6 +4965,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
>
> local_irq_enable();
>
> + if (have_spec_ctrl && svm->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
> +
> asm volatile (
> "push %%" _ASM_BP "; \n\t"
> "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
> @@ -5015,6 +5041,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
> #endif
> );
>
> + if (have_spec_ctrl) {
> + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
> + if (svm->spec_ctrl != 0)
Very small stuff but I hate inconsistencies:
Either you put here (svm->spec_ctrl) like in arch/x86/kvm/vmx.c or you
will use there (vmx->spec_ctrl != 0).
> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> + }
> +
> #ifdef CONFIG_X86_64
> wrmsrl(MSR_GS_BASE, svm->host.gs_base);
> #else
>
On 08/01/18 20:08, Paolo Bonzini wrote:
> From: Tom Lendacky <[email protected]>
>
> Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is
> going to run a VCPU different from what was previously run. Nested
> virtualization uses the same VMCB for the second level guest, but the
> L1 hypervisor should be using IBRS to protect itself.
>
> Signed-off-by: Tom Lendacky <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/svm.c | 31 +++++++++++++++++++++++++++++++
> 1 file changed, 31 insertions(+)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 779981a00d01..dd744d896cec 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -289,6 +289,7 @@ struct amd_svm_iommu_ir {
> module_param(vgif, int, 0444);
>
> static bool __read_mostly have_spec_ctrl;
> +static bool __read_mostly have_ibpb_support;
>
> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
> @@ -540,6 +541,7 @@ struct svm_cpu_data {
> struct kvm_ldttss_desc *tss_desc;
>
> struct page *save_area;
> + struct vmcb *current_vmcb;
> };
>
> static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
> @@ -1151,6 +1153,11 @@ static __init int svm_hardware_setup(void)
> pr_info("kvm: SPEC_CTRL available\n");
> else
> pr_info("kvm: SPEC_CTRL not available\n");
> + have_ibpb_support = have_spec_ctrl || cpu_has_ibpb_support();
> + if (have_ibpb_support)
> + pr_info("kvm: IBPB_SUPPORT available\n");
> + else
> + pr_info("kvm: IBPB_SUPPORT not available\n");
>
> return 0;
>
> @@ -1725,11 +1732,19 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> kvm_vcpu_uninit(vcpu);
> kmem_cache_free(kvm_vcpu_cache, svm);
> +
> + /*
> + * The VMCB could be recycled, causing a false negative in
> + * svm_vcpu_load; block speculative execution.
> + */
> + if (have_ibpb_support)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
> int i;
>
> if (unlikely(cpu != vcpu->cpu)) {
> @@ -1758,6 +1773,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (static_cpu_has(X86_FEATURE_RDTSCP))
> wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
>
> + if (sd->current_vmcb != svm->vmcb) {
> + sd->current_vmcb = svm->vmcb;
> + if (have_ibpb_support)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> + }
> +
> avic_vcpu_load(vcpu, cpu);
> }
>
> @@ -2798,6 +2819,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
> if (!nested_vmcb)
> return 1;
>
> + /*
> + * No need for IBPB here, the L1 hypervisor should be running with
> + * IBRS=1 and inserts one already when switching L2 VMs.
> + */
> +
I am not fully convinced yet this is OK from security perspective.
From the CPU point-of-view, both L1 & L2 run in the same
prediction-mode (as they are both running in SVM guest-mode). Therefore,
IBRS will not hide the BHB & BTB of L1 from L2.
This is how I understand things:
1. When transition between contexts in same prediction-mode, software is
responsible for issuing an IBPB to basically "delete" the "branch
prediction data" of the previous context such that the new context won't
be able to use it.
This is orthogonal to IBRS which makes sure new context won't use
"branch prediction data" that was created by a previous less-privileged
prediction-mode as we are talking about transitioning between 2 contexts
in same physical prediction-mode.
2. Because of (1), KVM was modified to issue IBPB when replacing active
VMCB/VMCS on CPU.
3. For the nested-virtualization case, L1 & L2 both run in same physical
prediction-mode. As from physical CPU perspective, they are both running
in SVM guest-mode. Therefore, L0 should issue IBPB when transitioning
from L1 to L2 and vice-versa.
4. In nested VMX case, this already happens because transitioning
between L1 & L2 involves changing active VMCS on CPU (from vmcs01 to
vmcs02) which already issues an IBPB.
5. However, nested SVM is reusing the same VMCB when transitioning
between L1 & L2 and therefore we should explicitly issue an IBPB in
nested_svm_vmrun() & nested_svm_vmexit().
6. One can implicitly assume that L1 hypervisor did issued a IBPB when
loading an VMCB and therefore it doesn't have to do it again in L0.
However, I see 2 problems with this approach:
(a) We don't protect old non-patched L1 hypervisors from Spectre even
though we could easily do that from L0 with small performance hit.
(b) When L1 hypervisor runs only a single L2 VM, it doesn't have to
issue an IBPB when loading the VMCB (as it didn't run any other VMCB
before) and it should be sufficient from L1 perspective to just write 1
to IBRS. However, in our nested-case, this is a security-hole.
Am I misunderstanding something?
Regards,
-Liran
> /* Exit Guest-Mode */
> leave_guest_mode(&svm->vcpu);
> svm->nested.vmcb = 0;
> @@ -3061,6 +3087,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
> if (!nested_vmcb)
> return false;
>
> + /*
> + * No need for IBPB here, since the nested VM is less privileged. The
> + * L1 hypervisor inserts one already when switching L2 VMs.
> + */
> +
> if (!nested_vmcb_checks(nested_vmcb)) {
> nested_vmcb->control.exit_code = SVM_EXIT_ERR;
> nested_vmcb->control.exit_code_hi = 0;
>
On 08/01/18 20:08, Paolo Bonzini wrote:
> Expose them to userspace, now that guests can use them.
> I am not adding cpufeatures here to avoid having a kernel
> that shows spec_ctrl in /proc/cpuinfo and actually has no
> support whatsoever for IBRS/IBPB. Keep the ugly special-casing
> for now, and clean it up once the generic arch/x86/ code
> learns about them.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/cpuid.c | 24 +++++++++++++++++++++---
> arch/x86/kvm/x86.c | 1 +
> 2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 767af697c20c..5f43a5940275 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -397,7 +397,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>
> /* cpuid 7.0.edx*/
> const u32 kvm_cpuid_7_0_edx_x86_features =
> - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
> + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) |
> + KF(SPEC_CTRL) | KF(STIBP);
> +
> + /* cpuid 0x80000008.edx */
> + const u32 kvm_cpuid_80000008_ebx_x86_features =
> + KF(IBPB_SUPPORT);
>
> /* all calls to cpuid_count() should be made on the same cpu */
> get_cpu();
> @@ -483,7 +488,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
> entry->ecx &= ~F(PKU);
> entry->edx &= kvm_cpuid_7_0_edx_x86_features;
> - entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
> + /*
> + * FIXME: the special casing of SPEC_CTRL and STIBP
> + * can be removed once they become regular
> + * cpufeatures.
> + */
> + entry->edx &= (
> + get_scattered_cpuid_leaf(7, 0, CPUID_EDX) |
> + KF(SPEC_CTRL) | KF(STIBP));
> } else {
> entry->ebx = 0;
> entry->ecx = 0;
> @@ -651,7 +663,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> if (!g_phys_as)
> g_phys_as = phys_as;
> entry->eax = g_phys_as | (virt_as << 8);
> - entry->ebx = entry->edx = 0;
> + /*
> + * FIXME: mask against cpufeatures, with
> + * get_scattered_cpuid_leaf(0x80000008, 0, CPUID_EBX),
> + * once IBPB_SUPPORT becomes a regular cpufeature.
> + */
> + entry->ebx &= kvm_cpuid_80000008_ebx_x86_features;
> + entry->edx = 0;
> break;
> }
> case 0x80000019:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index daa1918031df..4abb37d9f4d8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1032,6 +1032,7 @@ unsigned int kvm_get_pt_addr_cnt(void)
> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
> + MSR_IA32_SPEC_CTRL,
> };
>
> static unsigned num_msrs_to_save;
>
Reviewed-by: Liran Alon <[email protected]>
Reviewed-by: Jim Mattson <[email protected]>
On Mon, Jan 8, 2018 at 12:07 PM, Liran Alon <[email protected]> wrote:
>
>
> On 08/01/18 20:08, Paolo Bonzini wrote:
>>
>> Expose them to userspace, now that guests can use them.
>> I am not adding cpufeatures here to avoid having a kernel
>> that shows spec_ctrl in /proc/cpuinfo and actually has no
>> support whatsoever for IBRS/IBPB. Keep the ugly special-casing
>> for now, and clean it up once the generic arch/x86/ code
>> learns about them.
>>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/cpuid.c | 24 +++++++++++++++++++++---
>> arch/x86/kvm/x86.c | 1 +
>> 2 files changed, 22 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 767af697c20c..5f43a5940275 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -397,7 +397,12 @@ static inline int __do_cpuid_ent(struct
>> kvm_cpuid_entry2 *entry, u32 function,
>>
>> /* cpuid 7.0.edx*/
>> const u32 kvm_cpuid_7_0_edx_x86_features =
>> - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>> + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) |
>> + KF(SPEC_CTRL) | KF(STIBP);
>> +
>> + /* cpuid 0x80000008.edx */
>> + const u32 kvm_cpuid_80000008_ebx_x86_features =
>> + KF(IBPB_SUPPORT);
>>
>> /* all calls to cpuid_count() should be made on the same cpu */
>> get_cpu();
>> @@ -483,7 +488,14 @@ static inline int __do_cpuid_ent(struct
>> kvm_cpuid_entry2 *entry, u32 function,
>> if (!tdp_enabled ||
>> !boot_cpu_has(X86_FEATURE_OSPKE))
>> entry->ecx &= ~F(PKU);
>> entry->edx &= kvm_cpuid_7_0_edx_x86_features;
>> - entry->edx &= get_scattered_cpuid_leaf(7, 0,
>> CPUID_EDX);
>> + /*
>> + * FIXME: the special casing of SPEC_CTRL and
>> STIBP
>> + * can be removed once they become regular
>> + * cpufeatures.
>> + */
>> + entry->edx &= (
>> + get_scattered_cpuid_leaf(7, 0, CPUID_EDX)
>> |
>> + KF(SPEC_CTRL) | KF(STIBP));
>> } else {
>> entry->ebx = 0;
>> entry->ecx = 0;
>> @@ -651,7 +663,13 @@ static inline int __do_cpuid_ent(struct
>> kvm_cpuid_entry2 *entry, u32 function,
>> if (!g_phys_as)
>> g_phys_as = phys_as;
>> entry->eax = g_phys_as | (virt_as << 8);
>> - entry->ebx = entry->edx = 0;
>> + /*
>> + * FIXME: mask against cpufeatures, with
>> + * get_scattered_cpuid_leaf(0x80000008, 0, CPUID_EBX),
>> + * once IBPB_SUPPORT becomes a regular cpufeature.
>> + */
>> + entry->ebx &= kvm_cpuid_80000008_ebx_x86_features;
>> + entry->edx = 0;
>> break;
>> }
>> case 0x80000019:
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index daa1918031df..4abb37d9f4d8 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1032,6 +1032,7 @@ unsigned int kvm_get_pt_addr_cnt(void)
>> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
>> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
>> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
>> + MSR_IA32_SPEC_CTRL,
>> };
>>
>> static unsigned num_msrs_to_save;
>>
>
> Reviewed-by: Liran Alon <[email protected]>
On 08/01/18 21:18, Jim Mattson wrote:
> Guest usage of MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD should be
> predicated on guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL).
>
> On Mon, Jan 8, 2018 at 10:08 AM, Paolo Bonzini <[email protected]> wrote:
>> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
>> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
>> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
>> it yet).
>>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
>> 1 file changed, 32 insertions(+)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 669f5f74857d..d00bcad7336e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -120,6 +120,8 @@
>> module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
>> #endif
>>
>> +static bool __read_mostly have_spec_ctrl;
>> +
>> #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
>> #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
>> #define KVM_VM_CR0_ALWAYS_ON \
>> @@ -609,6 +611,8 @@ struct vcpu_vmx {
>> u64 msr_host_kernel_gs_base;
>> u64 msr_guest_kernel_gs_base;
>> #endif
>> + u64 spec_ctrl;
>> +
>> u32 vm_entry_controls_shadow;
>> u32 vm_exit_controls_shadow;
>> u32 secondary_exec_control;
>> @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> case MSR_IA32_TSC:
>> msr_info->data = guest_read_tsc(vcpu);
>> break;
>> + case MSR_IA32_SPEC_CTRL:
>> + msr_info->data = to_vmx(vcpu)->spec_ctrl;
>> + break;
>
> I have:
>
> if (!have_spec_ctrl ||
> (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
> return 1;
> msr_info->data = to_vmx(vcpu)->msr_ia32_spec_ctrl;
> break;
>
>> case MSR_IA32_SYSENTER_CS:
>> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>> break;
>> @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> case MSR_IA32_TSC:
>> kvm_write_tsc(vcpu, msr_info);
>> break;
>> + case MSR_IA32_SPEC_CTRL:
>> + to_vmx(vcpu)->spec_ctrl = msr_info->data;
>> + break;
>
> I have:
>
> if (!have_spec_ctrl ||
> (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
> return 1;
> to_vmx(vcpu)->msr_ia32_spec_ctrl = data;
> break;
>
>> case MSR_IA32_CR_PAT:
>> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
>> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
>> @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
>> goto out;
>> }
>>
>> + /*
>> + * FIXME: this is only needed until SPEC_CTRL is supported
>> + * by upstream Linux in cpufeatures, then it can be replaced
>> + * with static_cpu_has.
>> + */
>> + have_spec_ctrl = cpu_has_spec_ctrl();
>> + if (have_spec_ctrl)
>> + pr_info("kvm: SPEC_CTRL available\n");
>> + else
>> + pr_info("kvm: SPEC_CTRL not available\n");
>> +
>> if (boot_cpu_has(X86_FEATURE_NX))
>> kvm_enable_efer_bits(EFER_NX);
>>
>> @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
>> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
>> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
>> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
>> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
>> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>
> I have a lot of changes to MSR permission bitmap handling, but these
> intercepts should only be disabled when guest_cpuid_has(vcpu,
> X86_FEATURE_SPEC_CTRL).
>
>> memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
>> vmx_msr_bitmap_legacy, PAGE_SIZE);
>> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>
>> pt_guest_enter(vmx);
>>
>> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
>> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>> +
>
> Here, I have:
>
> /*
> * If the guest is allowed to write to MSR_IA32_SPEC_CTRL,
> * store it on VM-exit.
> */
> if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> add_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
> else
> clear_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
>
> /*
> * If the guest's IA32_SPEC_CTRL MSR doesn't match the host's
> * IA32_SPEC_CTRL MSR, then add the MSR to the atomic switch
> * MSRs, so that the guest value will be loaded on VM-entry
> * and the host value will be loaded on VM-exit.
> */
> if (vmx->msr_ia32_spec_ctrl != spec_ctrl_enabled())
> add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
> vmx->msr_ia32_spec_ctrl,
> spec_ctrl_enabled());
> else
> clear_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL);
>
I totally agree with this.
This exactly solves the issue I mentioned before of restoring the guest
value of MSR_IA32_SPEC_CTRL using WRMSR before calling
atomic_switch_perf_msrs() which does an indirect branch.
>> atomic_switch_perf_msrs(vmx);
>>
>> vmx_arm_hv_timer(vcpu);
>> @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> #endif
>> );
>>
>> + if (have_spec_ctrl) {
>> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>> + if (vmx->spec_ctrl)
>> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
>> + }
>> +
>
> I know the VM-exit MSR load and store lists are probably slower, but
> I'm a little uncomfortable restoring the host's IA32_SPEC_CTRL MSR
> late if the guest has it clear and the host has it set.
Again, I totally agree. This is a better approach for handling this.
>
>> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
>> if (vmx->host_debugctlmsr)
>> update_debugctlmsr(vmx->host_debugctlmsr);
>> --
>> 1.8.3.1
>>
>>
Hi Paolo
Do you assume that host isn't using IBRS and only guest uses it?
On Mon, Jan 8, 2018 at 10:08 AM, Paolo Bonzini <[email protected]> wrote:
> Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> it yet).
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 669f5f74857d..d00bcad7336e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -120,6 +120,8 @@
> module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
> #endif
>
> +static bool __read_mostly have_spec_ctrl;
> +
> #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
> #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
> #define KVM_VM_CR0_ALWAYS_ON \
> @@ -609,6 +611,8 @@ struct vcpu_vmx {
> u64 msr_host_kernel_gs_base;
> u64 msr_guest_kernel_gs_base;
> #endif
> + u64 spec_ctrl;
> +
> u32 vm_entry_controls_shadow;
> u32 vm_exit_controls_shadow;
> u32 secondary_exec_control;
> @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + msr_info->data = to_vmx(vcpu)->spec_ctrl;
> + break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
> @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + to_vmx(vcpu)->spec_ctrl = msr_info->data;
> + break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
> goto out;
> }
>
> + /*
> + * FIXME: this is only needed until SPEC_CTRL is supported
> + * by upstream Linux in cpufeatures, then it can be replaced
> + * with static_cpu_has.
> + */
> + have_spec_ctrl = cpu_has_spec_ctrl();
> + if (have_spec_ctrl)
> + pr_info("kvm: SPEC_CTRL available\n");
> + else
> + pr_info("kvm: SPEC_CTRL not available\n");
> +
> if (boot_cpu_has(X86_FEATURE_NX))
> kvm_enable_efer_bits(EFER_NX);
>
> @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>
> memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
> vmx_msr_bitmap_legacy, PAGE_SIZE);
> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> pt_guest_enter(vmx);
>
> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
Do we even need to optimize this? what if host Linux enabled IBRS, but
guest has it turned off?
Thought it might be simpler to blindly update it with what
vmx->spec_ctrl value is?
> atomic_switch_perf_msrs(vmx);
>
> vmx_arm_hv_timer(vcpu);
> @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> #endif
> );
>
> + if (have_spec_ctrl) {
> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> + if (vmx->spec_ctrl)
> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> + }
> +
Same thing here.. if the host OS has enabled IBRS wouldn't you want to
keep the same value?
> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
> if (vmx->host_debugctlmsr)
> update_debugctlmsr(vmx->host_debugctlmsr);
> --
> 1.8.3.1
>
>
----- Original Message -----
> From: "Ashok Raj" <[email protected]>
> To: "Paolo Bonzini" <[email protected]>
> Cc: [email protected], [email protected], [email protected], [email protected], "thomas lendacky"
> <[email protected]>, [email protected], [email protected], "Ashok Raj" <[email protected]>
> Sent: Monday, January 8, 2018 11:09:53 PM
> Subject: Re: [PATCH 3/7] kvm: vmx: pass MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD down to the guest
>
> Hi Paolo
>
> Do you assume that host isn't using IBRS and only guest uses it?
For now, yes.
Patches to add the IBRS and IBPB cpufeatures will have to adjust the
MSR writes from this patch.
Paolo
>
>
>
> On Mon, Jan 8, 2018 at 10:08 AM, Paolo Bonzini <[email protected]> wrote:
> > Direct access to MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD is important
> > for performance. Allow load/store of MSR_IA32_SPEC_CTRL, restore guest
> > IBRS on VM entry and set it to 0 on VM exit (because Linux does not use
> > it yet).
> >
> > Signed-off-by: Paolo Bonzini <[email protected]>
> > ---
> > arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
> > 1 file changed, 32 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 669f5f74857d..d00bcad7336e 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -120,6 +120,8 @@
> > module_param_named(preemption_timer, enable_preemption_timer, bool,
> > S_IRUGO);
> > #endif
> >
> > +static bool __read_mostly have_spec_ctrl;
> > +
> > #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
> > #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
> > #define KVM_VM_CR0_ALWAYS_ON \
> > @@ -609,6 +611,8 @@ struct vcpu_vmx {
> > u64 msr_host_kernel_gs_base;
> > u64 msr_guest_kernel_gs_base;
> > #endif
> > + u64 spec_ctrl;
> > +
> > u32 vm_entry_controls_shadow;
> > u32 vm_exit_controls_shadow;
> > u32 secondary_exec_control;
> > @@ -3361,6 +3365,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct
> > msr_data *msr_info)
> > case MSR_IA32_TSC:
> > msr_info->data = guest_read_tsc(vcpu);
> > break;
> > + case MSR_IA32_SPEC_CTRL:
> > + msr_info->data = to_vmx(vcpu)->spec_ctrl;
> > + break;
> > case MSR_IA32_SYSENTER_CS:
> > msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> > break;
> > @@ -3500,6 +3507,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct
> > msr_data *msr_info)
> > case MSR_IA32_TSC:
> > kvm_write_tsc(vcpu, msr_info);
> > break;
> > + case MSR_IA32_SPEC_CTRL:
> > + to_vmx(vcpu)->spec_ctrl = msr_info->data;
> > + break;
> > case MSR_IA32_CR_PAT:
> > if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> > if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> > @@ -7062,6 +7072,17 @@ static __init int hardware_setup(void)
> > goto out;
> > }
> >
> > + /*
> > + * FIXME: this is only needed until SPEC_CTRL is supported
> > + * by upstream Linux in cpufeatures, then it can be replaced
> > + * with static_cpu_has.
> > + */
> > + have_spec_ctrl = cpu_has_spec_ctrl();
> > + if (have_spec_ctrl)
> > + pr_info("kvm: SPEC_CTRL available\n");
> > + else
> > + pr_info("kvm: SPEC_CTRL not available\n");
> > +
> > if (boot_cpu_has(X86_FEATURE_NX))
> > kvm_enable_efer_bits(EFER_NX);
> >
> > @@ -7131,6 +7152,8 @@ static __init int hardware_setup(void)
> > vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
> > vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> > vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
> > + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> > + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
> >
> > memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
> > vmx_msr_bitmap_legacy, PAGE_SIZE);
> > @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu
> > *vcpu)
> >
> > pt_guest_enter(vmx);
> >
> > + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> > + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > +
>
> Do we even need to optimize this? what if host Linux enabled IBRS, but
> guest has it turned off?
> Thought it might be simpler to blindly update it with what
> vmx->spec_ctrl value is?
>
> > atomic_switch_perf_msrs(vmx);
> >
> > vmx_arm_hv_timer(vcpu);
> > @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu
> > *vcpu)
> > #endif
> > );
> >
> > + if (have_spec_ctrl) {
> > + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > + if (vmx->spec_ctrl)
> > + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> > + }
> > +
>
> Same thing here.. if the host OS has enabled IBRS wouldn't you want to
> keep the same value?
>
> > /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed
> > */
> > if (vmx->host_debugctlmsr)
> > update_debugctlmsr(vmx->host_debugctlmsr);
> > --
> > 1.8.3.1
> >
> >
>
> I have:
>
> if (!have_spec_ctrl ||
> (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
> return 1;
> msr_info->data = to_vmx(vcpu)->msr_ia32_spec_ctrl;
> break;
> I have:
>
> if (!have_spec_ctrl ||
> (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
> return 1;
> to_vmx(vcpu)->msr_ia32_spec_ctrl = data;
> break;
Both better than mine.
> > + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
> > + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>
> I have a lot of changes to MSR permission bitmap handling, but these
> intercepts should only be disabled when guest_cpuid_has(vcpu,
> X86_FEATURE_SPEC_CTRL).
That's harder to backport and not strictly necessary (just like
e.g. we don't check guest CPUID bits before emulation). I agree that
your version is better, but I think the above is fine as a minimal
fix.
> Here, I have:
>
> /*
> * If the guest is allowed to write to MSR_IA32_SPEC_CTRL,
> * store it on VM-exit.
> */
> if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> add_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
> else
> clear_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
>
> /*
> * If the guest's IA32_SPEC_CTRL MSR doesn't match the host's
> * IA32_SPEC_CTRL MSR, then add the MSR to the atomic switch
> * MSRs, so that the guest value will be loaded on VM-entry
> * and the host value will be loaded on VM-exit.
> */
> if (vmx->msr_ia32_spec_ctrl != spec_ctrl_enabled())
> add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
> vmx->msr_ia32_spec_ctrl,
> spec_ctrl_enabled());
> else
> clear_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL);
>
> > atomic_switch_perf_msrs(vmx);
> >
> > vmx_arm_hv_timer(vcpu);
> > @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu
> > *vcpu)
> > #endif
> > );
> >
> > + if (have_spec_ctrl) {
> > + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > + if (vmx->spec_ctrl)
> > + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> > + }
> > +
>
> I know the VM-exit MSR load and store lists are probably slower, but
> I'm a little uncomfortable restoring the host's IA32_SPEC_CTRL MSR
> late if the guest has it clear and the host has it set.
There is no indirect branch before though, isn't it?
Paolo
> > /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed
> > */
> > if (vmx->host_debugctlmsr)
> > update_debugctlmsr(vmx->host_debugctlmsr);
> > --
> > 1.8.3.1
> >
> >
>
----- Original Message -----
> From: "David Woodhouse" <[email protected]>
> To: "Paolo Bonzini" <[email protected]>, [email protected], [email protected]
> Cc: [email protected], [email protected], "thomas lendacky" <[email protected]>, [email protected]
> Sent: Monday, January 8, 2018 8:41:07 PM
> Subject: Re: [PATCH 3/7] kvm: vmx: pass MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD down to the guest
>
> On Mon, 2018-01-08 at 19:08 +0100, Paolo Bonzini wrote:
> >
> > + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> > + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > +
>
> I think this one probably *is* safe even without an 'else lfence',
> which means that the CPU can speculate around it, but it wants a
> comment explaining that someone has properly analysed it and saying
> precisely why.
This one is okay as long as there are no indirect jumps until
vmresume. But the one on vmexit is only okay because right now
it's *disabling* IBRS. Once IBRS is used by Linux, we'll need an
lfence there. I'll add a comment.
Paolo
On Mon, Jan 8, 2018 at 2:32 PM, Paolo Bonzini <[email protected]> wrote:
>
>> I have:
>>
>> if (!have_spec_ctrl ||
>> (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
>> return 1;
>> msr_info->data = to_vmx(vcpu)->msr_ia32_spec_ctrl;
>> break;
>
>> I have:
>>
>> if (!have_spec_ctrl ||
>> (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
>> return 1;
>> to_vmx(vcpu)->msr_ia32_spec_ctrl = data;
>> break;
>
> Both better than mine.
>
>> > + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
>> > + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>>
>> I have a lot of changes to MSR permission bitmap handling, but these
>> intercepts should only be disabled when guest_cpuid_has(vcpu,
>> X86_FEATURE_SPEC_CTRL).
>
> That's harder to backport and not strictly necessary (just like
> e.g. we don't check guest CPUID bits before emulation). I agree that
> your version is better, but I think the above is fine as a minimal
> fix.
Due to the impacts that spec_ctrl has on the neighboring hyperthread,
one may want to disable MSRs 0x48 and 0x49 for a particular VM. We do
this by masking off CPUID.(EAX=7, ECX=0).EDX[26] and CPUID.(EAX=7,
ECX=0).EDX[27] from the userspace agent. However, with your patch,
*any* VCPU gets unrestricted access to these MSRs, without any
mechanism for disabling them.
>> Here, I have:
>>
>> /*
>> * If the guest is allowed to write to MSR_IA32_SPEC_CTRL,
>> * store it on VM-exit.
>> */
>> if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> add_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
>> else
>> clear_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
>>
>> /*
>> * If the guest's IA32_SPEC_CTRL MSR doesn't match the host's
>> * IA32_SPEC_CTRL MSR, then add the MSR to the atomic switch
>> * MSRs, so that the guest value will be loaded on VM-entry
>> * and the host value will be loaded on VM-exit.
>> */
>> if (vmx->msr_ia32_spec_ctrl != spec_ctrl_enabled())
>> add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
>> vmx->msr_ia32_spec_ctrl,
>> spec_ctrl_enabled());
>> else
>> clear_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL);
>>
>> > atomic_switch_perf_msrs(vmx);
>> >
>> > vmx_arm_hv_timer(vcpu);
>> > @@ -9707,6 +9733,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu
>> > *vcpu)
>> > #endif
>> > );
>> >
>> > + if (have_spec_ctrl) {
>> > + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>> > + if (vmx->spec_ctrl)
>> > + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
>> > + }
>> > +
>>
>> I know the VM-exit MSR load and store lists are probably slower, but
>> I'm a little uncomfortable restoring the host's IA32_SPEC_CTRL MSR
>> late if the guest has it clear and the host has it set.
>
> There is no indirect branch before though, isn't it?
I guess that depends on how you define "before."
> Paolo
>
>> > /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed
>> > */
>> > if (vmx->host_debugctlmsr)
>> > update_debugctlmsr(vmx->host_debugctlmsr);
>> > --
>> > 1.8.3.1
>> >
>> >
>>
----- [email protected] wrote:
> ----- Original Message -----
> > From: "David Woodhouse" <[email protected]>
> > To: "Paolo Bonzini" <[email protected]>,
> [email protected], [email protected]
> > Cc: [email protected], [email protected], "thomas lendacky"
> <[email protected]>, [email protected]
> > Sent: Monday, January 8, 2018 8:41:07 PM
> > Subject: Re: [PATCH 3/7] kvm: vmx: pass MSR_IA32_SPEC_CTRL and
> MSR_IA32_PRED_CMD down to the guest
> >
> > On Mon, 2018-01-08 at 19:08 +0100, Paolo Bonzini wrote:
> > >
> > > + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> > > + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > > +
> >
> > I think this one probably *is* safe even without an 'else lfence',
> > which means that the CPU can speculate around it, but it wants a
> > comment explaining that someone has properly analysed it and saying
> > precisely why.
>
> This one is okay as long as there are no indirect jumps until
> vmresume. But the one on vmexit is only okay because right now
> it's *disabling* IBRS. Once IBRS is used by Linux, we'll need an
> lfence there. I'll add a comment.
>
> Paolo
That is true but from what I understand, there is an indirect branch from this point until vmresume.
That indirect branch resides in atomic_switch_perf_msrs() immediately called after this WRMSR:
atomic_switch_perf_msrs() -> perf_guest_get_msrs() -> x86_pmu.guest_get_msrs().
-Liran
On 08/01/2018 20:36, Jim Mattson wrote:
> Shouldn't there be an IBPB on *any* context switch away from a VCPU
> thread, even if it is to a non-VCPU thread?
Yes, but that's the task of patches to the generic Linux context
switching code. As mentioned in the cover letter, this isn't yet a full
solution---just the KVM bits, which is what Radim and I have full
control on. Hence the hacks with cpuid in local bools.
Paolo
On 09/01/2018 00:58, Liran Alon wrote:
>
> ----- [email protected] wrote:
>
>> ----- Original Message -----
>>> From: "David Woodhouse" <[email protected]>
>>> To: "Paolo Bonzini" <[email protected]>,
>> [email protected], [email protected]
>>> Cc: [email protected], [email protected], "thomas lendacky"
>> <[email protected]>, [email protected]
>>> Sent: Monday, January 8, 2018 8:41:07 PM
>>> Subject: Re: [PATCH 3/7] kvm: vmx: pass MSR_IA32_SPEC_CTRL and
>> MSR_IA32_PRED_CMD down to the guest
>>>
>>> On Mon, 2018-01-08 at 19:08 +0100, Paolo Bonzini wrote:
>>>>
>>>> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
>>>> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>>>> +
>>>
>>> I think this one probably *is* safe even without an 'else lfence',
>>> which means that the CPU can speculate around it, but it wants a
>>> comment explaining that someone has properly analysed it and saying
>>> precisely why.
>>
>> This one is okay as long as there are no indirect jumps until
>> vmresume. But the one on vmexit is only okay because right now
>> it's *disabling* IBRS. Once IBRS is used by Linux, we'll need an
>> lfence there. I'll add a comment.
>>
>> Paolo
>
> That is true but from what I understand, there is an indirect branch from this point until vmresume.
> That indirect branch resides in atomic_switch_perf_msrs() immediately called after this WRMSR:
> atomic_switch_perf_msrs() -> perf_guest_get_msrs() -> x86_pmu.guest_get_msrs().
Sure, it has to move later as pointed out by other reviewers.
Paolo
On 09/01/2018 00:19, Jim Mattson wrote:
>>>> + vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
>>>> + vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
>>> I have a lot of changes to MSR permission bitmap handling, but these
>>> intercepts should only be disabled when guest_cpuid_has(vcpu,
>>> X86_FEATURE_SPEC_CTRL).
>> That's harder to backport and not strictly necessary (just like
>> e.g. we don't check guest CPUID bits before emulation). I agree that
>> your version is better, but I think the above is fine as a minimal
>> fix.
>
> Due to the impacts that spec_ctrl has on the neighboring hyperthread,
> one may want to disable MSRs 0x48 and 0x49 for a particular VM. We do
> this by masking off CPUID.(EAX=7, ECX=0).EDX[26] and CPUID.(EAX=7,
> ECX=0).EDX[27] from the userspace agent. However, with your patch,
> *any* VCPU gets unrestricted access to these MSRs, without any
> mechanism for disabling them.
Yes, I agree that having the check is superior. However, I also want to
get there step by step.
>>>> + if (have_spec_ctrl) {
>>>> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>>>> + if (vmx->spec_ctrl)
>>>> + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
>>>> + }
>>>> +
>>>
>>> I know the VM-exit MSR load and store lists are probably slower, but
>>> I'm a little uncomfortable restoring the host's IA32_SPEC_CTRL MSR
>>> late if the guest has it clear and the host has it set.
>>
>> There is no indirect branch before though, isn't it?
>
> I guess that depends on how you define "before."
--verbose? :-/
Paolo
On Mon, 8 Jan 2018, Paolo Bonzini wrote:
> This series allows guests to use the MSR_IA32_SPEC_CTRL and
> MSR_IA32_PRED_CMD model specific registers that were added as mitigations
> for CVE-2017-5715.
>
> These are only the KVM specific parts of the fix. It does *not* yet
> include any protection for reading host memory from the guest, because
> that would be done in the same way as the rest of Linux. So there is no
> IBRS *usage* here, no retpolines, no stuffing of the return stack buffer.
> (KVM already includes a fix to clear all registers on vmexit, which is
> enough to block Google Project Zero's PoC exploit).
>
> However, I am including the changes to use IBPB (indirect branch
> predictor barrier) if available. That occurs only when there is a VCPU
> switch on a physical CPU, thus it has a small impact on performance.
>
> The patches are a bit hackish because the relevant cpufeatures have
> not been included yet, and because I wanted to make the patches easier
> to backport to distro kernels if desired, but I would still like to
> have them in 4.16.
>
> Please review.
CC'ing [email protected] on this would have been asked too much, right?
On 08/01/2018 20:09, Liran Alon wrote:
>>
>> +static inline bool cpu_has_spec_ctrl(void)
>> +{
>> + u32 eax, ebx, ecx, edx;
>> + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
>> +
>> + return edx & bit(KVM_CPUID_BIT_SPEC_CTRL);
>
> Why not just "return cpuid_edx(7) & bit(KVM_CPUID_BIT_SPEC_CTRL);"?
> This is also consistent with how cpu_has_ibpb_support() is written.
Leaf 7 explicitly requires you to clear ECX (there could be a leaf for
EAX=7,ECX=1 in the future). Even though cpuid_edx does do that, it's
not clear from the function that cpuid_edx(7) would work.
Paolo
On 08.01.2018 19:08, Paolo Bonzini wrote:
> From: Tim Chen <[email protected]>
>
> Ensure an IBPB (Indirect branch prediction barrier) before every VCPU
> switch.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d00bcad7336e..bf127c570675 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2375,6 +2375,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> vmcs_load(vmx->loaded_vmcs->vmcs);
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> if (!already_loaded) {
> @@ -4029,6 +4031,13 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
> free_vmcs(loaded_vmcs->vmcs);
> loaded_vmcs->vmcs = NULL;
> WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
> +
> + /*
> + * The VMCS could be recycled, causing a false negative in
> + * vmx_vcpu_load; block speculative execution.
> + */
> + if (have_spec_ctrl)
> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> }
>
> static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
>
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On 08/01/2018 21:00, Liran Alon wrote:
>
>
> On 08/01/18 20:08, Paolo Bonzini wrote:
>> From: Tom Lendacky <[email protected]>
>>
>> Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is
>> going to run a VCPU different from what was previously run. Nested
>> virtualization uses the same VMCB for the second level guest, but the
>> L1 hypervisor should be using IBRS to protect itself.
>>
>> Signed-off-by: Tom Lendacky <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/svm.c | 31 +++++++++++++++++++++++++++++++
>> 1 file changed, 31 insertions(+)
>>
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 779981a00d01..dd744d896cec 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -289,6 +289,7 @@ struct amd_svm_iommu_ir {
>> module_param(vgif, int, 0444);
>>
>> static bool __read_mostly have_spec_ctrl;
>> +static bool __read_mostly have_ibpb_support;
>>
>> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
>> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
>> @@ -540,6 +541,7 @@ struct svm_cpu_data {
>> struct kvm_ldttss_desc *tss_desc;
>>
>> struct page *save_area;
>> + struct vmcb *current_vmcb;
>> };
>>
>> static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
>> @@ -1151,6 +1153,11 @@ static __init int svm_hardware_setup(void)
>> pr_info("kvm: SPEC_CTRL available\n");
>> else
>> pr_info("kvm: SPEC_CTRL not available\n");
>> + have_ibpb_support = have_spec_ctrl || cpu_has_ibpb_support();
>> + if (have_ibpb_support)
>> + pr_info("kvm: IBPB_SUPPORT available\n");
>> + else
>> + pr_info("kvm: IBPB_SUPPORT not available\n");
>>
>> return 0;
>>
>> @@ -1725,11 +1732,19 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
>> kvm_vcpu_uninit(vcpu);
>> kmem_cache_free(kvm_vcpu_cache, svm);
>> +
>> + /*
>> + * The VMCB could be recycled, causing a false negative in
>> + * svm_vcpu_load; block speculative execution.
>> + */
>> + if (have_ibpb_support)
>> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
>> }
>>
>> static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>> {
>> struct vcpu_svm *svm = to_svm(vcpu);
>> + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
>> int i;
>>
>> if (unlikely(cpu != vcpu->cpu)) {
>> @@ -1758,6 +1773,12 @@ static void svm_vcpu_load(struct kvm_vcpu
>> *vcpu, int cpu)
>> if (static_cpu_has(X86_FEATURE_RDTSCP))
>> wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
>>
>> + if (sd->current_vmcb != svm->vmcb) {
>> + sd->current_vmcb = svm->vmcb;
>> + if (have_ibpb_support)
>> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
>> + }
>> +
>> avic_vcpu_load(vcpu, cpu);
>> }
>>
>> @@ -2798,6 +2819,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
>> if (!nested_vmcb)
>> return 1;
>>
>> + /*
>> + * No need for IBPB here, the L1 hypervisor should be running with
>> + * IBRS=1 and inserts one already when switching L2 VMs.
>> + */
>> +
>
> I am not fully convinced yet this is OK from security perspective.
> From the CPU point-of-view, both L1 & L2 run in the same prediction-mode
> (as they are both running in SVM guest-mode). Therefore, IBRS will not
> hide the BHB & BTB of L1 from L2.
Indeed, IBRS will not hide the indirect branch predictor state of L2
user mode from L1 user mode.
On current generation processors, as I understand it, IBRS simply
disables the indirect branch predictor when set to 1. Therefore, as
long as the L1 hypervisor sets IBRS=1, the indirect branch predictor
state left by L2 does not affect execution of the L1 hypervisor.
Future processors might have a mode that says "just set IBRS=1 and I'll
DTRT". If that's done by indexing the BTB using the full virtual
address, the CPL _and_ the ASID/PCID/VPID, then IBPB is not needed here
because the L2 VM uses a different ASID. If that's done by only
augmenting the BTB index with the CPL, we'd need an IBPB. But in this
new world we've been thrown into, that would be a processor bug...
Note that AMD currently doesn't implement IBRS at all. In order to
mitigate against CVE-2017-5715, the hypervisor should issue an IBPB on
every vmexit (in both L0 and L1). However, the cost of that is very
high. While we may want a paranoia mode that does it, it is outside the
scope of this series (which is more of a "let's unblock QEMU patches"
thing than a complete fix).
> 6. One can implicitly assume that L1 hypervisor did issued a IBPB when
> loading an VMCB and therefore it doesn't have to do it again in L0.
> However, I see 2 problems with this approach:
> (a) We don't protect old non-patched L1 hypervisors from Spectre even
> though we could easily do that from L0 with small performance hit.
Yeah, that's a nice thing to have. However, I disagree on the "small"
performance hit... on a patched hypervisor, the cost of a double fix can
be very noticeable.
> (b) When L1 hypervisor runs only a single L2 VM, it doesn't have to
> issue an IBPB when loading the VMCB (as it didn't run any other VMCB
> before) and it should be sufficient from L1 perspective to just write 1
> to IBRS. However, in our nested-case, this is a security-hole.
This is the main difference in our reasoning. I think IBRS=1 (or IBPB
on vmexit) should be enough.
Paolo
> Am I misunderstanding something?
>
> Regards,
> -Liran
>
>> /* Exit Guest-Mode */
>> leave_guest_mode(&svm->vcpu);
>> svm->nested.vmcb = 0;
>> @@ -3061,6 +3087,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
>> if (!nested_vmcb)
>> return false;
>>
>> + /*
>> + * No need for IBPB here, since the nested VM is less
>> privileged. The
>> + * L1 hypervisor inserts one already when switching L2 VMs.
>> + */
>> +
>> if (!nested_vmcb_checks(nested_vmcb)) {
>> nested_vmcb->control.exit_code = SVM_EXIT_ERR;
>> nested_vmcb->control.exit_code_hi = 0;
>>
>
On 09/01/2018 11:15, Thomas Gleixner wrote:
> On Mon, 8 Jan 2018, Paolo Bonzini wrote:
>
>> This series allows guests to use the MSR_IA32_SPEC_CTRL and
>> MSR_IA32_PRED_CMD model specific registers that were added as mitigations
>> for CVE-2017-5715.
>>
>> These are only the KVM specific parts of the fix. It does *not* yet
>> include any protection for reading host memory from the guest, because
>> that would be done in the same way as the rest of Linux. So there is no
>> IBRS *usage* here, no retpolines, no stuffing of the return stack buffer.
>> (KVM already includes a fix to clear all registers on vmexit, which is
>> enough to block Google Project Zero's PoC exploit).
>>
>> However, I am including the changes to use IBPB (indirect branch
>> predictor barrier) if available. That occurs only when there is a VCPU
>> switch on a physical CPU, thus it has a small impact on performance.
>>
>> The patches are a bit hackish because the relevant cpufeatures have
>> not been included yet, and because I wanted to make the patches easier
>> to backport to distro kernels if desired, but I would still like to
>> have them in 4.16.
>>
>> Please review.
>
> CC'ing [email protected] on this would have been asked too much, right?
Sorry, my mistake. I'll CC you on v2.
Paolo
On 08.01.2018 19:08, Paolo Bonzini wrote:
> As an interim measure until SPEC_CTRL is supported by upstream
> Linux in cpufeatures, add a function that lets vmx.c and svm.c
> know whether to save/restore MSR_IA32_SPEC_CTRL.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/cpuid.c | 3 ---
> arch/x86/kvm/cpuid.h | 22 ++++++++++++++++++++++
> 2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 8e9a07c557f1..767af697c20c 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void)
>
> #define F(x) bit(X86_FEATURE_##x)
>
> -/* These are scattered features in cpufeatures.h. */
> -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> #define KF(x) bit(KVM_CPUID_BIT_##x)
>
> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index c2cea6651279..8d04ccf177ce 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -155,6 +155,28 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
> return x86_stepping(best->eax);
> }
>
> +/* These are scattered features in cpufeatures.h. */
> +#define KVM_CPUID_BIT_AVX512_4VNNIW 2
> +#define KVM_CPUID_BIT_AVX512_4FMAPS 3
> +#define KVM_CPUID_BIT_SPEC_CTRL 26
> +#define KVM_CPUID_BIT_STIBP 27
I can see that STIBP is never checked in KVM code but only forwarded to
the guest if available.
I am wondering if we would have to check against that, too, before
issuing a FEATURE_SET_IBPB.
(can somebody point me at the intel documentation?)
> +
> +/* CPUID[eax=0x80000008].ebx */
> +#define KVM_CPUID_BIT_IBPB_SUPPORT 12
> +
> +static inline bool cpu_has_spec_ctrl(void)
> +{
> + u32 eax, ebx, ecx, edx;
> + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
> +
> + return edx & bit(KVM_CPUID_BIT_SPEC_CTRL);
> +}
> +
> +static inline bool cpu_has_ibpb_support(void)
> +{
> + return cpuid_ebx(0x80000008) & bit(KVM_CPUID_BIT_IBPB_SUPPORT);
> +}
> +
> static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
> {
> return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
>
--
Thanks,
David / dhildenb
On 09/01/2018 12:14, David Hildenbrand wrote:
>>
>> +/* These are scattered features in cpufeatures.h. */
>> +#define KVM_CPUID_BIT_AVX512_4VNNIW 2
>> +#define KVM_CPUID_BIT_AVX512_4FMAPS 3
>> +#define KVM_CPUID_BIT_SPEC_CTRL 26
>> +#define KVM_CPUID_BIT_STIBP 27
> I can see that STIBP is never checked in KVM code but only forwarded to
> the guest if available.
>
> I am wondering if we would have to check against that, too, before
> issuing a FEATURE_SET_IBPB.
STIBP is "single-threaded indirect branch prediction". SPEC_CTRL always
implies that FEATURE_SET_IBPB is available.
Paolo
----- [email protected] wrote:
> On 08/01/2018 21:00, Liran Alon wrote:
> >
> >
> > On 08/01/18 20:08, Paolo Bonzini wrote:
> >> From: Tom Lendacky <[email protected]>
> >>
> >> Set IBPB (Indirect Branch Prediction Barrier) when the current CPU
> is
> >> going to run a VCPU different from what was previously run.
> Nested
> >> virtualization uses the same VMCB for the second level guest, but
> the
> >> L1 hypervisor should be using IBRS to protect itself.
> >>
> >> Signed-off-by: Tom Lendacky <[email protected]>
> >> Signed-off-by: Paolo Bonzini <[email protected]>
> >> ---
> >> arch/x86/kvm/svm.c | 31 +++++++++++++++++++++++++++++++
> >> 1 file changed, 31 insertions(+)
> >>
> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >> index 779981a00d01..dd744d896cec 100644
> >> --- a/arch/x86/kvm/svm.c
> >> +++ b/arch/x86/kvm/svm.c
> >> @@ -289,6 +289,7 @@ struct amd_svm_iommu_ir {
> >> module_param(vgif, int, 0444);
> >>
> >> static bool __read_mostly have_spec_ctrl;
> >> +static bool __read_mostly have_ibpb_support;
> >>
> >> static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long
> cr0);
> >> static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool
> invalidate_gpa);
> >> @@ -540,6 +541,7 @@ struct svm_cpu_data {
> >> struct kvm_ldttss_desc *tss_desc;
> >>
> >> struct page *save_area;
> >> + struct vmcb *current_vmcb;
> >> };
> >>
> >> static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
> >> @@ -1151,6 +1153,11 @@ static __init int svm_hardware_setup(void)
> >> pr_info("kvm: SPEC_CTRL available\n");
> >> else
> >> pr_info("kvm: SPEC_CTRL not available\n");
> >> + have_ibpb_support = have_spec_ctrl || cpu_has_ibpb_support();
> >> + if (have_ibpb_support)
> >> + pr_info("kvm: IBPB_SUPPORT available\n");
> >> + else
> >> + pr_info("kvm: IBPB_SUPPORT not available\n");
> >>
> >> return 0;
> >>
> >> @@ -1725,11 +1732,19 @@ static void svm_free_vcpu(struct kvm_vcpu
> *vcpu)
> >> __free_pages(virt_to_page(svm->nested.msrpm),
> MSRPM_ALLOC_ORDER);
> >> kvm_vcpu_uninit(vcpu);
> >> kmem_cache_free(kvm_vcpu_cache, svm);
> >> +
> >> + /*
> >> + * The VMCB could be recycled, causing a false negative in
> >> + * svm_vcpu_load; block speculative execution.
> >> + */
> >> + if (have_ibpb_support)
> >> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> >> }
> >>
> >> static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >> {
> >> struct vcpu_svm *svm = to_svm(vcpu);
> >> + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
> >> int i;
> >>
> >> if (unlikely(cpu != vcpu->cpu)) {
> >> @@ -1758,6 +1773,12 @@ static void svm_vcpu_load(struct kvm_vcpu
> >> *vcpu, int cpu)
> >> if (static_cpu_has(X86_FEATURE_RDTSCP))
> >> wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
> >>
> >> + if (sd->current_vmcb != svm->vmcb) {
> >> + sd->current_vmcb = svm->vmcb;
> >> + if (have_ibpb_support)
> >> + wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
> >> + }
> >> +
> >> avic_vcpu_load(vcpu, cpu);
> >> }
> >>
> >> @@ -2798,6 +2819,11 @@ static int nested_svm_vmexit(struct vcpu_svm
> *svm)
> >> if (!nested_vmcb)
> >> return 1;
> >>
> >> + /*
> >> + * No need for IBPB here, the L1 hypervisor should be running
> with
> >> + * IBRS=1 and inserts one already when switching L2 VMs.
> >> + */
> >> +
> >
> > I am not fully convinced yet this is OK from security perspective.
> > From the CPU point-of-view, both L1 & L2 run in the same
> prediction-mode
> > (as they are both running in SVM guest-mode). Therefore, IBRS will
> not
> > hide the BHB & BTB of L1 from L2.
>
> Indeed, IBRS will not hide the indirect branch predictor state of L2
> user mode from L1 user mode.
>
> On current generation processors, as I understand it, IBRS simply
> disables the indirect branch predictor when set to 1. Therefore, as
This is not how I understood what IBRS does.
Intel (not sure about AMD) states that if IBRS is set, indirect branches will now allow their predicted target address to be controlled by code that executed in a less privileged prediction-mode before the IBRS was last written with a value of 1.
(Intel also states that thus an indirect branch may be affected by code in a less privileged prediction-mode that executed AFTER IBRS mode was last written with a value of 1).
Therefore, L2 could also inject BTB/BHB entries that will be used by L1:
Consider the case that L2 injects values into BTB/BHB and then issues a hypercall which will cause #VMExit which will be forwarded to L1. Because L2 & L1 runs in the same prediction-mode (both Ring0 SVM guest-mode) from physical CPU perspective, Ring0 L1 code could be using BTB/BHB entries injected by Ring0 L2.
(The fact that L0 have set IBRS to 1 when exiting from L2 to L0 doesn't prevent those entries from being used by L1 after L0 enters L1).
This is different than what happens in non-nested case as L0 & L1 runs in different physical prediction-modes and therefore setting IBRS=1 on every #VMExit should suffice.
Therefore, I don't understand how L1 setting IBRS to 1 will help him in this case.
Maybe this mechanism works differently on AMD?
-Liran
> long as the L1 hypervisor sets IBRS=1, the indirect branch predictor
> state left by L2 does not affect execution of the L1 hypervisor.
>
> Future processors might have a mode that says "just set IBRS=1 and
> I'll
> DTRT". If that's done by indexing the BTB using the full virtual
> address, the CPL _and_ the ASID/PCID/VPID, then IBPB is not needed
> here
> because the L2 VM uses a different ASID. If that's done by only
> augmenting the BTB index with the CPL, we'd need an IBPB. But in
> this
> new world we've been thrown into, that would be a processor bug...
>
> Note that AMD currently doesn't implement IBRS at all. In order to
> mitigate against CVE-2017-5715, the hypervisor should issue an IBPB
> on
> every vmexit (in both L0 and L1). However, the cost of that is very
> high. While we may want a paranoia mode that does it, it is outside
> the
> scope of this series (which is more of a "let's unblock QEMU patches"
> thing than a complete fix).
>
> > 6. One can implicitly assume that L1 hypervisor did issued a IBPB
> when
> > loading an VMCB and therefore it doesn't have to do it again in L0.
> > However, I see 2 problems with this approach:
> > (a) We don't protect old non-patched L1 hypervisors from Spectre
> even
> > though we could easily do that from L0 with small performance hit.
>
> Yeah, that's a nice thing to have. However, I disagree on the
> "small"
> performance hit... on a patched hypervisor, the cost of a double fix
> can
> be very noticeable.
>
> > (b) When L1 hypervisor runs only a single L2 VM, it doesn't have to
> > issue an IBPB when loading the VMCB (as it didn't run any other
> VMCB
> > before) and it should be sufficient from L1 perspective to just
> write 1
> > to IBRS. However, in our nested-case, this is a security-hole.
>
> This is the main difference in our reasoning. I think IBRS=1 (or
> IBPB
> on vmexit) should be enough.
>
> Paolo
>
> > Am I misunderstanding something?
> >
> > Regards,
> > -Liran
> >
> >> /* Exit Guest-Mode */
> >> leave_guest_mode(&svm->vcpu);
> >> svm->nested.vmcb = 0;
> >> @@ -3061,6 +3087,11 @@ static bool nested_svm_vmrun(struct vcpu_svm
> *svm)
> >> if (!nested_vmcb)
> >> return false;
> >>
> >> + /*
> >> + * No need for IBPB here, since the nested VM is less
> >> privileged. The
> >> + * L1 hypervisor inserts one already when switching L2 VMs.
> >> + */
> >> +
> >> if (!nested_vmcb_checks(nested_vmcb)) {
> >> nested_vmcb->control.exit_code = SVM_EXIT_ERR;
> >> nested_vmcb->control.exit_code_hi = 0;
> >>
> >
On 09/01/2018 12:31, Liran Alon wrote:
>> On current generation processors, as I understand it, IBRS simply
>> disables the indirect branch predictor when set to 1. Therefore,
>> as
> This is not how I understood what IBRS does.
>
> Intel (not sure about AMD) states that if IBRS is set, indirect
> branches will now allow their predicted target address to be
> controlled by code that executed in a less privileged prediction-mode
> before the IBRS was last written with a value of 1. (Intel also
> states that thus an indirect branch may be affected by code in a less
> privileged prediction-mode that executed AFTER IBRS mode was last
> written with a value of 1).
Difficult to say that, because AMD microcode for IBRS does not exist.
Is it even documented whether VMX/SVM non-root mode is a different
physical prediction mode.
Generally I agree you'd go by what the spec say, but we don't really
have a final spec from the vendors. The microarchitectural level
matters much more than the spec in this case, I'm opinion: what current
microcode patches do, and how future processors plan to close the leak
once and for all.
The above ("IBRS simply disables the indirect branch predictor") was my
take-away message from private discussion with Intel. My guess is that
the vendors are just handwaving a spec that doesn't match what they have
implemented, because honestly a microcode update is unlikely to do much
more than an old-fashioned chicken bit. Maybe on Skylake it does
though, since the performance characteristics of IBRS are so different
from previous processors. Let's ask Arjan who might have more
information about it, and hope he actually can disclose it...
Paolo
> Therefore, L2 could also inject BTB/BHB entries that will be used by
> L1: Consider the case that L2 injects values into BTB/BHB and then
> issues a hypercall which will cause #VMExit which will be forwarded
> to L1. Because L2 & L1 runs in the same prediction-mode (both Ring0
> SVM guest-mode) from physical CPU perspective, Ring0 L1 code could be
> using BTB/BHB entries injected by Ring0 L2. (The fact that L0 have
> set IBRS to 1 when exiting from L2 to L0 doesn't prevent those
> entries from being used by L1 after L0 enters L1).
>
> This is different than what happens in non-nested case as L0 & L1
> runs in different physical prediction-modes and therefore setting
> IBRS=1 on every #VMExit should suffice.
>
> Therefore, I don't understand how L1 setting IBRS to 1 will help him
> in this case. Maybe this mechanism works differently on AMD?
>
>> Future processors might have a mode that says "just set IBRS=1 and
>> I'll DTRT". If that's done by indexing the BTB using the full virtual
>> address, the CPL _and_ the ASID/PCID/VPID, then IBPB is not needed
>> here because the L2 VM uses a different ASID. If that's done by only
>> augmenting the BTB index with the CPL, we'd need an IBPB. But in
>> this new world we've been thrown into, that would be a processor bug...
On Tue, 9 Jan 2018, Paolo Bonzini wrote:
> On 09/01/2018 11:15, Thomas Gleixner wrote:
> > On Mon, 8 Jan 2018, Paolo Bonzini wrote:
> >
> >> This series allows guests to use the MSR_IA32_SPEC_CTRL and
> >> MSR_IA32_PRED_CMD model specific registers that were added as mitigations
> >> for CVE-2017-5715.
> >>
> >> These are only the KVM specific parts of the fix. It does *not* yet
> >> include any protection for reading host memory from the guest, because
> >> that would be done in the same way as the rest of Linux. So there is no
> >> IBRS *usage* here, no retpolines, no stuffing of the return stack buffer.
> >> (KVM already includes a fix to clear all registers on vmexit, which is
> >> enough to block Google Project Zero's PoC exploit).
> >>
> >> However, I am including the changes to use IBPB (indirect branch
> >> predictor barrier) if available. That occurs only when there is a VCPU
> >> switch on a physical CPU, thus it has a small impact on performance.
> >>
> >> The patches are a bit hackish because the relevant cpufeatures have
> >> not been included yet, and because I wanted to make the patches easier
> >> to backport to distro kernels if desired, but I would still like to
> >> have them in 4.16.
We really want to coordinate that proper with the ongoing integration of
the IB** for bare metal.
And that stuff really does not need to be hackish at all. We've spent a lot
of effort keeping all of it clean _AND_ available for 4.14 stable
consumption. Everything before 4.9 is a big fricking and incompatible mess
anyway.
> >> Please review.
> >
> > CC'ing [email protected] on this would have been asked too much, right?
>
> Sorry, my mistake. I'll CC you on v2.
All good ...
Please add the crowd which has been involved in the bare metal IBRS stuff
as well.
Thanks,
tglx
On 09/01/2018 13:03, Thomas Gleixner wrote:
> On Tue, 9 Jan 2018, Paolo Bonzini wrote:
>> On 09/01/2018 11:15, Thomas Gleixner wrote:
>>> On Mon, 8 Jan 2018, Paolo Bonzini wrote:
>>>
>>>> This series allows guests to use the MSR_IA32_SPEC_CTRL and
>>>> MSR_IA32_PRED_CMD model specific registers that were added as mitigations
>>>> for CVE-2017-5715.
>>>>
>>>> These are only the KVM specific parts of the fix. It does *not* yet
>>>> include any protection for reading host memory from the guest, because
>>>> that would be done in the same way as the rest of Linux. So there is no
>>>> IBRS *usage* here, no retpolines, no stuffing of the return stack buffer.
>>>> (KVM already includes a fix to clear all registers on vmexit, which is
>>>> enough to block Google Project Zero's PoC exploit).
>>>>
>>>> However, I am including the changes to use IBPB (indirect branch
>>>> predictor barrier) if available. That occurs only when there is a VCPU
>>>> switch on a physical CPU, thus it has a small impact on performance.
>>>>
>>>> The patches are a bit hackish because the relevant cpufeatures have
>>>> not been included yet, and because I wanted to make the patches easier
>>>> to backport to distro kernels if desired, but I would still like to
>>>> have them in 4.16.
>
> We really want to coordinate that proper with the ongoing integration of
> the IB** for bare metal.
Yes, this can get merged to master together with the bare-metal parts.
If, as I expect, the -rc rules will be be bent a bit for IB** (and
perhaps retpolines too) in 4.16, we have some time to sort it out.
Thanks,
Paolo
> And that stuff really does not need to be hackish at all. We've spent a lot
> of effort keeping all of it clean _AND_ available for 4.14 stable
> consumption. Everything before 4.9 is a big fricking and incompatible mess
> anyway.
On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> The above ("IBRS simply disables the indirect branch predictor") was my
> take-away message from private discussion with Intel. My guess is that
> the vendors are just handwaving a spec that doesn't match what they have
> implemented, because honestly a microcode update is unlikely to do much
> more than an old-fashioned chicken bit. Maybe on Skylake it does
> though, since the performance characteristics of IBRS are so different
> from previous processors. Let's ask Arjan who might have more
> information about it, and hope he actually can disclose it...
IBRS will ensure that, when set after the ring transition, no earlier
branch prediction data is used for indirect branches while IBRS is set
(this is a english summary of two pages of technical spec so it lacks
the language lawyer precision)
because of this promise, the implementation tends to be impactful
and it is very strongly recommended that retpoline is used instead of IBRS.
(with all the caveats already on lkml)
the IBPB is different, this is a covenient thing for switching between VM guests etc
----- [email protected] wrote:
> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> > The above ("IBRS simply disables the indirect branch predictor") was
> my
> > take-away message from private discussion with Intel. My guess is
> that
> > the vendors are just handwaving a spec that doesn't match what they
> have
> > implemented, because honestly a microcode update is unlikely to do
> much
> > more than an old-fashioned chicken bit. Maybe on Skylake it does
> > though, since the performance characteristics of IBRS are so
> different
> > from previous processors. Let's ask Arjan who might have more
> > information about it, and hope he actually can disclose it...
>
> IBRS will ensure that, when set after the ring transition, no earlier
> branch prediction data is used for indirect branches while IBRS is
> set
Consider the following scenario:
1. L1 runs with IBRS=1 in Ring0.
2. L1 restores L2 SPEC_CTRL and enters into L2.
3. L1 VMRUN exits into L0 which backups L1 SPEC_CTRL and enters L2 (using same VMCB).
4. L2 populates BTB/BHB with values and cause a hypercall which #VMExit into L0.
5. L0 backups L2 SPEC_CTRL and writes IBRS=1.
6. L0 restores L1 SPEC_CTRL and enters L1.
7. L1 backups L2 SPEC_CTRL and writes IBRS=1.
Just to make sure I understand:
You state that L2 BTB/BHB won't be used by L1 because L1 have set IBRS=1 in step (7).
And that is even though L1 & L2 could both be running in SVM guest-mode & Ring0 from physical CPU perspective. Therefore, having the same prediction-mode.
So basically you are saying that IBRS don't make sure to avoid using BTB/BHB from lower prediction-modes but instead just make sure to avoid usage of all BTB/BHB while IBRS is set.
Did I understand everything correctly?
Thanks,
-Liran
>
> (this is a english summary of two pages of technical spec so it lacks
> the language lawyer precision)
>
> because of this promise, the implementation tends to be impactful
> and it is very strongly recommended that retpoline is used instead of
> IBRS.
> (with all the caveats already on lkml)
>
> the IBPB is different, this is a covenient thing for switching between
> VM guests etc
On 1/9/2018 7:00 AM, Liran Alon wrote:
>
> ----- [email protected] wrote:
>
>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>>> The above ("IBRS simply disables the indirect branch predictor") was
>> my
>>> take-away message from private discussion with Intel. My guess is
>> that
>>> the vendors are just handwaving a spec that doesn't match what they
>> have
>>> implemented, because honestly a microcode update is unlikely to do
>> much
>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>>> though, since the performance characteristics of IBRS are so
>> different
>>> from previous processors. Let's ask Arjan who might have more
>>> information about it, and hope he actually can disclose it...
>>
>> IBRS will ensure that, when set after the ring transition, no earlier
>> branch prediction data is used for indirect branches while IBRS is
>> set
>
> Consider the following scenario:
> 1. L1 runs with IBRS=1 in Ring0.
> 2. L1 restores L2 SPEC_CTRL and enters into L2.
> 3. L1 VMRUN exits into L0 which backups L1 SPEC_CTRL and enters L2 (using same VMCB).
> 4. L2 populates BTB/BHB with values and cause a hypercall which #VMExit into L0.
> 5. L0 backups L2 SPEC_CTRL and writes IBRS=1.
> 6. L0 restores L1 SPEC_CTRL and enters L1.
> 7. L1 backups L2 SPEC_CTRL and writes IBRS=1.
>
I'm sorry I'm not familiar with your L0/L1/L2 terminology
(maybe it's before coffee has had time to permeate the brain)
----- [email protected] wrote:
> On 1/9/2018 7:00 AM, Liran Alon wrote:
> >
> > ----- [email protected] wrote:
> >
> >> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> >>> The above ("IBRS simply disables the indirect branch predictor")
> was
> >> my
> >>> take-away message from private discussion with Intel. My guess
> is
> >> that
> >>> the vendors are just handwaving a spec that doesn't match what
> they
> >> have
> >>> implemented, because honestly a microcode update is unlikely to
> do
> >> much
> >>> more than an old-fashioned chicken bit. Maybe on Skylake it does
> >>> though, since the performance characteristics of IBRS are so
> >> different
> >>> from previous processors. Let's ask Arjan who might have more
> >>> information about it, and hope he actually can disclose it...
> >>
> >> IBRS will ensure that, when set after the ring transition, no
> earlier
> >> branch prediction data is used for indirect branches while IBRS is
> >> set
> >
> > Consider the following scenario:
> > 1. L1 runs with IBRS=1 in Ring0.
> > 2. L1 restores L2 SPEC_CTRL and enters into L2.
> > 3. L1 VMRUN exits into L0 which backups L1 SPEC_CTRL and enters L2
> (using same VMCB).
> > 4. L2 populates BTB/BHB with values and cause a hypercall which
> #VMExit into L0.
> > 5. L0 backups L2 SPEC_CTRL and writes IBRS=1.
> > 6. L0 restores L1 SPEC_CTRL and enters L1.
> > 7. L1 backups L2 SPEC_CTRL and writes IBRS=1.
> >
>
> I'm sorry I'm not familiar with your L0/L1/L2 terminology
> (maybe it's before coffee has had time to permeate the brain)
These are standard terminology for guest levels:
L0 == hypervisor that runs on bare-metal
L1 == hypervisor that runs as L0 guest.
L2 == software that runs as L1 guest.
(We are talking about nested virtualization here)
-Liran
>> I'm sorry I'm not familiar with your L0/L1/L2 terminology
>> (maybe it's before coffee has had time to permeate the brain)
>
> These are standard terminology for guest levels:
> L0 == hypervisor that runs on bare-metal
> L1 == hypervisor that runs as L0 guest.
> L2 == software that runs as L1 guest.
> (We are talking about nested virtualization here)
1. I really really hope that the guests don't use IBRS but use retpoline. At least for Linux that is going to be the prefered approach.
2. For the CPU, there really is only "bare metal" vs "guest"; all guests are "guests" no matter how deeply nested. So for the language of privilege domains etc,
nested guests equal their parent.
----- [email protected] wrote:
> >> I'm sorry I'm not familiar with your L0/L1/L2 terminology
> >> (maybe it's before coffee has had time to permeate the brain)
> >
> > These are standard terminology for guest levels:
> > L0 == hypervisor that runs on bare-metal
> > L1 == hypervisor that runs as L0 guest.
> > L2 == software that runs as L1 guest.
> > (We are talking about nested virtualization here)
>
> 1. I really really hope that the guests don't use IBRS but use
> retpoline. At least for Linux that is going to be the prefered
> approach.
>
> 2. For the CPU, there really is only "bare metal" vs "guest"; all
> guests are "guests" no matter how deeply nested. So for the language
> of privilege domains etc,
> nested guests equal their parent.
So in the scenario I mentioned above, would L1 use BTB/BHB entries inserted by L2?
To me it seems that it would if IBRS takes prediction-mode into consideration.
And therefore, we must issue IBPB when switching between L1 & L2.
Same as happen on nVMX when switching between vmcs01 & vmcs02.
-Liran
On 09/01/2018 16:19, Arjan van de Ven wrote:
> On 1/9/2018 7:00 AM, Liran Alon wrote:
>>
>> ----- [email protected] wrote:
>>
>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>>>> The above ("IBRS simply disables the indirect branch predictor") was my
>>>> take-away message from private discussion with Intel. My guess is that
>>>> the vendors are just handwaving a spec that doesn't match what they have
>>>> implemented, because honestly a microcode update is unlikely to do much
>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>>>> though, since the performance characteristics of IBRS are so different
>>>> from previous processors. Let's ask Arjan who might have more
>>>> information about it, and hope he actually can disclose it...
>>>
>>> IBRS will ensure that, when set after the ring transition, no earlier
>>> branch prediction data is used for indirect branches while IBRS is
>>> set
Let me ask you my questions, which are independent of L0/L1/L2 terminology.
1) Is vmentry/vmexit considered a ring transition, even if the guest is
running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
the host will not do a wrmsr on exit. Is this safe for the host kernel?
2) How will the future processors work where IBRS should always be =1?
Will they still need IBPB? If I get a vmexit from a guest with IBRS=1,
and do a vmentry to the same VMCS *but with a different VPID*, will the
code running after the vmentry share BTB entries with code running
before the vmexit?
Thanks,
Paolo
On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
> On 09/01/2018 16:19, Arjan van de Ven wrote:
>> On 1/9/2018 7:00 AM, Liran Alon wrote:
>>>
>>> ----- [email protected] wrote:
>>>
>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>>>>> The above ("IBRS simply disables the indirect branch predictor") was my
>>>>> take-away message from private discussion with Intel. My guess is that
>>>>> the vendors are just handwaving a spec that doesn't match what they have
>>>>> implemented, because honestly a microcode update is unlikely to do much
>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>>>>> though, since the performance characteristics of IBRS are so different
>>>>> from previous processors. Let's ask Arjan who might have more
>>>>> information about it, and hope he actually can disclose it...
>>>>
>>>> IBRS will ensure that, when set after the ring transition, no earlier
>>>> branch prediction data is used for indirect branches while IBRS is
>>>> set
>
> Let me ask you my questions, which are independent of L0/L1/L2 terminology.
>
> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
> the host will not do a wrmsr on exit. Is this safe for the host kernel?
I think the CPU folks would want us to write the msr again.
> 2) How will the future processors work where IBRS should always be =1?
IBRS=1 should be "fire and forget this ever happened".
This is the only time anyone should use IBRS in practice
(and then the host turns it on and makes sure to not expose it to the guests I hope)
On 09/01/2018 17:23, Arjan van de Ven wrote:
> On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
>> On 09/01/2018 16:19, Arjan van de Ven wrote:
>>> On 1/9/2018 7:00 AM, Liran Alon wrote:
>>>>
>>>> ----- [email protected] wrote:
>>>>
>>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>>>>>> The above ("IBRS simply disables the indirect branch predictor")
>>>>>> was my
>>>>>> take-away message from private discussion with Intel. My guess is
>>>>>> that
>>>>>> the vendors are just handwaving a spec that doesn't match what
>>>>>> they have
>>>>>> implemented, because honestly a microcode update is unlikely to do
>>>>>> much
>>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>>>>>> though, since the performance characteristics of IBRS are so
>>>>>> different
>>>>>> from previous processors. Let's ask Arjan who might have more
>>>>>> information about it, and hope he actually can disclose it...
>>>>>
>>>>> IBRS will ensure that, when set after the ring transition, no earlier
>>>>> branch prediction data is used for indirect branches while IBRS is
>>>>> set
>>
>> Let me ask you my questions, which are independent of L0/L1/L2
>> terminology.
>>
>> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
>> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
>> the host will not do a wrmsr on exit. Is this safe for the host kernel?
>
> I think the CPU folks would want us to write the msr again.
Want us, or need us---and if we don't do that, what happens? And if we
have to do it, how is IBRS=1 different from an IBPB?...
Since I am at it, what happens on *current generation* CPUs if you
always leave IBRS=1? Slow and safe, or fast and unsafe?
>> 2) How will the future processors work where IBRS should always be =1?
>
> IBRS=1 should be "fire and forget this ever happened".
> This is the only time anyone should use IBRS in practice
And IBPB too I hope? But besides that, I need to know exactly how that
is implemented to ensure that it's doing the right thing.
> (and then the host turns it on and makes sure to not expose it to the
> guests I hope)
That's not that easy, because guests might have support for SPEC_CTRL
but not for IA32_ARCH_CAPABILITIES.
You could disable the SPEC_CTRL bit, but then the guest might think it
is not secure. It might also actually *be* insecure, if you migrated to
an older CPU where IBRS is not fire-and-forget.
Paolo
On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
> On 09/01/2018 17:23, Arjan van de Ven wrote:
> > On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
> >> On 09/01/2018 16:19, Arjan van de Ven wrote:
> >>> On 1/9/2018 7:00 AM, Liran Alon wrote:
> >>>>
> >>>> ----- [email protected] wrote:
> >>>>
> >>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> >>>>>> The above ("IBRS simply disables the indirect branch predictor")
> >>>>>> was my
> >>>>>> take-away message from private discussion with Intel.? My guess is
> >>>>>> that
> >>>>>> the vendors are just handwaving a spec that doesn't match what
> >>>>>> they have
> >>>>>> implemented, because honestly a microcode update is unlikely to do
> >>>>>> much
> >>>>>> more than an old-fashioned chicken bit.? Maybe on Skylake it does
> >>>>>> though, since the performance characteristics of IBRS are so
> >>>>>> different
> >>>>>> from previous processors.? Let's ask Arjan who might have more
> >>>>>> information about it, and hope he actually can disclose it...
> >>>>>
> >>>>> IBRS will ensure that, when set after the ring transition, no earlier
> >>>>> branch prediction data is used for indirect branches while IBRS is
> >>>>> set
> >>
> >> Let me ask you my questions, which are independent of L0/L1/L2
> >> terminology.
> >>
> >> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
> >> running in ring 0?? If IBRS=1 in the guest and the host is using IBRS,
> >> the host will not do a wrmsr on exit.? Is this safe for the host kernel?
> >
> > I think the CPU folks would want us to write the msr again.
>
> Want us, or need us---and if we don't do that, what happens? And if we
> have to do it, how is IBRS=1 different from an IBPB?...
Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
mode change'. And from what I have gathered so far moving from lower (guest)
to higher (hypervisor) has no bearing on the branch predicator. Meaning
the guest ring0 can attack us if we don't touch this MSR.
We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
'reset' button and at every 'prediction mode' you have to hit this.
<sigh> Can we have a discussion on making an kvm-security mailing list
where we can figure all this out during embargo and not have these
misunderstandings.
>
> Since I am at it, what happens on *current generation* CPUs if you
> always leave IBRS=1? Slow and safe, or fast and unsafe?
>
> >> 2) How will the future processors work where IBRS should always be =1?
> >
> > IBRS=1 should be "fire and forget this ever happened".
> > This is the only time anyone should use IBRS in practice
>
> And IBPB too I hope? But besides that, I need to know exactly how that
> is implemented to ensure that it's doing the right thing.
>
> > (and then the host turns it on and makes sure to not expose it to the
> > guests I hope)
>
> That's not that easy, because guests might have support for SPEC_CTRL
> but not for IA32_ARCH_CAPABILITIES.
>
> You could disable the SPEC_CTRL bit, but then the guest might think it
> is not secure. It might also actually *be* insecure, if you migrated to
> an older CPU where IBRS is not fire-and-forget.
>
> Paolo
On Tue, Jan 09, 2018 at 03:39:09PM -0500, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
> > On 09/01/2018 17:23, Arjan van de Ven wrote:
> > > On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
> > >> On 09/01/2018 16:19, Arjan van de Ven wrote:
> > >>> On 1/9/2018 7:00 AM, Liran Alon wrote:
> > >>>>
> > >>>> ----- [email protected] wrote:
> > >>>>
> > >>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> > >>>>>> The above ("IBRS simply disables the indirect branch predictor")
> > >>>>>> was my
> > >>>>>> take-away message from private discussion with Intel.? My guess is
> > >>>>>> that
> > >>>>>> the vendors are just handwaving a spec that doesn't match what
> > >>>>>> they have
> > >>>>>> implemented, because honestly a microcode update is unlikely to do
> > >>>>>> much
> > >>>>>> more than an old-fashioned chicken bit.? Maybe on Skylake it does
> > >>>>>> though, since the performance characteristics of IBRS are so
> > >>>>>> different
> > >>>>>> from previous processors.? Let's ask Arjan who might have more
> > >>>>>> information about it, and hope he actually can disclose it...
> > >>>>>
> > >>>>> IBRS will ensure that, when set after the ring transition, no earlier
> > >>>>> branch prediction data is used for indirect branches while IBRS is
> > >>>>> set
> > >>
> > >> Let me ask you my questions, which are independent of L0/L1/L2
> > >> terminology.
> > >>
> > >> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
> > >> running in ring 0?? If IBRS=1 in the guest and the host is using IBRS,
> > >> the host will not do a wrmsr on exit.? Is this safe for the host kernel?
> > >
> > > I think the CPU folks would want us to write the msr again.
> >
> > Want us, or need us---and if we don't do that, what happens? And if we
> > have to do it, how is IBRS=1 different from an IBPB?...
>
> Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
> mode change'. And from what I have gathered so far moving from lower (guest)
> to higher (hypervisor) has no bearing on the branch predicator. Meaning
> the guest ring0 can attack us if we don't touch this MSR.
>
> We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
> 'reset' button and at every 'prediction mode' you have to hit this.
I suppose means that when we VMENTER the original fix (where we
compare the host to guest) can stay - as we entering an lower prediction
mode. I wonder then what does writting 0 do to it? A nop?
>
>
> <sigh> Can we have a discussion on making an kvm-security mailing list
> where we can figure all this out during embargo and not have these
> misunderstandings.
>
> >
> > Since I am at it, what happens on *current generation* CPUs if you
> > always leave IBRS=1? Slow and safe, or fast and unsafe?
> >
> > >> 2) How will the future processors work where IBRS should always be =1?
> > >
> > > IBRS=1 should be "fire and forget this ever happened".
> > > This is the only time anyone should use IBRS in practice
> >
> > And IBPB too I hope? But besides that, I need to know exactly how that
> > is implemented to ensure that it's doing the right thing.
> >
> > > (and then the host turns it on and makes sure to not expose it to the
> > > guests I hope)
> >
> > That's not that easy, because guests might have support for SPEC_CTRL
> > but not for IA32_ARCH_CAPABILITIES.
> >
> > You could disable the SPEC_CTRL bit, but then the guest might think it
> > is not secure. It might also actually *be* insecure, if you migrated to
> > an older CPU where IBRS is not fire-and-forget.
> >
> > Paolo
Before VM-entry, don't we need to flush the BHB and the RSB to avoid
revealing KASLR information to the guest? (Thanks to Liran for
pointing this out.)
On Tue, Jan 9, 2018 at 12:47 PM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
> On Tue, Jan 09, 2018 at 03:39:09PM -0500, Konrad Rzeszutek Wilk wrote:
>> On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
>> > On 09/01/2018 17:23, Arjan van de Ven wrote:
>> > > On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
>> > >> On 09/01/2018 16:19, Arjan van de Ven wrote:
>> > >>> On 1/9/2018 7:00 AM, Liran Alon wrote:
>> > >>>>
>> > >>>> ----- [email protected] wrote:
>> > >>>>
>> > >>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>> > >>>>>> The above ("IBRS simply disables the indirect branch predictor")
>> > >>>>>> was my
>> > >>>>>> take-away message from private discussion with Intel. My guess is
>> > >>>>>> that
>> > >>>>>> the vendors are just handwaving a spec that doesn't match what
>> > >>>>>> they have
>> > >>>>>> implemented, because honestly a microcode update is unlikely to do
>> > >>>>>> much
>> > >>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>> > >>>>>> though, since the performance characteristics of IBRS are so
>> > >>>>>> different
>> > >>>>>> from previous processors. Let's ask Arjan who might have more
>> > >>>>>> information about it, and hope he actually can disclose it...
>> > >>>>>
>> > >>>>> IBRS will ensure that, when set after the ring transition, no earlier
>> > >>>>> branch prediction data is used for indirect branches while IBRS is
>> > >>>>> set
>> > >>
>> > >> Let me ask you my questions, which are independent of L0/L1/L2
>> > >> terminology.
>> > >>
>> > >> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
>> > >> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
>> > >> the host will not do a wrmsr on exit. Is this safe for the host kernel?
>> > >
>> > > I think the CPU folks would want us to write the msr again.
>> >
>> > Want us, or need us---and if we don't do that, what happens? And if we
>> > have to do it, how is IBRS=1 different from an IBPB?...
>>
>> Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
>> mode change'. And from what I have gathered so far moving from lower (guest)
>> to higher (hypervisor) has no bearing on the branch predicator. Meaning
>> the guest ring0 can attack us if we don't touch this MSR.
>>
>> We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
>> 'reset' button and at every 'prediction mode' you have to hit this.
>
> I suppose means that when we VMENTER the original fix (where we
> compare the host to guest) can stay - as we entering an lower prediction
> mode. I wonder then what does writting 0 do to it? A nop?
>
>>
>>
>> <sigh> Can we have a discussion on making an kvm-security mailing list
>> where we can figure all this out during embargo and not have these
>> misunderstandings.
>>
>> >
>> > Since I am at it, what happens on *current generation* CPUs if you
>> > always leave IBRS=1? Slow and safe, or fast and unsafe?
>> >
>> > >> 2) How will the future processors work where IBRS should always be =1?
>> > >
>> > > IBRS=1 should be "fire and forget this ever happened".
>> > > This is the only time anyone should use IBRS in practice
>> >
>> > And IBPB too I hope? But besides that, I need to know exactly how that
>> > is implemented to ensure that it's doing the right thing.
>> >
>> > > (and then the host turns it on and makes sure to not expose it to the
>> > > guests I hope)
>> >
>> > That's not that easy, because guests might have support for SPEC_CTRL
>> > but not for IA32_ARCH_CAPABILITIES.
>> >
>> > You could disable the SPEC_CTRL bit, but then the guest might think it
>> > is not secure. It might also actually *be* insecure, if you migrated to
>> > an older CPU where IBRS is not fire-and-forget.
>> >
>> > Paolo
On Tue, Jan 09, 2018 at 12:57:38PM -0800, Jim Mattson wrote:
> Before VM-entry, don't we need to flush the BHB and the RSB to avoid
> revealing KASLR information to the guest? (Thanks to Liran for
> pointing this out.)
Exactly.
Or is is touching with any value good enough?
(Removing '[email protected]') from the email. Adding Jun.
>
> On Tue, Jan 9, 2018 at 12:47 PM, Konrad Rzeszutek Wilk
> <[email protected]> wrote:
> > On Tue, Jan 09, 2018 at 03:39:09PM -0500, Konrad Rzeszutek Wilk wrote:
> >> On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
> >> > On 09/01/2018 17:23, Arjan van de Ven wrote:
> >> > > On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
> >> > >> On 09/01/2018 16:19, Arjan van de Ven wrote:
> >> > >>> On 1/9/2018 7:00 AM, Liran Alon wrote:
> >> > >>>>
> >> > >>>> ----- [email protected] wrote:
> >> > >>>>
> >> > >>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
> >> > >>>>>> The above ("IBRS simply disables the indirect branch predictor")
> >> > >>>>>> was my
> >> > >>>>>> take-away message from private discussion with Intel. My guess is
> >> > >>>>>> that
> >> > >>>>>> the vendors are just handwaving a spec that doesn't match what
> >> > >>>>>> they have
> >> > >>>>>> implemented, because honestly a microcode update is unlikely to do
> >> > >>>>>> much
> >> > >>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
> >> > >>>>>> though, since the performance characteristics of IBRS are so
> >> > >>>>>> different
> >> > >>>>>> from previous processors. Let's ask Arjan who might have more
> >> > >>>>>> information about it, and hope he actually can disclose it...
> >> > >>>>>
> >> > >>>>> IBRS will ensure that, when set after the ring transition, no earlier
> >> > >>>>> branch prediction data is used for indirect branches while IBRS is
> >> > >>>>> set
> >> > >>
> >> > >> Let me ask you my questions, which are independent of L0/L1/L2
> >> > >> terminology.
> >> > >>
> >> > >> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
> >> > >> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
> >> > >> the host will not do a wrmsr on exit. Is this safe for the host kernel?
> >> > >
> >> > > I think the CPU folks would want us to write the msr again.
> >> >
> >> > Want us, or need us---and if we don't do that, what happens? And if we
> >> > have to do it, how is IBRS=1 different from an IBPB?...
> >>
> >> Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
> >> mode change'. And from what I have gathered so far moving from lower (guest)
> >> to higher (hypervisor) has no bearing on the branch predicator. Meaning
> >> the guest ring0 can attack us if we don't touch this MSR.
> >>
> >> We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
> >> 'reset' button and at every 'prediction mode' you have to hit this.
> >
> > I suppose means that when we VMENTER the original fix (where we
> > compare the host to guest) can stay - as we entering an lower prediction
> > mode. I wonder then what does writting 0 do to it? A nop?
> >
> >>
> >>
> >> <sigh> Can we have a discussion on making an kvm-security mailing list
> >> where we can figure all this out during embargo and not have these
> >> misunderstandings.
> >>
> >> >
> >> > Since I am at it, what happens on *current generation* CPUs if you
> >> > always leave IBRS=1? Slow and safe, or fast and unsafe?
> >> >
> >> > >> 2) How will the future processors work where IBRS should always be =1?
> >> > >
> >> > > IBRS=1 should be "fire and forget this ever happened".
> >> > > This is the only time anyone should use IBRS in practice
> >> >
> >> > And IBPB too I hope? But besides that, I need to know exactly how that
> >> > is implemented to ensure that it's doing the right thing.
> >> >
> >> > > (and then the host turns it on and makes sure to not expose it to the
> >> > > guests I hope)
> >> >
> >> > That's not that easy, because guests might have support for SPEC_CTRL
> >> > but not for IA32_ARCH_CAPABILITIES.
> >> >
> >> > You could disable the SPEC_CTRL bit, but then the guest might think it
> >> > is not secure. It might also actually *be* insecure, if you migrated to
> >> > an older CPU where IBRS is not fire-and-forget.
> >> >
> >> > Paolo
If my documentation is up-to-date, writing IBRS does not clear the RSB
(except for parts which contain an RSB that is not filled by 32
CALLs).
On Tue, Jan 9, 2018 at 1:11 PM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
> On Tue, Jan 09, 2018 at 12:57:38PM -0800, Jim Mattson wrote:
>> Before VM-entry, don't we need to flush the BHB and the RSB to avoid
>> revealing KASLR information to the guest? (Thanks to Liran for
>> pointing this out.)
>
> Exactly.
>
> Or is is touching with any value good enough?
>
> (Removing '[email protected]') from the email. Adding Jun.
>>
>> On Tue, Jan 9, 2018 at 12:47 PM, Konrad Rzeszutek Wilk
>> <[email protected]> wrote:
>> > On Tue, Jan 09, 2018 at 03:39:09PM -0500, Konrad Rzeszutek Wilk wrote:
>> >> On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
>> >> > On 09/01/2018 17:23, Arjan van de Ven wrote:
>> >> > > On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
>> >> > >> On 09/01/2018 16:19, Arjan van de Ven wrote:
>> >> > >>> On 1/9/2018 7:00 AM, Liran Alon wrote:
>> >> > >>>>
>> >> > >>>> ----- [email protected] wrote:
>> >> > >>>>
>> >> > >>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>> >> > >>>>>> The above ("IBRS simply disables the indirect branch predictor")
>> >> > >>>>>> was my
>> >> > >>>>>> take-away message from private discussion with Intel. My guess is
>> >> > >>>>>> that
>> >> > >>>>>> the vendors are just handwaving a spec that doesn't match what
>> >> > >>>>>> they have
>> >> > >>>>>> implemented, because honestly a microcode update is unlikely to do
>> >> > >>>>>> much
>> >> > >>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>> >> > >>>>>> though, since the performance characteristics of IBRS are so
>> >> > >>>>>> different
>> >> > >>>>>> from previous processors. Let's ask Arjan who might have more
>> >> > >>>>>> information about it, and hope he actually can disclose it...
>> >> > >>>>>
>> >> > >>>>> IBRS will ensure that, when set after the ring transition, no earlier
>> >> > >>>>> branch prediction data is used for indirect branches while IBRS is
>> >> > >>>>> set
>> >> > >>
>> >> > >> Let me ask you my questions, which are independent of L0/L1/L2
>> >> > >> terminology.
>> >> > >>
>> >> > >> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
>> >> > >> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
>> >> > >> the host will not do a wrmsr on exit. Is this safe for the host kernel?
>> >> > >
>> >> > > I think the CPU folks would want us to write the msr again.
>> >> >
>> >> > Want us, or need us---and if we don't do that, what happens? And if we
>> >> > have to do it, how is IBRS=1 different from an IBPB?...
>> >>
>> >> Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
>> >> mode change'. And from what I have gathered so far moving from lower (guest)
>> >> to higher (hypervisor) has no bearing on the branch predicator. Meaning
>> >> the guest ring0 can attack us if we don't touch this MSR.
>> >>
>> >> We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
>> >> 'reset' button and at every 'prediction mode' you have to hit this.
>> >
>> > I suppose means that when we VMENTER the original fix (where we
>> > compare the host to guest) can stay - as we entering an lower prediction
>> > mode. I wonder then what does writting 0 do to it? A nop?
>> >
>> >>
>> >>
>> >> <sigh> Can we have a discussion on making an kvm-security mailing list
>> >> where we can figure all this out during embargo and not have these
>> >> misunderstandings.
>> >>
>> >> >
>> >> > Since I am at it, what happens on *current generation* CPUs if you
>> >> > always leave IBRS=1? Slow and safe, or fast and unsafe?
>> >> >
>> >> > >> 2) How will the future processors work where IBRS should always be =1?
>> >> > >
>> >> > > IBRS=1 should be "fire and forget this ever happened".
>> >> > > This is the only time anyone should use IBRS in practice
>> >> >
>> >> > And IBPB too I hope? But besides that, I need to know exactly how that
>> >> > is implemented to ensure that it's doing the right thing.
>> >> >
>> >> > > (and then the host turns it on and makes sure to not expose it to the
>> >> > > guests I hope)
>> >> >
>> >> > That's not that easy, because guests might have support for SPEC_CTRL
>> >> > but not for IA32_ARCH_CAPABILITIES.
>> >> >
>> >> > You could disable the SPEC_CTRL bit, but then the guest might think it
>> >> > is not secure. It might also actually *be* insecure, if you migrated to
>> >> > an older CPU where IBRS is not fire-and-forget.
>> >> >
>> >> > Paolo
On 09/01/2018 21:57, Jim Mattson wrote:
> Before VM-entry, don't we need to flush the BHB and the RSB to avoid
> revealing KASLR information to the guest? (Thanks to Liran for
> pointing this out.)
I don't know how you flush the BHB? As to the RSB, that would also be
part of generic Linux code so I've not included it yet in this series
which was mostly about the new MSRs and CPUID bits.
Paolo
On 09/01/2018 21:39, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 09, 2018 at 05:49:08PM +0100, Paolo Bonzini wrote:
>> On 09/01/2018 17:23, Arjan van de Ven wrote:
>>> On 1/9/2018 8:17 AM, Paolo Bonzini wrote:
>>>> On 09/01/2018 16:19, Arjan van de Ven wrote:
>>>>> On 1/9/2018 7:00 AM, Liran Alon wrote:
>>>>>>
>>>>>> ----- [email protected] wrote:
>>>>>>
>>>>>>> On 1/9/2018 3:41 AM, Paolo Bonzini wrote:
>>>>>>>> The above ("IBRS simply disables the indirect branch predictor")
>>>>>>>> was my
>>>>>>>> take-away message from private discussion with Intel. My guess is
>>>>>>>> that
>>>>>>>> the vendors are just handwaving a spec that doesn't match what
>>>>>>>> they have
>>>>>>>> implemented, because honestly a microcode update is unlikely to do
>>>>>>>> much
>>>>>>>> more than an old-fashioned chicken bit. Maybe on Skylake it does
>>>>>>>> though, since the performance characteristics of IBRS are so
>>>>>>>> different
>>>>>>>> from previous processors. Let's ask Arjan who might have more
>>>>>>>> information about it, and hope he actually can disclose it...
>>>>>>>
>>>>>>> IBRS will ensure that, when set after the ring transition, no earlier
>>>>>>> branch prediction data is used for indirect branches while IBRS is
>>>>>>> set
>>>>
>>>> Let me ask you my questions, which are independent of L0/L1/L2
>>>> terminology.
>>>>
>>>> 1) Is vmentry/vmexit considered a ring transition, even if the guest is
>>>> running in ring 0? If IBRS=1 in the guest and the host is using IBRS,
>>>> the host will not do a wrmsr on exit. Is this safe for the host kernel?
>>>
>>> I think the CPU folks would want us to write the msr again.
>>
>> Want us, or need us---and if we don't do that, what happens? And if we
>> have to do it, how is IBRS=1 different from an IBPB?...
>
> Arjan says 'ring transition' but I am pretty sure it is more of 'prediction
> mode change'. And from what I have gathered so far moving from lower (guest)
> to higher (hypervisor) has no bearing on the branch predicator. Meaning
> the guest ring0 can attack us if we don't touch this MSR.
>
> We have to WRMSR 0x48 to 1 to flush out lower prediction. Aka this is a
> 'reset' button and at every 'prediction mode' you have to hit this.
That however makes me wonder why Intel said "before transitioning to
ring 3, do WRMSR to IA32_SPEC_CTRL to clear IBRS to 0".
I have looked again at the slides I had and "IBRS all the time" seems to
require an IBPB anyway, thus making me wonder what's the point of it at
all. Why can't we have proper indexing of the BTB by PCID and VPID, and
forget IBRS completely on newer machines?!?
> <sigh> Can we have a discussion on making an kvm-security mailing list
> where we can figure all this out during embargo and not have these
> misunderstandings.
Being told who knows what from other companies, would also have been a
start. Instead CVE-2017-5715 was disclosed to each partner
individually, and now _we_ are reaping what Intel has sown.
Paolo
It's unclear from Intel's documentation whether writing bit 0 of
IA32_SPEC_CTRL or bit 0 of IA32_PRED_CMD will flush the BHB. (At
least, it's unclear from the documentation I have.)
The retpoline patches include code for *filling* the RSB, but if you
invoke the RSB refill code from kernel text before VM-entry, you still
reveal information about KASLR to the guest. I think we may need a
copy of the RSB refill code on a dynamically allocated page. In fact,
we may need ~32 branches on that page to clobber the BHB. That means
that maybe we can't do VM-entry from kernel text (unless one of IBRS
or IBPB flushes the BHB).
On Tue, Jan 9, 2018 at 1:42 PM, Paolo Bonzini <[email protected]> wrote:
> On 09/01/2018 21:57, Jim Mattson wrote:
>> Before VM-entry, don't we need to flush the BHB and the RSB to avoid
>> revealing KASLR information to the guest? (Thanks to Liran for
>> pointing this out.)
>
> I don't know how you flush the BHB? As to the RSB, that would also be
> part of generic Linux code so I've not included it yet in this series
> which was mostly about the new MSRs and CPUID bits.
>
> Paolo
On 01/08/2018 10:08 AM, Paolo Bonzini wrote:
> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> pt_guest_enter(vmx);
>
> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +
Say host uses IBRS (have_spec_ctrl==1), and have IBRS set to 1.
And say guest's vmx->spec_ctrl is 0 and not using IBRS.
We will be leaving IBRS msr as 1 and won't be
switching it to 0 before entering guest.
Guest will be running with incorrect IBRS value.
Seems like the correct logic is
if (host_supports_ibrs)
/* switch IBRS if host and guest ibrs differs */
if ((host_uses_ibrs && vmx->spec_ctrl == 0) || /* host IBRS 1, guest IBRS 0 */
(!host_uses_ibrs && vmx->spec_ctrl == 1)) /* host IBRS 0, guest IBRS 1 */
wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
}
Your have_spec_ctrl logic specifies that IBRS is available.
But that doesn't necessarily mean that we will use IBRS in
host.
Tim
On 11/01/2018 03:47, Tim Chen wrote:
> On 01/08/2018 10:08 AM, Paolo Bonzini wrote:
>
>> @@ -9597,6 +9620,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>
>> pt_guest_enter(vmx);
>>
>> + if (have_spec_ctrl && vmx->spec_ctrl != 0)
>> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>> +
>
> Say host uses IBRS (have_spec_ctrl==1), and have IBRS set to 1.
> And say guest's vmx->spec_ctrl is 0 and not using IBRS.
>
> We will be leaving IBRS msr as 1 and won't be
> switching it to 0 before entering guest.
> Guest will be running with incorrect IBRS value.
>
> Seems like the correct logic is
>
> if (host_supports_ibrs)
> /* switch IBRS if host and guest ibrs differs */
> if ((host_uses_ibrs && vmx->spec_ctrl == 0) || /* host IBRS 1, guest IBRS 0 */
> (!host_uses_ibrs && vmx->spec_ctrl == 1)) /* host IBRS 0, guest IBRS 1 */
> wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> }
>
> Your have_spec_ctrl logic specifies that IBRS is available.
> But that doesn't necessarily mean that we will use IBRS in
> host.
Of course. But these patches are just an initial version for a host
that doesn't support IBRS.
Paolo