2018-01-28 19:31:28

by KarimAllah Ahmed

[permalink] [raw]
Subject: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
retpoline+IBPB based approach.

To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
for guests that do not actually use the MSR, only add_atomic_switch_msr when a
non-zero is written to it.

Cc: Asit Mallick <[email protected]>
Cc: Arjan Van De Ven <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Jun Nakajima <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: David Woodhouse <[email protected]>
Cc: Greg KH <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Signed-off-by: KarimAllah Ahmed <[email protected]>
Signed-off-by: Ashok Raj <[email protected]>
---
arch/x86/kvm/cpuid.c | 4 +++-
arch/x86/kvm/cpuid.h | 1 +
arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0099e10..dc78095 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
/* These are scattered features in cpufeatures.h. */
#define KVM_CPUID_BIT_AVX512_4VNNIW 2
#define KVM_CPUID_BIT_AVX512_4FMAPS 3
+#define KVM_CPUID_BIT_SPEC_CTRL 26
#define KF(x) bit(KVM_CPUID_BIT_##x)

int kvm_update_cpuid(struct kvm_vcpu *vcpu)
@@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
+ (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);

/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index cdc70a3..dcfe227 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
[CPUID_7_ECX] = { 7, 0, CPUID_ECX},
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
};

static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aa8638a..1b743a0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type);
+

static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
m->host[i].value = host_val;
}

+/* do not touch guest_val and host_val if the msr is not found */
+static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 *guest_val, u64 *host_val)
+{
+ unsigned i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ break;
+
+ if (i == m->nr)
+ return 1;
+
+ if (guest_val)
+ *guest_val = m->guest[i].value;
+ if (host_val)
+ *host_val = m->host[i].value;
+
+ return 0;
+}
+
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
@@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ u64 spec_ctrl = 0;
struct shared_msr_entry *msr;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);

switch (msr_info->index) {
#ifdef CONFIG_X86_64
@@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ /*
+ * If the MSR is not in the atomic list yet, then it was never
+ * written to. So the MSR value will be '0'.
+ */
+ read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
+
+ msr_info->data = spec_ctrl;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
+ unsigned long *msr_bitmap;
+
+ /*
+ * IBRS is not used (yet) to protect the host. Once it does, this
+ * variable needs to be a bit smarter.
+ */
+ u64 host_spec_ctrl = 0;

switch (msr_index) {
case MSR_EFER:
@@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ /*
+ * Now we know that the guest is actually using the MSR, so
+ * atomically load and save the SPEC_CTRL MSR and pass it
+ * through to the guest.
+ */
+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
+ host_spec_ctrl);
+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
--
2.7.4



2018-01-28 20:22:27

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On January 28, 2018 2:29:10 PM EST, KarimAllah Ahmed <[email protected]> wrote:
>Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
>guests
>that will only mitigate Spectre V2 through IBRS+IBPB and will not be
>using a
>retpoline+IBPB based approach.
>
>To avoid the overhead of atomically saving and restoring the
>MSR_IA32_SPEC_CTRL
>for guests that do not actually use the MSR, only add_atomic_switch_msr
>when a
>non-zero is written to it.


We tried this and found that it was about 3% slower that doing the old way of rdmsr and wrmsr.

But that was also with the host doing IBRS as well.

On what type of hardware did you run this?

Ccing Daniel.
>
>Cc: Asit Mallick <[email protected]>
>Cc: Arjan Van De Ven <[email protected]>
>Cc: Dave Hansen <[email protected]>
>Cc: Andi Kleen <[email protected]>
>Cc: Andrea Arcangeli <[email protected]>
>Cc: Linus Torvalds <[email protected]>
>Cc: Tim Chen <[email protected]>
>Cc: Thomas Gleixner <[email protected]>
>Cc: Dan Williams <[email protected]>
>Cc: Jun Nakajima <[email protected]>
>Cc: Paolo Bonzini <[email protected]>
>Cc: David Woodhouse <[email protected]>
>Cc: Greg KH <[email protected]>
>Cc: Andy Lutomirski <[email protected]>
>Signed-off-by: KarimAllah Ahmed <[email protected]>
>Signed-off-by: Ashok Raj <[email protected]>
>---
> arch/x86/kvm/cpuid.c | 4 +++-
> arch/x86/kvm/cpuid.h | 1 +
>arch/x86/kvm/vmx.c | 63
>++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 67 insertions(+), 1 deletion(-)
>
>diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>index 0099e10..dc78095 100644
>--- a/arch/x86/kvm/cpuid.c
>+++ b/arch/x86/kvm/cpuid.c
>@@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
> /* These are scattered features in cpufeatures.h. */
> #define KVM_CPUID_BIT_AVX512_4VNNIW 2
> #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>+#define KVM_CPUID_BIT_SPEC_CTRL 26
> #define KF(x) bit(KVM_CPUID_BIT_##x)
>
> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>@@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct
>kvm_cpuid_entry2 *entry, u32 function,
>
> /* cpuid 7.0.edx*/
> const u32 kvm_cpuid_7_0_edx_x86_features =
>- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>+ (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>
> /* all calls to cpuid_count() should be made on the same cpu */
> get_cpu();
>diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>index cdc70a3..dcfe227 100644
>--- a/arch/x86/kvm/cpuid.h
>+++ b/arch/x86/kvm/cpuid.h
>@@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
> [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
> [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
> [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
> };
>
>static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned
>x86_feature)
>diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>index aa8638a..1b743a0 100644
>--- a/arch/x86/kvm/vmx.c
>+++ b/arch/x86/kvm/vmx.c
>@@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu,
>bool masked);
> static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
> u16 error_code);
> static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>+static void __always_inline vmx_disable_intercept_for_msr(unsigned
>long *msr_bitmap,
>+ u32 msr, int type);
>+
>
> static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>@@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct
>vcpu_vmx *vmx, unsigned msr,
> m->host[i].value = host_val;
> }
>
>+/* do not touch guest_val and host_val if the msr is not found */
>+static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>+ u64 *guest_val, u64 *host_val)
>+{
>+ unsigned i;
>+ struct msr_autoload *m = &vmx->msr_autoload;
>+
>+ for (i = 0; i < m->nr; ++i)
>+ if (m->guest[i].index == msr)
>+ break;
>+
>+ if (i == m->nr)
>+ return 1;
>+
>+ if (guest_val)
>+ *guest_val = m->guest[i].value;
>+ if (host_val)
>+ *host_val = m->host[i].value;
>+
>+ return 0;
>+}
>+
>static bool update_transition_efer(struct vcpu_vmx *vmx, int
>efer_offset)
> {
> u64 guest_efer = vmx->vcpu.arch.efer;
>@@ -3203,7 +3228,9 @@ static inline bool
>vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
> */
>static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data
>*msr_info)
> {
>+ u64 spec_ctrl = 0;
> struct shared_msr_entry *msr;
>+ struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> switch (msr_info->index) {
> #ifdef CONFIG_X86_64
>@@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu,
>struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
>+ case MSR_IA32_SPEC_CTRL:
>+ if (!msr_info->host_initiated &&
>+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>+ return 1;
>+
>+ /*
>+ * If the MSR is not in the atomic list yet, then it was never
>+ * written to. So the MSR value will be '0'.
>+ */
>+ read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>+
>+ msr_info->data = spec_ctrl;
>+ break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
>@@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu,
>struct msr_data *msr_info)
> int ret = 0;
> u32 msr_index = msr_info->index;
> u64 data = msr_info->data;
>+ unsigned long *msr_bitmap;
>+
>+ /*
>+ * IBRS is not used (yet) to protect the host. Once it does, this
>+ * variable needs to be a bit smarter.
>+ */
>+ u64 host_spec_ctrl = 0;
>
> switch (msr_index) {
> case MSR_EFER:
>@@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu,
>struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
>+ case MSR_IA32_SPEC_CTRL:
>+ if (!msr_info->host_initiated &&
>+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>+ return 1;
>+
>+ /*
>+ * Now we know that the guest is actually using the MSR, so
>+ * atomically load and save the SPEC_CTRL MSR and pass it
>+ * through to the guest.
>+ */
>+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
>+ host_spec_ctrl);
>+ msr_bitmap = vmx->vmcs01.msr_bitmap;
>+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
>+
>+ break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))


2018-01-28 20:43:18

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Sun, 2018-01-28 at 15:21 -0500, Konrad Rzeszutek Wilk wrote:
> >To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
> >for guests that do not actually use the MSR, only add_atomic_switch_msr when a
> >non-zero is written to it.
>
>
> We tried this and found that it was about 3% slower that doing the
> old way of rdmsr and wrmsr.
>
> But that was also with the host doing IBRS  as well.

The common case will be that neither host nor guest are doing IBRS.
Until the guest touches the MSR we do absolutely *nothing* with it,
which is definitely the fastest option.


Attachments:
smime.p7s (5.09 kB)

2018-01-28 20:46:40

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Sun, 2018-01-28 at 12:40 -0800, Andy Lutomirski wrote:
>
> Do you mean that the host would intercept the guest WRMSR and do
> WRMSR itself?  I would suggest that doing so is inconsistent with the
> docs.  As specified, doing WRMSR to write 1 to IBRS does *not*
> protect the guest.

I believe it does. Guest kernel is protected from any guest userspace
predictions learned before IBRS was last set to 1 in *any* mode,
including host.

> For that matter, what are the semantics of VMRESUME doing a write to
> IBRS as part of its MSR switch?  Is it treated as IBRS=1 from guest
> context?

Why does it matter? We *have* confirmed, FWIW, that VMRESUME writing 1
to IBRS as part of its MSR switch when it was already 1 is not
optimised away and *is* treated as writing IBRS=1 again.


Attachments:
smime.p7s (5.09 kB)

2018-01-28 20:53:48

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL


> On Jan 28, 2018, at 12:21 PM, Konrad Rzeszutek Wilk <[email protected]> wrote:
>
>> On January 28, 2018 2:29:10 PM EST, KarimAllah Ahmed <[email protected]> wrote:
>> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
>> guests
>> that will only mitigate Spectre V2 through IBRS+IBPB and will not be
>> using a
>> retpoline+IBPB based approach.
>>
>> To avoid the overhead of atomically saving and restoring the
>> MSR_IA32_SPEC_CTRL
>> for guests that do not actually use the MSR, only add_atomic_switch_msr
>> when a
>> non-zero is written to it.
>
>
> We tried this and found that it was about 3% slower that doing the old way of rdmsr and wrmsr.
>

Do you mean that the host would intercept the guest WRMSR and do WRMSR itself? I would suggest that doing so is inconsistent with the docs. As specified, doing WRMSR to write 1 to IBRS does *not* protect the guest.

For that matter, what are the semantics of VMRESUME doing a write to IBRS as part of its MSR switch? Is it treated as IBRS=1 from guest context?

2018-01-28 20:55:31

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL




> On Jan 28, 2018, at 12:44 PM, David Woodhouse <[email protected]> wrote:
>
>> On Sun, 2018-01-28 at 12:40 -0800, Andy Lutomirski wrote:
>>
>> Do you mean that the host would intercept the guest WRMSR and do
>> WRMSR itself? I would suggest that doing so is inconsistent with the
>> docs. As specified, doing WRMSR to write 1 to IBRS does *not*
>> protect the guest.
>
> I believe it does. Guest kernel is protected from any guest userspace
> predictions learned before IBRS was last set to 1 in *any* mode,
> including host.

Hmm, you're probably right.

I would love to know what awful hack Intel did that resulted in these semantics.

>
>> For that matter, what are the semantics of VMRESUME doing a write to
>> IBRS as part of its MSR switch? Is it treated as IBRS=1 from guest
>> context?
>
> Why does it matter? We *have* confirmed, FWIW, that VMRESUME writing 1
> to IBRS as part of its MSR switch when it was already 1 is not
> optimised away and *is* treated as writing IBRS=1 again.

That's good news.

2018-01-28 21:09:16

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Sun, 2018-01-28 at 12:53 -0800, Andy Lutomirski wrote:
>
> > I believe it does. Guest kernel is protected from any guest userspace
> > predictions learned before IBRS was last set to 1 in *any* mode,
> > including host.
>
> Hmm, you're probably right.
>
> I would love to know what awful hack Intel did that resulted in these semantics.

I am not convinced I ever really want to know. I just want it all to go
away in a future CPU with a SPCTR_NO bit in IA32_ARCH_CAPABILITIES.
(Not the IBRS_ALL interim hack).

I think it's a mixture of ongoing checking, and a barrier. And perhaps
varying proportions of each, in different CPU generations. By defining
it thus, they can actually implement it *either* way.


Attachments:
smime.p7s (5.09 kB)

2018-01-28 22:06:30

by Van De Ven, Arjan

[permalink] [raw]
Subject: RE: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

>
> On Sun, 2018-01-28 at 12:40 -0800, Andy Lutomirski wrote:
> >
> > Do you mean that the host would intercept the guest WRMSR and do
> > WRMSR itself?  I would suggest that doing so is inconsistent with the
> > docs.  As specified, doing WRMSR to write 1 to IBRS does *not*
> > protect the guest.
>
> I believe it does. Guest kernel is protected from any guest userspace
> predictions learned before IBRS was last set to 1 in *any* mode,
> including host.

the specification requires you to write a 1 on each transition to higher privilege.


>
> > For that matter, what are the semantics of VMRESUME doing a write to
> > IBRS as part of its MSR switch?  Is it treated as IBRS=1 from guest
> > context?

the guest ring 3 wouldn't have had time to do anything evil in the mean time so the vmresume write is valid. (anything else would be unworkable)

2018-01-28 22:13:38

by David Woodhouse

[permalink] [raw]
Subject: RE: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL


>>
>> On Sun, 2018-01-28 at 12:40 -0800, Andy Lutomirski wrote:
>> >
>> > Do you mean that the host would intercept the guest WRMSR and do
>> > WRMSR itself?  I would suggest that doing so is inconsistent with the
>> > docs.  As specified, doing WRMSR to write 1 to IBRS does *not*
>> > protect the guest.
>>
>> I believe it does. Guest kernel is protected from any guest userspace
>> predictions learned before IBRS was last set to 1 in *any* mode,
>> including host.
>
> the specification requires you to write a 1 on each transition to higher
> privilege.

Right. Andy's concern was about VMX non-root (i.e. guest) ring 0
attempting to write IBRS but it's trapped and actually happens in the
host.

As long as it *remains* set when the host re-enters the vCPU that should
be fine.


--
dwmw2


2018-01-29 03:00:25

by Liran Alon

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL


----- [email protected] wrote:

> On Sun, 2018-01-28 at 15:21 -0500, Konrad Rzeszutek Wilk wrote:
> > >To avoid the overhead of atomically saving and restoring the
> MSR_IA32_SPEC_CTRL
> > >for guests that do not actually use the MSR, only
> add_atomic_switch_msr when a
> > >non-zero is written to it.
> >
> >
> > We tried this and found that it was about 3% slower that doing the
> > old way of rdmsr and wrmsr.
> >
> > But that was also with the host doing IBRS  as well.
>
> The common case will be that neither host nor guest are doing IBRS.
> Until the guest touches the MSR we do absolutely *nothing* with it,
> which is definitely the fastest option.

Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
Running a Windows guest should be a pretty common use-case no?

In addition, your handle of the first WRMSR intercept could be different.
It could signal you to start doing the following:
1. Disable intercept on SPEC_CTRL MSR.
2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
(And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)

That way, you will both have fastest option as long as guest don't use IBRS
and also won't have the 3% performance hit compared to Konrad's proposal.

Am I missing something?

-Liran

2018-01-29 03:02:08

by KarimAllah Ahmed

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On 01/28/2018 09:21 PM, Konrad Rzeszutek Wilk wrote:
> On January 28, 2018 2:29:10 PM EST, KarimAllah Ahmed <[email protected]> wrote:
>> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
>> guests
>> that will only mitigate Spectre V2 through IBRS+IBPB and will not be
>> using a
>> retpoline+IBPB based approach.
>>
>> To avoid the overhead of atomically saving and restoring the
>> MSR_IA32_SPEC_CTRL
>> for guests that do not actually use the MSR, only add_atomic_switch_msr
>> when a
>> non-zero is written to it.
>
>
> We tried this and found that it was about 3% slower that doing the old way of rdmsr and wrmsr.

I actually have not measured the performance difference between using
the atomic_switch vs just just doing rdmsr/wrmsr. I was mostly focused
on not saving and restoring when the guest does not actually use the MSRs.

Interesting data point though, I will update the code to use rdmsr/wrmsr
and see if I see it in my hardware (I am using a skylake processor).

>
> But that was also with the host doing IBRS as well.
>
> On what type of hardware did you run this?
>
> Ccing Daniel.
>>
>> Cc: Asit Mallick <[email protected]>
>> Cc: Arjan Van De Ven <[email protected]>
>> Cc: Dave Hansen <[email protected]>
>> Cc: Andi Kleen <[email protected]>
>> Cc: Andrea Arcangeli <[email protected]>
>> Cc: Linus Torvalds <[email protected]>
>> Cc: Tim Chen <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> Cc: Dan Williams <[email protected]>
>> Cc: Jun Nakajima <[email protected]>
>> Cc: Paolo Bonzini <[email protected]>
>> Cc: David Woodhouse <[email protected]>
>> Cc: Greg KH <[email protected]>
>> Cc: Andy Lutomirski <[email protected]>
>> Signed-off-by: KarimAllah Ahmed <[email protected]>
>> Signed-off-by: Ashok Raj <[email protected]>
>> ---
>> arch/x86/kvm/cpuid.c | 4 +++-
>> arch/x86/kvm/cpuid.h | 1 +
>> arch/x86/kvm/vmx.c | 63
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 67 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 0099e10..dc78095 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
>> /* These are scattered features in cpufeatures.h. */
>> #define KVM_CPUID_BIT_AVX512_4VNNIW 2
>> #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>> +#define KVM_CPUID_BIT_SPEC_CTRL 26
>> #define KF(x) bit(KVM_CPUID_BIT_##x)
>>
>> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>> @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct
>> kvm_cpuid_entry2 *entry, u32 function,
>>
>> /* cpuid 7.0.edx*/
>> const u32 kvm_cpuid_7_0_edx_x86_features =
>> - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>> + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>> + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>>
>> /* all calls to cpuid_count() should be made on the same cpu */
>> get_cpu();
>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>> index cdc70a3..dcfe227 100644
>> --- a/arch/x86/kvm/cpuid.h
>> +++ b/arch/x86/kvm/cpuid.h
>> @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>> [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
>> [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
>> [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>> + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
>> };
>>
>> static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned
>> x86_feature)
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index aa8638a..1b743a0 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu,
>> bool masked);
>> static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
>> u16 error_code);
>> static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>> +static void __always_inline vmx_disable_intercept_for_msr(unsigned
>> long *msr_bitmap,
>> + u32 msr, int type);
>> +
>>
>> static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>> static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>> @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct
>> vcpu_vmx *vmx, unsigned msr,
>> m->host[i].value = host_val;
>> }
>>
>> +/* do not touch guest_val and host_val if the msr is not found */
>> +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> + u64 *guest_val, u64 *host_val)
>> +{
>> + unsigned i;
>> + struct msr_autoload *m = &vmx->msr_autoload;
>> +
>> + for (i = 0; i < m->nr; ++i)
>> + if (m->guest[i].index == msr)
>> + break;
>> +
>> + if (i == m->nr)
>> + return 1;
>> +
>> + if (guest_val)
>> + *guest_val = m->guest[i].value;
>> + if (host_val)
>> + *host_val = m->host[i].value;
>> +
>> + return 0;
>> +}
>> +
>> static bool update_transition_efer(struct vcpu_vmx *vmx, int
>> efer_offset)
>> {
>> u64 guest_efer = vmx->vcpu.arch.efer;
>> @@ -3203,7 +3228,9 @@ static inline bool
>> vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>> */
>> static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data
>> *msr_info)
>> {
>> + u64 spec_ctrl = 0;
>> struct shared_msr_entry *msr;
>> + struct vcpu_vmx *vmx = to_vmx(vcpu);
>>
>> switch (msr_info->index) {
>> #ifdef CONFIG_X86_64
>> @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu,
>> struct msr_data *msr_info)
>> case MSR_IA32_TSC:
>> msr_info->data = guest_read_tsc(vcpu);
>> break;
>> + case MSR_IA32_SPEC_CTRL:
>> + if (!msr_info->host_initiated &&
>> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> + return 1;
>> +
>> + /*
>> + * If the MSR is not in the atomic list yet, then it was never
>> + * written to. So the MSR value will be '0'.
>> + */
>> + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>> +
>> + msr_info->data = spec_ctrl;
>> + break;
>> case MSR_IA32_SYSENTER_CS:
>> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>> break;
>> @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu,
>> struct msr_data *msr_info)
>> int ret = 0;
>> u32 msr_index = msr_info->index;
>> u64 data = msr_info->data;
>> + unsigned long *msr_bitmap;
>> +
>> + /*
>> + * IBRS is not used (yet) to protect the host. Once it does, this
>> + * variable needs to be a bit smarter.
>> + */
>> + u64 host_spec_ctrl = 0;
>>
>> switch (msr_index) {
>> case MSR_EFER:
>> @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu,
>> struct msr_data *msr_info)
>> case MSR_IA32_TSC:
>> kvm_write_tsc(vcpu, msr_info);
>> break;
>> + case MSR_IA32_SPEC_CTRL:
>> + if (!msr_info->host_initiated &&
>> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> + return 1;
>> +
>> + /*
>> + * Now we know that the guest is actually using the MSR, so
>> + * atomically load and save the SPEC_CTRL MSR and pass it
>> + * through to the guest.
>> + */
>> + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
>> + host_spec_ctrl);
>> + msr_bitmap = vmx->vmcs01.msr_bitmap;
>> + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
>> +
>> + break;
>> case MSR_IA32_CR_PAT:
>> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
>> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
>
>
Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

2018-01-29 08:47:23

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Sun, 2018-01-28 at 16:39 -0800, Liran Alon wrote:
>
> Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
> Running a Windows guest should be a pretty common use-case no?
>
> In addition, your handle of the first WRMSR intercept could be different.
> It could signal you to start doing the following:
> 1. Disable intercept on SPEC_CTRL MSR.
> 2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
> 3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
> (And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)
>
> That way, you will both have fastest option as long as guest don't use IBRS
> and also won't have the 3% performance hit compared to Konrad's proposal.
>
> Am I missing something?

Reads from the SPEC_CTRL MSR are strangely slow. I suspect a large part
of the 3% speedup you observe is because in the above, the vmentry path
doesn't need to *read* the host's value and store it; the host is
expected to restore it for itself anyway?

I'd actually quite like to repeat the benchmark on the new fixed
microcode, if anyone has it yet, to see if that read/swap slowness is
still quite as excessive. I'm certainly not ruling this out, but I'm
just a little wary of premature optimisation, and I'd like to make sure
we have everything *else* in the KVM patches right first.

The fact that the save-and-restrict macros I have in the tip of my
working tree at the moment are horrid and causing 0-day nastygrams,
probably doesn't help persuade me to favour the approach ;)

... hm, the CPU actually has separate MSR save/restore lists for
entry/exit, doesn't it? Is there any way to sanely make use of that and
do the restoration manually on vmentry but let it be automatic on
vmexit, by having it *only* in the guest's MSR-store area to be saved
on exit and restored on exit, but *not* in the host's MSR-store area?

Reading the code and comparing with the SDM, I can't see where we're
ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
case...


Attachments:
smime.p7s (5.09 kB)

2018-01-29 09:44:42

by KarimAllah Ahmed

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On 01/29/2018 09:46 AM, David Woodhouse wrote:
> On Sun, 2018-01-28 at 16:39 -0800, Liran Alon wrote:
>>
>> Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
>> Running a Windows guest should be a pretty common use-case no?
>>
>> In addition, your handle of the first WRMSR intercept could be different.
>> It could signal you to start doing the following:
>> 1. Disable intercept on SPEC_CTRL MSR.
>> 2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
>> 3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
>> (And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)
>>
>> That way, you will both have fastest option as long as guest don't use IBRS
>> and also won't have the 3% performance hit compared to Konrad's proposal.
>>
>> Am I missing something?
>
> Reads from the SPEC_CTRL MSR are strangely slow. I suspect a large part
> of the 3% speedup you observe is because in the above, the vmentry path
> doesn't need to *read* the host's value and store it; the host is
> expected to restore it for itself anyway?
>
> I'd actually quite like to repeat the benchmark on the new fixed
> microcode, if anyone has it yet, to see if that read/swap slowness is
> still quite as excessive. I'm certainly not ruling this out, but I'm
> just a little wary of premature optimisation, and I'd like to make sure
> we have everything *else* in the KVM patches right first.
>
> The fact that the save-and-restrict macros I have in the tip of my
> working tree at the moment are horrid and causing 0-day nastygrams,
> probably doesn't help persuade me to favour the approach ;)
>
> ... hm, the CPU actually has separate MSR save/restore lists for
> entry/exit, doesn't it? Is there any way to sanely make use of that and
> do the restoration manually on vmentry but let it be automatic on
> vmexit, by having it *only* in the guest's MSR-store area to be saved
> on exit and restored on exit, but *not* in the host's MSR-store area?
>
> Reading the code and comparing with the SDM, I can't see where we're
> ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
> case...

Hmmm ... you are probably right! I think all users of this interface
always trap + update save area and never passthrough the MSR. That is
why only LOAD is needed *so far*.

Okay, let me sort this out in v3 then.

>
Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

2018-01-29 10:39:42

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, 2018-01-29 at 10:43 +0100, KarimAllah Ahmed wrote:
> On 01/29/2018 09:46 AM, David Woodhouse wrote:
> > Reading the code and comparing with the SDM, I can't see where we're
> > ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
> > case...
> Hmmm ... you are probably right! I think all users of this interface
> always trap + update save area and never passthrough the MSR. That is
> why only LOAD is needed *so far*.
>
> Okay, let me sort this out in v3 then.

I'm starting to think a variant of Ashok's patch might actually be the
simpler approach, and not "premature optimisation". Especially if we
need to support the !cpu_has_vmx_msr_bitmaps() case?

Start with vmx->spec_ctrl set to zero. When first touched, make it
passthrough (but not atomically switched) and set a flag (e.g.
"spec_ctrl_live") which triggers the 'restore_branch_speculation' and
'save_and_restrict_branch_speculation' behaviours. Except don't use
those macros. Those can look something like

 /* If this vCPU has touched SPEC_CTRL then restore its value if needed */
 if (vmx->spec_ctrl_live && vmx->spec_ctrl)
     wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 /* vmentry is serialising on affected CPUs, so the conditional branch is safe */


... and, respectively, ...

/* If this vCPU has touched SPEC_CTRL then save its value and ensure we have zero */
if (vmx->spec_ctrl_live) {
rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
if (vmx->spec_ctrl)
wrmsrl(MSR_IA32_SPEC_CTRL, 0);
}


Perhaps we can ditch the separate 'spec_ctrl_live' flag and check the
pass-through MSR bitmap directly, in the case that it exists? 


Attachments:
smime.p7s (5.09 kB)

2018-01-29 10:49:25

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On 29/01/2018 09:46, David Woodhouse wrote:
> I'd actually quite like to repeat the benchmark on the new fixed
> microcode, if anyone has it yet, to see if that read/swap slowness is
> still quite as excessive. I'm certainly not ruling this out, but I'm
> just a little wary of premature optimisation, and I'd like to make sure
> we have everything *else* in the KVM patches right first.
>
> The fact that the save-and-restrict macros I have in the tip of my
> working tree at the moment are horrid and causing 0-day nastygrams,
> probably doesn't help persuade me to favour the approach ;)
>
> ... hm, the CPU actually has separate MSR save/restore lists for
> entry/exit, doesn't it? Is there any way to sanely make use of that and
> do the restoration manually on vmentry but let it be automatic on
> vmexit, by having it *only* in the guest's MSR-store area to be saved
> on exit and restored on exit, but *not* in the host's MSR-store area?

Right now we don't even use the store-on-vmexit list at all, so the
Simplest Patch That Can Possibly Work is definitely the one using
rdmsr/wrmsr. It's not really premature optimization---though it doesn't
hurt that it isn't awfully slow.

Paolo

2018-01-29 10:55:55

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On 29/01/2018 11:37, David Woodhouse wrote:
> On Mon, 2018-01-29 at 10:43 +0100, KarimAllah Ahmed wrote:
>> On 01/29/2018 09:46 AM, David Woodhouse wrote:
>>> Reading the code and comparing with the SDM, I can't see where we're
>>> ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
>>> case...
>> Hmmm ... you are probably right! I think all users of this interface
>> always trap + update save area and never passthrough the MSR. That is
>> why only LOAD is needed *so far*.
>>
>> Okay, let me sort this out in v3 then.
>
> I'm starting to think a variant of Ashok's patch might actually be the
> simpler approach, and not "premature optimisation". Especially if we
> need to support the !cpu_has_vmx_msr_bitmaps() case?

That case is awfully slow anyway, it doesn't matter, but the
direct-access flag would simply be always 0 if you have no MSR bitmaps.

> Start with vmx->spec_ctrl set to zero. When first touched, make it
> passthrough (but not atomically switched) and set a flag (e.g.
> "spec_ctrl_live") which triggers the 'restore_branch_speculation' and
> 'save_and_restrict_branch_speculation' behaviours. Except don't use
> those macros. Those can look something like
>
>  /* If this vCPU has touched SPEC_CTRL then restore its value if needed */
>  if (vmx->spec_ctrl_live && vmx->spec_ctrl)
>      wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>  /* vmentry is serialising on affected CPUs, so the conditional branch is safe */
>
>
> ... and, respectively, ...
>
> /* If this vCPU has touched SPEC_CTRL then save its value and ensure we have zero */
> if (vmx->spec_ctrl_live) {
> rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> if (vmx->spec_ctrl)
> wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> }
>
> Perhaps we can ditch the separate 'spec_ctrl_live' flag and check the
> pass-through MSR bitmap directly, in the case that it exists? 

Probably a cache miss, or even a TLB miss if you're unlucky, so the
separate flag is okay.

Paolo

2018-01-29 17:29:35

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 10:37:44AM +0000, David Woodhouse wrote:
> On Mon, 2018-01-29 at 10:43 +0100, KarimAllah Ahmed wrote:
> > On 01/29/2018 09:46 AM, David Woodhouse wrote:
> > > Reading the code and comparing with the SDM, I can't see where we're
> > > ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
> > > case...
> > Hmmm ... you are probably right! I think all users of this interface
> > always trap + update save area and never passthrough the MSR. That is
> > why only LOAD is needed *so far*.
> >
> > Okay, let me sort this out in v3 then.
>
> I'm starting to think a variant of Ashok's patch might actually be the
> simpler approach, and not "premature optimisation". Especially if we
> need to support the !cpu_has_vmx_msr_bitmaps() case?
>
> Start with vmx->spec_ctrl set to zero. When first touched, make it
> passthrough (but not atomically switched) and set a flag (e.g.
> "spec_ctrl_live") which triggers the 'restore_branch_speculation' and
> 'save_and_restrict_branch_speculation' behaviours. Except don't use
> those macros. Those can look something like
>
> ?/* If this vCPU has touched SPEC_CTRL then restore its value if needed */
> ?if (vmx->spec_ctrl_live && vmx->spec_ctrl)
> ? ? ?wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> ?/* vmentry is serialising on affected CPUs, so the conditional branch is safe */
>
>
> ... and, respectively, ...
>
> /* If this vCPU has touched SPEC_CTRL then save its value and ensure we have zero */
> if (vmx->spec_ctrl_live) {
> rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> if (vmx->spec_ctrl)
> wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> }
>
>
> Perhaps we can ditch the separate 'spec_ctrl_live' flag and check the
> pass-through MSR bitmap directly, in the case that it exists??

Or the cpuid_flag as that would determine whether the MSR bitmap intercept
is set or not.



2018-01-29 17:33:06

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 08:46:03AM +0000, David Woodhouse wrote:
> On Sun, 2018-01-28 at 16:39 -0800, Liran Alon wrote:
> >
> > Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
> > Running a Windows guest should be a pretty common use-case no?
> >
> > In addition, your handle of the first WRMSR intercept could be different.
> > It could signal you to start doing the following:
> > 1. Disable intercept on SPEC_CTRL MSR.
> > 2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
> > 3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
> > (And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)
> >
> > That way, you will both have fastest option as long as guest don't use IBRS
> > and also won't have the 3% performance hit compared to Konrad's proposal.
> >
> > Am I missing something?
>
> Reads from the SPEC_CTRL MSR are strangely slow. I suspect a large part
> of the 3% speedup you observe is because in the above, the vmentry path
> doesn't need to *read* the host's value and store it; the host is
> expected to restore it for itself anyway?

Yes for at least the purpose of correctness. That is based on what I have
heard is that you when you transition to a higher ring you have to write 1, then write zero
when you transition back to lower rings. That is it is like a knob.

But then I heard that on some CPUs it is more like reset button and
just writting 1 to IBRS is fine. But again, correctness here.

>
> I'd actually quite like to repeat the benchmark on the new fixed
> microcode, if anyone has it yet, to see if that read/swap slowness is
> still quite as excessive. I'm certainly not ruling this out, but I'm
> just a little wary of premature optimisation, and I'd like to make sure
> we have everything *else* in the KVM patches right first.
>
> The fact that the save-and-restrict macros I have in the tip of my
> working tree at the moment are horrid and causing 0-day nastygrams,
> probably doesn't help persuade me to favour the approach ;)
>
> ... hm, the CPU actually has separate MSR save/restore lists for
> entry/exit, doesn't it? Is there any way to sanely make use of that and
> do the restoration manually on vmentry but let it be automatic on
> vmexit, by having it *only* in the guest's MSR-store area to be saved
> on exit and restored on exit, but *not* in the host's MSR-store area?

Oh. That sounds sounds interesting
>
> Reading the code and comparing with the SDM, I can't see where we're
> ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
> case...

Right. We (well Daniel Kiper, CC-ed) implemented it for this and
that is where we got the numbers.

Daniel, you OK posting it here? Granted with the caveats thta it won't even
compile against upstream as it was based on a distro kernel.



2018-01-29 18:44:11

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed <[email protected]> wrote:
> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
> that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
> retpoline+IBPB based approach.
>
> To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
> for guests that do not actually use the MSR, only add_atomic_switch_msr when a
> non-zero is written to it.
>
> Cc: Asit Mallick <[email protected]>
> Cc: Arjan Van De Ven <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Andrea Arcangeli <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Dan Williams <[email protected]>
> Cc: Jun Nakajima <[email protected]>
> Cc: Paolo Bonzini <[email protected]>
> Cc: David Woodhouse <[email protected]>
> Cc: Greg KH <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Signed-off-by: KarimAllah Ahmed <[email protected]>
> Signed-off-by: Ashok Raj <[email protected]>
> ---
> arch/x86/kvm/cpuid.c | 4 +++-
> arch/x86/kvm/cpuid.h | 1 +
> arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 67 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 0099e10..dc78095 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
> /* These are scattered features in cpufeatures.h. */
> #define KVM_CPUID_BIT_AVX512_4VNNIW 2
> #define KVM_CPUID_BIT_AVX512_4FMAPS 3
> +#define KVM_CPUID_BIT_SPEC_CTRL 26
> #define KF(x) bit(KVM_CPUID_BIT_##x)
>
> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>
> /* cpuid 7.0.edx*/
> const u32 kvm_cpuid_7_0_edx_x86_features =
> - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
> + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
> + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);

Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
pass through for existing CPUs (26 and 27)?

>
> /* all calls to cpuid_count() should be made on the same cpu */
> get_cpu();
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index cdc70a3..dcfe227 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
> [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
> [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
> [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
> + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
> };
>
> static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index aa8638a..1b743a0 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
> static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
> u16 error_code);
> static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
> +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> + u32 msr, int type);
> +
>
> static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> m->host[i].value = host_val;
> }
>
> +/* do not touch guest_val and host_val if the msr is not found */
> +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> + u64 *guest_val, u64 *host_val)
> +{
> + unsigned i;
> + struct msr_autoload *m = &vmx->msr_autoload;
> +
> + for (i = 0; i < m->nr; ++i)
> + if (m->guest[i].index == msr)
> + break;
> +
> + if (i == m->nr)
> + return 1;
> +
> + if (guest_val)
> + *guest_val = m->guest[i].value;
> + if (host_val)
> + *host_val = m->host[i].value;
> +
> + return 0;
> +}
> +
> static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
> {
> u64 guest_efer = vmx->vcpu.arch.efer;
> @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
> */
> static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> {
> + u64 spec_ctrl = 0;
> struct shared_msr_entry *msr;
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> switch (msr_info->index) {
> #ifdef CONFIG_X86_64
> @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> msr_info->data = guest_read_tsc(vcpu);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + if (!msr_info->host_initiated &&
> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))

Shouldn't this conjunct be:
!(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?

> + return 1;

What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
!boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.

> +
> + /*
> + * If the MSR is not in the atomic list yet, then it was never
> + * written to. So the MSR value will be '0'.
> + */
> + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);

Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
don't have to search the atomic switch list?

> +
> + msr_info->data = spec_ctrl;
> + break;
> case MSR_IA32_SYSENTER_CS:
> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> break;
> @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> int ret = 0;
> u32 msr_index = msr_info->index;
> u64 data = msr_info->data;
> + unsigned long *msr_bitmap;
> +
> + /*
> + * IBRS is not used (yet) to protect the host. Once it does, this
> + * variable needs to be a bit smarter.
> + */
> + u64 host_spec_ctrl = 0;
>
> switch (msr_index) {
> case MSR_EFER:
> @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_IA32_TSC:
> kvm_write_tsc(vcpu, msr_info);
> break;
> + case MSR_IA32_SPEC_CTRL:
> + if (!msr_info->host_initiated &&
> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> + return 1;

This looks incomplete. As above, what if
!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
!boot_cpu_has(X86_FEATURE_STIBP)?
If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
on loading the host MSRs from the VM-exit MSR load list.

Also, what if the value being written is illegal?

/*
* Processors that support IBRS but not STIBP
* (CPUID.(EAX=07H, ECX=0):EDX[27:26] = 01b) will
* ignore attempts to set STIBP instead of causing an
* exception due to setting that reserved bit.
*/
if ((data & ~(u64)(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ||
((data & SPEC_CTRL_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
return 1;

> +
> + /*
> + * Now we know that the guest is actually using the MSR, so
> + * atomically load and save the SPEC_CTRL MSR and pass it
> + * through to the guest.
> + */
> + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
> + host_spec_ctrl);
> + msr_bitmap = vmx->vmcs01.msr_bitmap;
> + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);

I assume you mean MSR_IA32_SPEC_CTRL rather than MSR_FS_BASE.

Also, what if the host and the guest support a different set of bits
in MSR_IA32_SPEC_CTRL, due to a userspace modification of the guest's
CPUID info?

> +
> + break;
> case MSR_IA32_CR_PAT:
> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> --
> 2.7.4
>

Where do you preserve the guest's MSR_IA32_SPEC_CTRL value on VM-exit,
if the guest has been given permission to write the MSR?

You also have to clear the guest's MSR_IA32_SPEC_CTRL on
vmx_vcpu_reset, don't you?

2018-01-29 19:02:13

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL


(Top-posting; sorry.)

Much of that is already fixed during our day, in
http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb

I forgot to fix up the wrong-MSR typo though, and we do still need to address reset.

On Mon, 2018-01-29 at 10:43 -0800, Jim Mattson wrote:
> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed wrote:
> >
> > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
> > that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
> > retpoline+IBPB based approach.
> >
> > To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
> > for guests that do not actually use the MSR, only add_atomic_switch_msr when a
> > non-zero is written to it.
> >
> > Cc: Asit Mallick <[email protected]>
> > Cc: Arjan Van De Ven <[email protected]>
> > Cc: Dave Hansen <[email protected]>
> > Cc: Andi Kleen <[email protected]>
> > Cc: Andrea Arcangeli <[email protected]>
> > Cc: Linus Torvalds <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Dan Williams <[email protected]>
> > Cc: Jun Nakajima <[email protected]>
> > Cc: Paolo Bonzini <[email protected]>
> > Cc: David Woodhouse <[email protected]>
> > Cc: Greg KH <[email protected]>
> > Cc: Andy Lutomirski <[email protected]>
> > Signed-off-by: KarimAllah Ahmed <[email protected]>
> > Signed-off-by: Ashok Raj <[email protected]>
> > ---
> >  arch/x86/kvm/cpuid.c |  4 +++-
> >  arch/x86/kvm/cpuid.h |  1 +
> >  arch/x86/kvm/vmx.c   | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 67 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 0099e10..dc78095 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
> >  /* These are scattered features in cpufeatures.h. */
> >  #define KVM_CPUID_BIT_AVX512_4VNNIW     2
> >  #define KVM_CPUID_BIT_AVX512_4FMAPS     3
> > +#define KVM_CPUID_BIT_SPEC_CTRL         26
> >  #define KF(x) bit(KVM_CPUID_BIT_##x)
> >
> >  int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> > @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >
> >         /* cpuid 7.0.edx*/
> >         const u32 kvm_cpuid_7_0_edx_x86_features =
> > -               KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
> > +               KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
> > +               (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
> pass through for existing CPUs (26 and 27)?
>
> >
> >
> >         /* all calls to cpuid_count() should be made on the same cpu */
> >         get_cpu();
> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> > index cdc70a3..dcfe227 100644
> > --- a/arch/x86/kvm/cpuid.h
> > +++ b/arch/x86/kvm/cpuid.h
> > @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
> >         [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
> >         [CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
> >         [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
> > +       [CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
> >  };
> >
> >  static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index aa8638a..1b743a0 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
> >  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
> >                                             u16 error_code);
> >  static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
> > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> > +                                                         u32 msr, int type);
> > +
> >
> >  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> >  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> > @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> >         m->host[i].value = host_val;
> >  }
> >
> > +/* do not touch guest_val and host_val if the msr is not found */
> > +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> > +                                 u64 *guest_val, u64 *host_val)
> > +{
> > +       unsigned i;
> > +       struct msr_autoload *m = &vmx->msr_autoload;
> > +
> > +       for (i = 0; i < m->nr; ++i)
> > +               if (m->guest[i].index == msr)
> > +                       break;
> > +
> > +       if (i == m->nr)
> > +               return 1;
> > +
> > +       if (guest_val)
> > +               *guest_val = m->guest[i].value;
> > +       if (host_val)
> > +               *host_val = m->host[i].value;
> > +
> > +       return 0;
> > +}
> > +
> >  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
> >  {
> >         u64 guest_efer = vmx->vcpu.arch.efer;
> > @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
> >   */
> >  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  {
> > +       u64 spec_ctrl = 0;
> >         struct shared_msr_entry *msr;
> > +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> >         switch (msr_info->index) {
> >  #ifdef CONFIG_X86_64
> > @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >         case MSR_IA32_TSC:
> >                 msr_info->data = guest_read_tsc(vcpu);
> >                 break;
> > +       case MSR_IA32_SPEC_CTRL:
> > +               if (!msr_info->host_initiated &&
> > +                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> Shouldn't this conjunct be:
> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>
> >
> > +                       return 1;
> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>
> >
> > +
> > +               /*
> > +                * If the MSR is not in the atomic list yet, then it was never
> > +                * written to. So the MSR value will be '0'.
> > +                */
> > +               read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
> don't have to search the atomic switch list?
>
> >
> > +
> > +               msr_info->data = spec_ctrl;
> > +               break;
> >         case MSR_IA32_SYSENTER_CS:
> >                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> >                 break;
> > @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >         int ret = 0;
> >         u32 msr_index = msr_info->index;
> >         u64 data = msr_info->data;
> > +       unsigned long *msr_bitmap;
> > +
> > +       /*
> > +        * IBRS is not used (yet) to protect the host. Once it does, this
> > +        * variable needs to be a bit smarter.
> > +        */
> > +       u64 host_spec_ctrl = 0;
> >
> >         switch (msr_index) {
> >         case MSR_EFER:
> > @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >         case MSR_IA32_TSC:
> >                 kvm_write_tsc(vcpu, msr_info);
> >                 break;
> > +       case MSR_IA32_SPEC_CTRL:
> > +               if (!msr_info->host_initiated &&
> > +                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> > +                       return 1;
> This looks incomplete. As above, what if
> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> !boot_cpu_has(X86_FEATURE_STIBP)?
> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
> on loading the host MSRs from the VM-exit MSR load list.
>
> Also, what if the value being written is illegal?
>
> /*
> * Processors that support IBRS but not STIBP
> * (CPUID.(EAX=07H, ECX=0):EDX[27:26] = 01b) will
> * ignore attempts to set STIBP instead of causing an
> * exception due to setting that reserved bit.
> */
> if ((data & ~(u64)(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ||
>     ((data & SPEC_CTRL_IBRS) &&
>      !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
> return 1;
>
> >
> > +
> > +               /*
> > +                * Now we know that the guest is actually using the MSR, so
> > +                * atomically load and save the SPEC_CTRL MSR and pass it
> > +                * through to the guest.
> > +                */
> > +               add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
> > +                                     host_spec_ctrl);
> > +               msr_bitmap = vmx->vmcs01.msr_bitmap;
> > +               vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
> I assume you mean MSR_IA32_SPEC_CTRL rather than MSR_FS_BASE.
>
> Also, what if the host and the guest support a different set of bits
> in MSR_IA32_SPEC_CTRL, due to a userspace modification of the guest's
> CPUID info?
>
> >
> > +
> > +               break;
> >         case MSR_IA32_CR_PAT:
> >                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> >                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> > --
> > 2.7.4
> >
> Where do you preserve the guest's MSR_IA32_SPEC_CTRL value on VM-exit,
> if the guest has been given permission to write the MSR?
>
> You also have to clear the guest's MSR_IA32_SPEC_CTRL on
> vmx_vcpu_reset, don't you?
>


Attachments:
smime.p7s (5.09 kB)

2018-01-29 19:06:30

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

Can I assume you'll send out a new version with the fixes?

On Mon, Jan 29, 2018 at 11:01 AM, David Woodhouse <[email protected]> wrote:
>
> (Top-posting; sorry.)
>
> Much of that is already fixed during our day, in
> http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb
>
> I forgot to fix up the wrong-MSR typo though, and we do still need to address reset.
>
> On Mon, 2018-01-29 at 10:43 -0800, Jim Mattson wrote:
>> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed wrote:
>> >
>> > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
>> > that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
>> > retpoline+IBPB based approach.
>> >
>> > To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
>> > for guests that do not actually use the MSR, only add_atomic_switch_msr when a
>> > non-zero is written to it.
>> >
>> > Cc: Asit Mallick <[email protected]>
>> > Cc: Arjan Van De Ven <[email protected]>
>> > Cc: Dave Hansen <[email protected]>
>> > Cc: Andi Kleen <[email protected]>
>> > Cc: Andrea Arcangeli <[email protected]>
>> > Cc: Linus Torvalds <[email protected]>
>> > Cc: Tim Chen <[email protected]>
>> > Cc: Thomas Gleixner <[email protected]>
>> > Cc: Dan Williams <[email protected]>
>> > Cc: Jun Nakajima <[email protected]>
>> > Cc: Paolo Bonzini <[email protected]>
>> > Cc: David Woodhouse <[email protected]>
>> > Cc: Greg KH <[email protected]>
>> > Cc: Andy Lutomirski <[email protected]>
>> > Signed-off-by: KarimAllah Ahmed <[email protected]>
>> > Signed-off-by: Ashok Raj <[email protected]>
>> > ---
>> > arch/x86/kvm/cpuid.c | 4 +++-
>> > arch/x86/kvm/cpuid.h | 1 +
>> > arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> > 3 files changed, 67 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> > index 0099e10..dc78095 100644
>> > --- a/arch/x86/kvm/cpuid.c
>> > +++ b/arch/x86/kvm/cpuid.c
>> > @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
>> > /* These are scattered features in cpufeatures.h. */
>> > #define KVM_CPUID_BIT_AVX512_4VNNIW 2
>> > #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>> > +#define KVM_CPUID_BIT_SPEC_CTRL 26
>> > #define KF(x) bit(KVM_CPUID_BIT_##x)
>> >
>> > int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>> > @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>> >
>> > /* cpuid 7.0.edx*/
>> > const u32 kvm_cpuid_7_0_edx_x86_features =
>> > - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>> > + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>> > + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
>> pass through for existing CPUs (26 and 27)?
>>
>> >
>> >
>> > /* all calls to cpuid_count() should be made on the same cpu */
>> > get_cpu();
>> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>> > index cdc70a3..dcfe227 100644
>> > --- a/arch/x86/kvm/cpuid.h
>> > +++ b/arch/x86/kvm/cpuid.h
>> > @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>> > [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
>> > [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
>> > [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>> > + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
>> > };
>> >
>> > static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > index aa8638a..1b743a0 100644
>> > --- a/arch/x86/kvm/vmx.c
>> > +++ b/arch/x86/kvm/vmx.c
>> > @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
>> > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
>> > u16 error_code);
>> > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>> > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>> > + u32 msr, int type);
>> > +
>> >
>> > static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>> > static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>> > @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > m->host[i].value = host_val;
>> > }
>> >
>> > +/* do not touch guest_val and host_val if the msr is not found */
>> > +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > + u64 *guest_val, u64 *host_val)
>> > +{
>> > + unsigned i;
>> > + struct msr_autoload *m = &vmx->msr_autoload;
>> > +
>> > + for (i = 0; i < m->nr; ++i)
>> > + if (m->guest[i].index == msr)
>> > + break;
>> > +
>> > + if (i == m->nr)
>> > + return 1;
>> > +
>> > + if (guest_val)
>> > + *guest_val = m->guest[i].value;
>> > + if (host_val)
>> > + *host_val = m->host[i].value;
>> > +
>> > + return 0;
>> > +}
>> > +
>> > static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
>> > {
>> > u64 guest_efer = vmx->vcpu.arch.efer;
>> > @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>> > */
>> > static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > {
>> > + u64 spec_ctrl = 0;
>> > struct shared_msr_entry *msr;
>> > + struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >
>> > switch (msr_info->index) {
>> > #ifdef CONFIG_X86_64
>> > @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > msr_info->data = guest_read_tsc(vcpu);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> Shouldn't this conjunct be:
>> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
>> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>>
>> >
>> > + return 1;
>> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>>
>> >
>> > +
>> > + /*
>> > + * If the MSR is not in the atomic list yet, then it was never
>> > + * written to. So the MSR value will be '0'.
>> > + */
>> > + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
>> don't have to search the atomic switch list?
>>
>> >
>> > +
>> > + msr_info->data = spec_ctrl;
>> > + break;
>> > case MSR_IA32_SYSENTER_CS:
>> > msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>> > break;
>> > @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > int ret = 0;
>> > u32 msr_index = msr_info->index;
>> > u64 data = msr_info->data;
>> > + unsigned long *msr_bitmap;
>> > +
>> > + /*
>> > + * IBRS is not used (yet) to protect the host. Once it does, this
>> > + * variable needs to be a bit smarter.
>> > + */
>> > + u64 host_spec_ctrl = 0;
>> >
>> > switch (msr_index) {
>> > case MSR_EFER:
>> > @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > kvm_write_tsc(vcpu, msr_info);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> > + return 1;
>> This looks incomplete. As above, what if
>> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)?
>> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
>> on loading the host MSRs from the VM-exit MSR load list.
>>
>> Also, what if the value being written is illegal?
>>
>> /*
>> * Processors that support IBRS but not STIBP
>> * (CPUID.(EAX=07H, ECX=0):EDX[27:26] = 01b) will
>> * ignore attempts to set STIBP instead of causing an
>> * exception due to setting that reserved bit.
>> */
>> if ((data & ~(u64)(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ||
>> ((data & SPEC_CTRL_IBRS) &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
>> return 1;
>>
>> >
>> > +
>> > + /*
>> > + * Now we know that the guest is actually using the MSR, so
>> > + * atomically load and save the SPEC_CTRL MSR and pass it
>> > + * through to the guest.
>> > + */
>> > + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
>> > + host_spec_ctrl);
>> > + msr_bitmap = vmx->vmcs01.msr_bitmap;
>> > + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
>> I assume you mean MSR_IA32_SPEC_CTRL rather than MSR_FS_BASE.
>>
>> Also, what if the host and the guest support a different set of bits
>> in MSR_IA32_SPEC_CTRL, due to a userspace modification of the guest's
>> CPUID info?
>>
>> >
>> > +
>> > + break;
>> > case MSR_IA32_CR_PAT:
>> > if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
>> > if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
>> > --
>> > 2.7.4
>> >
>> Where do you preserve the guest's MSR_IA32_SPEC_CTRL value on VM-exit,
>> if the guest has been given permission to write the MSR?
>>
>> You also have to clear the guest's MSR_IA32_SPEC_CTRL on
>> vmx_vcpu_reset, don't you?
>>

2018-01-29 19:11:50

by KarimAllah Ahmed

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On 01/29/2018 08:04 PM, Jim Mattson wrote:
> Can I assume you'll send out a new version with the fixes?

Yes, I am currently doing some tests and once I am done I will send a
new round.

... and the typo is already fixed in 'ibpb-wip' :)

>
> On Mon, Jan 29, 2018 at 11:01 AM, David Woodhouse <[email protected]> wrote:
>>
>> (Top-posting; sorry.)
>>
>> Much of that is already fixed during our day, in
>> http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb
>>
>> I forgot to fix up the wrong-MSR typo though, and we do still need to address reset.
>>
>> On Mon, 2018-01-29 at 10:43 -0800, Jim Mattson wrote:
>>> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed wrote:
>>>>
>>>> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
>>>> that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
>>>> retpoline+IBPB based approach.
>>>>
>>>> To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
>>>> for guests that do not actually use the MSR, only add_atomic_switch_msr when a
>>>> non-zero is written to it.
>>>>
>>>> Cc: Asit Mallick <[email protected]>
>>>> Cc: Arjan Van De Ven <[email protected]>
>>>> Cc: Dave Hansen <[email protected]>
>>>> Cc: Andi Kleen <[email protected]>
>>>> Cc: Andrea Arcangeli <[email protected]>
>>>> Cc: Linus Torvalds <[email protected]>
>>>> Cc: Tim Chen <[email protected]>
>>>> Cc: Thomas Gleixner <[email protected]>
>>>> Cc: Dan Williams <[email protected]>
>>>> Cc: Jun Nakajima <[email protected]>
>>>> Cc: Paolo Bonzini <[email protected]>
>>>> Cc: David Woodhouse <[email protected]>
>>>> Cc: Greg KH <[email protected]>
>>>> Cc: Andy Lutomirski <[email protected]>
>>>> Signed-off-by: KarimAllah Ahmed <[email protected]>
>>>> Signed-off-by: Ashok Raj <[email protected]>
>>>> ---
>>>> arch/x86/kvm/cpuid.c | 4 +++-
>>>> arch/x86/kvm/cpuid.h | 1 +
>>>> arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>> 3 files changed, 67 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>>>> index 0099e10..dc78095 100644
>>>> --- a/arch/x86/kvm/cpuid.c
>>>> +++ b/arch/x86/kvm/cpuid.c
>>>> @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
>>>> /* These are scattered features in cpufeatures.h. */
>>>> #define KVM_CPUID_BIT_AVX512_4VNNIW 2
>>>> #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>>>> +#define KVM_CPUID_BIT_SPEC_CTRL 26
>>>> #define KF(x) bit(KVM_CPUID_BIT_##x)
>>>>
>>>> int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>>>> @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>>>
>>>> /* cpuid 7.0.edx*/
>>>> const u32 kvm_cpuid_7_0_edx_x86_features =
>>>> - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>>>> + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>>>> + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>>> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
>>> pass through for existing CPUs (26 and 27)?
>>>
>>>>
>>>>
>>>> /* all calls to cpuid_count() should be made on the same cpu */
>>>> get_cpu();
>>>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>>>> index cdc70a3..dcfe227 100644
>>>> --- a/arch/x86/kvm/cpuid.h
>>>> +++ b/arch/x86/kvm/cpuid.h
>>>> @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>>>> [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
>>>> [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
>>>> [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>>>> + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
>>>> };
>>>>
>>>> static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index aa8638a..1b743a0 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
>>>> static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
>>>> u16 error_code);
>>>> static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>>>> +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>>>> + u32 msr, int type);
>>>> +
>>>>
>>>> static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>>>> static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>>>> @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>>>> m->host[i].value = host_val;
>>>> }
>>>>
>>>> +/* do not touch guest_val and host_val if the msr is not found */
>>>> +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>>>> + u64 *guest_val, u64 *host_val)
>>>> +{
>>>> + unsigned i;
>>>> + struct msr_autoload *m = &vmx->msr_autoload;
>>>> +
>>>> + for (i = 0; i < m->nr; ++i)
>>>> + if (m->guest[i].index == msr)
>>>> + break;
>>>> +
>>>> + if (i == m->nr)
>>>> + return 1;
>>>> +
>>>> + if (guest_val)
>>>> + *guest_val = m->guest[i].value;
>>>> + if (host_val)
>>>> + *host_val = m->host[i].value;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
>>>> {
>>>> u64 guest_efer = vmx->vcpu.arch.efer;
>>>> @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>>>> */
>>>> static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> {
>>>> + u64 spec_ctrl = 0;
>>>> struct shared_msr_entry *msr;
>>>> + struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>>
>>>> switch (msr_info->index) {
>>>> #ifdef CONFIG_X86_64
>>>> @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> case MSR_IA32_TSC:
>>>> msr_info->data = guest_read_tsc(vcpu);
>>>> break;
>>>> + case MSR_IA32_SPEC_CTRL:
>>>> + if (!msr_info->host_initiated &&
>>>> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>>> Shouldn't this conjunct be:
>>> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
>>> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>>>
>>>>
>>>> + return 1;
>>> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>>> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>>>
>>>>
>>>> +
>>>> + /*
>>>> + * If the MSR is not in the atomic list yet, then it was never
>>>> + * written to. So the MSR value will be '0'.
>>>> + */
>>>> + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>>> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
>>> don't have to search the atomic switch list?
>>>
>>>>
>>>> +
>>>> + msr_info->data = spec_ctrl;
>>>> + break;
>>>> case MSR_IA32_SYSENTER_CS:
>>>> msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>>>> break;
>>>> @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> int ret = 0;
>>>> u32 msr_index = msr_info->index;
>>>> u64 data = msr_info->data;
>>>> + unsigned long *msr_bitmap;
>>>> +
>>>> + /*
>>>> + * IBRS is not used (yet) to protect the host. Once it does, this
>>>> + * variable needs to be a bit smarter.
>>>> + */
>>>> + u64 host_spec_ctrl = 0;
>>>>
>>>> switch (msr_index) {
>>>> case MSR_EFER:
>>>> @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> case MSR_IA32_TSC:
>>>> kvm_write_tsc(vcpu, msr_info);
>>>> break;
>>>> + case MSR_IA32_SPEC_CTRL:
>>>> + if (!msr_info->host_initiated &&
>>>> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>>>> + return 1;
>>> This looks incomplete. As above, what if
>>> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>>> !boot_cpu_has(X86_FEATURE_STIBP)?
>>> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
>>> on loading the host MSRs from the VM-exit MSR load list.
>>>
>>> Also, what if the value being written is illegal?
>>>
>>> /*
>>> * Processors that support IBRS but not STIBP
>>> * (CPUID.(EAX=07H, ECX=0):EDX[27:26] = 01b) will
>>> * ignore attempts to set STIBP instead of causing an
>>> * exception due to setting that reserved bit.
>>> */
>>> if ((data & ~(u64)(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ||
>>> ((data & SPEC_CTRL_IBRS) &&
>>> !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)))
>>> return 1;
>>>
>>>>
>>>> +
>>>> + /*
>>>> + * Now we know that the guest is actually using the MSR, so
>>>> + * atomically load and save the SPEC_CTRL MSR and pass it
>>>> + * through to the guest.
>>>> + */
>>>> + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, msr_info->data,
>>>> + host_spec_ctrl);
>>>> + msr_bitmap = vmx->vmcs01.msr_bitmap;
>>>> + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
>>> I assume you mean MSR_IA32_SPEC_CTRL rather than MSR_FS_BASE.
>>>
>>> Also, what if the host and the guest support a different set of bits
>>> in MSR_IA32_SPEC_CTRL, due to a userspace modification of the guest's
>>> CPUID info?
>>>
>>>>
>>>> +
>>>> + break;
>>>> case MSR_IA32_CR_PAT:
>>>> if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
>>>> if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
>>>> --
>>>> 2.7.4
>>>>
>>> Where do you preserve the guest's MSR_IA32_SPEC_CTRL value on VM-exit,
>>> if the guest has been given permission to write the MSR?
>>>
>>> You also have to clear the guest's MSR_IA32_SPEC_CTRL on
>>> vmx_vcpu_reset, don't you?
>>>
Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

2018-01-29 19:21:04

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 10:43:22AM -0800, Jim Mattson wrote:
> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed <[email protected]> wrote:
> > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
> > that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
> > retpoline+IBPB based approach.
> >
> > To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
> > for guests that do not actually use the MSR, only add_atomic_switch_msr when a
> > non-zero is written to it.
> >
> > Cc: Asit Mallick <[email protected]>
> > Cc: Arjan Van De Ven <[email protected]>
> > Cc: Dave Hansen <[email protected]>
> > Cc: Andi Kleen <[email protected]>
> > Cc: Andrea Arcangeli <[email protected]>
> > Cc: Linus Torvalds <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Dan Williams <[email protected]>
> > Cc: Jun Nakajima <[email protected]>
> > Cc: Paolo Bonzini <[email protected]>
> > Cc: David Woodhouse <[email protected]>
> > Cc: Greg KH <[email protected]>
> > Cc: Andy Lutomirski <[email protected]>
> > Signed-off-by: KarimAllah Ahmed <[email protected]>
> > Signed-off-by: Ashok Raj <[email protected]>
> > ---
> > arch/x86/kvm/cpuid.c | 4 +++-
> > arch/x86/kvm/cpuid.h | 1 +
> > arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 67 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 0099e10..dc78095 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
> > /* These are scattered features in cpufeatures.h. */
> > #define KVM_CPUID_BIT_AVX512_4VNNIW 2
> > #define KVM_CPUID_BIT_AVX512_4FMAPS 3
> > +#define KVM_CPUID_BIT_SPEC_CTRL 26
> > #define KF(x) bit(KVM_CPUID_BIT_##x)
> >
> > int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> > @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >
> > /* cpuid 7.0.edx*/
> > const u32 kvm_cpuid_7_0_edx_x86_features =
> > - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
> > + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
> > + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>
> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
> pass through for existing CPUs (26 and 27)?
>
> >
> > /* all calls to cpuid_count() should be made on the same cpu */
> > get_cpu();
> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> > index cdc70a3..dcfe227 100644
> > --- a/arch/x86/kvm/cpuid.h
> > +++ b/arch/x86/kvm/cpuid.h
> > @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
> > [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
> > [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
> > [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
> > + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
> > };
> >
> > static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index aa8638a..1b743a0 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
> > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
> > u16 error_code);
> > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
> > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> > + u32 msr, int type);
> > +
> >
> > static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> > static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> > @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> > m->host[i].value = host_val;
> > }
> >
> > +/* do not touch guest_val and host_val if the msr is not found */
> > +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> > + u64 *guest_val, u64 *host_val)
> > +{
> > + unsigned i;
> > + struct msr_autoload *m = &vmx->msr_autoload;
> > +
> > + for (i = 0; i < m->nr; ++i)
> > + if (m->guest[i].index == msr)
> > + break;
> > +
> > + if (i == m->nr)
> > + return 1;
> > +
> > + if (guest_val)
> > + *guest_val = m->guest[i].value;
> > + if (host_val)
> > + *host_val = m->host[i].value;
> > +
> > + return 0;
> > +}
> > +
> > static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
> > {
> > u64 guest_efer = vmx->vcpu.arch.efer;
> > @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
> > */
> > static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > {
> > + u64 spec_ctrl = 0;
> > struct shared_msr_entry *msr;
> > + struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > switch (msr_info->index) {
> > #ifdef CONFIG_X86_64
> > @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > case MSR_IA32_TSC:
> > msr_info->data = guest_read_tsc(vcpu);
> > break;
> > + case MSR_IA32_SPEC_CTRL:
> > + if (!msr_info->host_initiated &&
> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>
> Shouldn't this conjunct be:
> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>
> > + return 1;
>
> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>
> > +
> > + /*
> > + * If the MSR is not in the atomic list yet, then it was never
> > + * written to. So the MSR value will be '0'.
> > + */
> > + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>
> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
> don't have to search the atomic switch list?
>
> > +
> > + msr_info->data = spec_ctrl;
> > + break;
> > case MSR_IA32_SYSENTER_CS:
> > msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
> > break;
> > @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > int ret = 0;
> > u32 msr_index = msr_info->index;
> > u64 data = msr_info->data;
> > + unsigned long *msr_bitmap;
> > +
> > + /*
> > + * IBRS is not used (yet) to protect the host. Once it does, this
> > + * variable needs to be a bit smarter.
> > + */
> > + u64 host_spec_ctrl = 0;
> >
> > switch (msr_index) {
> > case MSR_EFER:
> > @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > case MSR_IA32_TSC:
> > kvm_write_tsc(vcpu, msr_info);
> > break;
> > + case MSR_IA32_SPEC_CTRL:
> > + if (!msr_info->host_initiated &&
> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> > + return 1;
>
> This looks incomplete. As above, what if
> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> !boot_cpu_has(X86_FEATURE_STIBP)?
> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
> on loading the host MSRs from the VM-exit MSR load list.

Yikes, right it will #GP.

>
> Also, what if the value being written is illegal?

You can write garbage and it won't #GP. Granted it should only read
correct values (0,1,2,or 3).

Albeit the spec says nothing about it (except call those regions as reserved
which would imply - rdmsr ifrst and then 'or' it with what you are wrmsr).
That of couse would not be the best choice :-(

2018-01-29 19:28:08

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 11:16 AM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
> On Mon, Jan 29, 2018 at 10:43:22AM -0800, Jim Mattson wrote:
>> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed <[email protected]> wrote:
>> > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
>> > that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
>> > retpoline+IBPB based approach.
>> >
>> > To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
>> > for guests that do not actually use the MSR, only add_atomic_switch_msr when a
>> > non-zero is written to it.
>> >
>> > Cc: Asit Mallick <[email protected]>
>> > Cc: Arjan Van De Ven <[email protected]>
>> > Cc: Dave Hansen <[email protected]>
>> > Cc: Andi Kleen <[email protected]>
>> > Cc: Andrea Arcangeli <[email protected]>
>> > Cc: Linus Torvalds <[email protected]>
>> > Cc: Tim Chen <[email protected]>
>> > Cc: Thomas Gleixner <[email protected]>
>> > Cc: Dan Williams <[email protected]>
>> > Cc: Jun Nakajima <[email protected]>
>> > Cc: Paolo Bonzini <[email protected]>
>> > Cc: David Woodhouse <[email protected]>
>> > Cc: Greg KH <[email protected]>
>> > Cc: Andy Lutomirski <[email protected]>
>> > Signed-off-by: KarimAllah Ahmed <[email protected]>
>> > Signed-off-by: Ashok Raj <[email protected]>
>> > ---
>> > arch/x86/kvm/cpuid.c | 4 +++-
>> > arch/x86/kvm/cpuid.h | 1 +
>> > arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> > 3 files changed, 67 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> > index 0099e10..dc78095 100644
>> > --- a/arch/x86/kvm/cpuid.c
>> > +++ b/arch/x86/kvm/cpuid.c
>> > @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
>> > /* These are scattered features in cpufeatures.h. */
>> > #define KVM_CPUID_BIT_AVX512_4VNNIW 2
>> > #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>> > +#define KVM_CPUID_BIT_SPEC_CTRL 26
>> > #define KF(x) bit(KVM_CPUID_BIT_##x)
>> >
>> > int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>> > @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>> >
>> > /* cpuid 7.0.edx*/
>> > const u32 kvm_cpuid_7_0_edx_x86_features =
>> > - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>> > + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>> > + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>>
>> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
>> pass through for existing CPUs (26 and 27)?
>>
>> >
>> > /* all calls to cpuid_count() should be made on the same cpu */
>> > get_cpu();
>> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>> > index cdc70a3..dcfe227 100644
>> > --- a/arch/x86/kvm/cpuid.h
>> > +++ b/arch/x86/kvm/cpuid.h
>> > @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>> > [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
>> > [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
>> > [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>> > + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
>> > };
>> >
>> > static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > index aa8638a..1b743a0 100644
>> > --- a/arch/x86/kvm/vmx.c
>> > +++ b/arch/x86/kvm/vmx.c
>> > @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
>> > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
>> > u16 error_code);
>> > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>> > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>> > + u32 msr, int type);
>> > +
>> >
>> > static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>> > static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>> > @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > m->host[i].value = host_val;
>> > }
>> >
>> > +/* do not touch guest_val and host_val if the msr is not found */
>> > +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > + u64 *guest_val, u64 *host_val)
>> > +{
>> > + unsigned i;
>> > + struct msr_autoload *m = &vmx->msr_autoload;
>> > +
>> > + for (i = 0; i < m->nr; ++i)
>> > + if (m->guest[i].index == msr)
>> > + break;
>> > +
>> > + if (i == m->nr)
>> > + return 1;
>> > +
>> > + if (guest_val)
>> > + *guest_val = m->guest[i].value;
>> > + if (host_val)
>> > + *host_val = m->host[i].value;
>> > +
>> > + return 0;
>> > +}
>> > +
>> > static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
>> > {
>> > u64 guest_efer = vmx->vcpu.arch.efer;
>> > @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>> > */
>> > static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > {
>> > + u64 spec_ctrl = 0;
>> > struct shared_msr_entry *msr;
>> > + struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >
>> > switch (msr_info->index) {
>> > #ifdef CONFIG_X86_64
>> > @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > msr_info->data = guest_read_tsc(vcpu);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>>
>> Shouldn't this conjunct be:
>> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
>> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>>
>> > + return 1;
>>
>> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>>
>> > +
>> > + /*
>> > + * If the MSR is not in the atomic list yet, then it was never
>> > + * written to. So the MSR value will be '0'.
>> > + */
>> > + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>>
>> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
>> don't have to search the atomic switch list?
>>
>> > +
>> > + msr_info->data = spec_ctrl;
>> > + break;
>> > case MSR_IA32_SYSENTER_CS:
>> > msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>> > break;
>> > @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > int ret = 0;
>> > u32 msr_index = msr_info->index;
>> > u64 data = msr_info->data;
>> > + unsigned long *msr_bitmap;
>> > +
>> > + /*
>> > + * IBRS is not used (yet) to protect the host. Once it does, this
>> > + * variable needs to be a bit smarter.
>> > + */
>> > + u64 host_spec_ctrl = 0;
>> >
>> > switch (msr_index) {
>> > case MSR_EFER:
>> > @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > kvm_write_tsc(vcpu, msr_info);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> > + return 1;
>>
>> This looks incomplete. As above, what if
>> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)?
>> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
>> on loading the host MSRs from the VM-exit MSR load list.
>
> Yikes, right it will #GP.

Worse; it will VMX-abort, which shuts down the logical CPU.

>>
>> Also, what if the value being written is illegal?
>
> You can write garbage and it won't #GP. Granted it should only read
> correct values (0,1,2,or 3).

That may depend on the processor. On HSX processors with ucode 0x3b, I
find that you can write bits 0, 1, and 2 without a #GP, but bits 63:3
do raise #GP. Nonetheless, the virtual CPU implemented by kvm only
supports bits 0 and 1, regardless of the underlying host support, so
it should raise #GP if bits 63:2 are set.

>
> Albeit the spec says nothing about it (except call those regions as reserved
> which would imply - rdmsr ifrst and then 'or' it with what you are wrmsr).
> That of couse would not be the best choice :-(

2018-01-29 21:53:22

by Daniel Kiper

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 12:31:13PM -0500, Konrad Rzeszutek Wilk wrote:
> On Mon, Jan 29, 2018 at 08:46:03AM +0000, David Woodhouse wrote:
> > On Sun, 2018-01-28 at 16:39 -0800, Liran Alon wrote:
> > >
> > > Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
> > > Running a Windows guest should be a pretty common use-case no?
> > >
> > > In addition, your handle of the first WRMSR intercept could be different.
> > > It could signal you to start doing the following:
> > > 1. Disable intercept on SPEC_CTRL MSR.
> > > 2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
> > > 3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
> > > (And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)
> > >
> > > That way, you will both have fastest option as long as guest don't use IBRS
> > > and also won't have the 3% performance hit compared to Konrad's proposal.
> > >
> > > Am I missing something?
> >
> > Reads from the SPEC_CTRL MSR are strangely slow. I suspect a large part
> > of the 3% speedup you observe is because in the above, the vmentry path
> > doesn't need to *read* the host's value and store it; the host is
> > expected to restore it for itself anyway?
>
> Yes for at least the purpose of correctness. That is based on what I have
> heard is that you when you transition to a higher ring you have to write 1, then write zero
> when you transition back to lower rings. That is it is like a knob.
>
> But then I heard that on some CPUs it is more like reset button and
> just writting 1 to IBRS is fine. But again, correctness here.
>
> >
> > I'd actually quite like to repeat the benchmark on the new fixed
> > microcode, if anyone has it yet, to see if that read/swap slowness is
> > still quite as excessive. I'm certainly not ruling this out, but I'm
> > just a little wary of premature optimisation, and I'd like to make sure
> > we have everything *else* in the KVM patches right first.
> >
> > The fact that the save-and-restrict macros I have in the tip of my
> > working tree at the moment are horrid and causing 0-day nastygrams,
> > probably doesn't help persuade me to favour the approach ;)
> >
> > ... hm, the CPU actually has separate MSR save/restore lists for
> > entry/exit, doesn't it? Is there any way to sanely make use of that and
> > do the restoration manually on vmentry but let it be automatic on
> > vmexit, by having it *only* in the guest's MSR-store area to be saved
> > on exit and restored on exit, but *not* in the host's MSR-store area?

s/on exit and restored on exit/on exit and restored on entry/?

Additionally, AIUI there is no "host's MSR-store area".

> Oh. That sounds sounds interesting

That is possible but you have to split add_atomic_switch_msr() accordingly.

> > Reading the code and comparing with the SDM, I can't see where we're
> > ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
> > case...
>
> Right. We (well Daniel Kiper, CC-ed) implemented it for this and
> that is where we got the numbers.
>
> Daniel, you OK posting it here? Granted with the caveats thta it won't even
> compile against upstream as it was based on a distro kernel.

Please look below...

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aa9bc4f..e7c0f8b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -162,8 +162,10 @@ module_param(ple_window_max, int, S_IRUGO);

extern const ulong vmx_return;

-#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
+#define NR_AUTOLOAD_MSRS 8
+#define NR_AUTOSTORE_MSRS NR_AUTOLOAD_MSRS
+
+#define VMCS02_POOL_SIZE 1

struct vmcs {
u32 revision_id;
@@ -504,6 +506,10 @@ struct vcpu_vmx {
struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
} msr_autoload;
+ struct msr_autostore {
+ unsigned nr;
+ struct vmx_msr_entry guest[NR_AUTOSTORE_MSRS];
+ } msr_autostore;
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
@@ -1704,6 +1710,37 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
m->host[i].value = host_val;
}

+static void add_atomic_store_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ unsigned i;
+ struct msr_autostore *m = &vmx->msr_autostore;
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ return;
+
+ if (i == NR_AUTOSTORE_MSRS) {
+ pr_err("Not enough msr store entries. Can't add msr %x\n", msr);
+ BUG();
+ }
+
+ m->guest[i].index = msr;
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, ++m->nr);
+}
+
+static u64 get_msr_vmcs_store(struct vcpu_vmx *vmx, unsigned msr)
+{
+ unsigned i;
+ struct msr_autostore *m = &vmx->msr_autostore;
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ return m->guest[i].value;
+
+ pr_err("Can't find msr %x in VMCS store\n", msr);
+ BUG();
+}
+
static void reload_tss(void)
{
/*
@@ -4716,6 +4753,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
#endif

vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest));
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
@@ -8192,8 +8230,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)

vmx->__launched = vmx->loaded_vmcs->launched;

- if (ibrs_inuse)
- wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+ if (ibrs_inuse) {
+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, vmx->spec_ctrl,
+ SPEC_CTRL_FEATURE_ENABLE_IBRS);
+ add_atomic_store_msr(vmx, MSR_IA32_SPEC_CTRL);
+ }

asm(
/* Store host registers */
@@ -8317,12 +8358,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);

- if (ibrs_inuse) {
- rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
- wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_FEATURE_ENABLE_IBRS);
- }
stuff_RSB();

+ if (ibrs_inuse)
+ vmx->spec_ctrl = get_msr_vmcs_store(vmx, MSR_IA32_SPEC_CTRL);
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);

By the way, I saw in one version of patches that you use own rdmsrl()/wrmsrl()
functions to save/restore IBRS instead of using common ones (I mean
native_rdmsrl()/native_wrmsrl(), safe__rdmsrl()/safe_wrmsrl(), etc.) available
in the kernel. Could you explain why do you do that?

Daniel

2018-01-29 23:02:51

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

On Mon, Jan 29, 2018 at 1:49 PM, Daniel Kiper <[email protected]> wrote:
> On Mon, Jan 29, 2018 at 12:31:13PM -0500, Konrad Rzeszutek Wilk wrote:
>> On Mon, Jan 29, 2018 at 08:46:03AM +0000, David Woodhouse wrote:
>> > On Sun, 2018-01-28 at 16:39 -0800, Liran Alon wrote:
>> > >
>> > > Windows use IBRS and Microsoft don't have any plans to switch to retpoline.
>> > > Running a Windows guest should be a pretty common use-case no?
>> > >
>> > > In addition, your handle of the first WRMSR intercept could be different.
>> > > It could signal you to start doing the following:
>> > > 1. Disable intercept on SPEC_CTRL MSR.
>> > > 2. On VMEntry, Write vCPU SPEC_CTRL value into physical MSR.
>> > > 3. On VMExit, read physical MSR into vCPU SPEC_CTRL value.
>> > > (And if IBRS is used at host, also set physical SPEC_CTRL MSR here to 1)
>> > >
>> > > That way, you will both have fastest option as long as guest don't use IBRS
>> > > and also won't have the 3% performance hit compared to Konrad's proposal.
>> > >
>> > > Am I missing something?
>> >
>> > Reads from the SPEC_CTRL MSR are strangely slow. I suspect a large part
>> > of the 3% speedup you observe is because in the above, the vmentry path
>> > doesn't need to *read* the host's value and store it; the host is
>> > expected to restore it for itself anyway?
>>
>> Yes for at least the purpose of correctness. That is based on what I have
>> heard is that you when you transition to a higher ring you have to write 1, then write zero
>> when you transition back to lower rings. That is it is like a knob.
>>
>> But then I heard that on some CPUs it is more like reset button and
>> just writting 1 to IBRS is fine. But again, correctness here.
>>
>> >
>> > I'd actually quite like to repeat the benchmark on the new fixed
>> > microcode, if anyone has it yet, to see if that read/swap slowness is
>> > still quite as excessive. I'm certainly not ruling this out, but I'm
>> > just a little wary of premature optimisation, and I'd like to make sure
>> > we have everything *else* in the KVM patches right first.
>> >
>> > The fact that the save-and-restrict macros I have in the tip of my
>> > working tree at the moment are horrid and causing 0-day nastygrams,
>> > probably doesn't help persuade me to favour the approach ;)
>> >
>> > ... hm, the CPU actually has separate MSR save/restore lists for
>> > entry/exit, doesn't it? Is there any way to sanely make use of that and
>> > do the restoration manually on vmentry but let it be automatic on
>> > vmexit, by having it *only* in the guest's MSR-store area to be saved
>> > on exit and restored on exit, but *not* in the host's MSR-store area?
>
> s/on exit and restored on exit/on exit and restored on entry/?
>
> Additionally, AIUI there is no "host's MSR-store area".
>
>> Oh. That sounds sounds interesting
>
> That is possible but you have to split add_atomic_switch_msr() accordingly.
>
>> > Reading the code and comparing with the SDM, I can't see where we're
>> > ever setting VM_EXIT_MSR_STORE_{ADDR,COUNT} except in the nested
>> > case...
>>
>> Right. We (well Daniel Kiper, CC-ed) implemented it for this and
>> that is where we got the numbers.
>>
>> Daniel, you OK posting it here? Granted with the caveats thta it won't even
>> compile against upstream as it was based on a distro kernel.
>
> Please look below...
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index aa9bc4f..e7c0f8b 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -162,8 +162,10 @@ module_param(ple_window_max, int, S_IRUGO);
>
> extern const ulong vmx_return;
>
> -#define NR_AUTOLOAD_MSRS 8
> -#define VMCS02_POOL_SIZE 1
> +#define NR_AUTOLOAD_MSRS 8
> +#define NR_AUTOSTORE_MSRS NR_AUTOLOAD_MSRS
> +
> +#define VMCS02_POOL_SIZE 1

I think you accidentally resurrected VMCS02_POOL_SIZE.

> struct vmcs {
> u32 revision_id;
> @@ -504,6 +506,10 @@ struct vcpu_vmx {
> struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
> struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
> } msr_autoload;
> + struct msr_autostore {
> + unsigned nr;
> + struct vmx_msr_entry guest[NR_AUTOSTORE_MSRS];
> + } msr_autostore;
> struct {
> int loaded;
> u16 fs_sel, gs_sel, ldt_sel;
> @@ -1704,6 +1710,37 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
> m->host[i].value = host_val;
> }
>
> +static void add_atomic_store_msr(struct vcpu_vmx *vmx, unsigned msr)
> +{
> + unsigned i;
> + struct msr_autostore *m = &vmx->msr_autostore;
> +
> + for (i = 0; i < m->nr; ++i)
> + if (m->guest[i].index == msr)
> + return;
> +
> + if (i == NR_AUTOSTORE_MSRS) {
> + pr_err("Not enough msr store entries. Can't add msr %x\n", msr);
> + BUG();

I wouldn't panic the host for this. add_atomic_switch_msr() just emits
a warning for the equivalent condition. (Resetting the vCPU might be
better in both cases.)

> + }
> +
> + m->guest[i].index = msr;
> + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, ++m->nr);
> +}
> +
> +static u64 get_msr_vmcs_store(struct vcpu_vmx *vmx, unsigned msr)
> +{
> + unsigned i;
> + struct msr_autostore *m = &vmx->msr_autostore;
> +
> + for (i = 0; i < m->nr; ++i)
> + if (m->guest[i].index == msr)
> + return m->guest[i].value;
> +
> + pr_err("Can't find msr %x in VMCS store\n", msr);
> + BUG();

Same comment as above.

> +}
> +
> static void reload_tss(void)
> {
> /*
> @@ -4716,6 +4753,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> #endif
>
> vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
> + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest));
> vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
> vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
> vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
> @@ -8192,8 +8230,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>
> vmx->__launched = vmx->loaded_vmcs->launched;
>
> - if (ibrs_inuse)
> - wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> + if (ibrs_inuse) {
> + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, vmx->spec_ctrl,
> + SPEC_CTRL_FEATURE_ENABLE_IBRS);
> + add_atomic_store_msr(vmx, MSR_IA32_SPEC_CTRL);
> + }

If ibrs_inuse can be cleared dynamically, perhaps there should be:
} else {
clear_autostore_msr(vmx, MSR_IA32_SPEC_CTRL);
}

>
> asm(
> /* Store host registers */
> @@ -8317,12 +8358,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> #endif
> );
>
> - if (ibrs_inuse) {
> - rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> - wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_FEATURE_ENABLE_IBRS);
> - }
> stuff_RSB();
>
> + if (ibrs_inuse)
> + vmx->spec_ctrl = get_msr_vmcs_store(vmx, MSR_IA32_SPEC_CTRL);
> +
> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
> if (debugctlmsr)
> update_debugctlmsr(debugctlmsr);
>
> By the way, I saw in one version of patches that you use own rdmsrl()/wrmsrl()
> functions to save/restore IBRS instead of using common ones (I mean
> native_rdmsrl()/native_wrmsrl(), safe__rdmsrl()/safe_wrmsrl(), etc.) available
> in the kernel. Could you explain why do you do that?
>
> Daniel

What about vmcs02?

If ibrs_inuse can be set dynamically, you may need the following in
nested_vmx_vmexit:

vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr);