2014-02-25 17:49:44

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH 0/2] KVM: x86: more xsave and mpx improvements

Here are the patches I mentioned while reviewing Liu Jinsong's MPX
series.

Patch 1 is a further cleanup of xcr0 handling, and patch 2 introduces
nested virtualization support for MPX.

Please review.

Thanks,

Paolo

Paolo Bonzini (2):
KVM: x86: introduce kvm_supported_xcr0()
KVM: x86: Add nested virtualization support for MPX

arch/x86/kvm/cpuid.c | 27 ++++++++++++++++-----------
arch/x86/kvm/vmx.c | 16 ++++++++++++++++
arch/x86/kvm/x86.c | 4 +---
arch/x86/kvm/x86.h | 2 ++
4 files changed, 35 insertions(+), 14 deletions(-)

--
1.8.3.1


2014-02-25 17:49:48

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH 1/2] KVM: x86: introduce kvm_supported_xcr0()

XSAVE support for KVM is already using host_xcr0 & KVM_SUPPORTED_XCR0 as
a "dynamic" version of KVM_SUPPORTED_XCR0.

However, this is not enough because the MPX bits should not be presented
to the guest unless kvm_x86_ops confirms the support. So, replace all
instances of host_xcr0 & KVM_SUPPORTED_XCR0 with a new function
kvm_supported_xcr0() that also has this check.

Note that here:

if (xstate_bv & ~KVM_SUPPORTED_XCR0)
return -EINVAL;
if (xstate_bv & ~host_cr0)
return -EINVAL;

the code is equivalent to

if ((xstate_bv & ~KVM_SUPPORTED_XCR0) ||
(xstate_bv & ~host_cr0)
return -EINVAL;

i.e. "xstate_bv & (~KVM_SUPPORTED_XCR0 | ~host_cr0)" which is in turn
equal to "xstate_bv & ~(KVM_SUPPORTED_XCR0 & host_cr0)". So we should
also use the new function there.

Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/cpuid.c | 27 ++++++++++++++++-----------
arch/x86/kvm/x86.c | 4 +---
arch/x86/kvm/x86.h | 2 ++
3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ddc8a7e165df..18aefb4d0927 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv)
return ret;
}

+u64 kvm_supported_xcr0(void)
+{
+ u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+ if (!kvm_x86_ops->mpx_supported || !kvm_x86_ops->mpx_supported())
+ xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+
+ return xcr0;
+}
+
void kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -73,7 +83,7 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
} else {
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) &
- host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_supported_xcr0();
vcpu->arch.guest_xstate_size = best->ebx =
xstate_required_size(vcpu->arch.xcr0);
}
@@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}

-static bool supported_xcr0_bit(unsigned bit)
-{
- u64 mask = ((u64)1 << bit);
-
- return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
-}
-
#define F(x) bit(X86_FEATURE_##x)

static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
@@ -439,16 +442,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
case 0xd: {
int idx, i;
+ u64 supported = kvm_supported_xcr0();

- entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
- entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
+ entry->eax &= supported;
+ entry->edx &= supported >> 32;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
for (idx = 1, i = 1; idx < 64; ++idx) {
+ u64 mask = ((u64)1 << idx);
if (*nent >= maxnent)
goto out;

do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+ if (entry[i].eax == 0 || !(supported & mask))
continue;
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6530019116b0..4e038cda45a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3048,9 +3048,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~KVM_SUPPORTED_XCR0)
- return -EINVAL;
- if (xstate_bv & ~host_xcr0)
+ if (xstate_bv & ~kvm_supported_xcr0())
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state->xsave,
guest_xsave->region, vcpu->arch.guest_xstate_size);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 392ecbff0030..8c97bac9a895 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -126,6 +126,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
| XSTATE_BNDREGS | XSTATE_BNDCSR)
extern u64 host_xcr0;

+extern u64 kvm_supported_xcr0(void);
+
extern unsigned int min_timer_period_us;

extern struct static_key kvm_no_apic_vcpu;
--
1.8.3.1

2014-02-25 17:50:00

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH 2/2] KVM: x86: Add nested virtualization support for MPX

This is simple to do, the "host" BNDCFGS is either 0 or the guest value.
However, both controls have to be present. We cannot provide MPX if
we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls.

Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 729b1e42aded..da28ac46ca88 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -202,6 +202,7 @@ struct __packed vmcs12 {
u64 guest_pdptr1;
u64 guest_pdptr2;
u64 guest_pdptr3;
+ u64 guest_bndcfgs;
u64 host_ia32_pat;
u64 host_ia32_efer;
u64 host_ia32_perf_global_ctrl;
@@ -534,6 +535,7 @@ static const unsigned long shadow_read_write_fields[] = {
GUEST_CS_LIMIT,
GUEST_CS_BASE,
GUEST_ES_BASE,
+ GUEST_BNDCFGS,
CR0_GUEST_HOST_MASK,
CR0_READ_SHADOW,
CR4_READ_SHADOW,
@@ -589,6 +591,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(GUEST_PDPTR1, guest_pdptr1),
FIELD64(GUEST_PDPTR2, guest_pdptr2),
FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
FIELD64(HOST_IA32_PAT, host_ia32_pat),
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -719,6 +722,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
+static bool vmx_mpx_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -2279,6 +2283,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
}
nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
+ if (vmx_mpx_supported())
+ nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;

/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2292,6 +2298,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
VM_ENTRY_LOAD_IA32_PAT;
nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
VM_ENTRY_LOAD_IA32_EFER);
+ if (vmx_mpx_supported())
+ nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;

/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7847,6 +7855,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)

set_cr4_guest_host_mask(vmx);

+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vmcs_write64(TSC_OFFSET,
vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -8277,6 +8288,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+ vmcs12->guest_bndcfgs = vmcs_readl(GUEST_BNDCFGS);

/* update exit information fields: */

@@ -8386,6 +8398,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);

+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
vcpu->arch.pat = vmcs12->host_ia32_pat;
--
1.8.3.1

2014-02-25 18:05:57

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Add nested virtualization support for MPX

On 2014-02-25 18:49, Paolo Bonzini wrote:
> This is simple to do, the "host" BNDCFGS is either 0 or the guest value.
> However, both controls have to be present. We cannot provide MPX if
> we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 729b1e42aded..da28ac46ca88 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -202,6 +202,7 @@ struct __packed vmcs12 {
> u64 guest_pdptr1;
> u64 guest_pdptr2;
> u64 guest_pdptr3;
> + u64 guest_bndcfgs;
> u64 host_ia32_pat;
> u64 host_ia32_efer;
> u64 host_ia32_perf_global_ctrl;
> @@ -534,6 +535,7 @@ static const unsigned long shadow_read_write_fields[] = {
> GUEST_CS_LIMIT,
> GUEST_CS_BASE,
> GUEST_ES_BASE,
> + GUEST_BNDCFGS,
> CR0_GUEST_HOST_MASK,
> CR0_READ_SHADOW,
> CR4_READ_SHADOW,
> @@ -589,6 +591,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
> FIELD64(GUEST_PDPTR1, guest_pdptr1),
> FIELD64(GUEST_PDPTR2, guest_pdptr2),
> FIELD64(GUEST_PDPTR3, guest_pdptr3),
> + FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
> FIELD64(HOST_IA32_PAT, host_ia32_pat),
> FIELD64(HOST_IA32_EFER, host_ia32_efer),
> FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
> @@ -719,6 +722,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
> static u64 construct_eptp(unsigned long root_hpa);
> static void kvm_cpu_vmxon(u64 addr);
> static void kvm_cpu_vmxoff(void);
> +static bool vmx_mpx_supported(void);
> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
> static void vmx_set_segment(struct kvm_vcpu *vcpu,
> struct kvm_segment *var, int seg);
> @@ -2279,6 +2283,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
> }
> nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
> VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
> + if (vmx_mpx_supported())
> + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
>
> /* entry controls */
> rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
> @@ -2292,6 +2298,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
> VM_ENTRY_LOAD_IA32_PAT;
> nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
> VM_ENTRY_LOAD_IA32_EFER);
> + if (vmx_mpx_supported())
> + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
>
> /* cpu-based controls */
> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
> @@ -7847,6 +7855,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>
> set_cr4_guest_host_mask(vmx);
>
> + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
> + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
> +
> if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
> vmcs_write64(TSC_OFFSET,
> vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
> @@ -8277,6 +8288,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> + vmcs12->guest_bndcfgs = vmcs_readl(GUEST_BNDCFGS);

Can we read this value unconditionally, even when the host does not
support the feature?

>
> /* update exit information fields: */
>
> @@ -8386,6 +8398,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
> vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
> vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
>
> + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
> + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
> + vmcs_write64(GUEST_BNDCFGS, 0);
> +
> if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
> vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
> vcpu->arch.pat = vmcs12->host_ia32_pat;
>

Do we also have a unit test to stress this? Or are we lacking silicon
with MPX and corresponding VMX features?

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

2014-02-25 18:14:00

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Add nested virtualization support for MPX

Il 25/02/2014 19:05, Jan Kiszka ha scritto:
> On 2014-02-25 18:49, Paolo Bonzini wrote:
>> This is simple to do, the "host" BNDCFGS is either 0 or the guest value.
>> However, both controls have to be present. We cannot provide MPX if
>> we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls.
>>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>> ---
>> arch/x86/kvm/vmx.c | 16 ++++++++++++++++
>> 1 file changed, 16 insertions(+)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 729b1e42aded..da28ac46ca88 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -202,6 +202,7 @@ struct __packed vmcs12 {
>> u64 guest_pdptr1;
>> u64 guest_pdptr2;
>> u64 guest_pdptr3;
>> + u64 guest_bndcfgs;
>> u64 host_ia32_pat;
>> u64 host_ia32_efer;
>> u64 host_ia32_perf_global_ctrl;
>> @@ -534,6 +535,7 @@ static const unsigned long shadow_read_write_fields[] = {
>> GUEST_CS_LIMIT,
>> GUEST_CS_BASE,
>> GUEST_ES_BASE,
>> + GUEST_BNDCFGS,
>> CR0_GUEST_HOST_MASK,
>> CR0_READ_SHADOW,
>> CR4_READ_SHADOW,
>> @@ -589,6 +591,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>> FIELD64(GUEST_PDPTR1, guest_pdptr1),
>> FIELD64(GUEST_PDPTR2, guest_pdptr2),
>> FIELD64(GUEST_PDPTR3, guest_pdptr3),
>> + FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
>> FIELD64(HOST_IA32_PAT, host_ia32_pat),
>> FIELD64(HOST_IA32_EFER, host_ia32_efer),
>> FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
>> @@ -719,6 +722,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
>> static u64 construct_eptp(unsigned long root_hpa);
>> static void kvm_cpu_vmxon(u64 addr);
>> static void kvm_cpu_vmxoff(void);
>> +static bool vmx_mpx_supported(void);
>> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>> static void vmx_set_segment(struct kvm_vcpu *vcpu,
>> struct kvm_segment *var, int seg);
>> @@ -2279,6 +2283,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>> }
>> nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>> VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
>> + if (vmx_mpx_supported())
>> + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
>>
>> /* entry controls */
>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
>> @@ -2292,6 +2298,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>> VM_ENTRY_LOAD_IA32_PAT;
>> nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
>> VM_ENTRY_LOAD_IA32_EFER);
>> + if (vmx_mpx_supported())
>> + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
>>
>> /* cpu-based controls */
>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
>> @@ -7847,6 +7855,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>
>> set_cr4_guest_host_mask(vmx);
>>
>> + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
>> + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
>> +
>> if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
>> vmcs_write64(TSC_OFFSET,
>> vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
>> @@ -8277,6 +8288,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>> vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
>> vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
>> vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
>> + vmcs12->guest_bndcfgs = vmcs_readl(GUEST_BNDCFGS);
>
> Can we read this value unconditionally, even when the host does not
> support the feature?

return -EWRONGPATCH;

>>
>> /* update exit information fields: */
>>
>> @@ -8386,6 +8398,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>> vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
>> vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
>>
>> + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
>> + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
>> + vmcs_write64(GUEST_BNDCFGS, 0);
>> +
>> if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
>> vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
>> vcpu->arch.pat = vmcs12->host_ia32_pat;
>>
>
> Do we also have a unit test to stress this? Or are we lacking silicon
> with MPX and corresponding VMX features?

No silicon yet.

There is an emulator, but it is already slow enough without nested
virtualization... it would be three-level virtualization :)

Paolo

2014-02-25 18:46:53

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Add nested virtualization support for MPX

Il 25/02/2014 19:13, Paolo Bonzini ha scritto:
> Il 25/02/2014 19:05, Jan Kiszka ha scritto:
>> On 2014-02-25 18:49, Paolo Bonzini wrote:
>>> This is simple to do, the "host" BNDCFGS is either 0 or the guest value.
>>> However, both controls have to be present. We cannot provide MPX if
>>> we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls.
>>>
>>> Signed-off-by: Paolo Bonzini <[email protected]>
>>> ---
>>> arch/x86/kvm/vmx.c | 16 ++++++++++++++++
>>> 1 file changed, 16 insertions(+)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 729b1e42aded..da28ac46ca88 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -202,6 +202,7 @@ struct __packed vmcs12 {
>>> u64 guest_pdptr1;
>>> u64 guest_pdptr2;
>>> u64 guest_pdptr3;
>>> + u64 guest_bndcfgs;
>>> u64 host_ia32_pat;
>>> u64 host_ia32_efer;
>>> u64 host_ia32_perf_global_ctrl;
>>> @@ -534,6 +535,7 @@ static const unsigned long
>>> shadow_read_write_fields[] = {
>>> GUEST_CS_LIMIT,
>>> GUEST_CS_BASE,
>>> GUEST_ES_BASE,
>>> + GUEST_BNDCFGS,
>>> CR0_GUEST_HOST_MASK,
>>> CR0_READ_SHADOW,
>>> CR4_READ_SHADOW,
>>> @@ -589,6 +591,7 @@ static const unsigned short
>>> vmcs_field_to_offset_table[] = {
>>> FIELD64(GUEST_PDPTR1, guest_pdptr1),
>>> FIELD64(GUEST_PDPTR2, guest_pdptr2),
>>> FIELD64(GUEST_PDPTR3, guest_pdptr3),
>>> + FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
>>> FIELD64(HOST_IA32_PAT, host_ia32_pat),
>>> FIELD64(HOST_IA32_EFER, host_ia32_efer),
>>> FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
>>> @@ -719,6 +722,7 @@ static unsigned long nested_ept_get_cr3(struct
>>> kvm_vcpu *vcpu);
>>> static u64 construct_eptp(unsigned long root_hpa);
>>> static void kvm_cpu_vmxon(u64 addr);
>>> static void kvm_cpu_vmxoff(void);
>>> +static bool vmx_mpx_supported(void);
>>> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>>> static void vmx_set_segment(struct kvm_vcpu *vcpu,
>>> struct kvm_segment *var, int seg);
>>> @@ -2279,6 +2283,8 @@ static __init void
>>> nested_vmx_setup_ctls_msrs(void)
>>> }
>>> nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>> VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
>>> + if (vmx_mpx_supported())
>>> + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
>>>
>>> /* entry controls */
>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
>>> @@ -2292,6 +2298,8 @@ static __init void
>>> nested_vmx_setup_ctls_msrs(void)
>>> VM_ENTRY_LOAD_IA32_PAT;
>>> nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
>>> VM_ENTRY_LOAD_IA32_EFER);
>>> + if (vmx_mpx_supported())
>>> + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
>>>
>>> /* cpu-based controls */
>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
>>> @@ -7847,6 +7855,9 @@ static void prepare_vmcs02(struct kvm_vcpu
>>> *vcpu, struct vmcs12 *vmcs12)
>>>
>>> set_cr4_guest_host_mask(vmx);
>>>
>>> + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
>>> + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
>>> +
>>> if (vmcs12->cpu_based_vm_exec_control &
>>> CPU_BASED_USE_TSC_OFFSETING)
>>> vmcs_write64(TSC_OFFSET,
>>> vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
>>> @@ -8277,6 +8288,7 @@ static void prepare_vmcs12(struct kvm_vcpu
>>> *vcpu, struct vmcs12 *vmcs12,
>>> vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
>>> vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
>>> vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
>>> + vmcs12->guest_bndcfgs = vmcs_readl(GUEST_BNDCFGS);
>>
>> Can we read this value unconditionally, even when the host does not
>> support the feature?
>
> return -EWRONGPATCH;

Makes sense to clarify since I'll only be able to send the right patch
tomorrow.

I had not noticed this problem because vmcs_readl just returns a random
value if it fails and GUEST_BNDCFGS will not be vmcs_written.
x86/vmx.flat passes even with this bug.

However, this should be a vmcs_read64.

Paolo

>>>
>>> /* update exit information fields: */
>>>
>>> @@ -8386,6 +8398,10 @@ static void load_vmcs12_host_state(struct
>>> kvm_vcpu *vcpu,
>>> vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
>>> vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
>>>
>>> + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
>>> + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
>>> + vmcs_write64(GUEST_BNDCFGS, 0);
>>> +
>>> if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
>>> vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
>>> vcpu->arch.pat = vmcs12->host_ia32_pat;
>>>
>>
>> Do we also have a unit test to stress this? Or are we lacking silicon
>> with MPX and corresponding VMX features?
>
> No silicon yet.
>
> There is an emulator, but it is already slow enough without nested
> virtualization... it would be three-level virtualization :)
>
> Paolo
>

2014-02-25 18:50:14

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Add nested virtualization support for MPX

On 2014-02-25 19:13, Paolo Bonzini wrote:
> Il 25/02/2014 19:05, Jan Kiszka ha scritto:
>> On 2014-02-25 18:49, Paolo Bonzini wrote:
>>> This is simple to do, the "host" BNDCFGS is either 0 or the guest value.
>>> However, both controls have to be present. We cannot provide MPX if
>>> we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls.
>>>
>>> Signed-off-by: Paolo Bonzini <[email protected]>
>>> ---
>>> arch/x86/kvm/vmx.c | 16 ++++++++++++++++
>>> 1 file changed, 16 insertions(+)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 729b1e42aded..da28ac46ca88 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -202,6 +202,7 @@ struct __packed vmcs12 {
>>> u64 guest_pdptr1;
>>> u64 guest_pdptr2;
>>> u64 guest_pdptr3;
>>> + u64 guest_bndcfgs;
>>> u64 host_ia32_pat;
>>> u64 host_ia32_efer;
>>> u64 host_ia32_perf_global_ctrl;
>>> @@ -534,6 +535,7 @@ static const unsigned long
>>> shadow_read_write_fields[] = {
>>> GUEST_CS_LIMIT,
>>> GUEST_CS_BASE,
>>> GUEST_ES_BASE,
>>> + GUEST_BNDCFGS,
>>> CR0_GUEST_HOST_MASK,
>>> CR0_READ_SHADOW,
>>> CR4_READ_SHADOW,
>>> @@ -589,6 +591,7 @@ static const unsigned short
>>> vmcs_field_to_offset_table[] = {
>>> FIELD64(GUEST_PDPTR1, guest_pdptr1),
>>> FIELD64(GUEST_PDPTR2, guest_pdptr2),
>>> FIELD64(GUEST_PDPTR3, guest_pdptr3),
>>> + FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
>>> FIELD64(HOST_IA32_PAT, host_ia32_pat),
>>> FIELD64(HOST_IA32_EFER, host_ia32_efer),
>>> FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
>>> @@ -719,6 +722,7 @@ static unsigned long nested_ept_get_cr3(struct
>>> kvm_vcpu *vcpu);
>>> static u64 construct_eptp(unsigned long root_hpa);
>>> static void kvm_cpu_vmxon(u64 addr);
>>> static void kvm_cpu_vmxoff(void);
>>> +static bool vmx_mpx_supported(void);
>>> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>>> static void vmx_set_segment(struct kvm_vcpu *vcpu,
>>> struct kvm_segment *var, int seg);
>>> @@ -2279,6 +2283,8 @@ static __init void
>>> nested_vmx_setup_ctls_msrs(void)
>>> }
>>> nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>> VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
>>> + if (vmx_mpx_supported())
>>> + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
>>>
>>> /* entry controls */
>>> rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
>>> @@ -2292,6 +2298,8 @@ static __init void
>>> nested_vmx_setup_ctls_msrs(void)
>>> VM_ENTRY_LOAD_IA32_PAT;
>>> nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
>>> VM_ENTRY_LOAD_IA32_EFER);
>>> + if (vmx_mpx_supported())
>>> + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
>>>
>>> /* cpu-based controls */
>>> rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
>>> @@ -7847,6 +7855,9 @@ static void prepare_vmcs02(struct kvm_vcpu
>>> *vcpu, struct vmcs12 *vmcs12)
>>>
>>> set_cr4_guest_host_mask(vmx);
>>>
>>> + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
>>> + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
>>> +
>>> if (vmcs12->cpu_based_vm_exec_control &
>>> CPU_BASED_USE_TSC_OFFSETING)
>>> vmcs_write64(TSC_OFFSET,
>>> vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
>>> @@ -8277,6 +8288,7 @@ static void prepare_vmcs12(struct kvm_vcpu
>>> *vcpu, struct vmcs12 *vmcs12,
>>> vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
>>> vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
>>> vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
>>> + vmcs12->guest_bndcfgs = vmcs_readl(GUEST_BNDCFGS);
>>
>> Can we read this value unconditionally, even when the host does not
>> support the feature?
>
> return -EWRONGPATCH;
>
>>>
>>> /* update exit information fields: */
>>>
>>> @@ -8386,6 +8398,10 @@ static void load_vmcs12_host_state(struct
>>> kvm_vcpu *vcpu,
>>> vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
>>> vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
>>>
>>> + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
>>> + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
>>> + vmcs_write64(GUEST_BNDCFGS, 0);
>>> +
>>> if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
>>> vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
>>> vcpu->arch.pat = vmcs12->host_ia32_pat;
>>>
>>
>> Do we also have a unit test to stress this? Or are we lacking silicon
>> with MPX and corresponding VMX features?
>
> No silicon yet.
>
> There is an emulator, but it is already slow enough without nested
> virtualization... it would be three-level virtualization :)

Cool! :) Maybe it will get faster again when we nest even deeper - no
one was there so far to tell us...

But even if not, the vmx unit tests are pretty short and should not
suffer much.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux