LinuxLists.cc - [PATCH v3 0/3] Per-VCPU MSR bitmaps patches

2018-01-31 21:10:43

Subject: [PATCH v3 0/3] Per-VCPU MSR bitmaps patches - topic branch for x86/pti

David and others,

the following changes since commit ba804bb4b72e57374b5f567b783aa0298fba0ce6:

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-01-26 09:03:16 -0800)

are available in the git repository at:

git://git.kernel.org/pub/scm/virt/kvm/kvm.git msr-bitmaps

for you to fetch changes up to 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6:

KVM: VMX: make MSR bitmaps per-VCPU (2018-01-26 22:59:32 +0100)

The patches are on top of Linus's tree and I checked that they apply okay
on top of the latest 4.14 tree as well as required for tip x86/pti.
One extra commit is needed that is pretty safe and would have been merged
for 4.16 anyway.

Radim, please pull this into kvm.git too. As before I've placed my
resolution on kvm.git, branch refs/heads/msr-bitmaps-merge-resolution.
It's based on kvm/next this time.

Thanks,

Paolo

Jim Mattson (1):
KVM: nVMX: Eliminate vmcs02 pool

Paolo Bonzini (2):
KVM: VMX: introduce alloc_loaded_vmcs
KVM: VMX: make MSR bitmaps per-VCPU

arch/x86/kvm/vmx.c | 440 ++++++++++++++++++++++-------------------------------
1 file changed, 186 insertions(+), 254 deletions(-)

--
2.14.3

2018-01-31 21:10:24

by Paolo Bonzini

[permalink] [raw]

Subject: [PATCH 2/3] KVM: VMX: introduce alloc_loaded_vmcs

Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also
allocate an MSR bitmap there.

Cc: [email protected] # prereq for Spectre mitigation
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++--------------
1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad6a883b7a32..ab4b9bc99a52 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3829,11 +3829,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
return vmcs;
}

-static struct vmcs *alloc_vmcs(void)
-{
- return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
static void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
@@ -3852,6 +3847,22 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}

+static struct vmcs *alloc_vmcs(void)
+{
+ return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs();
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs_init(loaded_vmcs);
+ return 0;
+}
+
static void free_kvm_area(void)
{
int cpu;
@@ -7145,12 +7156,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ int r;

- vmx->nested.vmcs02.vmcs = alloc_vmcs();
- vmx->nested.vmcs02.shadow_vmcs = NULL;
- if (!vmx->nested.vmcs02.vmcs)
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
goto out_vmcs02;
- loaded_vmcs_init(&vmx->nested.vmcs02);

if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
@@ -9545,13 +9555,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->guest_msrs)
goto free_pml;

- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx->loaded_vmcs->vmcs = alloc_vmcs();
- vmx->loaded_vmcs->shadow_vmcs = NULL;
- if (!vmx->loaded_vmcs->vmcs)
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
goto free_msrs;
- loaded_vmcs_init(vmx->loaded_vmcs);

+ vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
--
2.14.3

2018-01-31 21:10:48

by Paolo Bonzini

[permalink] [raw]

Subject: [PATCH 3/3] KVM: VMX: make MSR bitmaps per-VCPU

Place the MSR bitmap in struct loaded_vmcs, and update it in place
every time the x2apic or APICv state can change. This is rare and
the loop can handle 64 MSRs per iteration, in a similar fashion as
nested_vmx_prepare_msr_bitmap.

This prepares for choosing, on a per-VM basis, whether to intercept
the SPEC_CTRL and PRED_CMD MSRs.

Cc: [email protected] # prereq for Spectre mitigation
Suggested-by: Jim Mattson <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/vmx.c | 270 +++++++++++++++++++++++++++++------------------------
1 file changed, 147 insertions(+), 123 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab4b9bc99a52..896af99a8606 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -111,6 +111,14 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);

+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW 3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+#define MSR_BITMAP_MODE_LM 4
+
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL

/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
@@ -209,6 +217,7 @@ struct loaded_vmcs {
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
};

@@ -449,8 +458,6 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;

- unsigned long *msr_bitmap;
-
struct hrtimer preemption_timer;
bool preemption_timer_expired;

@@ -573,6 +580,7 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
+ u8 msr_bitmap_mode;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -927,6 +935,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);

static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -946,12 +955,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
enum {
VMX_IO_BITMAP_A,
VMX_IO_BITMAP_B,
- VMX_MSR_BITMAP_LEGACY,
- VMX_MSR_BITMAP_LONGMODE,
- VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
- VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
- VMX_MSR_BITMAP_LEGACY_X2APIC,
- VMX_MSR_BITMAP_LONGMODE_X2APIC,
VMX_VMREAD_BITMAP,
VMX_VMWRITE_BITMAP,
VMX_BITMAP_NR
@@ -961,12 +964,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];

#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
-#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
-#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
-#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
-#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
-#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])

@@ -2564,36 +2561,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp;
}

-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- unsigned long *msr_bitmap;
-
- if (is_guest_mode(vcpu))
- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
- else if (cpu_has_secondary_exec_ctrls() &&
- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
- } else {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
- }
- } else {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
- }
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -2634,7 +2601,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(&vmx->vcpu);
+ vmx_update_msr_bitmap(&vmx->vcpu);
}

/*
@@ -3844,6 +3811,8 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}

@@ -3860,7 +3829,18 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)

loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs_init(loaded_vmcs);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ }
return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
}

static void free_kvm_area(void)
@@ -4921,10 +4901,8 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}

-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
{
int f = sizeof(unsigned long);

@@ -4958,6 +4936,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
}
}

+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ else
+ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+}
+
/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -5004,30 +5026,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}

-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
{
- if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
- msr, MSR_TYPE_R | MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
- msr, MSR_TYPE_R | MSR_TYPE_W);
+ u8 mode = 0;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode |= MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ }
+
+ if (is_long_mode(vcpu))
+ mode |= MSR_BITMAP_MODE_LM;
+
+ return mode;
}

-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+ u8 mode)
{
- if (apicv_active) {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
- msr, type);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
- msr, type);
- } else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, type);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, type);
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+
+ if (mode & MSR_BITMAP_MODE_X2APIC) {
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ }
}
}

+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ u8 mode = vmx_msr_bitmap_mode(vcpu);
+ u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+ if (!changed)
+ return;
+
+ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_LM));
+
+ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+ vmx->msr_bitmap_mode = mode;
+}
+
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return enable_apicv;
@@ -5277,7 +5339,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}

static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5464,7 +5526,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
}
if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));

vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

@@ -6747,7 +6809,7 @@ void vmx_enable_tdp(void)

static __init int hardware_setup(void)
{
- int r = -ENOMEM, i, msr;
+ int r = -ENOMEM, i;

rdmsrl_safe(MSR_EFER, &host_efer);

@@ -6767,9 +6829,6 @@ static __init int hardware_setup(void)

memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);

- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
goto out;
@@ -6838,42 +6897,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}

- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_legacy_x2apic,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
-
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */

- for (msr = 0x800; msr <= 0x8ff; msr++) {
- if (msr == 0x839 /* TMCCT */)
- continue;
- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
- }
-
- /*
- * TPR reads and writes can be virtualized even if virtual interrupt
- * delivery is not in use.
- */
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
-
- /* EOI */
- vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
- /* SELF-IPI */
- vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
-
if (enable_ept)
vmx_enable_tdp();
else
@@ -7162,13 +7187,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;

- if (cpu_has_vmx_msr_bitmap()) {
- vmx->nested.msr_bitmap =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx->nested.msr_bitmap)
- goto out_msr_bitmap;
- }
-
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
@@ -7195,9 +7213,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_vmcs12);

out_cached_vmcs12:
- free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
free_loaded_vmcs(&vmx->nested.vmcs02);

out_vmcs02:
@@ -7343,10 +7358,6 @@ static void free_nested(struct vcpu_vmx *vmx)
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- if (vmx->nested.msr_bitmap) {
- free_page((unsigned long)vmx->nested.msr_bitmap);
- vmx->nested.msr_bitmap = NULL;
- }
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -8862,7 +8873,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);

- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}

static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -9523,6 +9534,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ unsigned long *msr_bitmap;
int cpu;

if (!vmx)
@@ -9559,6 +9571,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (err < 0)
goto free_msrs;

+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx->msr_bitmap_mode = 0;
+
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -10022,7 +10043,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
struct page *page;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;

/* This shortcut is ok because we support only x2APIC MSRs so far. */
if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
@@ -10599,6 +10620,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);

+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
if (enable_vpid) {
/*
* There is no direct mapping between vpid02 and vpid12, the
@@ -11397,7 +11421,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);

if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
--
2.14.3

2018-01-31 21:12:42

by Paolo Bonzini

[permalink] [raw]

Subject: [PATCH 1/3] KVM: nVMX: Eliminate vmcs02 pool

From: Jim Mattson <[email protected]>

The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.

Cc: [email protected] # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <[email protected]>
Signed-off-by: Mark Kanda <[email protected]>
Reviewed-by: Ameya More <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
Reviewed-by: Paolo Bonzini <[email protected]>
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/vmx.c | 146 +++++++++--------------------------------------------
1 file changed, 23 insertions(+), 123 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c829d89e2e63..ad6a883b7a32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,7 +185,6 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return;

#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1

struct vmcs {
u32 revision_id;
@@ -226,7 +225,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -409,13 +408,6 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000

-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -440,15 +432,15 @@ struct nested_vmx {
*/
bool sync_shadow_vmcs;

- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6973,94 +6965,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}

-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
@@ -7242,6 +7146,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;

+ vmx->nested.vmcs02.vmcs = alloc_vmcs();
+ vmx->nested.vmcs02.shadow_vmcs = NULL;
+ if (!vmx->nested.vmcs02.vmcs)
+ goto out_vmcs02;
+ loaded_vmcs_init(&vmx->nested.vmcs02);
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7264,9 +7174,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}

- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7281,6 +7188,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
free_page((unsigned long)vmx->nested.msr_bitmap);

out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
return -ENOMEM;
}

@@ -7434,7 +7344,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7360,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL;
}

- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}

/* Emulate the VMXOFF instruction */
@@ -7493,8 +7403,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));

- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8406,10 +8314,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)

/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -10903,20 +10812,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;

- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);

if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);

- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);

if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11534,10 +11438,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);

- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
--
2.14.3

2018-02-01 01:49:09

by Konrad Rzeszutek Wilk

[permalink] [raw]

Subject: Re: [PATCH v3 0/3] Per-VCPU MSR bitmaps patches - topic branch for x86/pti

On Wed, Jan 31, 2018 at 04:09:21PM -0500, Paolo Bonzini wrote:
> David and others,
>
> the following changes since commit ba804bb4b72e57374b5f567b783aa0298fba0ce6:
>
> Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-01-26 09:03:16 -0800)
>
> are available in the git repository at:
>
> git://git.kernel.org/pub/scm/virt/kvm/kvm.git msr-bitmaps
>
> for you to fetch changes up to 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6:
>
> KVM: VMX: make MSR bitmaps per-VCPU (2018-01-26 22:59:32 +0100)
>
> The patches are on top of Linus's tree and I checked that they apply okay
> on top of the latest 4.14 tree as well as required for tip x86/pti.
> One extra commit is needed that is pretty safe and would have been merged
> for 4.16 anyway.
>
> Radim, please pull this into kvm.git too. As before I've placed my
> resolution on kvm.git, branch refs/heads/msr-bitmaps-merge-resolution.
> It's based on kvm/next this time.

You don't want folks to review patch #2 and #3 first? There are tons
of people that I am sure would be thrilled with the whole Spectra circus.

>
> Thanks,
>
> Paolo
>
>
> Jim Mattson (1):
> KVM: nVMX: Eliminate vmcs02 pool
>
> Paolo Bonzini (2):
> KVM: VMX: introduce alloc_loaded_vmcs
> KVM: VMX: make MSR bitmaps per-VCPU
>
> arch/x86/kvm/vmx.c | 440 ++++++++++++++++++++++-------------------------------
> 1 file changed, 186 insertions(+), 254 deletions(-)
>
> --
> 2.14.3
>

2018-02-01 16:00:10

by Radim Krčmář

[permalink] [raw]

Subject: Re: [PATCH v3 0/3] Per-VCPU MSR bitmaps patches - topic branch for x86/pti

2018-01-31 20:48-0500, Konrad Rzeszutek Wilk:
> On Wed, Jan 31, 2018 at 04:09:21PM -0500, Paolo Bonzini wrote:
> > David and others,
> >
> > the following changes since commit ba804bb4b72e57374b5f567b783aa0298fba0ce6:
> >
> > Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-01-26 09:03:16 -0800)
> >
> > are available in the git repository at:
> >
> > git://git.kernel.org/pub/scm/virt/kvm/kvm.git msr-bitmaps
> >
> > for you to fetch changes up to 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6:
> >
> > KVM: VMX: make MSR bitmaps per-VCPU (2018-01-26 22:59:32 +0100)
> >
> > The patches are on top of Linus's tree and I checked that they apply okay
> > on top of the latest 4.14 tree as well as required for tip x86/pti.
> > One extra commit is needed that is pretty safe and would have been merged
> > for 4.16 anyway.
> >
> > Radim, please pull this into kvm.git too. As before I've placed my
> > resolution on kvm.git, branch refs/heads/msr-bitmaps-merge-resolution.
> > It's based on kvm/next this time.
>
> You don't want folks to review patch #2 and #3 first? There are tons
> of people that I am sure would be thrilled with the whole Spectra circus.

This is only a boring prerequisite that doesn't add any security ...
Jim Mattson gave r-b to [v1 3/3] and was a significant contributor to
the patches, David Hildenbrandt reviewed [v2 2,3/3] and there have been
only minor changes since.

v3 looks good to me, so I'll pull if there are no more requirements from
pti.