Thanks to the thorough review, I'm removing the RFC tag. The only
major remaining point of contention is the code duplication between
virt/kvm/kvm_main.c and arch/x86/kvm/smram.c.
As before, I have only tested these patches with Q35's high SMRAM
and with open SMRAM. Still, the TODO list has gotten smaller.
I still haven't tried using #define for offsets. The main obstacle
is doing math to compute offsets.
Reviewed-by tags for individual patches are appreciated (especially
patches 1 and 2 which I'd like to push separately).
Paolo
RFC->v1:
- two patches (Radim's kvm_write_guest_page optimization + my fix
for CR4.SMEP=1/CR0.WP=0) have already graduated to kvm/next
- one new patch ("kvm: x86: introduce num_emulated_msrs")
- fix pasto in do_get_msr
- introduced kvm_set_hflags
- replaced printk with tracepoints
- added smi.pending and smi.rsm_unmasks_nmi fields to kvm_vcpu_events,
to fix NMIs and add support for latching an SMI
- SMBASE is not guest-readable
- rebased on top of kvm/next so that SMBASE is not reset on INIT
- fix access rights (shift left by 8) in 32-bit SMM state save format
- disable if !unrestricted_guest && !emulate_invalid_guest_state
TODO:
- test on AMD
- test with true SMRAM support in QEMU (only affects patch 10)
Paolo Bonzini (12):
KVM: export __gfn_to_pfn_memslot, drop gfn_to_pfn_async
KVM: x86: introduce num_emulated_msrs
KVM: remove unnecessary arg from mark_page_dirty_in_slot, export it
KVM: x86: pass host_initiated to functions that read MSRs
KVM: x86: pass the whole hflags field to emulator and back
KVM: x86: API changes for SMM support
KVM: x86: stubs for SMM support
KVM: x86: save/load state on SMM switch
KVM: x86: add vcpu-specific functions to read/write/translate GFNs
KVM: x86: add SMM to the MMU role
KVM: x86: add KVM_MEM_X86_SMRAM memory slot flag
KVM: x86: advertise KVM_CAP_X86_SMM
Documentation/virtual/kvm/api.txt | 58 +++-
arch/x86/include/asm/kvm_emulate.h | 9 +-
arch/x86/include/asm/kvm_host.h | 42 ++-
arch/x86/include/asm/vmx.h | 1 +
arch/x86/include/uapi/asm/kvm.h | 14 +-
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/cpuid.h | 8 +
arch/x86/kvm/emulate.c | 262 +++++++++++++++++-
arch/x86/kvm/kvm_cache_regs.h | 5 +
arch/x86/kvm/lapic.c | 26 +-
arch/x86/kvm/mmu.c | 20 +-
arch/x86/kvm/paging_tmpl.h | 8 +-
arch/x86/kvm/smram.c | 229 ++++++++++++++++
arch/x86/kvm/svm.c | 69 +++--
arch/x86/kvm/trace.h | 22 ++
arch/x86/kvm/vmx.c | 80 +++---
arch/x86/kvm/x86.c | 543 +++++++++++++++++++++++++++++++------
include/linux/kvm_host.h | 20 +-
include/uapi/linux/kvm.h | 5 +-
virt/kvm/kvm_main.c | 44 ++-
20 files changed, 1223 insertions(+), 244 deletions(-)
create mode 100644 arch/x86/kvm/smram.c
--
1.8.3.1
gfn_to_pfn_async is used in just one place, and because of x86-specific
treatment that place will need to look at the memory slot. Hence inline
it into try_async_pf and export __gfn_to_pfn_memslot.
The patch also switches the subsequent call to gfn_to_pfn_prot to use
__gfn_to_pfn_memslot. For now this is just a small optimization, but
having a memslot argument will also be useful when implementing SMRAM
(which will need an x86-specific function for gfn-to-memslot conversion).
Finally, remove the now-unused async argument of __gfn_to_pfn.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/mmu.c | 9 +++++----
include/linux/kvm_host.h | 4 ++--
virt/kvm/kvm_main.c | 26 ++++++++------------------
3 files changed, 15 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 209fe1477465..371109546382 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3475,10 +3475,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
+ struct kvm_memory_slot *slot;
bool async;
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
@@ -3492,8 +3494,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
return false;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7a08cd6f4a8..87fd74a04005 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -539,13 +539,13 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f202c4035134..bd3c08a7c6c2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1355,9 +1355,8 @@ exit:
return pfn;
}
-static pfn_t
-__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
- bool *async, bool write_fault, bool *writable)
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -1376,44 +1375,35 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
return hva_to_pfn(addr, atomic, async, write_fault,
writable);
}
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic,
bool write_fault, bool *writable)
{
struct kvm_memory_slot *slot;
- if (async)
- *async = false;
-
slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, NULL, write_fault,
writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
{
- return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
+ return __gfn_to_pfn(kvm, gfn, true, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable)
-{
- return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
-
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
- return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+ return __gfn_to_pfn(kvm, gfn, false, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
- return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+ return __gfn_to_pfn(kvm, gfn, false, write_fault, writable);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
--
1.8.3.1
We will want to filter away MSR_IA32_SMBASE from the emulated_msrs if
the host CPU does not support SMM virtualization. Introduce the
logic to do that, and also move paravirt MSRs to emulated_msrs for
simplicity and to get rid of KVM_SAVE_MSRS_BEGIN.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/x86.c | 40 +++++++++++++++++++++++++++-------------
1 file changed, 27 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdccbe1749a5..7b894f2523f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -925,17 +925,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
*/
-#define KVM_SAVE_MSRS_BEGIN 12
static u32 msrs_to_save[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -947,7 +941,14 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
+
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
@@ -955,6 +956,8 @@ static const u32 emulated_msrs[] = {
MSR_IA32_MCG_CTL,
};
+static unsigned num_emulated_msrs;
+
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits)
@@ -2864,7 +2867,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -2876,7 +2879,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
@@ -4142,8 +4145,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i, j;
- /* skip the first msrs in the list. KVM-specific */
- for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
@@ -4168,6 +4170,18 @@ static void kvm_init_msr_list(void)
j++;
}
num_msrs_to_save = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+ switch (emulated_msrs[i]) {
+ default:
+ break;
+ }
+
+ if (j < i)
+ emulated_msrs[j] = emulated_msrs[i];
+ j++;
+ }
+ num_emulated_msrs = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
--
1.8.3.1
Reviewed-by: Marcelo Tosatti <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 14 ++++++--------
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 87fd74a04005..2946ec3f1b81 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -573,6 +573,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bd3c08a7c6c2..0fcc5d28f3a9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,8 +103,6 @@ static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static void kvm_release_pfn_dirty(pfn_t pfn);
-static void mark_page_dirty_in_slot(struct kvm *kvm,
- struct kvm_memory_slot *memslot, gfn_t gfn);
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -1590,7 +1588,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
r = __copy_to_user((void __user *)addr + offset, data, len);
if (r)
return -EFAULT;
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ mark_page_dirty_in_slot(memslot, gfn);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
@@ -1673,7 +1671,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
r = __copy_to_user((void __user *)ghc->hva, data, len);
if (r)
return -EFAULT;
- mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+ mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
return 0;
}
@@ -1731,9 +1729,8 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
-static void mark_page_dirty_in_slot(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
+ gfn_t gfn)
{
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
@@ -1741,13 +1738,14 @@ static void mark_page_dirty_in_slot(struct kvm *kvm,
set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
+EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *memslot;
memslot = gfn_to_memslot(kvm, gfn);
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ mark_page_dirty_in_slot(memslot, gfn);
}
EXPORT_SYMBOL_GPL(mark_page_dirty);
--
1.8.3.1
SMBASE is only readable from SMM for the VCPU, but it must be always
accessible if userspace is accessing it. Thus, all functions that
read MSRs are changed to accept a struct msr_data; the host_initiated
and index fields are pre-initialized, while the data field is filled
on return.
Signed-off-by: Paolo Bonzini <[email protected]>
--
RFC->v1: fix pasto in do_get_msr
---
arch/x86/include/asm/kvm_host.h | 6 +--
arch/x86/kvm/svm.c | 54 ++++++++++----------
arch/x86/kvm/vmx.c | 62 ++++++++++++-----------
arch/x86/kvm/x86.c | 106 ++++++++++++++++++++++++----------------
4 files changed, 127 insertions(+), 101 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8b661d1946b5..6487ffc95b0a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -719,7 +719,7 @@ struct kvm_x86_ops {
void (*vcpu_put)(struct kvm_vcpu *vcpu);
void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -938,7 +938,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
struct x86_emulate_ctxt;
@@ -967,7 +967,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8954d7a07703..24282bc6580c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3069,42 +3069,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
svm_scale_tsc(vcpu, host_tsc);
}
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
- switch (ecx) {
+ switch (msr_info->index) {
case MSR_IA32_TSC: {
- *data = svm->vmcb->control.tsc_offset +
+ msr_info->data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
break;
}
case MSR_STAR:
- *data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb->save.lstar;
break;
case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->sysenter_eip;
+ msr_info->data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->sysenter_esp;
+ msr_info->data = svm->sysenter_esp;
break;
/*
* Nobody will change the following 5 values in the VMCB so we can
@@ -3112,31 +3112,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
* implemented.
*/
case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
- *data = svm->nested.hsave_msr;
+ msr_info->data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
+ msr_info->data = svm->nested.vm_cr_msr;
break;
case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
+ msr_info->data = 0x01000065;
break;
default:
- return kvm_get_msr_common(vcpu, ecx, data);
+ return kvm_get_msr_common(vcpu, msr_info);
}
return 0;
}
@@ -3144,16 +3144,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
static int rdmsr_interception(struct vcpu_svm *svm)
{
u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
- u64 data;
+ struct msr_data msr_info;
- if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (svm_get_msr(&svm->vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
} else {
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+ msr_info.data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+ msr_info.data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5e8d41bccc26..bf961951786a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2622,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- u64 data;
struct shared_msr_entry *msr;
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
+ switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
vmx_load_host_state(to_vmx(vcpu));
- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- data = guest_read_tsc();
+ msr_info->data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
if (!vmx_mpx_supported())
return 1;
- data = vmcs_read64(GUEST_BNDCFGS);
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_FEATURE_CONTROL:
if (!nested_vmx_allowed(vcpu))
return 1;
- data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+ return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
- data = vcpu->arch.ia32_xss;
+ msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
/* Otherwise falls through */
default:
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
+ msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
if (msr) {
- data = msr->data;
+ msr_info->data = msr->data;
break;
}
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
}
- *pdata = data;
return 0;
}
@@ -5473,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
static int handle_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data;
+ struct msr_data msr_info;
- if (vmx_get_msr(vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (vmx_get_msr(vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
/* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
skip_emulated_instruction(vcpu);
return 1;
}
@@ -9150,6 +9145,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
struct vmx_msr_entry e;
for (i = 0; i < count; i++) {
+ struct msr_data msr_info;
if (kvm_read_guest(vcpu->kvm,
gpa + i * sizeof(e),
&e, 2 * sizeof(u32))) {
@@ -9164,7 +9160,9 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
return -EINVAL;
}
- if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ msr_info.host_initiated = false;
+ msr_info.index = e.index;
+ if (kvm_get_msr(vcpu, &msr_info)) {
pr_warn_ratelimited(
"%s cannot read MSR (%u, 0x%x)\n",
__func__, i, e.index);
@@ -9173,10 +9171,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
if (kvm_write_guest(vcpu->kvm,
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
- &e.value, sizeof(e.value))) {
+ &msr_info.data, sizeof(msr_info.data))) {
pr_warn_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
- __func__, i, e.index, e.value);
+ __func__, i, e.index, msr_info.data);
return -EINVAL;
}
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b894f2523f6..52df3ac6a6bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1051,6 +1051,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = index;
+ msr.host_initiated = true;
+ r = kvm_get_msr(vcpu, &msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+ return 0;
+}
+
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct msr_data msr;
@@ -2384,9 +2399,9 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+ return kvm_x86_ops->get_msr(vcpu, msr);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
@@ -2523,11 +2538,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 data;
- switch (msr) {
+ switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -2550,26 +2565,26 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
- data = 0;
+ msr_info->data = 0;
break;
case MSR_P6_PERFCTR0:
case MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
- data = 0;
+ if (kvm_pmu_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
- data = 0x100000000ULL;
+ msr_info->data = 0x100000000ULL;
break;
case MSR_MTRRcap:
- data = 0x500 | KVM_NR_VAR_MTRR;
+ msr_info->data = 0x500 | KVM_NR_VAR_MTRR;
break;
case 0x200 ... 0x2ff:
- return get_msr_mtrr(vcpu, msr, pdata);
+ return get_msr_mtrr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
- data = 3;
+ msr_info->data = 3;
break;
/*
* MSR_EBC_FREQUENCY_ID
@@ -2583,48 +2598,48 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* multiplying by zero otherwise.
*/
case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
+ msr_info->data = 1 << 24;
break;
case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
+ msr_info->data = kvm_get_apic_base(vcpu);
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
- return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
- data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
- data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
- data = vcpu->arch.ia32_misc_enable_msr;
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
- data = 1000ULL;
+ msr_info->data = 1000ULL;
/* CPU multiplier */
data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.efer;
+ msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
case MSR_KVM_WALL_CLOCK_NEW:
- data = vcpu->kvm->arch.wall_clock;
+ msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
case MSR_KVM_SYSTEM_TIME_NEW:
- data = vcpu->arch.time;
+ msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
+ msr_info->data = vcpu->arch.apf.msr_val;
break;
case MSR_KVM_STEAL_TIME:
- data = vcpu->arch.st.msr_val;
+ msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
- data = vcpu->arch.pv_eoi.msr_val;
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
@@ -2632,7 +2647,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr, pdata);
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2643,17 +2658,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* type 6, model 8 and higher from exploding due to
* the rdmsr failing.
*/
- data = 0x20000000;
+ msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
+ if (kvm_hv_msr_partition_wide(msr_info->index)) {
int r;
mutex_lock(&vcpu->kvm->lock);
- r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return get_msr_hyperv(vcpu, msr, pdata);
+ return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
@@ -2666,31 +2681,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* L2 cache control register 3: 64GB range, 256KB size,
* enabled, latency 0x1, configured
*/
- data = 0xbe702111;
+ msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.length;
+ msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.status;
+ msr_info->data = vcpu->arch.osvw.status;
break;
default:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
+ if (kvm_pmu_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
if (!ignore_msrs) {
- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
return 1;
} else {
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
- data = 0;
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+ msr_info->data = 0;
}
break;
}
- *pdata = data;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -3461,7 +3475,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
break;
case KVM_SET_MSRS:
r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -4992,7 +5006,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct msr_data msr;
+ int r;
+
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+ if (r)
+ return r;
+
+ *pdata = msr.data;
+ return 0;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
--
1.8.3.1
The hflags field will contain information about system management mode
and will be useful for the emulator. Pass the entire field rather than
just the guest-mode information.
Signed-off-by: Paolo Bonzini <[email protected]>
---
RFC->v1: introduce kvm_set_hflags
---
arch/x86/include/asm/kvm_emulate.h | 5 ++++-
arch/x86/kvm/emulate.c | 6 +++---
arch/x86/kvm/x86.c | 10 +++++++++-
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe160..7410879a41f7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -262,6 +262,9 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
};
+/* These match some of the HF_* flags defined in kvm_host.h */
+#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+
struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
@@ -273,8 +276,8 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ int emul_flags;
- bool guest_mode; /* guest running a nested guest */
bool perm_ok; /* do not check permissions if true */
bool ud; /* inject an #UD if host doesn't support insn */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5839fc56cb3e..e82a559df21a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4871,7 +4871,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +4900,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4953,7 +4953,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 52df3ac6a6bc..7d311a0de84c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5176,7 +5176,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- ctxt->guest_mode = is_guest_mode(vcpu);
+ BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5345,6 +5346,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ vcpu->arch.hflags = emul_flags;
+}
+
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -5544,6 +5550,8 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (vcpu->arch.hflags != ctxt->emul_flags)
+ kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
--
1.8.3.1
This patch includes changes to the external API for SMM support.
All the changes are predicated by the availability of a new
capability, KVM_CAP_X86_SMM, which is added at the end of the
patch series.
Signed-off-by: Paolo Bonzini <[email protected]>
---
RFC->v1: add smi.pending and smi.rsm_unmasks_nmi fields, reduce padding
in struct kvm_vcpu_events; remove memset of events struct,
instead zero smi.pad. KVM_CAP_X86_SMM frozen at 117.
---
Documentation/virtual/kvm/api.txt | 40 +++++++++++++++++++++++++++++++++------
arch/x86/include/asm/kvm_host.h | 3 +++
arch/x86/include/uapi/asm/kvm.h | 11 ++++++++++-
arch/x86/kvm/kvm_cache_regs.h | 5 +++++
arch/x86/kvm/x86.c | 34 +++++++++++++++++++++++++++++++--
include/uapi/linux/kvm.h | 5 ++++-
6 files changed, 88 insertions(+), 10 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 695544420ff2..51523b70b6b2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -820,11 +820,21 @@ struct kvm_vcpu_events {
} nmi;
__u32 sipi_vector;
__u32 flags;
+ struct {
+ __u8 smm;
+ __u8 pending;
+ __u8 smm_inside_nmi;
+ __u8 pad;
+ } smi;
};
-KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
-interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+Only two fields are defined in the flags field:
+
+- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+ interrupt.shadow contains a valid state.
+- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
+ smi contains a valid state.
4.32 KVM_SET_VCPU_EVENTS
@@ -841,17 +851,20 @@ vcpu.
See KVM_GET_VCPU_EVENTS for the data structure.
Fields that may be modified asynchronously by running VCPUs can be excluded
-from the update. These fields are nmi.pending and sipi_vector. Keep the
-corresponding bits in the flags field cleared to suppress overwriting the
-current in-kernel state. The bits are:
+from the update. These fields are nmi.pending, sipi_vector, smi.smm,
+smi.pending. Keep the corresponding bits in the flags field cleared to
+suppress overwriting the current in-kernel state. The bits are:
KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
+KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct.
If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
the flags field to signal that interrupt.shadow contains a valid state and
shall be written into the VCPU.
+KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
+
4.33 KVM_GET_DEBUGREGS
@@ -2979,6 +2992,16 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
which is the maximum number of possibly pending cpu-local interrupts.
+4.90 KVM_SMI
+
+Capability: KVM_CAP_X86_SMM
+Architectures: x86
+Type: vcpu ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+
+Queues an SMI on the thread's vcpu.
+
5. The kvm_run structure
------------------------
@@ -3014,7 +3037,12 @@ an interrupt can be injected now with KVM_INTERRUPT.
The value of the current interrupt flag. Only valid if in-kernel
local APIC is not used.
- __u8 padding2[2];
+ __u16 flags;
+
+More architecture-specific flags detailing state of the VCPU that may
+affect the device's behavior. The only currently defined flag is
+KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the
+VCPU is in system management mode.
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6487ffc95b0a..a1bef695dc99 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -469,6 +469,7 @@ struct kvm_vcpu_arch {
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
+ bool smi_pending; /* SMI queued after currently running handler */
struct mtrr_state_type mtrr_state;
u64 pat;
@@ -1112,6 +1113,8 @@ enum {
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK (1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
/*
* Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2fec75e4b1e1..30100a3c1bed 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {
#define KVM_IRQCHIP_IOAPIC 2
#define KVM_NR_IRQCHIPS 3
+#define KVM_RUN_X86_SMM (1 << 0)
+
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
@@ -281,6 +283,7 @@ struct kvm_reinject_control {
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+#define KVM_VCPUEVENT_VALID_SMM 0x00000008
/* Interrupt shadow states */
#define KVM_X86_SHADOW_INT_MOV_SS 0x01
@@ -309,7 +312,13 @@ struct kvm_vcpu_events {
} nmi;
__u32 sipi_vector;
__u32 flags;
- __u32 reserved[10];
+ struct {
+ __u8 smm;
+ __u8 pending;
+ __u8 smm_inside_nmi;
+ __u8 pad;
+ } smi;
+ __u32 reserved[9];
};
/* for KVM_GET/SET_DEBUGREGS */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f44b..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7d311a0de84c..8015c67a6d07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3037,6 +3037,11 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -3142,8 +3147,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->sipi_vector = 0; /* never valid when reporting to user space */
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ events->smi.pad = 0;
+
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SHADOW);
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -3152,7 +3164,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM))
return -EINVAL;
process_nmi(vcpu);
@@ -3177,6 +3190,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_vcpu_has_lapic(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ if (events->smi.smm)
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ vcpu->arch.smi_pending = events->smi.pending;
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= ~HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
@@ -3436,6 +3461,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_nmi(vcpu);
break;
}
+ case KVM_SMI: {
+ r = kvm_vcpu_ioctl_smi(vcpu);
+ break;
+ }
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
@@ -6118,6 +6147,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 75bd9f7fd846..eace8babd227 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -202,7 +202,7 @@ struct kvm_run {
__u32 exit_reason;
__u8 ready_for_interrupt_injection;
__u8 if_flag;
- __u8 padding2[2];
+ __u16 flags;
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
@@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_S390_IRQ_STATE 114
#define KVM_CAP_PPC_HWRNG 115
#define KVM_CAP_DISABLE_QUIRKS 116
+#define KVM_CAP_X86_SMM 117
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_S390_IRQ_STATE */
#define KVM_S390_SET_IRQ_STATE _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
+/* Available with KVM_CAP_X86_SMM */
+#define KVM_SMI _IO(KVMIO, 0xb7)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
--
1.8.3.1
This patch adds the interface between x86.c and the emulator: the
SMBASE register, a new emulator flag, the RSM instruction. It also
adds a new request bit that will be used by the KVM_SMI ioctl.
Signed-off-by: Paolo Bonzini <[email protected]>
--
RFC->v1: make SMBASE host-readable only
add support for latching an SMI
do not reset SMBASE on INIT
---
arch/x86/include/asm/kvm_emulate.h | 4 +++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/emulate.c | 10 ++++++-
arch/x86/kvm/lapic.c | 4 ++-
arch/x86/kvm/svm.c | 1 +
arch/x86/kvm/vmx.c | 2 +-
arch/x86/kvm/x86.c | 54 +++++++++++++++++++++++++++++++++++++-
include/linux/kvm_host.h | 1 +
9 files changed, 74 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7410879a41f7..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+ void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -264,6 +266,8 @@ enum x86emul_mode {
/* These match some of the HF_* flags defined in kvm_host.h */
#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a1bef695dc99..fc05c7dda9f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -367,6 +367,7 @@ struct kvm_vcpu_arch {
int32_t apic_arb_prio;
int mp_state;
u64 ia32_misc_enable_msr;
+ u64 smbase;
bool tpr_access_reporting;
u64 ia32_xss;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da772edd19ab..e8f998114df0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -108,6 +108,7 @@
#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+#define VMX_MISC_IA32_SMBASE_MSR 0x00008000
/* VMCS Encodings */
enum vmcs_field {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e82a559df21a..c5a6a407afba 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2262,6 +2262,14 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+ return emulate_ud(ctxt);
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
@@ -4173,7 +4181,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- DI(ImplicitOps, rsm),
+ II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dc5b57bb1f76..fe26f2051979 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -799,7 +799,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- apic_debug("Ignoring guest SMI\n");
+ result = 1;
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_NMI:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 24282bc6580c..6afffb82513c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3394,6 +3394,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_RSM] = emulate_on_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf961951786a..121a653919c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2504,7 +2504,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
vmx->nested.nested_vmx_misc_low |=
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
- VMX_MISC_ACTIVITY_HLT;
+ VMX_MISC_ACTIVITY_HLT | VMX_MISC_IA32_SMBASE_MSR;
vmx->nested.nested_vmx_misc_high = 0;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8015c67a6d07..2bf5bc4ed00f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -954,6 +954,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_SMBASE,
};
static unsigned num_emulated_msrs;
@@ -2220,6 +2221,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
@@ -2615,6 +2621,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MISC_ENABLE:
msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
msr_info->data = 1000ULL;
@@ -3039,6 +3050,8 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+
return 0;
}
@@ -5059,6 +5072,20 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.smbase = smbase;
+}
+
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
@@ -5144,6 +5171,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
+ .get_smbase = emulator_get_smbase,
+ .set_smbase = emulator_set_smbase,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
@@ -5206,6 +5235,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+ BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
@@ -5377,6 +5408,13 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu);
void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
{
+ if (is_smm(vcpu) && (emul_flags & HF_SMM_MASK) == 0) {
+ if (unlikely(vcpu->arch.smi_pending)) {
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ vcpu->arch.smi_pending = 0;
+ }
+ }
+
vcpu->arch.hflags = emul_flags;
}
@@ -6271,6 +6309,16 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ if (is_smm(vcpu)) {
+ vcpu->arch.smi_pending = true;
+ return;
+ }
+
+ printk_once(KERN_DEBUG "Ignoring guest SMI\n");
+}
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
@@ -6377,6 +6425,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -7276,8 +7326,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (!init_event)
+ if (!init_event) {
kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+ }
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2946ec3f1b81..19d09a08885b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page)
#define KVM_REQ_ENABLE_IBS 23
#define KVM_REQ_DISABLE_IBS 24
#define KVM_REQ_APIC_PAGE_RELOAD 25
+#define KVM_REQ_SMI 26
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
--
1.8.3.1
The big ugly one. This patch adds support for switching in and out of
system management mode, respectively upon receiving KVM_REQ_SMI and upon
executing a RSM instruction. Both 32- and 64-bit formats are supported
for the SMM state save area.
Signed-off-by: Paolo Bonzini <[email protected]>
---
RFC->v1: shift access rights left by 8 for 32-bit format
move tracepoint to kvm_set_hflags
fix NMI handling
---
arch/x86/kvm/cpuid.h | 8 ++
arch/x86/kvm/emulate.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/trace.h | 22 +++++
arch/x86/kvm/x86.c | 225 +++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 501 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c3b1ad9fca81..02b67616435d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c5a6a407afba..b278ea09ed80 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2262,12 +2262,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ return edx & bit(X86_FEATURE_LM);
+}
+
+#define get_smstate(type, smbase, offset) \
+ ({ \
+ type __val; \
+ int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
+ sizeof(__val), NULL); \
+ if (r != X86EMUL_CONTINUE) \
+ return X86EMUL_UNHANDLEABLE; \
+ __val; \
+ })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->d = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->p = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+
+ selector = get_smstate(u32, smbase, 0x7fa8 + n * 4);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ set_desc_base(&desc, get_smstate(u32, smbase, offset + 8));
+ set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
+ rsm_set_desc_flags(&desc, get_smstate(u32, smbase, offset));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+ u32 base3;
+
+ offset = 0x7e00 + n * 16;
+
+ selector = get_smstate(u16, smbase, offset);
+ rsm_set_desc_flags(&desc, get_smstate(u16, smbase, offset + 2) << 8);
+ set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
+ set_desc_base(&desc, get_smstate(u32, smbase, offset + 8));
+ base3 = get_smstate(u32, smbase, offset + 12);
+
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr4)
+{
+ int bad;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u16 selector;
+ u32 val, cr0, cr4;
+ int i;
+
+ cr0 = get_smstate(u32, smbase, 0x7ffc);
+ ctxt->ops->set_cr(ctxt, 3, get_smstate(u32, smbase, 0x7ff8));
+ ctxt->eflags = get_smstate(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = get_smstate(u32, smbase, 0x7ff0);
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = get_smstate(u32, smbase, 0x7fd0 + i * 4);
+
+ val = get_smstate(u32, smbase, 0x7fcc);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = get_smstate(u32, smbase, 0x7fc8);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ selector = get_smstate(u32, smbase, 0x7fc4);
+ set_desc_base(&desc, get_smstate(u32, smbase, 0x7f64));
+ set_desc_limit(&desc, get_smstate(u32, smbase, 0x7f60));
+ rsm_set_desc_flags(&desc, get_smstate(u32, smbase, 0x7f5c));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+ selector = get_smstate(u32, smbase, 0x7fc0);
+ set_desc_base(&desc, get_smstate(u32, smbase, 0x7f80));
+ set_desc_limit(&desc, get_smstate(u32, smbase, 0x7f7c));
+ rsm_set_desc_flags(&desc, get_smstate(u32, smbase, 0x7f78));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+ dt.address = get_smstate(u32, smbase, 0x7f74);
+ dt.size = get_smstate(u32, smbase, 0x7f70);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ dt.address = get_smstate(u32, smbase, 0x7f58);
+ dt.size = get_smstate(u32, smbase, 0x7f54);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_32(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ cr4 = get_smstate(u32, smbase, 0x7f14);
+
+ ctxt->ops->set_smbase(ctxt, get_smstate(u32, smbase, 0x7ef8));
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u64 val, cr0, cr4;
+ u32 base3;
+ u16 selector;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = get_smstate(u64, smbase, 0x7ff8 - i * 8);
+
+ ctxt->_eip = get_smstate(u64, smbase, 0x7f78);
+ ctxt->eflags = get_smstate(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+ val = get_smstate(u32, smbase, 0x7f68);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = get_smstate(u32, smbase, 0x7f60);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ cr0 = get_smstate(u64, smbase, 0x7f58);
+ ctxt->ops->set_cr(ctxt, 3, get_smstate(u64, smbase, 0x7f50));
+ cr4 = get_smstate(u64, smbase, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, get_smstate(u32, smbase, 0x7f00));
+ val = get_smstate(u64, smbase, 0x7ed0);
+ ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ selector = get_smstate(u32, smbase, 0x7e90);
+ rsm_set_desc_flags(&desc, get_smstate(u32, smbase, 0x7e92) << 8);
+ set_desc_limit(&desc, get_smstate(u32, smbase, 0x7e94));
+ set_desc_base(&desc, get_smstate(u32, smbase, 0x7e98));
+ base3 = get_smstate(u32, smbase, 0x7e9c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+ dt.size = get_smstate(u32, smbase, 0x7e84);
+ dt.address = get_smstate(u64, smbase, 0x7e88);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ selector = get_smstate(u32, smbase, 0x7e70);
+ rsm_set_desc_flags(&desc, get_smstate(u32, smbase, 0x7e72) << 8);
+ set_desc_limit(&desc, get_smstate(u32, smbase, 0x7e74));
+ set_desc_base(&desc, get_smstate(u32, smbase, 0x7e78));
+ base3 = get_smstate(u32, smbase, 0x7e7c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+ dt.size = get_smstate(u32, smbase, 0x7e64);
+ dt.address = get_smstate(u64, smbase, 0x7e68);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_64(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
+ unsigned long cr0, cr4, efer;
+ u64 smbase;
+ int ret;
+
if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
- return X86EMUL_UNHANDLEABLE;
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
+ * to read_std/write_std are not virtual.
+ *
+ * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+ */
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+ /* FIXME: should triple fault */
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
+ ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+ return X86EMUL_CONTINUE;
}
static void
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..17e505125b2c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
+TRACE_EVENT(kvm_enter_smm,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2bf5bc4ed00f..9eba0d850d17 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5409,6 +5409,9 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu);
void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
{
if (is_smm(vcpu) && (emul_flags & HF_SMM_MASK) == 0) {
+ /* This is a good place to trace that we are exiting SMM. */
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
if (unlikely(vcpu->arch.smi_pending)) {
kvm_make_request(KVM_REQ_SMI, vcpu);
vcpu->arch.smi_pending = 0;
@@ -6309,14 +6312,234 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
+static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+
+ kvm_get_segment(vcpu, &seg, n);
+ put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ put_smstate(u32, buf, offset + 8, seg.base);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+}
+
+static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+ u16 flags;
+
+ kvm_get_segment(vcpu, &seg, n);
+ offset = 0x7e00 + n * 16;
+
+ flags = process_smi_get_segment_flags(&seg) >> 8;
+ put_smstate(u16, buf, offset, seg.selector);
+ put_smstate(u16, buf, offset + 2, flags);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u64, buf, offset + 8, seg.base);
+}
+
+static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+ put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+ put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+ put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+ for (i = 0; i < 8; i++)
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u32, buf, 0x7fcc, (u32)val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u32, buf, 0x7fc4, seg.selector);
+ put_smstate(u32, buf, 0x7f64, seg.base);
+ put_smstate(u32, buf, 0x7f60, seg.limit);
+ put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u32, buf, 0x7fc0, seg.selector);
+ put_smstate(u32, buf, 0x7f80, seg.base);
+ put_smstate(u32, buf, 0x7f7c, seg.limit);
+ put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f74, dt.address);
+ put_smstate(u32, buf, 0x7f70, dt.size);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f58, dt.address);
+ put_smstate(u32, buf, 0x7f54, dt.size);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_32(vcpu, buf, i);
+
+ put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020000);
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+#ifdef CONFIG_X86_64
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+ put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+ put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u64, buf, 0x7f68, val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u64, buf, 0x7f60, val);
+
+ put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+ put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+ put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+ put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+ put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u16, buf, 0x7e90, seg.selector);
+ put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e94, seg.limit);
+ put_smstate(u64, buf, 0x7e98, seg.base);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e84, dt.size);
+ put_smstate(u64, buf, 0x7e88, dt.address);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u16, buf, 0x7e70, seg.selector);
+ put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e74, seg.limit);
+ put_smstate(u64, buf, 0x7e78, seg.base);
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e64, dt.size);
+ put_smstate(u64, buf, 0x7e68, dt.address);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_64(vcpu, buf, i);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
static void process_smi(struct kvm_vcpu *vcpu)
{
+ struct kvm_segment cs, ds;
+ char buf[512];
+ u32 cr0;
+ int r;
+
if (is_smm(vcpu)) {
vcpu->arch.smi_pending = true;
return;
}
- printk_once(KERN_DEBUG "Ignoring guest SMI\n");
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ memset(buf, 0, 512);
+ if (guest_cpuid_has_longmode(vcpu))
+ process_smi_save_state_64(vcpu, buf);
+ else
+ process_smi_save_state_32(vcpu, buf);
+
+ r = kvm_write_guest(vcpu->kvm, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+ if (r < 0)
+ return;
+
+ if (kvm_x86_ops->get_nmi_mask(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_x86_ops->set_cr4(vcpu, 0);
+
+ __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+ if (guest_cpuid_has_longmode(vcpu))
+ kvm_x86_ops->set_efer(vcpu, 0);
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
}
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
--
1.8.3.1
We need to hide SMRAM from guests not running in SMM. Therefore,
all uses of kvm_read_guest* and kvm_write_guest* must be changed to
check whether the VCPU is in system management mode. We need to
introduce a new family of functions for this.
For now, the code is just copied from virt/kvm/kvm_main.c, except
for calls to other read/write functions.
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 20 ++++
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/lapic.c | 22 ++---
arch/x86/kvm/mmu.c | 8 +-
arch/x86/kvm/paging_tmpl.h | 8 +-
arch/x86/kvm/smram.c | 208 ++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/svm.c | 8 +-
arch/x86/kvm/vmx.c | 10 +-
arch/x86/kvm/x86.c | 62 ++++++------
9 files changed, 288 insertions(+), 60 deletions(-)
create mode 100644 arch/x86/kvm/smram.c
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc05c7dda9f1..52b8716397d5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -984,6 +984,26 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
+
+struct kvm_memory_slot *x86_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long x86_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+int x86_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa, unsigned long len);
+int x86_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+ int len);
+int x86_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+ unsigned long len);
+int x86_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+ unsigned long len);
+int x86_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+int x86_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data,
+ int offset, int len);
+int x86_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+ unsigned long len);
+int x86_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 16e8f962eaad..02b40b128490 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o smram.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fe26f2051979..7034ec2bd318 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -497,15 +497,15 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ return x86_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
sizeof(val));
}
static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
{
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
- sizeof(*val));
+ return x86_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
}
static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -1883,8 +1883,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ x86_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
@@ -1935,16 +1935,16 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ x86_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
if (vapic_addr) {
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.apic->vapic_cache,
- vapic_addr, sizeof(u32)))
+ if (x86_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
return -EINVAL;
__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
} else {
@@ -2036,7 +2036,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ return x86_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
addr, sizeof(u8));
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 371109546382..4694ad42aa8b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -875,7 +875,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
{
struct kvm_memory_slot *slot;
- slot = gfn_to_memslot(vcpu->kvm, gfn);
+ slot = x86_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
(no_dirty_log && slot->dirty_bitmap))
slot = NULL;
@@ -3460,7 +3460,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, gva, x86_gfn_to_hva(vcpu, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3478,7 +3478,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
struct kvm_memory_slot *slot;
bool async;
- slot = gfn_to_memslot(vcpu->kvm, gfn);
+ slot = x86_gfn_to_memslot(vcpu, gfn);
async = false;
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
@@ -4108,7 +4108,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
+ r = x86_read_guest(vcpu, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe9b5..82dd0c9f788f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
base_gpa = pte_gpa & ~mask;
index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
- r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ r = x86_read_guest_atomic(vcpu, base_gpa,
gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
curr_pte = gw->prefetch_ptes[index];
} else
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+ r = x86_read_guest_atomic(vcpu, pte_gpa,
&curr_pte, sizeof(curr_pte));
return r || curr_pte != gw->ptes[level - 1];
@@ -869,7 +869,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (!rmap_can_add(vcpu))
break;
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ if (x86_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
break;
@@ -956,7 +956,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ if (x86_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
return -EINVAL;
diff --git a/arch/x86/kvm/smram.c b/arch/x86/kvm/smram.c
new file mode 100644
index 000000000000..73616edab631
--- /dev/null
+++ b/arch/x86/kvm/smram.c
@@ -0,0 +1,208 @@
+/*
+ * Helpers for SMRAM access
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kvm_host.h>
+
+struct kvm_memory_slot *x86_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(vcpu->kvm, gfn);
+
+ return slot;
+}
+EXPORT_SYMBOL_GPL(x86_gfn_to_memslot);
+
+static unsigned long x86_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool *writable)
+{
+ struct kvm_memory_slot *slot = x86_gfn_to_memslot(vcpu, gfn);
+
+ return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
+unsigned long x86_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = x86_gfn_to_memslot(vcpu, gfn);
+
+ return gfn_to_hva_memslot(slot, gfn);
+}
+EXPORT_SYMBOL_GPL(x86_gfn_to_hva);
+
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+int x86_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+ int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = x86_gfn_to_hva_prot(vcpu, gfn, NULL);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = __copy_from_user(data, (void __user *)addr + offset, len);
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_read_guest_page);
+
+int x86_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ int r;
+ unsigned long addr;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int offset = offset_in_page(gpa);
+
+ addr = x86_gfn_to_hva_prot(vcpu, gfn, NULL);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ pagefault_disable();
+ r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ pagefault_enable();
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_read_guest_atomic);
+
+int x86_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = x86_read_guest_page(vcpu, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_read_guest);
+
+int x86_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa, unsigned long len)
+{
+ return kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
+}
+EXPORT_SYMBOL_GPL(x86_gfn_to_hva_cache_init);
+
+int x86_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len)
+{
+ struct kvm_memslots *slots = kvm_memslots(vcpu->kvm);
+ int r;
+
+ BUG_ON(len > ghc->len);
+
+ if (slots->generation != ghc->generation)
+ x86_gfn_to_hva_cache_init(vcpu->kvm, ghc, ghc->gpa, ghc->len);
+
+ if (unlikely(!ghc->memslot))
+ return x86_read_guest(vcpu, ghc->gpa, data, len);
+
+ if (kvm_is_error_hva(ghc->hva))
+ return -EFAULT;
+
+ r = __copy_from_user(data, (void __user *)ghc->hva, len);
+ if (r)
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_read_guest_cached);
+
+int x86_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data,
+ int offset, int len)
+{
+ struct kvm_memory_slot *slot = x86_gfn_to_memslot(vcpu, gfn);
+ int r;
+ unsigned long addr;
+
+ slot = x86_gfn_to_memslot(vcpu, gfn);
+ addr = gfn_to_hva_memslot(slot, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = __copy_to_user((void __user *)addr + offset, data, len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty_in_slot(slot, gfn);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_write_guest_page);
+
+int x86_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = x86_write_guest_page(vcpu, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_write_guest);
+
+int x86_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len)
+{
+ struct kvm_memslots *slots = kvm_memslots(vcpu->kvm);
+ int r;
+
+ BUG_ON(len > ghc->len);
+
+ if (slots->generation != ghc->generation)
+ x86_gfn_to_hva_cache_init(vcpu->kvm, ghc, ghc->gpa, ghc->len);
+
+ if (unlikely(!ghc->memslot))
+ return x86_write_guest(vcpu, ghc->gpa, data, len);
+
+ if (kvm_is_error_hva(ghc->hva))
+ return -EFAULT;
+
+ r = __copy_to_user((void __user *)ghc->hva, data, len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(x86_write_guest_cached);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6afffb82513c..1422dd945b85 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1955,7 +1955,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
+ ret = x86_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -2153,7 +2153,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
mask = (0xf >> (4 - size)) << start_bit;
val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ if (x86_read_guest(&svm->vcpu, gpa, &val, iopm_len))
return NESTED_EXIT_DONE;
return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2178,7 +2178,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
/* Offset is in 32 bit units but need in 8 bit units */
offset *= 4;
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ if (x86_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
return NESTED_EXIT_DONE;
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2449,7 +2449,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
p = msrpm_offsets[i];
offset = svm->nested.vmcb_msrpm + (p * 4);
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ if (x86_read_guest(&svm->vcpu, offset, &value, 4))
return false;
svm->nested.msrpm[p] = svm->msrpm[p] | value;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 121a653919c0..d29f751ea10d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7326,7 +7326,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
bitmap += (port & 0x7fff) / 8;
if (last_bitmap != bitmap)
- if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+ if (x86_read_guest(vcpu, bitmap, &b, 1))
return true;
if (b & (1 << (port & 7)))
return true;
@@ -7370,7 +7370,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
/* Then read the msr_index'th bit from this bitmap: */
if (msr_index < 1024*8) {
unsigned char b;
- if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+ if (x86_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
return true;
return 1 & (b >> (msr_index & 7));
} else
@@ -9112,7 +9112,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr.host_initiated = false;
for (i = 0; i < count; i++) {
- if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+ if (x86_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) {
pr_warn_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
@@ -9146,7 +9146,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
for (i = 0; i < count; i++) {
struct msr_data msr_info;
- if (kvm_read_guest(vcpu->kvm,
+ if (x86_read_guest(vcpu,
gpa + i * sizeof(e),
&e, 2 * sizeof(u32))) {
pr_warn_ratelimited(
@@ -9168,7 +9168,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index);
return -EINVAL;
}
- if (kvm_write_guest(vcpu->kvm,
+ if (x86_write_guest(vcpu,
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
&msr_info.data, sizeof(msr_info.data))) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9eba0d850d17..9c89aee475d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
/*
* This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
+ * running guest. The difference to x86_read_guest_page is that this function
* can read from guest physical or from the guest's guest physical memory.
*/
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -493,7 +493,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
real_gfn = gpa_to_gfn(real_gfn);
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+ return x86_read_guest_page(vcpu, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
@@ -1128,7 +1128,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+static void kvm_write_wall_clock(struct kvm_vcpu *vcpu, gpa_t wall_clock)
{
int version;
int r;
@@ -1138,7 +1138,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
if (!wall_clock)
return;
- r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ r = x86_read_guest(vcpu, wall_clock, &version, sizeof(version));
if (r)
return;
@@ -1147,7 +1147,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
++version;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+ x86_write_guest(vcpu, wall_clock, &version, sizeof(version));
/*
* The guest calculates current wall clock time by adding
@@ -1157,18 +1157,18 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
*/
getboottime(&boot);
- if (kvm->arch.kvmclock_offset) {
- struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
+ if (vcpu->kvm->arch.kvmclock_offset) {
+ struct timespec ts = ns_to_timespec(vcpu->kvm->arch.kvmclock_offset);
boot = timespec_sub(boot, ts);
}
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;
- kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+ x86_write_guest(vcpu, wall_clock, &wc, sizeof(wc));
version++;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+ x86_write_guest(vcpu, wall_clock, &version, sizeof(version));
}
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
@@ -1684,7 +1684,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ if (unlikely(x86_read_guest_cached(v, &vcpu->pv_time,
&guest_hv_clock, sizeof(guest_hv_clock))))
return 0;
@@ -1727,7 +1727,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ x86_write_guest_cached(v, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
@@ -1968,7 +1968,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
r = PTR_ERR(page);
goto out;
}
- if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+ if (x86_write_guest(vcpu, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
out_free:
@@ -2021,7 +2021,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
}
gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
+ addr = x86_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
kvm_x86_ops->patch_hypercall(vcpu, instructions);
@@ -2040,7 +2040,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
+ if (x86_write_guest(vcpu, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
@@ -2068,7 +2068,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
}
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(vcpu->kvm, gfn);
+ addr = x86_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
if (__clear_user((void __user *)addr, PAGE_SIZE))
@@ -2110,8 +2110,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
- sizeof(u32)))
+ if (x86_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u32)))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
@@ -2141,7 +2141,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ if (unlikely(x86_read_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
@@ -2149,7 +2149,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.version += 2;
vcpu->arch.st.accum_steal = 0;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ x86_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
@@ -2229,7 +2229,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
- kvm_write_wall_clock(vcpu->kvm, data);
+ kvm_write_wall_clock(vcpu, data);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
@@ -2257,7 +2257,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
gpa_offset = data & ~(PAGE_MASK | 1);
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (x86_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
@@ -2278,9 +2278,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS,
- sizeof(struct kvm_steal_time)))
+ if (x86_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+ data & KVM_STEAL_VALID_BITS,
+ sizeof(struct kvm_steal_time)))
return 1;
vcpu->arch.st.msr_val = data;
@@ -4355,7 +4355,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
+ ret = x86_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -4389,7 +4389,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
offset = addr & (PAGE_SIZE-1);
if (WARN_ON(offset + bytes > PAGE_SIZE))
bytes = (unsigned)PAGE_SIZE - offset;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
+ ret = x86_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
offset, bytes);
if (unlikely(ret < 0))
return X86EMUL_IO_NEEDED;
@@ -4436,7 +4436,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ ret = x86_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4489,7 +4489,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ ret = x86_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4523,7 +4523,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+ return !x86_read_guest(vcpu, gpa, val, bytes);
}
static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -6490,7 +6490,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
else
process_smi_save_state_32(vcpu, buf);
- r = kvm_write_guest(vcpu->kvm, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+ r = x86_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (r < 0)
return;
@@ -8279,7 +8279,7 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
{
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ return x86_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
sizeof(val));
}
--
1.8.3.1
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 11 +----------
arch/x86/kvm/mmu.c | 5 ++++-
arch/x86/kvm/x86.c | 1 +
3 files changed, 6 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52b8716397d5..3caefa4be90b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -184,16 +184,6 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - direct mapping of virtual to physical mapping at gfn
- * used for real mode and two-dimensional paging
- * bits 17:19 - common access permissions for all ptes in this shadow page
- */
union kvm_mmu_page_role {
unsigned word;
struct {
@@ -207,6 +197,7 @@ union kvm_mmu_page_role {
unsigned nxe:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
+ unsigned smm:1;
};
};
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4694ad42aa8b..043680d90964 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3879,6 +3879,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
+ context->base_role.smm = is_smm(vcpu);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -3937,6 +3938,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
context->base_role.cr0_wp = is_write_protection(vcpu);
context->base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
+ context->base_role.smm = is_smm(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
@@ -4239,7 +4241,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
- mask.cr0_wp = mask.cr4_pae = mask.nxe = mask.smep_andnot_wp = 1;
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = mask.smep_andnot_wp =
+ mask.smm = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c89aee475d3..ce36aca2276d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5419,6 +5419,7 @@ void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
}
vcpu->arch.hflags = emul_flags;
+ kvm_mmu_reset_context(vcpu);
}
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
--
1.8.3.1
This adds an arch-specific memslot flag that hides slots unless the
VCPU is in system management mode.
Some care is needed in order to limit the overhead of x86_gfn_to_memslot
when compared with gfn_to_memslot. Thankfully, we have __gfn_to_memslot
and search_memslots which are the same, so we can add some extra output
to search_memslots. The compiler will optimize it as dead code in
__gfn_to_memslot, and will use it to thread jumps in x86_gfn_to_memslot.
Signed-off-by: Paolo Bonzini <[email protected]>
---
Documentation/virtual/kvm/api.txt | 18 ++++++++++++------
arch/x86/include/uapi/asm/kvm.h | 3 +++
arch/x86/kvm/smram.c | 25 +++++++++++++++++++++++--
include/linux/kvm_host.h | 14 ++++++++++----
virt/kvm/kvm_main.c | 4 ++++
5 files changed, 52 insertions(+), 12 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 51523b70b6b2..2bc99ae040da 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -933,18 +933,24 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
be identical. This allows large pages in the guest to be backed by large
pages in the host.
-The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
-KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of
-writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to
-use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
-to make a new slot read-only. In this case, writes to this memory will be
-posted to userspace as KVM_EXIT_MMIO exits.
+The flags field supports two architecture-independent flags:
+KVM_MEM_LOG_DIRTY_PAGES and KVM_MEM_READONLY. The former can be set
+to instruct KVM to keep track of writes to memory within the slot.
+See KVM_GET_DIRTY_LOG ioctl to know how to use it. The latter can
+be set, if KVM_CAP_READONLY_MEM capability allows it, to make a new
+slot read-only. In this case, writes to this memory will be posted to
+userspace as KVM_EXIT_MMIO exits.
When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
the memory region are automatically reflected into the guest. For example, an
mmap() that affects the region will be made visible immediately. Another
example is madvise(MADV_DROP).
+Each architectures can support other specific flags. Right now there is
+only one defined. On x86, if KVM_CAP_X86_SMM is available, the
+KVM_MEM_X86_SMRAM flag will hide the slot to VCPUs that are not
+in system management mode.
+
It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
allocation and is deprecated.
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 30100a3c1bed..46df15bc844f 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -45,6 +45,9 @@
#define __KVM_HAVE_XCRS
#define __KVM_HAVE_READONLY_MEM
+#define __KVM_ARCH_VALID_FLAGS KVM_MEM_X86_SMRAM
+#define KVM_MEM_X86_SMRAM (1 << 24)
+
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/kvm/smram.c b/arch/x86/kvm/smram.c
index 73616edab631..e7dd933673a4 100644
--- a/arch/x86/kvm/smram.c
+++ b/arch/x86/kvm/smram.c
@@ -19,10 +19,23 @@
#include <linux/module.h>
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
struct kvm_memory_slot *x86_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- struct kvm_memory_slot *slot = gfn_to_memslot(vcpu->kvm, gfn);
+ /* By using search_memslots directly the compiler can optimize away
+ * the "if (found)" check below.
+ *
+ * It cannot do the same for gfn_to_memslot because it is not inlined,
+ * and it also cannot do the same for __gfn_to_memslot because the
+ * kernel is compiled with -fno-delete-null-pointer-checks.
+ */
+ bool found;
+ struct kvm_memslots *memslots = kvm_memslots(vcpu->kvm);
+ struct kvm_memory_slot *slot = search_memslots(memslots, gfn, &found);
+
+ if (found && unlikely(slot->flags & KVM_MEM_X86_SMRAM) && !is_smm(vcpu))
+ return NULL;
return slot;
}
@@ -112,7 +125,15 @@ EXPORT_SYMBOL_GPL(x86_read_guest);
int x86_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
{
- return kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
+ int r = kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
+
+ if (r < 0)
+ return r;
+
+ /* Use slow path for reads and writes to SMRAM. */
+ if (ghc->memslot && (ghc->memslot->flags & KVM_MEM_X86_SMRAM))
+ ghc->memslot = NULL;
+ return r;
}
EXPORT_SYMBOL_GPL(x86_gfn_to_hva_cache_init);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 19d09a08885b..ae7c60262369 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -810,16 +810,18 @@ static inline void kvm_guest_exit(void)
* gfn_to_memslot() itself isn't here as an inline because that would
* bloat other code too much.
*/
-static inline struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn)
+static __always_inline struct kvm_memory_slot *
+search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool *found)
{
int start = 0, end = slots->used_slots;
int slot = atomic_read(&slots->lru_slot);
struct kvm_memory_slot *memslots = slots->memslots;
if (gfn >= memslots[slot].base_gfn &&
- gfn < memslots[slot].base_gfn + memslots[slot].npages)
+ gfn < memslots[slot].base_gfn + memslots[slot].npages) {
+ *found = true;
return &memslots[slot];
+ }
while (start < end) {
slot = start + (end - start) / 2;
@@ -833,16 +835,20 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
if (gfn >= memslots[start].base_gfn &&
gfn < memslots[start].base_gfn + memslots[start].npages) {
atomic_set(&slots->lru_slot, start);
+ *found = true;
return &memslots[start];
}
+ *found = false;
return NULL;
}
static inline struct kvm_memory_slot *
__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
{
- return search_memslots(slots, gfn);
+ bool found;
+
+ return search_memslots(slots, gfn, &found);
}
static inline unsigned long
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0fcc5d28f3a9..46bff2082479 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -716,6 +716,10 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
#ifdef __KVM_HAVE_READONLY_MEM
valid_flags |= KVM_MEM_READONLY;
#endif
+#ifdef __KVM_ARCH_VALID_FLAGS
+ BUILD_BUG_ON(__KVM_ARCH_VALID_FLAGS & KVM_MEMSLOT_INVALID);
+ valid_flags |= __KVM_ARCH_VALID_FLAGS;
+#endif
if (mem->flags & ~valid_flags)
return -EINVAL;
--
1.8.3.1
Signed-off-by: Paolo Bonzini <[email protected]>
--
RFC->v1: depend on support for real mode CS base above 1M
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/svm.c | 6 ++++++
arch/x86/kvm/vmx.c | 6 ++++++
arch/x86/kvm/x86.c | 15 +++++++++++++++
4 files changed, 28 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3caefa4be90b..1c3e68a571c7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -700,6 +700,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ bool (*cpu_has_high_real_mode_segbase)(void);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
/* Create, but do not attach this VCPU */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1422dd945b85..c8c4430222c3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4080,6 +4080,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
+static bool svm_has_high_real_mode_segbase(void)
+{
+ return true;
+}
+
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
@@ -4353,6 +4358,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d29f751ea10d..c547b20e7438 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8142,6 +8142,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
local_irq_enable();
}
+static bool vmx_has_high_real_mode_segbase(void)
+{
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
static bool vmx_mpx_supported(void)
{
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -10299,6 +10304,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
+ .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce36aca2276d..c4dd64377c19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2836,6 +2836,17 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#endif
r = 1;
break;
+ case KVM_CAP_X86_SMM:
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+ break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
@@ -4229,6 +4240,10 @@ static void kvm_init_msr_list(void)
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
switch (emulated_msrs[i]) {
+ case MSR_IA32_SMBASE:
+ if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+ continue;
+ break;
default:
break;
}
--
1.8.3.1
2015-05-08 13:20+0200, Paolo Bonzini:
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> @@ -202,7 +202,7 @@ struct kvm_run {
> __u32 exit_reason;
> __u8 ready_for_interrupt_injection;
> __u8 if_flag;
> - __u8 padding2[2];
> + __u16 flags;
(It got lost last review and I'd really like to know ...
what is the advantage of giving both bytes to flags?)
2015-05-08 13:20+0200, Paolo Bonzini:
> This patch adds the interface between x86.c and the emulator: the
> SMBASE register, a new emulator flag, the RSM instruction. It also
> adds a new request bit that will be used by the KVM_SMI ioctl.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> --
> RFC->v1: make SMBASE host-readable only
> add support for latching an SMI
> do not reset SMBASE on INIT
> ---
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> @@ -367,6 +367,7 @@ struct kvm_vcpu_arch {
> int32_t apic_arb_prio;
> int mp_state;
> u64 ia32_misc_enable_msr;
> + u64 smbase;
smbase is u32 in hardware.
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> @@ -2504,7 +2504,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
> vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
> vmx->nested.nested_vmx_misc_low |=
> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
> - VMX_MISC_ACTIVITY_HLT;
> + VMX_MISC_ACTIVITY_HLT | VMX_MISC_IA32_SMBASE_MSR;
No need to expose this feature when the MSR isn't readable.
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> @@ -2220,6 +2221,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> + case MSR_IA32_SMBASE:
> + if (!msr_info->host_initiated)
> + return 1;
> + vcpu->arch.smbase = data;
> + break;
> @@ -2615,6 +2621,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> + case MSR_IA32_SMBASE:
> + if (!msr_info->host_initiated)
> + return 1;
> + msr_info->data = vcpu->arch.smbase;
> + break;
On 21/05/2015 16:49, Radim Krčmář wrote:
> 2015-05-08 13:20+0200, Paolo Bonzini:
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> @@ -202,7 +202,7 @@ struct kvm_run {
>> __u32 exit_reason;
>> __u8 ready_for_interrupt_injection;
>> __u8 if_flag;
>> - __u8 padding2[2];
>> + __u16 flags;
>
> (It got lost last review and I'd really like to know ...
> what is the advantage of giving both bytes to flags?)
No advantage. You just should leave padding2[1] in the middle so that
the offset of &run->padding2[0] doesn't change. Since it's not obvious
I gave two bytes to flags, but I can do it either way.
Paolo
2015-05-08 13:20+0200, Paolo Bonzini:
> The big ugly one. This patch adds support for switching in and out of
> system management mode, respectively upon receiving KVM_REQ_SMI and upon
> executing a RSM instruction. Both 32- and 64-bit formats are supported
> for the SMM state save area.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> RFC->v1: shift access rights left by 8 for 32-bit format
> move tracepoint to kvm_set_hflags
> fix NMI handling
> ---
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> @@ -2262,12 +2262,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
> +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
> +{
> + struct desc_struct desc;
> + int offset;
> + u16 selector;
> +
> + selector = get_smstate(u32, smbase, 0x7fa8 + n * 4);
(u16, SDM says that most significant 2 bytes are reserved anyway.)
> + if (n < 3)
> + offset = 0x7f84 + n * 12;
> + else
> + offset = 0x7f2c + (n - 3) * 12;
These numbers made me look where the hell is that defined and the
easiest reference seemed to be http://www.sandpile.org/x86/smm.htm,
which has several layouts ... I hopefully checked the intersection of
various Intels and AMDs.
> + set_desc_base(&desc, get_smstate(u32, smbase, offset + 8));
> + set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
> + rsm_set_desc_flags(&desc, get_smstate(u32, smbase, offset));
(There wan't a layout where this would be right, so we could save the
shifting of those flags in 64 bit mode. Intel P6 was close, and they
had only 2 bytes for access right, which means they weren't shifted.)
> +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
> +{
> + cr0 = get_smstate(u32, smbase, 0x7ffc);
(I wonder why they made 'smbase + 0x8000' the default offset in SDM,
when 'smbase + 0xfe00' or 'smbase' would work as well.)
> +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
> +{
> + struct desc_struct desc;
> + u16 selector;
> + selector = get_smstate(u32, smbase, 0x7e90);
> + rsm_set_desc_flags(&desc, get_smstate(u32, smbase, 0x7e92) << 8);
(Both reads should be u16. Luckily, extra data gets ignored.)
> static int em_rsm(struct x86_emulate_ctxt *ctxt)
> {
> + if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
> + ctxt->ops->set_nmi_mask(ctxt, false);
NMI is always fun ... let's see two cases:
1. NMI -> SMI -> RSM -> NMI
NMI is not injected; ok.
2. NMI -> SMI -> IRET -> RSM -> NMI
NMI is injected; I think it shouldn't be ... have you based this
behavior on the 3rd paragraph of SDM 34.8 NMI HANDLING WHILE IN SMM
("A special case [...]")?
Why I think we should restore NMI mask on RSM:
- It's consistent with SMI -> IRET -> NMI -> RSM -> NMI (where we,
I think correctly, unmask NMIs) and the idea that SMM tries to be to
transparent (but maybe they didn't care about retarded SMI handlers).
- APM 2:15.30.3 SMM_CTL MSR (C001_0116h)
• ENTER—Bit 1. Enter SMM: map the SMRAM memory areas, record whether
NMI was currently blocked and block further NMI and SMI interrupts.
• EXIT—Bit 3. Exit SMM: unmap the SMRAM memory areas, restore the
previous masking status of NMI and unconditionally reenable SMI.
The MSR should mimic real SMM signals and does restore the NMI mask.
On 21/05/2015 18:20, Radim Krčmář wrote:
>
>> > + set_desc_base(&desc, get_smstate(u32, smbase, offset + 8));
>> > + set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
>> > + rsm_set_desc_flags(&desc, get_smstate(u32, smbase, offset));
> (There wan't a layout where this would be right, so we could save the
> shifting of those flags in 64 bit mode. Intel P6 was close, and they
> had only 2 bytes for access right, which means they weren't shifted.)
Check the AMD architecture manual.
Paolo
On 21/05/2015 18:20, Radim Krčmář wrote:
> 2. NMI -> SMI -> IRET -> RSM -> NMI
> NMI is injected; I think it shouldn't be ... have you based this
> behavior on the 3rd paragraph of SDM 34.8 NMI HANDLING WHILE IN SMM
> ("A special case [...]")?
Yes.
> Why I think we should restore NMI mask on RSM:
> - It's consistent with SMI -> IRET -> NMI -> RSM -> NMI (where we,
> I think correctly, unmask NMIs)
Yes, we do.
> and the idea that SMM tries to be to
> transparent (but maybe they didn't care about retarded SMI handlers).
That's my reading of that paragraph of the manual. :)
> - APM 2:15.30.3 SMM_CTL MSR (C001_0116h)
> • ENTER—Bit 1. Enter SMM: map the SMRAM memory areas, record whether
> NMI was currently blocked and block further NMI and SMI interrupts.
> • EXIT—Bit 3. Exit SMM: unmap the SMRAM memory areas, restore the
> previous masking status of NMI and unconditionally reenable SMI.
>
> The MSR should mimic real SMM signals and does restore the NMI mask.
No idea... My implementation does restore the previous masking status,
but only if it was "unmasked".
Paolo
2015-05-21 16:59+0200, Paolo Bonzini:
> On 21/05/2015 16:49, Radim Krčmář wrote:
>> 2015-05-08 13:20+0200, Paolo Bonzini:
>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>> @@ -202,7 +202,7 @@ struct kvm_run {
>>> __u32 exit_reason;
>>> __u8 ready_for_interrupt_injection;
>>> __u8 if_flag;
>>> - __u8 padding2[2];
>>> + __u16 flags;
>>
>> (It got lost last review and I'd really like to know ...
>> what is the advantage of giving both bytes to flags?)
>
> No advantage. You just should leave padding2[1] in the middle so that
> the offset of &run->padding2[0] doesn't change.
I don't get that. The position of padding should be decided by
comparing probabilities of extending 'if_flag' and 'flags'.
> Since it's not obvious
> I gave two bytes to flags, but I can do it either way.
if_flag seems to be set in stone as one bit, so I'd vote for
__u8 flags;
__u8 padding2;
(Or 'padding3', to prevent the same class of errors that removing it
altogether does; which we didn't do for other tailed padding).
For there isn't much space left in struct kvm ...
2015-05-21 18:21+0200, Paolo Bonzini:
> On 21/05/2015 18:20, Radim Krčmář wrote:
> >
> >> > + set_desc_base(&desc, get_smstate(u32, smbase, offset + 8));
> >> > + set_desc_limit(&desc, get_smstate(u32, smbase, offset + 4));
> >> > + rsm_set_desc_flags(&desc, get_smstate(u32, smbase, offset));
> > (There wan't a layout where this would be right, so we could save the
> > shifting of those flags in 64 bit mode. Intel P6 was close, and they
> > had only 2 bytes for access right, which means they weren't shifted.)
>
> Check the AMD architecture manual.
I must be blind, is there more than Table 10-2?
(And according to ADM manual, we are overwriting GDT and IDT base at
offset 0xff88 and 0xff94 with ES and CS data, so it's not the best
reference for this case ...)
2015-05-21 18:23+0200, Paolo Bonzini:
> On 21/05/2015 18:20, Radim Krčmář wrote:
>> 2. NMI -> SMI -> IRET -> RSM -> NMI
>> NMI is injected; I think it shouldn't be ... have you based this
>> behavior on the 3rd paragraph of SDM 34.8 NMI HANDLING WHILE IN SMM
>> ("A special case [...]")?
>
> Yes.
Well, if I were to go lawyer
[...] saves the SMRAM state save map but does not save the attribute to
keep NMI interrupts disabled.
NMI masking is a bit, so it'd be really wasteful not to have an
attribute to keep NMI enabled in the same place ...
Potentially, an NMI could be latched (while in SMM or upon exit) and
serviced upon exit [...]
This "Potentially" could be in the sense that the whole 3rd paragraph is
only applicable to some ancient SMM design :)
The 1st paragraph has quite clear sentence:
If NMIs were blocked before the SMI occurred, they are blocked after
execution of RSM.
so I'd just ignore the 3rd paragraph ...
And the APM 2:10.3.3 Exceptions and Interrupts
NMI—If an NMI occurs while the processor is in SMM, it is latched by
the processor, but the NMI handler is not invoked until the processor
leaves SMM with the execution of an RSM instruction. A pending NMI
causes the handler to be invoked immediately after the RSM completes
and before the first instruction in the interrupted program is
executed.
An SMM handler can unmask NMI interrupts by simply executing an IRET.
Upon completion of the IRET instruction, the processor recognizes the
pending NMI, and transfers control to the NMI handler. Once an NMI is
recognized within SMM using this technique, subsequent NMIs are
recognized until SMM is exited. Later SMIs cause NMIs to be masked,
until the SMM handler unmasks them.
makes me think that we should unmask them unconditionally or that SMM
doesn't do anything with NMI masking.
If we can choose, less NMI nesting seems like a good idea.
On 21/05/2015 18:33, Radim Krčmář wrote:
>> > Check the AMD architecture manual.
> I must be blind, is there more than Table 10-2?
There's Table 10-1! :DDD
Paolo
On 21/05/2015 19:00, Radim Krčmář wrote:
> Potentially, an NMI could be latched (while in SMM or upon exit) and
> serviced upon exit [...]
>
> This "Potentially" could be in the sense that the whole 3rd paragraph is
> only applicable to some ancient SMM design :)
It could also be in the sense that you cannot exclude an NMI coming at
exactly the wrong time.
If you want to go full language lawyer, it does mention it whenever
behavior is specific to a processor family.
> The 1st paragraph has quite clear sentence:
>
> If NMIs were blocked before the SMI occurred, they are blocked after
> execution of RSM.
>
> so I'd just ignore the 3rd paragraph ...
>
> And the APM 2:10.3.3 Exceptions and Interrupts
> NMI—If an NMI occurs while the processor is in SMM, it is latched by
> the processor, but the NMI handler is not invoked until the processor
> leaves SMM with the execution of an RSM instruction. A pending NMI
> causes the handler to be invoked immediately after the RSM completes
> and before the first instruction in the interrupted program is
> executed.
>
> An SMM handler can unmask NMI interrupts by simply executing an IRET.
> Upon completion of the IRET instruction, the processor recognizes the
> pending NMI, and transfers control to the NMI handler. Once an NMI is
> recognized within SMM using this technique, subsequent NMIs are
> recognized until SMM is exited. Later SMIs cause NMIs to be masked,
> until the SMM handler unmasks them.
>
> makes me think that we should unmask them unconditionally or that SMM
> doesn't do anything with NMI masking.
Actually I hadn't noticed this paragraph. But I read it the same as the
Intel manual (i.e. what I implemented): it doesn't say anywhere that RSM
may cause the processor to *set* the "NMIs masked" flag.
It makes no sense; as you said it's 1 bit of state! But it seems that
it's the architectural behavior. :(
> If we can choose, less NMI nesting seems like a good idea.
It would---I'm just preempting future patches from Nadav. :) That said,
even if OVMF does do IRETs in SMM (in 64-bit mode it fills in page
tables lazily for memory above 4GB), we do not care about asynchronous
SMIs such as those for power management. So we should never enter SMM
with NMIs masked, to begin with.
Paolo
On 21/05/2015 18:26, Radim Krčmář wrote:
> 2015-05-21 16:59+0200, Paolo Bonzini:
>> On 21/05/2015 16:49, Radim Krčmář wrote:
>>> 2015-05-08 13:20+0200, Paolo Bonzini:
>>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>>> @@ -202,7 +202,7 @@ struct kvm_run {
>>>> __u32 exit_reason;
>>>> __u8 ready_for_interrupt_injection;
>>>> __u8 if_flag;
>>>> - __u8 padding2[2];
>>>> + __u16 flags;
>>>
>>> (It got lost last review and I'd really like to know ...
>>> what is the advantage of giving both bytes to flags?)
>>
>> No advantage. You just should leave padding2[1] in the middle so that
>> the offset of &run->padding2[0] doesn't change.
>
> I don't get that. The position of padding should be decided by
> comparing probabilities of extending 'if_flag' and 'flags'.
>
>> Since it's not obvious
>> I gave two bytes to flags, but I can do it either way.
>
> if_flag seems to be set in stone as one bit, so I'd vote for
>
> __u8 flags;
> __u8 padding2;
>
> (Or 'padding3', to prevent the same class of errors that removing it
> altogether does; which we didn't do for other tailed padding).
You're right that we didn't do it. I'll change it to flags + padding2.
Paolo
> For there isn't much space left in struct kvm ...
>
2015-05-21 22:24+0200, Paolo Bonzini:
> On 21/05/2015 18:33, Radim Krčmář wrote:
> >> > Check the AMD architecture manual.
> > I must be blind, is there more than Table 10-2?
>
> There's Table 10-1! :DDD
:D I think I understand ...
10-1 says that amd64 doesn't shift the segment's attributes (they
wouldn't fit into a word otherwise), but table 10-2 says nothing about
the same for ia32 segment registers; that behavior is model-specific.
Some people on http://www.sandpile.org/x86/smm.htm found out that P6
stores SMM state like this
7F84h: ES selector
7F86h: ES access rights
7F88h: ES limit
7F8Ch: ES base
which has an extra selector there (makes little sense), but access
rights cannot be shifted for they have only a word of space.
I guess it stems in conflicting online resources, but it's not an
architectural behavior, so we'll be wrong anyway :)
(Not shifting them would make the code a bit nicer ...)
2015-05-21 23:21+0200, Paolo Bonzini:
> On 21/05/2015 19:00, Radim Krčmář wrote:
>> Potentially, an NMI could be latched (while in SMM or upon exit) and
>> serviced upon exit [...]
>>
>> This "Potentially" could be in the sense that the whole 3rd paragraph is
>> only applicable to some ancient SMM design :)
>
> It could also be in the sense that you cannot exclude an NMI coming at
> exactly the wrong time.
Yes, but it is hard to figure out how big the wrong time window is ...
Taken to the extreme, the paragraph says that we must inject NMI that
arrived while in SMM after RSM; regardless of NMI blocking before.
(Which is not how real hardware works.)
> If you want to go full language lawyer, it does mention it whenever
> behavior is specific to a processor family.
True, I don't know of an exception, but that is not a proof for the
contrary here :/
>> The 1st paragraph has quite clear sentence:
>>
>> If NMIs were blocked before the SMI occurred, they are blocked after
>> execution of RSM.
>>
>> so I'd just ignore the 3rd paragraph ...
It's suspicious in other ways ... I'll focus on other part of the
sentence now
Potentially, an NMI could be latched (while in SMM or upon exit)
^^^^^^^^^^^^^^^^^^^^^
A NMI can't be latched in SMM mode and delivered after RSM when we
started with masked NMI.
It was latched in SMM, so we either didn't unmask NMIs or we were
executing a NMI in SMM mode. The first case is covered by
If NMIs were blocked before the SMI occurred, they are blocked after
execution of RSM.
The second case, when we specialize the above, would need to unmask NMIs
with IRET, accept an NMI, and then do RSM before IRET (because IRET
would immediately inject the latched NMI);
if CPU unmasks NMIs in that case, I'd slap someone.
Btw. I had a good laugh on Intel's response to a similar question:
https://software.intel.com/en-us/forums/topic/305672
>> And the APM 2:10.3.3 Exceptions and Interrupts
| [...]
>> makes me think that we should unmask them unconditionally or that SMM
>> doesn't do anything with NMI masking.
>
> Actually I hadn't noticed this paragraph. But I read it the same as the
> Intel manual (i.e. what I implemented): it doesn't say anywhere that RSM
> may cause the processor to *set* the "NMIs masked" flag.
>
> It makes no sense; as you said it's 1 bit of state! But it seems that
> it's the architectural behavior. :(
Ok, it's sad and I'm too lazy to actually try it ...
>> If we can choose, less NMI nesting seems like a good idea.
>
> It would---I'm just preempting future patches from Nadav. :)
Me too :D
> That said,
> even if OVMF does do IRETs in SMM (in 64-bit mode it fills in page
> tables lazily for memory above 4GB), we do not care about asynchronous
> SMIs such as those for power management. So we should never enter SMM
> with NMIs masked, to begin with.
Yeah, it's a stupid corner case, the place where most of time and sanity
is lost.
On 22/05/2015 16:17, Radim Krčmář wrote:
> Btw. I had a good laugh on Intel's response to a similar question:
> https://software.intel.com/en-us/forums/topic/305672
Duh... the question is dumb (because he's not doing IRET in SMM), and
the answer is dumber...
Paolo
On 05/08/2015 02:20 PM, Paolo Bonzini wrote:
> This adds an arch-specific memslot flag that hides slots unless the
> VCPU is in system management mode.
>
> Some care is needed in order to limit the overhead of x86_gfn_to_memslot
> when compared with gfn_to_memslot. Thankfully, we have __gfn_to_memslot
> and search_memslots which are the same, so we can add some extra output
> to search_memslots. The compiler will optimize it as dead code in
> __gfn_to_memslot, and will use it to thread jumps in x86_gfn_to_memslot.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> Documentation/virtual/kvm/api.txt | 18 ++++++++++++------
> arch/x86/include/uapi/asm/kvm.h | 3 +++
> arch/x86/kvm/smram.c | 25 +++++++++++++++++++++++--
> include/linux/kvm_host.h | 14 ++++++++++----
> virt/kvm/kvm_main.c | 4 ++++
> 5 files changed, 52 insertions(+), 12 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 51523b70b6b2..2bc99ae040da 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -933,18 +933,24 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
> be identical. This allows large pages in the guest to be backed by large
> pages in the host.
>
> -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
> -KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of
> -writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to
> -use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
> -to make a new slot read-only. In this case, writes to this memory will be
> -posted to userspace as KVM_EXIT_MMIO exits.
> +The flags field supports two architecture-independent flags:
> +KVM_MEM_LOG_DIRTY_PAGES and KVM_MEM_READONLY. The former can be set
> +to instruct KVM to keep track of writes to memory within the slot.
> +See KVM_GET_DIRTY_LOG ioctl to know how to use it. The latter can
> +be set, if KVM_CAP_READONLY_MEM capability allows it, to make a new
> +slot read-only. In this case, writes to this memory will be posted to
> +userspace as KVM_EXIT_MMIO exits.
>
> When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
> the memory region are automatically reflected into the guest. For example, an
> mmap() that affects the region will be made visible immediately. Another
> example is madvise(MADV_DROP).
>
> +Each architectures can support other specific flags. Right now there is
> +only one defined. On x86, if KVM_CAP_X86_SMM is available, the
> +KVM_MEM_X86_SMRAM flag will hide the slot to VCPUs that are not
> +in system management mode.
Is this generic enough? For example, a system could configure itself so
that an SMRAM region goes to mmio, hiding real RAM.
I see two alternatives:
- have three states: SMM, !SMM, both
- define two tables: SMM, !SMM, both spanning the entire address space
you should probably document how dirty bitmap handling happens in the
presence of SMM.
> +
> It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
> The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
> allocation and is deprecated.
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 30100a3c1bed..46df15bc844f 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -45,6 +45,9 @@
> #define __KVM_HAVE_XCRS
> #define __KVM_HAVE_READONLY_MEM
>
> +#define __KVM_ARCH_VALID_FLAGS KVM_MEM_X86_SMRAM
> +#define KVM_MEM_X86_SMRAM (1 << 24)
> +
> /* Architectural interrupt line count. */
> #define KVM_NR_INTERRUPTS 256
>
> diff --git a/arch/x86/kvm/smram.c b/arch/x86/kvm/smram.c
> index 73616edab631..e7dd933673a4 100644
> --- a/arch/x86/kvm/smram.c
> +++ b/arch/x86/kvm/smram.c
> @@ -19,10 +19,23 @@
>
> #include <linux/module.h>
> #include <linux/kvm_host.h>
> +#include "kvm_cache_regs.h"
>
> struct kvm_memory_slot *x86_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
> {
> - struct kvm_memory_slot *slot = gfn_to_memslot(vcpu->kvm, gfn);
> + /* By using search_memslots directly the compiler can optimize away
> + * the "if (found)" check below.
> + *
> + * It cannot do the same for gfn_to_memslot because it is not inlined,
> + * and it also cannot do the same for __gfn_to_memslot because the
> + * kernel is compiled with -fno-delete-null-pointer-checks.
> + */
> + bool found;
> + struct kvm_memslots *memslots = kvm_memslots(vcpu->kvm);
> + struct kvm_memory_slot *slot = search_memslots(memslots, gfn, &found);
> +
> + if (found && unlikely(slot->flags & KVM_MEM_X86_SMRAM) && !is_smm(vcpu))
> + return NULL;
>
> return slot;
> }
> @@ -112,7 +125,15 @@ EXPORT_SYMBOL_GPL(x86_read_guest);
> int x86_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
> gpa_t gpa, unsigned long len)
> {
> - return kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
> + int r = kvm_gfn_to_hva_cache_init(kvm, ghc, gpa, len);
> +
> + if (r < 0)
> + return r;
> +
> + /* Use slow path for reads and writes to SMRAM. */
> + if (ghc->memslot && (ghc->memslot->flags & KVM_MEM_X86_SMRAM))
> + ghc->memslot = NULL;
> + return r;
> }
> EXPORT_SYMBOL_GPL(x86_gfn_to_hva_cache_init);
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 19d09a08885b..ae7c60262369 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -810,16 +810,18 @@ static inline void kvm_guest_exit(void)
> * gfn_to_memslot() itself isn't here as an inline because that would
> * bloat other code too much.
> */
> -static inline struct kvm_memory_slot *
> -search_memslots(struct kvm_memslots *slots, gfn_t gfn)
> +static __always_inline struct kvm_memory_slot *
> +search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool *found)
> {
> int start = 0, end = slots->used_slots;
> int slot = atomic_read(&slots->lru_slot);
> struct kvm_memory_slot *memslots = slots->memslots;
>
> if (gfn >= memslots[slot].base_gfn &&
> - gfn < memslots[slot].base_gfn + memslots[slot].npages)
> + gfn < memslots[slot].base_gfn + memslots[slot].npages) {
> + *found = true;
> return &memslots[slot];
> + }
>
> while (start < end) {
> slot = start + (end - start) / 2;
> @@ -833,16 +835,20 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
> if (gfn >= memslots[start].base_gfn &&
> gfn < memslots[start].base_gfn + memslots[start].npages) {
> atomic_set(&slots->lru_slot, start);
> + *found = true;
> return &memslots[start];
> }
>
> + *found = false;
> return NULL;
> }
>
> static inline struct kvm_memory_slot *
> __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
> {
> - return search_memslots(slots, gfn);
> + bool found;
> +
> + return search_memslots(slots, gfn, &found);
> }
>
> static inline unsigned long
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0fcc5d28f3a9..46bff2082479 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -716,6 +716,10 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
> #ifdef __KVM_HAVE_READONLY_MEM
> valid_flags |= KVM_MEM_READONLY;
> #endif
> +#ifdef __KVM_ARCH_VALID_FLAGS
> + BUILD_BUG_ON(__KVM_ARCH_VALID_FLAGS & KVM_MEMSLOT_INVALID);
> + valid_flags |= __KVM_ARCH_VALID_FLAGS;
> +#endif
>
> if (mem->flags & ~valid_flags)
> return -EINVAL;