2017-08-17 11:59:11

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 0/5] KVM: MMU: 5 level EPT/shadow support

Intel's existing processors limit the maximum linear address width to
48 bits, and the maximum physical address width to 46 bits. And the
upcoming processors will extend maximum linear address width to 57 bits
and maximum physical address width can go upto 52 bits in practical.

With linear address width greater than 48, a new paging mode in IA-32e
is introduced - 5 level paging(also known as LA57). And to support VMs
with this feature, KVM MMU code need to be extended.

And to achieve this, this patchset:
1> leverages 2 qemu parameters: +la57 and phys-bits to expose wider linear
address width and physical address width to the VM;
2> extends shadow logic to construct 5 level shadow page for VMs running
in LA57 mode;
3> extends ept logic to construct 5 level ept table for VMs whose maximum
physical width exceeds 48 bits.

Changes in v2:
- Address comments from Paolo Bonzini and Jim Mattson: add a new patch to let
kvm_cpuid() return false when cpuid entry is not found;
- Address comments from Paolo Bonzini: fix a typo in check_cr_write() and use
62 as the upper limit when checking reserved bits for a physical address;
- Address comments from Paolo Bonzini: move definition of PT64_ROOT_MAX_LEVEL
into kvm_host.h;
- Address comments from Paolo Bonzini: add checking for shadow_root_level in
mmu_free_roots();
- Address comments from Paolo Bonzini: set root_level & shadow_root_level both
to PT64_ROOT_4LEVEL for shadow ept situation.

Yu Zhang (5):
KVM: x86: Add return value to kvm_cpuid().
KVM: MMU: check guest CR3 reserved bits based on its physical address
width.
KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.
KVM: MMU: Add 5 level EPT & Shadow page table support.
KVM: MMU: Expose the LA57 feature to VM.

arch/x86/include/asm/kvm_emulate.h | 4 +--
arch/x86/include/asm/kvm_host.h | 31 ++++++--------------
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/cpuid.c | 39 ++++++++++++++++++-------
arch/x86/kvm/cpuid.h | 9 +++++-
arch/x86/kvm/emulate.c | 42 +++++++++++++++++----------
arch/x86/kvm/kvm_cache_regs.h | 2 +-
arch/x86/kvm/mmu.c | 59 ++++++++++++++++++++++++--------------
arch/x86/kvm/mmu.h | 6 +++-
arch/x86/kvm/mmu_audit.c | 4 +--
arch/x86/kvm/svm.c | 8 +++---
arch/x86/kvm/trace.h | 11 ++++---
arch/x86/kvm/vmx.c | 27 ++++++++++-------
arch/x86/kvm/x86.c | 21 ++++++++------
arch/x86/kvm/x86.h | 44 ++++++++++++++++++++++++++++
15 files changed, 205 insertions(+), 103 deletions(-)

--
2.5.0


2017-08-17 11:59:13

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

Return false in kvm_cpuid() when it fails to find the cpuid
entry. Also, this routine(and its caller) is optimized with
a new argument - check_limit, so that the check_cpuid_limit()
fall back can be avoided.

Signed-off-by: Yu Zhang <[email protected]>
---
arch/x86/include/asm/kvm_emulate.h | 4 ++--
arch/x86/kvm/cpuid.c | 18 ++++++++++++++----
arch/x86/kvm/cpuid.h | 9 ++++++++-
arch/x86/kvm/emulate.c | 17 ++++++++++-------
arch/x86/kvm/svm.c | 2 +-
arch/x86/kvm/trace.h | 11 +++++++----
arch/x86/kvm/x86.c | 6 +++---
7 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index fde36f1..0e51a07 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);

- void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, int check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);

unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2ee..989ba4e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -853,15 +853,22 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
}

-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, int check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+ bool entry_found = true;

best = kvm_find_cpuid_entry(vcpu, function, index);

- if (!best)
+ if (!best) {
+ entry_found = false;
+ if (!check_limit)
+ goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+ }

if (best) {
*eax = best->eax;
@@ -870,7 +877,10 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
- trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+
+out:
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+ return entry_found;
}
EXPORT_SYMBOL_GPL(kvm_cpuid);

@@ -883,7 +893,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)

eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ac15193..3e759cf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+
+enum {
+ NO_CHECK_LIMIT = 0,
+ CHECK_LIMIT = 1,
+};
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, int check_limit);

int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb00559..46daa37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@

#include "x86.h"
#include "tss.h"
+#include "cpuid.h"

/*
* Operand types
@@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)

eax = 0x80000001;
ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
- return edx & bit(X86_FEATURE_LM);
+ if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT))
+ return edx & bit(X86_FEATURE_LM);
+ else
+ return 0;
}

#define GET_SMSTATE(type, smbase, offset) \
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;

eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)

eax = 0x00000000;
ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
/*
* Intel ("GenuineIntel")
* remark: Intel CPUs only support "syscall" in 64bit
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
/*
* Check MOVBE is set in the guest-visible CPUID leaf.
*/
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
if (!(ecx & FFL(MOVBE)))
return emulate_ud(ctxt);

@@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)

eax = reg_read(ctxt, VCPU_REGS_RAX);
ecx = reg_read(ctxt, VCPU_REGS_RCX);
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
*reg_write(ctxt, VCPU_REGS_RAX) = eax;
*reg_write(ctxt, VCPU_REGS_RBX) = ebx;
*reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
{
u32 eax = 1, ebx, ecx = 0, edx;

- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
if (!(edx & FFL(FXSR)))
return emulate_ud(ctxt);

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fa9ee5..9def4a8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
}
init_vmcb(svm);

- kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, CHECK_LIMIT);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);

if (kvm_vcpu_apicv_active(vcpu) && !init_event)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc67..8a202c4 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
*/
TRACE_EVENT(kvm_cpuid,
TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
- unsigned long rcx, unsigned long rdx),
- TP_ARGS(function, rax, rbx, rcx, rdx),
+ unsigned long rcx, unsigned long rdx, bool found),
+ TP_ARGS(function, rax, rbx, rcx, rdx, found),

TP_STRUCT__entry(
__field( unsigned int, function )
@@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
__field( unsigned long, rbx )
__field( unsigned long, rcx )
__field( unsigned long, rdx )
+ __field( bool, found )
),

TP_fast_assign(
@@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
__entry->rbx = rbx;
__entry->rcx = rcx;
__entry->rdx = rdx;
+ __entry->found = found;
),

- TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
__entry->function, __entry->rax,
- __entry->rbx, __entry->rcx, __entry->rdx)
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found")
);

#define AREG(x) { APIC_##x, "APIC_" #x }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e40a779..ee99fc1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5213,10 +5213,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}

-static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, int check_limit)
{
- kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}

static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
--
2.5.0

2017-08-17 11:59:18

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support.

Extends the shadow paging code, so that 5 level shadow page
table can be constructed if VM is running in 5 level paging
mode.

Also extends the ept code, so that 5 level ept table can be
constructed if maxphysaddr of VM exceeds 48 bits. Unlike the
shadow logic, KVM should still use 4 level ept table for a VM
whose physical address width is less than 48 bits, even when
the VM is running in 5 level paging mode.

Signed-off-by: Yu Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 10 +++++-----
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/cpuid.c | 5 +++++
arch/x86/kvm/mmu.c | 43 +++++++++++++++++++++++++++--------------
arch/x86/kvm/mmu.h | 1 +
arch/x86/kvm/mmu_audit.c | 4 ++--
arch/x86/kvm/svm.c | 4 ++--
arch/x86/kvm/vmx.c | 19 ++++++++++++------
arch/x86/kvm/x86.h | 10 ++++++++++
9 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f70b8a..34b0313 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,7 +315,7 @@ struct kvm_pio_request {
int size;
};

-#define PT64_ROOT_MAX_LEVEL 4
+#define PT64_ROOT_MAX_LEVEL 5

struct rsvd_bits_validate {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
@@ -323,9 +323,9 @@ struct rsvd_bits_validate {
};

/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
*/
struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -981,7 +981,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
- int (*get_tdp_level)(void);
+ int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5f63a2e..a0fb025 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -453,6 +453,7 @@ enum vmcs_field {

#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 989ba4e..65da75d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);

+#ifdef CONFIG_X86_64
+ if (vcpu->arch.maxphyaddr > 48)
+ kvm_mmu_reset_context(vcpu);
+#endif
+
kvm_pmu_refresh(vcpu);
return 0;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cd4d2cc..c392ae7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;

@@ -3376,10 +3376,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0,
+ vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,15 +3421,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

MMU_WARN_ON(VALID_PAGE(root));

spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
- 0, ACC_ALL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3520,7 +3521,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)

vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -4022,6 +4023,12 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
@@ -4063,6 +4070,8 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
{
u64 bad_mt_xwr;

+ rsvd_check->rsvd_bits_mask[0][4] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][2] =
@@ -4072,6 +4081,7 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);

/* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
rsvd_check->rsvd_bits_mask[1][2] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
@@ -4332,7 +4342,10 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
static void paging64_init_context(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
+ int root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+
+ paging64_init_context_common(vcpu, context, root_level);
}

static void paging32_init_context(struct kvm_vcpu *vcpu,
@@ -4373,7 +4386,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
@@ -4387,7 +4400,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
context->nx = is_nx(vcpu);
- context->root_level = PT64_ROOT_4LEVEL;
+ context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, context);
context->gva_to_gpa = paging64_gva_to_gpa;
} else if (is_pae(vcpu)) {
@@ -4444,7 +4458,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,

MMU_WARN_ON(VALID_PAGE(context->root_hpa));

- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->shadow_root_level = PT64_ROOT_4LEVEL;

context->nx = true;
context->ept_ad = accessed_dirty;
@@ -4453,7 +4467,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
context->update_pte = ept_update_pte;
- context->root_level = context->shadow_root_level;
+ context->root_level = PT64_ROOT_4LEVEL;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
context->base_role.ad_disabled = !accessed_dirty;
@@ -4498,7 +4512,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
} else if (is_long_mode(vcpu)) {
g_context->nx = is_nx(vcpu);
- g_context->root_level = PT64_ROOT_4LEVEL;
+ g_context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
} else if (is_pae(vcpu)) {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 57f80c5..4e6f1ee 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,7 @@
#define PT32_DIR_PSE36_MASK \
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)

+#define PT64_ROOT_5LEVEL 5
#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2e6996d..d22ddbd 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
return;
}

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 99d4539..efd7538 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -567,7 +567,7 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}

-static int get_npt_level(void)
+static int get_npt_level(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
return PT64_ROOT_4LEVEL;
@@ -2389,7 +2389,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level();
+ vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed1074e..614ade7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1200,6 +1200,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}

+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -4296,13 +4301,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}

+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+ if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ return VMX_EPT_MAX_GAW + 1;
+ return VMX_EPT_DEFAULT_GAW + 1;
+}
+
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
u64 eptp;

/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
- VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+ (get_ept_level(vcpu) - 1) << VMX_EPT_GAW_EPTP_SHIFT;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPT_AD_ENABLE_BIT;
@@ -9505,11 +9517,6 @@ static void __init vmx_check_processor_compat(void *rtn)
}
}

-static int get_ept_level(void)
-{
- return VMX_EPT_DEFAULT_GAW + 1;
-}
-
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6120670..0107ab7 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -62,6 +62,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
return cs_l;
}

+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return (vcpu->arch.efer & EFER_LMA) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+ return 0;
+#endif
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
--
2.5.0

2017-08-17 11:59:20

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 3/5] KVM: MMU: Rename PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL.

Now we have 4 level page table and 5 level page table in 64 bits
long mode, let's rename the PT64_ROOT_LEVEL to PT64_ROOT_4LEVEL,
then we can use PT64_ROOT_5LEVEL for 5 level page table, it's
helpful to make the code more clear.

Also PT64_ROOT_MAX_LEVEL is defined as 4, so that we can just
redefine it to 5 whenever a replacement is needed for 5 level
paging.

Signed-off-by: Yu Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 +++-
arch/x86/kvm/mmu.c | 36 ++++++++++++++++++------------------
arch/x86/kvm/mmu.h | 2 +-
arch/x86/kvm/mmu_audit.c | 4 ++--
arch/x86/kvm/svm.c | 2 +-
5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 018300e..7f70b8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,8 +315,10 @@ struct kvm_pio_request {
int size;
};

+#define PT64_ROOT_MAX_LEVEL 4
+
struct rsvd_bits_validate {
- u64 rsvd_bits_mask[2][4];
+ u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
};

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ee21c0..cd4d2cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2167,8 +2167,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
}

struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
- unsigned int idx[PT64_ROOT_LEVEL];
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
};

#define for_each_sp(pvec, sp, parents, i) \
@@ -2383,8 +2383,8 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;

- if (iterator->level == PT64_ROOT_LEVEL &&
- vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+ if (iterator->level == PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;

@@ -3323,8 +3323,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;

@@ -3376,10 +3376,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3420,14 +3420,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

MMU_WARN_ON(VALID_PAGE(root));

spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
@@ -3442,7 +3442,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;

for (i = 0; i < 4; ++i) {
@@ -3475,7 +3475,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* If we shadow a 32 bit page table with a long mode page
* table we enter this path.
*/
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
if (vcpu->arch.mmu.lm_root == NULL) {
/*
* The additional page necessary for this is only
@@ -3520,7 +3520,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)

vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -3596,7 +3596,7 @@ static bool
walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
- u64 sptes[PT64_ROOT_LEVEL], spte = 0ull;
+ u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
int root, leaf;
bool reserved = false;

@@ -4022,7 +4022,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
- case PT64_ROOT_LEVEL:
+ case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
rsvd_bits(maxphyaddr, 51);
@@ -4332,7 +4332,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
static void paging64_init_context(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+ paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
}

static void paging32_init_context(struct kvm_vcpu *vcpu,
@@ -4387,7 +4387,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
context->nx = is_nx(vcpu);
- context->root_level = PT64_ROOT_LEVEL;
+ context->root_level = PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, context);
context->gva_to_gpa = paging64_gva_to_gpa;
} else if (is_pae(vcpu)) {
@@ -4498,7 +4498,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
} else if (is_long_mode(vcpu)) {
g_context->nx = is_nx(vcpu);
- g_context->root_level = PT64_ROOT_LEVEL;
+ g_context->root_level = PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
} else if (is_pae(vcpu)) {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1cd0fcb..57f80c5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,7 +37,7 @@
#define PT32_DIR_PSE36_MASK \
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)

-#define PT64_ROOT_LEVEL 4
+#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index dcce533..2e6996d 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
return;
}

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9def4a8..99d4539 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -570,7 +570,7 @@ static inline void invlpga(unsigned long addr, u32 asid)
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
+ return PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
--
2.5.0

2017-08-17 11:59:48

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 5/5] KVM: MMU: Expose the LA57 feature to VM.

This patch exposes 5 level page table feature to the VM,
at the same time, the canonical virtual address checking is
extended to support both 48-bits and 57-bits address width.

Signed-off-by: Yu Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 18 ++----------------
arch/x86/kvm/cpuid.c | 16 ++++++++++------
arch/x86/kvm/emulate.c | 12 ++++++------
arch/x86/kvm/kvm_cache_regs.h | 2 +-
arch/x86/kvm/vmx.c | 8 ++++----
arch/x86/kvm/x86.c | 7 +++++--
arch/x86/kvm/x86.h | 34 ++++++++++++++++++++++++++++++++++
7 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 34b0313..7a0f12b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,8 +85,8 @@
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))

#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)

@@ -1299,20 +1299,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}

-static inline u64 get_canonical(u64 la)
-{
- return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
- return get_canonical(la) != la;
-#else
- return false;
-#endif
-}
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 65da75d..126b726 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,13 +126,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);

/*
- * The existing code assumes virtual address is 48-bit in the canonical
- * address checks; exit if it is ever changed.
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
*/
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best && ((best->eax & 0xff00) >> 8) != 48 &&
- ((best->eax & 0xff00) >> 8) != 0)
- return -EINVAL;
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }

/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -388,7 +391,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,

/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+ F(AVX512VBMI) | F(LA57) | F(PKU) |
+ 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3e534d..03b462f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -696,7 +696,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
- if (is_noncanonical_address(la))
+ if (emul_is_noncanonical_address(la, ctxt))
goto bad;

*max_size = min_t(u64, ~0u, (1ull << 48) - la);
@@ -1750,8 +1750,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
sizeof(base3), &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
return ret;
- if (is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32)))
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
load:
@@ -2844,8 +2844,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
- if (is_noncanonical_address(rcx) ||
- is_noncanonical_address(rdx))
+ if (emul_is_noncanonical_address(rcx, ctxt) ||
+ emul_is_noncanonical_address(rdx, ctxt))
return emulate_gp(ctxt, 0);
break;
}
@@ -3760,7 +3760,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->mode == X86EMUL_MODE_PROT64 &&
- is_noncanonical_address(desc_ptr.address))
+ emul_is_noncanonical_address(desc_ptr.address, ctxt))
return emulate_gp(ctxt, 0);
if (lgdt)
ctxt->ops->set_gdt(ctxt, &desc_ptr);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2..0052317 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)

static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 614ade7..ee0e5b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -122,7 +122,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)

#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -3368,7 +3368,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK) ||
+ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
@@ -7034,7 +7034,7 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
- exn = is_noncanonical_address(*ret);
+ exn = is_noncanonical_address(*ret, vcpu);
} else if (is_protmode(vcpu)) {
/* Protected mode: apply checks for segment validity in the
* following order:
@@ -7839,7 +7839,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)

switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
- if (is_noncanonical_address(operand.gla)) {
+ if (is_noncanonical_address(operand.gla, vcpu)) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa3041f..7fb969c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -769,6 +769,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
return 1;

+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
+ return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -1074,7 +1077,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
- if (is_noncanonical_address(msr->data))
+ if (is_noncanonical_address(msr->data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
@@ -1091,7 +1094,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- msr->data = get_canonical(msr->data);
+ msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
}
return kvm_x86_ops->set_msr(vcpu, msr);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 0107ab7..63ed0d0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -97,6 +97,40 @@ static inline u32 bit(int bitno)
return 1 << (bitno & 31);
}

+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u64 get_canonical(u64 la, u8 vaddr_bits)
+{
+ return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+#else
+ return false;
+#endif
+}
+
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
gva_t gva, gfn_t gfn, unsigned access)
{
--
2.5.0

2017-08-17 12:00:33

by Yu Zhang

[permalink] [raw]
Subject: [PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

Currently, KVM uses CR3_L_MODE_RESERVED_BITS to check the
reserved bits in CR3. Yet the length of reserved bits in
guest CR3 should be based on the physical address width
exposed to the VM. This patch changes CR3 check logic to
calculate the reserved bits at runtime.

Signed-off-by: Yu Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 -
arch/x86/kvm/emulate.c | 13 +++++++++++--
arch/x86/kvm/mmu.h | 3 +++
arch/x86/kvm/x86.c | 8 ++++----
4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e4862e..018300e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,6 @@
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))

-#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
#define CR3_PCID_INVD BIT_64(63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46daa37..f3e534d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,7 @@
#include "x86.h"
#include "tss.h"
#include "cpuid.h"
+#include "mmu.h"

/*
* Operand types
@@ -4100,8 +4101,16 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;

ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+ if (efer & EFER_LMA) {
+ u64 maxphyaddr;
+ u32 eax = 0x80000008;
+
+ if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, NULL,
+ NO_CHECK_LIMIT)) {
+ maxphyaddr = eax & 0xff;
+ rsvd = rsvd_bits(maxphyaddr, 62);
+ }
+ }

if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a..1cd0fcb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,9 @@

static inline u64 rsvd_bits(int s, int e)
{
+ if (e < s)
+ return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
}

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ee99fc1..fa3041f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -813,10 +813,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}

- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else if (is_pae(vcpu) && is_paging(vcpu) &&
+ if (is_long_mode(vcpu) &&
+ (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+ return 1;
+ else if (is_pae(vcpu) && is_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;

--
2.5.0

2017-08-17 12:29:19

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

On 17/08/2017 21:52, Yu Zhang wrote:
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index ac15193..3e759cf 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
> int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
> struct kvm_cpuid2 *cpuid,
> struct kvm_cpuid_entry2 __user *entries);
> -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
> +
> +enum {
> + NO_CHECK_LIMIT = 0,
> + CHECK_LIMIT = 1,
> +};

emulate.c should not include cpuid.h. The argument can be simply a
bool, though.

> +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
> + u32 *ecx, u32 *edx, int check_limit);
>
> int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index fb00559..46daa37 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -28,6 +28,7 @@
>
> #include "x86.h"
> #include "tss.h"
> +#include "cpuid.h"
>
> /*
> * Operand types
> @@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
>
> eax = 0x80000001;
> ecx = 0;
> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> - return edx & bit(X86_FEATURE_LM);
> + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT))
> + return edx & bit(X86_FEATURE_LM);
> + else
> + return 0;
> }
>
> #define GET_SMSTATE(type, smbase, offset) \
> @@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
> u32 eax, ebx, ecx, edx;
>
> eax = ecx = 0;
> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
> return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
> && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
> && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
> @@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
>
> eax = 0x00000000;
> ecx = 0x00000000;
> - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
> /*
> * Intel ("GenuineIntel")
> * remark: Intel CPUs only support "syscall" in 64bit
> @@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
> /*
> * Check MOVBE is set in the guest-visible CPUID leaf.
> */
> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);

This should be NO_CHECK_LIMIT.

Otherwise okay!

Paolo

> if (!(ecx & FFL(MOVBE)))
> return emulate_ud(ctxt);
>
> @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
>
> eax = reg_read(ctxt, VCPU_REGS_RAX);
> ecx = reg_read(ctxt, VCPU_REGS_RCX);
> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
> *reg_write(ctxt, VCPU_REGS_RAX) = eax;
> *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
> *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
> @@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
> {
> u32 eax = 1, ebx, ecx = 0, edx;
>
> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
> if (!(edx & FFL(FXSR)))
> return emulate_ud(ctxt);
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 1fa9ee5..9def4a8 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> }
> init_vmcb(svm);
>
> - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
> + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, CHECK_LIMIT);
> kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
>
> if (kvm_vcpu_apicv_active(vcpu) && !init_event)
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 0a6cc67..8a202c4 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
> */
> TRACE_EVENT(kvm_cpuid,
> TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
> - unsigned long rcx, unsigned long rdx),
> - TP_ARGS(function, rax, rbx, rcx, rdx),
> + unsigned long rcx, unsigned long rdx, bool found),
> + TP_ARGS(function, rax, rbx, rcx, rdx, found),
>
> TP_STRUCT__entry(
> __field( unsigned int, function )
> @@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
> __field( unsigned long, rbx )
> __field( unsigned long, rcx )
> __field( unsigned long, rdx )
> + __field( bool, found )
> ),
>
> TP_fast_assign(
> @@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
> __entry->rbx = rbx;
> __entry->rcx = rcx;
> __entry->rdx = rdx;
> + __entry->found = found;
> ),
>
> - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
> + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
> __entry->function, __entry->rax,
> - __entry->rbx, __entry->rcx, __entry->rdx)
> + __entry->rbx, __entry->rcx, __entry->rdx,
> + __entry->found ? "found" : "not found")
> );
>
> #define AREG(x) { APIC_##x, "APIC_" #x }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e40a779..ee99fc1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5213,10 +5213,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
> return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
> }
>
> -static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
> - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
> +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
> + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, int check_limit)
> {
> - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
> + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
> }
>
> static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
>

2017-08-17 12:31:23

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.

On 17/08/2017 21:52, Yu Zhang wrote:
> + if (efer & EFER_LMA) {
> + u64 maxphyaddr;
> + u32 eax = 0x80000008;
> +
> + if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, NULL,
> + NO_CHECK_LIMIT)) {
> + maxphyaddr = eax & 0xff;
> + rsvd = rsvd_bits(maxphyaddr, 62);
> + }

You should use 36 here if ctxt->ops->get_cpuid returns false, for
consistency with cpuid_query_maxphyaddr.

Paolo

2017-08-17 12:45:49

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().



On 8/17/2017 8:29 PM, Paolo Bonzini wrote:
> On 17/08/2017 21:52, Yu Zhang wrote:
>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>> index ac15193..3e759cf 100644
>> --- a/arch/x86/kvm/cpuid.h
>> +++ b/arch/x86/kvm/cpuid.h
>> @@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
>> int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
>> struct kvm_cpuid2 *cpuid,
>> struct kvm_cpuid_entry2 __user *entries);
>> -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
>> +
>> +enum {
>> + NO_CHECK_LIMIT = 0,
>> + CHECK_LIMIT = 1,
>> +};
> emulate.c should not include cpuid.h. The argument can be simply a
> bool, though.

Thanks, Paolo.
So we just use true/false in emulate.c & svm.c, is this OK?
BTW could you please

>> +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
>> + u32 *ecx, u32 *edx, int check_limit);
>>
>> int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
>>
>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>> index fb00559..46daa37 100644
>> --- a/arch/x86/kvm/emulate.c
>> +++ b/arch/x86/kvm/emulate.c
>> @@ -28,6 +28,7 @@
>>
>> #include "x86.h"
>> #include "tss.h"
>> +#include "cpuid.h"
>>
>> /*
>> * Operand types
>> @@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
>>
>> eax = 0x80000001;
>> ecx = 0;
>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> - return edx & bit(X86_FEATURE_LM);
>> + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT))
>> + return edx & bit(X86_FEATURE_LM);
>> + else
>> + return 0;
>> }
>>
>> #define GET_SMSTATE(type, smbase, offset) \
>> @@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
>> u32 eax, ebx, ecx, edx;
>>
>> eax = ecx = 0;
>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>> return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
>> && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
>> && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
>> @@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
>>
>> eax = 0x00000000;
>> ecx = 0x00000000;
>> - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>> /*
>> * Intel ("GenuineIntel")
>> * remark: Intel CPUs only support "syscall" in 64bit
>> @@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
>> /*
>> * Check MOVBE is set in the guest-visible CPUID leaf.
>> */
>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
> This should be NO_CHECK_LIMIT.
>
> Otherwise okay!

Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a
bool argument),
because it's also for eax=1?

And what about svm_vcpu_reset()?

I am not sure if leaf 1 is always available. And if the answer is yes, I
do not think any of these
3 places(em_movbe/check_fxsr/svm_vcpu_reset) will need to fall back to
check_cpuid_limit(),
nor do we need to check the return value of get_cpuid(). Do you agree? :-)

Yu

>
> Paolo
>
>> if (!(ecx & FFL(MOVBE)))
>> return emulate_ud(ctxt);
>>
>> @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
>>
>> eax = reg_read(ctxt, VCPU_REGS_RAX);
>> ecx = reg_read(ctxt, VCPU_REGS_RCX);
>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>> *reg_write(ctxt, VCPU_REGS_RAX) = eax;
>> *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
>> *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
>> @@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
>> {
>> u32 eax = 1, ebx, ecx = 0, edx;
>>
>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>> if (!(edx & FFL(FXSR)))
>> return emulate_ud(ctxt);
>>
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 1fa9ee5..9def4a8 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>> }
>> init_vmcb(svm);
>>
>> - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>> + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, CHECK_LIMIT);
>> kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
>>
>> if (kvm_vcpu_apicv_active(vcpu) && !init_event)
>> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
>> index 0a6cc67..8a202c4 100644
>> --- a/arch/x86/kvm/trace.h
>> +++ b/arch/x86/kvm/trace.h
>> @@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
>> */
>> TRACE_EVENT(kvm_cpuid,
>> TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
>> - unsigned long rcx, unsigned long rdx),
>> - TP_ARGS(function, rax, rbx, rcx, rdx),
>> + unsigned long rcx, unsigned long rdx, bool found),
>> + TP_ARGS(function, rax, rbx, rcx, rdx, found),
>>
>> TP_STRUCT__entry(
>> __field( unsigned int, function )
>> @@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
>> __field( unsigned long, rbx )
>> __field( unsigned long, rcx )
>> __field( unsigned long, rdx )
>> + __field( bool, found )
>> ),
>>
>> TP_fast_assign(
>> @@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
>> __entry->rbx = rbx;
>> __entry->rcx = rcx;
>> __entry->rdx = rdx;
>> + __entry->found = found;
>> ),
>>
>> - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
>> + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
>> __entry->function, __entry->rax,
>> - __entry->rbx, __entry->rcx, __entry->rdx)
>> + __entry->rbx, __entry->rcx, __entry->rdx,
>> + __entry->found ? "found" : "not found")
>> );
>>
>> #define AREG(x) { APIC_##x, "APIC_" #x }
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index e40a779..ee99fc1 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -5213,10 +5213,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
>> return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
>> }
>>
>> -static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>> - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
>> +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>> + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, int check_limit)
>> {
>> - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
>> + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
>> }
>>
>> static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
>>
>

2017-08-17 12:47:36

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] KVM: MMU: check guest CR3 reserved bits based on its physical address width.



On 8/17/2017 8:31 PM, Paolo Bonzini wrote:
> On 17/08/2017 21:52, Yu Zhang wrote:
>> + if (efer & EFER_LMA) {
>> + u64 maxphyaddr;
>> + u32 eax = 0x80000008;
>> +
>> + if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, NULL,
>> + NO_CHECK_LIMIT)) {
>> + maxphyaddr = eax & 0xff;
>> + rsvd = rsvd_bits(maxphyaddr, 62);
>> + }
> You should use 36 here if ctxt->ops->get_cpuid returns false, for
> consistency with cpuid_query_maxphyaddr.

Oh, right. Thanks! :-)

Yu

2017-08-17 12:56:00

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().



On 8/17/2017 8:23 PM, Yu Zhang wrote:
>
>
> On 8/17/2017 8:29 PM, Paolo Bonzini wrote:
>> On 17/08/2017 21:52, Yu Zhang wrote:
>>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>>> index ac15193..3e759cf 100644
>>> --- a/arch/x86/kvm/cpuid.h
>>> +++ b/arch/x86/kvm/cpuid.h
>>> @@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
>>> int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
>>> struct kvm_cpuid2 *cpuid,
>>> struct kvm_cpuid_entry2 __user *entries);
>>> -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx,
>>> u32 *edx);
>>> +
>>> +enum {
>>> + NO_CHECK_LIMIT = 0,
>>> + CHECK_LIMIT = 1,
>>> +};
>> emulate.c should not include cpuid.h. The argument can be simply a
>> bool, though.
>
> Thanks, Paolo.
> So we just use true/false in emulate.c & svm.c, is this OK?
> BTW could you please
Sorry for the unfinished line. I was wondering, why can't emulate.c
include cpuid.h?

Yu

2017-08-17 13:17:34

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

On 17/08/2017 14:23, Yu Zhang wrote:
>
>
> On 8/17/2017 8:29 PM, Paolo Bonzini wrote:
>> On 17/08/2017 21:52, Yu Zhang wrote:
>>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>>> index ac15193..3e759cf 100644
>>> --- a/arch/x86/kvm/cpuid.h
>>> +++ b/arch/x86/kvm/cpuid.h
>>> @@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
>>> int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
>>> struct kvm_cpuid2 *cpuid,
>>> struct kvm_cpuid_entry2 __user *entries);
>>> -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx,
>>> u32 *edx);
>>> +
>>> +enum {
>>> + NO_CHECK_LIMIT = 0,
>>> + CHECK_LIMIT = 1,
>>> +};
>> emulate.c should not include cpuid.h. The argument can be simply a
>> bool, though.
>
> Thanks, Paolo.
> So we just use true/false in emulate.c & svm.c, is this OK?
> BTW could you please
>
>>> +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
>>> + u32 *ecx, u32 *edx, int check_limit);
>>> int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
>>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>>> index fb00559..46daa37 100644
>>> --- a/arch/x86/kvm/emulate.c
>>> +++ b/arch/x86/kvm/emulate.c
>>> @@ -28,6 +28,7 @@
>>> #include "x86.h"
>>> #include "tss.h"
>>> +#include "cpuid.h"
>>> /*
>>> * Operand types
>>> @@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct
>>> x86_emulate_ctxt *ctxt)
>>> eax = 0x80000001;
>>> ecx = 0;
>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> - return edx & bit(X86_FEATURE_LM);
>>> + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx,
>>> NO_CHECK_LIMIT))
>>> + return edx & bit(X86_FEATURE_LM);
>>> + else
>>> + return 0;
>>> }
>>> #define GET_SMSTATE(type, smbase, offset) \
>>> @@ -2636,7 +2639,7 @@ static bool vendor_intel(struct
>>> x86_emulate_ctxt *ctxt)
>>> u32 eax, ebx, ecx, edx;
>>> eax = ecx = 0;
>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>>> return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
>>> && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
>>> && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
>>> @@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct
>>> x86_emulate_ctxt *ctxt)
>>> eax = 0x00000000;
>>> ecx = 0x00000000;
>>> - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>>> /*
>>> * Intel ("GenuineIntel")
>>> * remark: Intel CPUs only support "syscall" in 64bit
>>> @@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
>>> /*
>>> * Check MOVBE is set in the guest-visible CPUID leaf.
>>> */
>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>> This should be NO_CHECK_LIMIT.
>>
>> Otherwise okay!
>
> Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a
> bool argument), because it's also for eax=1?

Good point.

> And what about svm_vcpu_reset()?

No, this one should be left as is, it's just writing a register and not
checking a feature.

> I am not sure if leaf 1 is always available. And if the answer is yes, I
> do not think any of these 3 places(em_movbe/check_fxsr/svm_vcpu_reset) will
> need to fall back to check_cpuid_limit(),
> nor do we need to check the return value of get_cpuid(). Do you agree? :-)

I think the answer is no, but you don't need to check the return value
because testing against 0 is okay (if best is NULL, get_cpuid returns 0
for eax/ebx/ecx/edx).

Paolo

>
> Yu
>
>>
>> Paolo
>>
>>> if (!(ecx & FFL(MOVBE)))
>>> return emulate_ud(ctxt);
>>> @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt
>>> *ctxt)
>>> eax = reg_read(ctxt, VCPU_REGS_RAX);
>>> ecx = reg_read(ctxt, VCPU_REGS_RCX);
>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>>> *reg_write(ctxt, VCPU_REGS_RAX) = eax;
>>> *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
>>> *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
>>> @@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt
>>> *ctxt)
>>> {
>>> u32 eax = 1, ebx, ecx = 0, edx;
>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>>> if (!(edx & FFL(FXSR)))
>>> return emulate_ud(ctxt);
>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>> index 1fa9ee5..9def4a8 100644
>>> --- a/arch/x86/kvm/svm.c
>>> +++ b/arch/x86/kvm/svm.c
>>> @@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu
>>> *vcpu, bool init_event)
>>> }
>>> init_vmcb(svm);
>>> - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>>> + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, CHECK_LIMIT);
>>> kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
>>> if (kvm_vcpu_apicv_active(vcpu) && !init_event)
>>> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
>>> index 0a6cc67..8a202c4 100644
>>> --- a/arch/x86/kvm/trace.h
>>> +++ b/arch/x86/kvm/trace.h
>>> @@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
>>> */
>>> TRACE_EVENT(kvm_cpuid,
>>> TP_PROTO(unsigned int function, unsigned long rax, unsigned
>>> long rbx,
>>> - unsigned long rcx, unsigned long rdx),
>>> - TP_ARGS(function, rax, rbx, rcx, rdx),
>>> + unsigned long rcx, unsigned long rdx, bool found),
>>> + TP_ARGS(function, rax, rbx, rcx, rdx, found),
>>> TP_STRUCT__entry(
>>> __field( unsigned int, function )
>>> @@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
>>> __field( unsigned long, rbx )
>>> __field( unsigned long, rcx )
>>> __field( unsigned long, rdx )
>>> + __field( bool, found )
>>> ),
>>> TP_fast_assign(
>>> @@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
>>> __entry->rbx = rbx;
>>> __entry->rcx = rcx;
>>> __entry->rdx = rdx;
>>> + __entry->found = found;
>>> ),
>>> - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
>>> + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry
>>> %s",
>>> __entry->function, __entry->rax,
>>> - __entry->rbx, __entry->rcx, __entry->rdx)
>>> + __entry->rbx, __entry->rcx, __entry->rdx,
>>> + __entry->found ? "found" : "not found")
>>> );
>>> #define AREG(x) { APIC_##x, "APIC_" #x }
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index e40a779..ee99fc1 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -5213,10 +5213,10 @@ static int emulator_intercept(struct
>>> x86_emulate_ctxt *ctxt,
>>> return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info,
>>> stage);
>>> }
>>> -static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>>> - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
>>> +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>>> + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, int check_limit)
>>> {
>>> - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
>>> + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx,
>>> check_limit);
>>> }
>>> static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt,
>>> unsigned reg)
>>>
>>
>

2017-08-17 13:18:29

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

On 17/08/2017 14:33, Yu Zhang wrote:
>>>>
>>>> +enum {
>>>> + NO_CHECK_LIMIT = 0,
>>>> + CHECK_LIMIT = 1,
>>>> +};
>>> emulate.c should not include cpuid.h. The argument can be simply a
>>> bool, though.
>>
>> Thanks, Paolo.
>> So we just use true/false in emulate.c & svm.c, is this OK?

I would use true/false everywhere.

>> BTW could you please
> Sorry for the unfinished line. I was wondering, why can't emulate.c
> include cpuid.h?

The emulator should be separate from the rest of KVM, in principle it
could be used by userspace too. So its interface should be as limited
as possible.

Paolo

2017-08-17 13:43:08

by Yu Zhang

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().



On 8/17/2017 9:17 PM, Paolo Bonzini wrote:
> On 17/08/2017 14:23, Yu Zhang wrote:
>>
>> On 8/17/2017 8:29 PM, Paolo Bonzini wrote:
>>> On 17/08/2017 21:52, Yu Zhang wrote:
>>>> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>>>> index ac15193..3e759cf 100644
>>>> --- a/arch/x86/kvm/cpuid.h
>>>> +++ b/arch/x86/kvm/cpuid.h
>>>> @@ -21,7 +21,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
>>>> int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
>>>> struct kvm_cpuid2 *cpuid,
>>>> struct kvm_cpuid_entry2 __user *entries);
>>>> -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx,
>>>> u32 *edx);
>>>> +
>>>> +enum {
>>>> + NO_CHECK_LIMIT = 0,
>>>> + CHECK_LIMIT = 1,
>>>> +};
>>> emulate.c should not include cpuid.h. The argument can be simply a
>>> bool, though.
>> Thanks, Paolo.
>> So we just use true/false in emulate.c & svm.c, is this OK?
>> BTW could you please
>>
>>>> +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
>>>> + u32 *ecx, u32 *edx, int check_limit);
>>>> int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
>>>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>>>> index fb00559..46daa37 100644
>>>> --- a/arch/x86/kvm/emulate.c
>>>> +++ b/arch/x86/kvm/emulate.c
>>>> @@ -28,6 +28,7 @@
>>>> #include "x86.h"
>>>> #include "tss.h"
>>>> +#include "cpuid.h"
>>>> /*
>>>> * Operand types
>>>> @@ -2333,8 +2334,10 @@ static int emulator_has_longmode(struct
>>>> x86_emulate_ctxt *ctxt)
>>>> eax = 0x80000001;
>>>> ecx = 0;
>>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> - return edx & bit(X86_FEATURE_LM);
>>>> + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx,
>>>> NO_CHECK_LIMIT))
>>>> + return edx & bit(X86_FEATURE_LM);
>>>> + else
>>>> + return 0;
>>>> }
>>>> #define GET_SMSTATE(type, smbase, offset) \
>>>> @@ -2636,7 +2639,7 @@ static bool vendor_intel(struct
>>>> x86_emulate_ctxt *ctxt)
>>>> u32 eax, ebx, ecx, edx;
>>>> eax = ecx = 0;
>>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>>>> return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
>>>> && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
>>>> && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
>>>> @@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct
>>>> x86_emulate_ctxt *ctxt)
>>>> eax = 0x00000000;
>>>> ecx = 0x00000000;
>>>> - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, NO_CHECK_LIMIT);
>>>> /*
>>>> * Intel ("GenuineIntel")
>>>> * remark: Intel CPUs only support "syscall" in 64bit
>>>> @@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
>>>> /*
>>>> * Check MOVBE is set in the guest-visible CPUID leaf.
>>>> */
>>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>>> This should be NO_CHECK_LIMIT.
>>>
>>> Otherwise okay!
>> Then I guess check_fxsr() should also use NO_CHECK_LIMIT('false' for a
>> bool argument), because it's also for eax=1?
> Good point.
>
>> And what about svm_vcpu_reset()?
> No, this one should be left as is, it's just writing a register and not
> checking a feature.

Got it. Thanks.

>
>> I am not sure if leaf 1 is always available. And if the answer is yes, I
>> do not think any of these 3 places(em_movbe/check_fxsr/svm_vcpu_reset) will
>> need to fall back to check_cpuid_limit(),
>> nor do we need to check the return value of get_cpuid(). Do you agree? :-)
> I think the answer is no, but you don't need to check the return value
> because testing against 0 is okay (if best is NULL, get_cpuid returns 0
> for eax/ebx/ecx/edx).

OK. And to return 0 for eax/ebx/ecx/edx if check_cpuid_limit() is also
to be omitted,
I'd better refactor this patch and move the "out:" before the if
statement. :-)

best = check_cpuid_limit(vcpu, function, index);
}

+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -887,7 +888,6 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32
*ebx,
} else
*eax = *ebx = *ecx = *edx = 0;

-out:
trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
return entry_found;
}

And for all get_cpuid() callers which is testing the existence of a
feature, we do not need to
check the return value, just checking the flag in the register should be
fine, correct?

Yu

>
> Paolo
>
>> Yu
>>
>>> Paolo
>>>
>>>> if (!(ecx & FFL(MOVBE)))
>>>> return emulate_ud(ctxt);
>>>> @@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt
>>>> *ctxt)
>>>> eax = reg_read(ctxt, VCPU_REGS_RAX);
>>>> ecx = reg_read(ctxt, VCPU_REGS_RCX);
>>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>>>> *reg_write(ctxt, VCPU_REGS_RAX) = eax;
>>>> *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
>>>> *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
>>>> @@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt
>>>> *ctxt)
>>>> {
>>>> u32 eax = 1, ebx, ecx = 0, edx;
>>>> - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>>>> + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, CHECK_LIMIT);
>>>> if (!(edx & FFL(FXSR)))
>>>> return emulate_ud(ctxt);
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index 1fa9ee5..9def4a8 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -1580,7 +1580,7 @@ static void svm_vcpu_reset(struct kvm_vcpu
>>>> *vcpu, bool init_event)
>>>> }
>>>> init_vmcb(svm);
>>>> - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>>>> + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, CHECK_LIMIT);
>>>> kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
>>>> if (kvm_vcpu_apicv_active(vcpu) && !init_event)
>>>> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
>>>> index 0a6cc67..8a202c4 100644
>>>> --- a/arch/x86/kvm/trace.h
>>>> +++ b/arch/x86/kvm/trace.h
>>>> @@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
>>>> */
>>>> TRACE_EVENT(kvm_cpuid,
>>>> TP_PROTO(unsigned int function, unsigned long rax, unsigned
>>>> long rbx,
>>>> - unsigned long rcx, unsigned long rdx),
>>>> - TP_ARGS(function, rax, rbx, rcx, rdx),
>>>> + unsigned long rcx, unsigned long rdx, bool found),
>>>> + TP_ARGS(function, rax, rbx, rcx, rdx, found),
>>>> TP_STRUCT__entry(
>>>> __field( unsigned int, function )
>>>> @@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
>>>> __field( unsigned long, rbx )
>>>> __field( unsigned long, rcx )
>>>> __field( unsigned long, rdx )
>>>> + __field( bool, found )
>>>> ),
>>>> TP_fast_assign(
>>>> @@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
>>>> __entry->rbx = rbx;
>>>> __entry->rcx = rcx;
>>>> __entry->rdx = rdx;
>>>> + __entry->found = found;
>>>> ),
>>>> - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
>>>> + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry
>>>> %s",
>>>> __entry->function, __entry->rax,
>>>> - __entry->rbx, __entry->rcx, __entry->rdx)
>>>> + __entry->rbx, __entry->rcx, __entry->rdx,
>>>> + __entry->found ? "found" : "not found")
>>>> );
>>>> #define AREG(x) { APIC_##x, "APIC_" #x }
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index e40a779..ee99fc1 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -5213,10 +5213,10 @@ static int emulator_intercept(struct
>>>> x86_emulate_ctxt *ctxt,
>>>> return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info,
>>>> stage);
>>>> }
>>>> -static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>>>> - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
>>>> +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
>>>> + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, int check_limit)
>>>> {
>>>> - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
>>>> + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx,
>>>> check_limit);
>>>> }
>>>> static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt,
>>>> unsigned reg)
>>>>
>

2017-08-17 14:30:11

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] KVM: x86: Add return value to kvm_cpuid().

On 17/08/2017 15:20, Yu Zhang wrote:
>>
>
> OK. And to return 0 for eax/ebx/ecx/edx if check_cpuid_limit() is also
> to be omitted,
> I'd better refactor this patch and move the "out:" before the if
> statement. :-)
>
> best = check_cpuid_limit(vcpu, function, index);
> }
>
> +out:
> if (best) {
> *eax = best->eax;
> *ebx = best->ebx;
> @@ -887,7 +888,6 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32
> *ebx,
> } else
> *eax = *ebx = *ecx = *edx = 0;
>
> -out:
> trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
> return entry_found;
> }
>
> And for all get_cpuid() callers which is testing the existence of a
> feature, we do not need to
> check the return value, just checking the flag in the register should be
> fine, correct?

Yes, correct!

Paolo