LinuxLists.cc - [PATCH v2 1/2] KVM: X86: Add per-VM no-HLT-exiting capability

2018-02-05 07:00:44

Subject: [PATCH v2 1/2] KVM: X86: Add per-VM no-HLT-exiting capability

From: Wanpeng Li <[email protected]>

If host CPUs are dedicated to a VM, we can avoid VM exits on HLT.
This patch adds the per-VM non-HLT-exiting capability.

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
v1 -> v2:
* vmx_clear_hlt() around INIT handling
* vmx_clear_hlt() upon SMI and implement auto halt restart

Documentation/virtual/kvm/api.txt | 11 +++++++++++
arch/x86/include/asm/kvm_emulate.h | 1 +
arch/x86/include/asm/kvm_host.h | 7 +++++++
arch/x86/kvm/emulate.c | 2 ++
arch/x86/kvm/vmx.c | 38 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++----
arch/x86/kvm/x86.h | 5 +++++
include/uapi/linux/kvm.h | 1 +
8 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 023da07..865b029 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4302,6 +4302,17 @@ enables QEMU to build error log and branch to guest kernel registered
machine check handling routine. Without this capability KVM will
branch to guests' 0x200 interrupt vector.

+7.13 KVM_CAP_X86_GUEST_HLT
+
+Architectures: x86
+Parameters: none
+Returns: 0 on success
+
+This capability indicates that a guest using HLT to stop a virtual CPU
+will not cause a VM exit. As such, time spent while a virtual CPU is
+halted in this way will then be accounted for as guest running time on
+the host, KVM_FEATURE_PV_UNHALT should be disabled.
+
8. Other capabilities.
----------------------

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b24b1c8..78cfe8ca 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -225,6 +225,7 @@ struct x86_emulate_ops {
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+ void (*smm_auto_halt_restart)(struct x86_emulate_ctxt *ctxt);

};

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8f0f09a..95b2c44 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -623,6 +623,11 @@ struct kvm_vcpu_arch {
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
+ /*
+ * bit 0 is set if Value of Auto HALT Restart after Entry to SMM is true
+ * bit 1 is set if Value of Auto HALT Restart When Exiting SMM is true
+ */
+ int smm_auto_halt_restart;

struct kvm_mtrr mtrr_state;
u64 pat;
@@ -806,6 +811,8 @@ struct kvm_arch {

gpa_t wall_clock;

+ bool hlt_in_guest;
+
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d91eaeb..ee5bc65 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2597,6 +2597,8 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)

smbase = ctxt->ops->get_smbase(ctxt);

+ if (GET_SMSTATE(u16, smbase, 0x7f02) & 0x1)
+ ctxt->ops->smm_auto_halt_restart(ctxt);
/*
* Give pre_leave_smm() a chance to make ISA-specific changes to the
* vCPU state (e.g. enter guest mode) before loading state from the SMM
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3e71086..23789c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2474,6 +2474,24 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
return 0;
}

+static bool vmx_need_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ return kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT;
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (vmx_need_clear_hlt(vcpu))
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2504,6 +2522,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
intr_info |= INTR_TYPE_HARD_EXCEPTION;

vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
}

static bool vmx_rdtscp_supported(void)
@@ -5359,6 +5379,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
return exec_control;
}

@@ -5716,6 +5738,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
update_exception_bitmap(vcpu);

vpid_sync_context(vmx->vpid);
+ if (init_event)
+ vmx_clear_hlt(vcpu);
}

/*
@@ -5787,6 +5811,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
}

static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -5817,6 +5843,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)

vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
}

static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -12048,6 +12076,10 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)

vmx->nested.smm.vmxon = vmx->nested.vmxon;
vmx->nested.vmxon = false;
+ if (vmx_need_clear_hlt(vcpu)) {
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vcpu->arch.smm_auto_halt_restart = 0x1;
+ }
return 0;
}

@@ -12056,6 +12088,12 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;

+ if (vcpu->arch.smm_auto_halt_restart & 0x3)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
+ else if (vcpu->arch.smm_auto_halt_restart & 0x1)
+ skip_emulated_instruction(vcpu);
+ vcpu->arch.smm_auto_halt_restart = 0;
+
if (vmx->nested.smm.vmxon) {
vmx->nested.vmxon = true;
vmx->nested.smm.vmxon = false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05dbdba..1bdfdcf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2785,6 +2785,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_X86_GUEST_HLT:
r = 1;
break;
case KVM_CAP_ADJUST_CLOCK:
@@ -4106,6 +4107,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,

r = 0;
break;
+ case KVM_CAP_X86_GUEST_HLT:
+ kvm->arch.hlt_in_guest = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5417,6 +5422,11 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
}

+static void emulator_smm_auto_halt_restart(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.smm_auto_halt_restart = 0x2;
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5457,6 +5467,7 @@ static const struct x86_emulate_ops emulate_ops = {
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
.pre_leave_smm = emulator_pre_leave_smm,
+ .smm_auto_halt_restart = emulator_smm_auto_halt_restart,
};

static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6757,6 +6768,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)

put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));

+ if (vcpu->arch.smm_auto_halt_restart)
+ put_smstate(u16, buf, 0x7f02, 0x1);
+
/* revision id */
put_smstate(u32, buf, 0x7efc, 0x00020000);
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
@@ -6785,6 +6799,9 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));

+ if (vcpu->arch.smm_auto_halt_restart)
+ put_smstate(u16, buf, 0x7f02, 0x1);
+
put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);

/* revision id */
@@ -6828,10 +6845,6 @@ static void enter_smm(struct kvm_vcpu *vcpu)

trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- enter_smm_save_state_64(vcpu, buf);
- else
- enter_smm_save_state_32(vcpu, buf);

/*
* Give pre_enter_smm() a chance to make ISA-specific changes to the
@@ -6840,6 +6853,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
*/
kvm_x86_ops->pre_enter_smm(vcpu, buf);

+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, buf);
+ else
+ enter_smm_save_state_32(vcpu, buf);
+
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));

@@ -8029,6 +8047,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)

vcpu->arch.smi_pending = 0;
vcpu->arch.smi_count = 0;
+ vcpu->arch.smm_auto_halt_restart = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b91215d..96fe84e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -270,4 +270,9 @@ static inline bool kvm_mwait_in_guest(void)
!boot_cpu_has_bug(X86_BUG_MONITOR);
}

+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
#endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ed5fb32..1a2b2da 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -935,6 +935,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_PPC_GET_CPU_CHAR 151
#define KVM_CAP_S390_BPB 152
#define KVM_CAP_HYPERV_EVENTFD 153
+#define KVM_CAP_X86_GUEST_HLT 154

#ifdef KVM_CAP_IRQ_ROUTING

--
2.7.4

2018-02-05 07:00:22

by Wanpeng Li

[permalink] [raw]

Subject: [PATCH v2 2/2] KVM: X86: Avoid traversing all the cpus for pv tlb flush when steal time is disabled

From: Wanpeng Li <[email protected]>

Avoid traversing all the cpus for pv tlb flush when steal time
is disabled since pv tlb flush depends on the field in steal time
for shared data.

Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kernel/kvm.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f2a09cf..4f3c997 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -546,7 +546,8 @@ static void __init kvm_guest_init(void)
}

if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_feature(KVM_FEATURE_PV_DEDICATED))
+ !kvm_para_has_feature(KVM_FEATURE_PV_DEDICATED) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;

if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -635,7 +636,8 @@ static __init int kvm_setup_pv_tlb_flush(void)
int cpu;

if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_feature(KVM_FEATURE_PV_DEDICATED)) {
+ !kvm_para_has_feature(KVM_FEATURE_PV_DEDICATED) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
GFP_KERNEL, cpu_to_node(cpu));
--
2.7.4

2018-02-13 05:03:48

by Wanpeng Li

[permalink] [raw]

Subject: Re: [PATCH v2 1/2] KVM: X86: Add per-VM no-HLT-exiting capability

Ping,
2018-02-05 14:57 GMT+08:00 Wanpeng Li <[email protected]>:
> From: Wanpeng Li <[email protected]>
>
> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT.
> This patch adds the per-VM non-HLT-exiting capability.
>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v1 -> v2:
> * vmx_clear_hlt() around INIT handling
> * vmx_clear_hlt() upon SMI and implement auto halt restart
>
> Documentation/virtual/kvm/api.txt | 11 +++++++++++
> arch/x86/include/asm/kvm_emulate.h | 1 +
> arch/x86/include/asm/kvm_host.h | 7 +++++++
> arch/x86/kvm/emulate.c | 2 ++
> arch/x86/kvm/vmx.c | 38 ++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++----
> arch/x86/kvm/x86.h | 5 +++++
> include/uapi/linux/kvm.h | 1 +
> 8 files changed, 88 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 023da07..865b029 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4302,6 +4302,17 @@ enables QEMU to build error log and branch to guest kernel registered
> machine check handling routine. Without this capability KVM will
> branch to guests' 0x200 interrupt vector.
>
> +7.13 KVM_CAP_X86_GUEST_HLT
> +
> +Architectures: x86
> +Parameters: none
> +Returns: 0 on success
> +
> +This capability indicates that a guest using HLT to stop a virtual CPU
> +will not cause a VM exit. As such, time spent while a virtual CPU is
> +halted in this way will then be accounted for as guest running time on
> +the host, KVM_FEATURE_PV_UNHALT should be disabled.
> +
> 8. Other capabilities.
> ----------------------
>
> diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
> index b24b1c8..78cfe8ca 100644
> --- a/arch/x86/include/asm/kvm_emulate.h
> +++ b/arch/x86/include/asm/kvm_emulate.h
> @@ -225,6 +225,7 @@ struct x86_emulate_ops {
> unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
> void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
> int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
> + void (*smm_auto_halt_restart)(struct x86_emulate_ctxt *ctxt);
>
> };
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 8f0f09a..95b2c44 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -623,6 +623,11 @@ struct kvm_vcpu_arch {
> unsigned nmi_pending; /* NMI queued after currently running handler */
> bool nmi_injected; /* Trying to inject an NMI this entry */
> bool smi_pending; /* SMI queued after currently running handler */
> + /*
> + * bit 0 is set if Value of Auto HALT Restart after Entry to SMM is true
> + * bit 1 is set if Value of Auto HALT Restart When Exiting SMM is true
> + */
> + int smm_auto_halt_restart;
>
> struct kvm_mtrr mtrr_state;
> u64 pat;
> @@ -806,6 +811,8 @@ struct kvm_arch {
>
> gpa_t wall_clock;
>
> + bool hlt_in_guest;
> +
> bool ept_identity_pagetable_done;
> gpa_t ept_identity_map_addr;
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index d91eaeb..ee5bc65 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2597,6 +2597,8 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
>
> smbase = ctxt->ops->get_smbase(ctxt);
>
> + if (GET_SMSTATE(u16, smbase, 0x7f02) & 0x1)
> + ctxt->ops->smm_auto_halt_restart(ctxt);
> /*
> * Give pre_leave_smm() a chance to make ISA-specific changes to the
> * vCPU state (e.g. enter guest mode) before loading state from the SMM
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 3e71086..23789c9 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2474,6 +2474,24 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
> return 0;
> }
>
> +static bool vmx_need_clear_hlt(struct kvm_vcpu *vcpu)
> +{
> + return kvm_hlt_in_guest(vcpu->kvm) &&
> + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT;
> +}
> +
> +static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
> +{
> + /*
> + * Ensure that we clear the HLT state in the VMCS. We don't need to
> + * explicitly skip the instruction because if the HLT state is set,
> + * then the instruction is already executing and RIP has already been
> + * advanced.
> + */
> + if (vmx_need_clear_hlt(vcpu))
> + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
> +}
> +
> static void vmx_queue_exception(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -2504,6 +2522,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
> intr_info |= INTR_TYPE_HARD_EXCEPTION;
>
> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
> +
> + vmx_clear_hlt(vcpu);
> }
>
> static bool vmx_rdtscp_supported(void)
> @@ -5359,6 +5379,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
> exec_control |= CPU_BASED_CR3_STORE_EXITING |
> CPU_BASED_CR3_LOAD_EXITING |
> CPU_BASED_INVLPG_EXITING;
> + if (kvm_hlt_in_guest(vmx->vcpu.kvm))
> + exec_control &= ~CPU_BASED_HLT_EXITING;
> return exec_control;
> }
>
> @@ -5716,6 +5738,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> update_exception_bitmap(vcpu);
>
> vpid_sync_context(vmx->vpid);
> + if (init_event)
> + vmx_clear_hlt(vcpu);
> }
>
> /*
> @@ -5787,6 +5811,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
> } else
> intr |= INTR_TYPE_EXT_INTR;
> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
> +
> + vmx_clear_hlt(vcpu);
> }
>
> static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
> @@ -5817,6 +5843,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>
> vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
> +
> + vmx_clear_hlt(vcpu);
> }
>
> static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
> @@ -12048,6 +12076,10 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
>
> vmx->nested.smm.vmxon = vmx->nested.vmxon;
> vmx->nested.vmxon = false;
> + if (vmx_need_clear_hlt(vcpu)) {
> + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
> + vcpu->arch.smm_auto_halt_restart = 0x1;
> + }
> return 0;
> }
>
> @@ -12056,6 +12088,12 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> int ret;
>
> + if (vcpu->arch.smm_auto_halt_restart & 0x3)
> + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
> + else if (vcpu->arch.smm_auto_halt_restart & 0x1)
> + skip_emulated_instruction(vcpu);
> + vcpu->arch.smm_auto_halt_restart = 0;
> +
> if (vmx->nested.smm.vmxon) {
> vmx->nested.vmxon = true;
> vmx->nested.smm.vmxon = false;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 05dbdba..1bdfdcf 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2785,6 +2785,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_SET_BOOT_CPU_ID:
> case KVM_CAP_SPLIT_IRQCHIP:
> case KVM_CAP_IMMEDIATE_EXIT:
> + case KVM_CAP_X86_GUEST_HLT:
> r = 1;
> break;
> case KVM_CAP_ADJUST_CLOCK:
> @@ -4106,6 +4107,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>
> r = 0;
> break;
> + case KVM_CAP_X86_GUEST_HLT:
> + kvm->arch.hlt_in_guest = cap->args[0];
> + r = 0;
> + break;
> default:
> r = -EINVAL;
> break;
> @@ -5417,6 +5422,11 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
> return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
> }
>
> +static void emulator_smm_auto_halt_restart(struct x86_emulate_ctxt *ctxt)
> +{
> + emul_to_vcpu(ctxt)->arch.smm_auto_halt_restart = 0x2;
> +}
> +
> static const struct x86_emulate_ops emulate_ops = {
> .read_gpr = emulator_read_gpr,
> .write_gpr = emulator_write_gpr,
> @@ -5457,6 +5467,7 @@ static const struct x86_emulate_ops emulate_ops = {
> .get_hflags = emulator_get_hflags,
> .set_hflags = emulator_set_hflags,
> .pre_leave_smm = emulator_pre_leave_smm,
> + .smm_auto_halt_restart = emulator_smm_auto_halt_restart,
> };
>
> static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
> @@ -6757,6 +6768,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
>
> put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
>
> + if (vcpu->arch.smm_auto_halt_restart)
> + put_smstate(u16, buf, 0x7f02, 0x1);
> +
> /* revision id */
> put_smstate(u32, buf, 0x7efc, 0x00020000);
> put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
> @@ -6785,6 +6799,9 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
> put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
> put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
>
> + if (vcpu->arch.smm_auto_halt_restart)
> + put_smstate(u16, buf, 0x7f02, 0x1);
> +
> put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
>
> /* revision id */
> @@ -6828,10 +6845,6 @@ static void enter_smm(struct kvm_vcpu *vcpu)
>
> trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
> memset(buf, 0, 512);
> - if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
> - enter_smm_save_state_64(vcpu, buf);
> - else
> - enter_smm_save_state_32(vcpu, buf);
>
> /*
> * Give pre_enter_smm() a chance to make ISA-specific changes to the
> @@ -6840,6 +6853,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
> */
> kvm_x86_ops->pre_enter_smm(vcpu, buf);
>
> + if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
> + enter_smm_save_state_64(vcpu, buf);
> + else
> + enter_smm_save_state_32(vcpu, buf);
> +
> vcpu->arch.hflags |= HF_SMM_MASK;
> kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
>
> @@ -8029,6 +8047,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>
> vcpu->arch.smi_pending = 0;
> vcpu->arch.smi_count = 0;
> + vcpu->arch.smm_auto_halt_restart = 0;
> atomic_set(&vcpu->arch.nmi_queued, 0);
> vcpu->arch.nmi_pending = 0;
> vcpu->arch.nmi_injected = false;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index b91215d..96fe84e 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -270,4 +270,9 @@ static inline bool kvm_mwait_in_guest(void)
> !boot_cpu_has_bug(X86_BUG_MONITOR);
> }
>
> +static inline bool kvm_hlt_in_guest(struct kvm *kvm)
> +{
> + return kvm->arch.hlt_in_guest;
> +}
> +
> #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ed5fb32..1a2b2da 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -935,6 +935,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_PPC_GET_CPU_CHAR 151
> #define KVM_CAP_S390_BPB 152
> #define KVM_CAP_HYPERV_EVENTFD 153
> +#define KVM_CAP_X86_GUEST_HLT 154
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> --
> 2.7.4
>

2018-02-13 16:03:52

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH v2 1/2] KVM: X86: Add per-VM no-HLT-exiting capability

On 05/02/2018 07:57, Wanpeng Li wrote:
> From: Wanpeng Li <[email protected]>
>
> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT.
> This patch adds the per-VM non-HLT-exiting capability.
>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> v1 -> v2:
> * vmx_clear_hlt() around INIT handling
> * vmx_clear_hlt() upon SMI and implement auto halt restart

Hi Wanpeng,

sorry I could not answer before.

We do not need to implement AutoHalt. It's a messy functionality and
the way it works is much simpler: on RSM the microcode reads AutoHALT's
bit 0 and... decrements RIP if it is 1. All you need to do however is
clear the activity state. Guests should expect anyway that "CLI;HLT"
can be interrupted by an NMI and follow it with a JMP.

Second, I would prefer to implement at the same time MWAIT and PAUSE
passthrough, as in https://www.spinics.net/lists/kvm/msg159517.html:

> The three capabilities are more or less all doing the same thing.
> Perhaps it would make some sense to only leave PAUSE spin loops in
> guest, but not HLT/MWAIT; but apart from that I think users would
> probably enable all of them. So I think we should put in the
> documentation that blindly passing the KVM_CHECK_EXTENSION result to
> KVM_ENABLE_CAP is a valid thing to do when vCPUs are associated to
> dedicated physical CPUs.
>
> Let's get rid of KVM_CAP_X86_GUEST_MWAIT altogether and
> add a new capability. But let's use just one.

Thanks again for your work, and sorry for slightly contradicting Radim's
review. I've rebased and applied patch 2.

Paolo

2018-02-14 03:27:55

by Wanpeng Li

[permalink] [raw]

Subject: Re: [PATCH v2 1/2] KVM: X86: Add per-VM no-HLT-exiting capability

2018-02-14 0:02 GMT+08:00 Paolo Bonzini <[email protected]>:
> On 05/02/2018 07:57, Wanpeng Li wrote:
>> From: Wanpeng Li <[email protected]>
>>
>> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT.
>> This patch adds the per-VM non-HLT-exiting capability.
>>
>> Cc: Paolo Bonzini <[email protected]>
>> Cc: Radim Krčmář <[email protected]>
>> Signed-off-by: Wanpeng Li <[email protected]>
>> ---
>> v1 -> v2:
>> * vmx_clear_hlt() around INIT handling
>> * vmx_clear_hlt() upon SMI and implement auto halt restart
>
> Hi Wanpeng,
>
> sorry I could not answer before.
>
> We do not need to implement AutoHalt. It's a messy functionality and
> the way it works is much simpler: on RSM the microcode reads AutoHALT's
> bit 0 and... decrements RIP if it is 1. All you need to do however is
> clear the activity state. Guests should expect anyway that "CLI;HLT"
> can be interrupted by an NMI and follow it with a JMP.

Thanks for pointing out.

>
> Second, I would prefer to implement at the same time MWAIT and PAUSE
> passthrough, as in https://www.spinics.net/lists/kvm/msg159517.html:

Understand.

>
>> The three capabilities are more or less all doing the same thing.
>> Perhaps it would make some sense to only leave PAUSE spin loops in
>> guest, but not HLT/MWAIT; but apart from that I think users would
>> probably enable all of them. So I think we should put in the
>> documentation that blindly passing the KVM_CHECK_EXTENSION result to
>> KVM_ENABLE_CAP is a valid thing to do when vCPUs are associated to
>> dedicated physical CPUs.
>>
>> Let's get rid of KVM_CAP_X86_GUEST_MWAIT altogether and
>> add a new capability. But let's use just one.
>
> Thanks again for your work, and sorry for slightly contradicting Radim's
> review. I've rebased and applied patch 2.

No problem. You and Radim's review is always appreciated and helpful.

Regards,
Wanpeng Li