UMONITOR, UMWAIT and TPAUSE are a set of user wait instructions.
UMONITOR arms address monitoring hardware using an address. A store
to an address within the specified address range triggers the
monitoring hardware to wake up the processor waiting in umwait.
UMWAIT instructs the processor to enter an implementation-dependent
optimized state while monitoring a range of addresses. The optimized
state may be either a light-weight power/performance optimized state
(c0.1 state) or an improved power/performance optimized state
(c0.2 state).
TPAUSE instructs the processor to enter an implementation-dependent
optimized state c0.1 or c0.2 state and wake up when time-stamp counter
reaches specified timeout.
Availability of the user wait instructions is indicated by the presence
of the CPUID feature flag WAITPKG CPUID.0x07.0x0:ECX[5].
The patches enable the umonitor, umwait and tpause features in KVM.
Because umwait and tpause can put a (psysical) CPU into a power saving
state, by default we dont't expose it to kvm and provide a capability to
enable it. With this capability enabled, a VM can use UMONITOR, UMWAIT
and TPAUSE instructions. If the instruction causes a delay, the amount
of time delayed is called here the physical delay. The physical delay is
first computed by determining the virtual delay (the time to delay
relative to the VM’s timestamp counter). Otherwise, UMONITOR, UMWAIT
and TPAUSE cause an invalid-opcode exception(#UD).
The release document ref below link:
https://software.intel.com/sites/default/files/\
managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
This patch has a dependency on https://lkml.org/lkml/2019/1/16/909
Changelog:
v2:
Separated from the series https://lkml.org/lkml/2018/7/10/160
Add provide a capability to enable UMONITOR, UMWAIT and TPAUSE
v1:
Sent out with MOVDIRI/MOVDIR64B instructions patches
Tao Xu (3):
KVM: x86: add support for user wait instructions
KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL
KVM: vmx: handle vm-exit for UMWAIT and TPAUSE
Documentation/virtual/kvm/api.txt | 12 ++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/vmx.h | 1 +
arch/x86/include/uapi/asm/vmx.h | 6 ++-
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx/vmx.c | 62 +++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/x86.c | 8 ++++
arch/x86/kvm/x86.h | 5 +++
include/uapi/linux/kvm.h | 1 +
10 files changed, 97 insertions(+), 2 deletions(-)
--
2.20.1
UMWAIT and TPAUSE instructions use IA32_UMWAIT_CONTROL at MSR index E1H
to determines the maximum time in TSC-quanta that the processor can reside
in either C0.1 or C0.2.
This patch is to emulate MSR IA32_UMWAIT_CONTROL in guest and
differentiate MSR_TEST_CTL between host and guest.
Co-developed-by: Jingqi Liu <[email protected]>
Signed-off-by: Jingqi Liu <[email protected]>
Signed-off-by: Tao Xu <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 42 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/x86.c | 1 +
3 files changed, 44 insertions(+)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a65ee7ea47b4..49e107692aee 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1676,6 +1676,14 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_enable_usr_wait_pause(vmx->vcpu.kvm) ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG)))
+ return 1;
+
+ msr_info->data = vmx->msr_ia32_umwait_control;
+ break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -1838,6 +1846,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_enable_usr_wait_pause(vmx->vcpu.kvm) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG))
+ return 1;
+
+ if (!data)
+ break;
+
+ vmx->msr_ia32_umwait_control = data;
+ break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -4085,6 +4103,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
+ vmx->msr_ia32_umwait_control = 0;
+
vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -4123,6 +4143,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
+ vmx->msr_ia32_umwait_control = 0;
+
vcpu->arch.microcode_version = 0x100000000ULL;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@@ -6327,6 +6349,24 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
+static void atomic_switch_ia32_umwait_control(struct vcpu_vmx *vmx)
+{
+ u64 host_umwait_control;
+
+ if (!kvm_enable_usr_wait_pause(vmx->vcpu.kvm))
+ return;
+
+ if (rdmsrl_safe(MSR_IA32_UMWAIT_CONTROL, &host_umwait_control))
+ return;
+
+ if (vmx->msr_ia32_umwait_control != host_umwait_control)
+ add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
+ vmx->msr_ia32_umwait_control,
+ host_umwait_control, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
+}
+
static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
@@ -6435,6 +6475,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
+ atomic_switch_ia32_umwait_control(vmx);
+
vmx_update_hv_timer(vcpu);
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 63d37ccce3dc..7b779f8816fb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -194,6 +194,7 @@ struct vcpu_vmx {
#endif
u64 spec_ctrl;
+ u64 msr_ia32_umwait_control;
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38a89c878c5d..245ed4a63765 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1138,6 +1138,7 @@ static u32 msrs_to_save[] = {
MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
};
static unsigned num_msrs_to_save;
--
2.20.1
As the latest Intel 64 and IA-32 Architectures Software Developer's
Manual, UMWAIT and TPAUSE instructions cause a VM exit if the
“RDTSC exiting” and “enable user wait and pause” VM-execution controls
are both 1.
This patch is to handle the vm-exit for UMWAIT and TPAUSE as invalid_op.
Co-developed-by: Jingqi Liu <[email protected]>
Signed-off-by: Jingqi Liu <[email protected]>
Signed-off-by: Tao Xu <[email protected]>
---
arch/x86/include/uapi/asm/vmx.h | 6 +++++-
arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d213ec5c3766..d88d7a68849b 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -85,6 +85,8 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_UMWAIT 67
+#define EXIT_REASON_TPAUSE 68
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -142,7 +144,9 @@
{ EXIT_REASON_RDSEED, "RDSEED" }, \
{ EXIT_REASON_PML_FULL, "PML_FULL" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
- { EXIT_REASON_XRSTORS, "XRSTORS" }
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_UMWAIT, "UMWAIT" }, \
+ { EXIT_REASON_TPAUSE, "TPAUSE" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 49e107692aee..0743a4ac2b61 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5337,6 +5337,20 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}
+static int handle_umwait(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: Can't use UMWAIT instruction "
+ "when RDTSC exiting VM-execution control is enabled!\n");
+ return handle_invalid_op(vcpu);
+}
+
+static int handle_tpause(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: Can't use TPAUSE instruction "
+ "when RDTSC exiting VM-execution control is enabled!\n");
+ return handle_invalid_op(vcpu);
+}
+
static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
@@ -5547,6 +5561,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_UMWAIT] = handle_umwait,
+ [EXIT_REASON_TPAUSE] = handle_tpause,
};
static const int kvm_vmx_max_exit_handlers =
--
2.20.1
This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
in kvm, and by default dont't expose it to kvm and provide a capability
to enable it.
Co-developed-by: Jingqi Liu <[email protected]>
Signed-off-by: Jingqi Liu <[email protected]>
Signed-off-by: Tao Xu <[email protected]>
---
Documentation/virtual/kvm/api.txt | 12 ++++++++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx/vmx.c | 4 ++++
arch/x86/kvm/x86.c | 7 +++++++
arch/x86/kvm/x86.h | 5 +++++
include/uapi/linux/kvm.h | 1 +
8 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index ba6c42c576dd..3d0196220486 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4997,6 +4997,18 @@ it hard or impossible to use it correctly. The availability of
KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed.
Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT.
+7.19 KVM_CAP_ENABLE_USR_WAIT_PAUSE
+
+Architectures: x86
+Parameters: args[0] whether feature should be enabled or not
+
+With this capability enabled, a VM can use UMONITOR, UMWAIT and TPAUSE
+instructions. If the instruction causes a delay, the amount of
+time delayed is called here the physical delay. The physical delay is
+first computed by determining the virtual delay (the time to delay
+relative to the VM’s timestamp counter). Otherwise, UMONITOR, UMWAIT
+and TPAUSE cause an invalid-opcode exception(#UD).
+
8. Other capabilities.
----------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 450d69a1e6fa..0da87c2e1c4d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -882,6 +882,7 @@ struct kvm_arch {
bool mwait_in_guest;
bool hlt_in_guest;
bool pause_in_guest;
+ bool enable_usr_wait_pause;
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4e4133e86484..1c94b1009288 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -82,6 +82,7 @@
#define SECONDARY_EXEC_PT_USE_GPA 0x01000000
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 80a642a0143d..1cc001870a9d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -405,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1ac167614032..a65ee7ea47b4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2247,6 +2247,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
@@ -3880,6 +3881,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (kvm_pause_in_guest(vmx->vcpu.kvm))
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!kvm_enable_usr_wait_pause(vmx->vcpu.kvm) ||
+ (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_RDTSC_EXITING))
+ exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
if (!kvm_vcpu_apicv_active(vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 536b78c4af6e..38a89c878c5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3141,6 +3141,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_x86_ops->get_nested_state ?
kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
break;
+ case KVM_CAP_ENABLE_USR_WAIT_PAUSE:
+ r = boot_cpu_has(X86_FEATURE_WAITPKG);
+ break;
default:
break;
}
@@ -4622,6 +4625,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+ case KVM_CAP_ENABLE_USR_WAIT_PAUSE:
+ kvm->arch.enable_usr_wait_pause = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a470ff0868c5..37685e6679f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -333,6 +333,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm)
return kvm->arch.pause_in_guest;
}
+static inline bool kvm_enable_usr_wait_pause(struct kvm *kvm)
+{
+ return kvm->arch.enable_usr_wait_pause;
+}
+
DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2fe12b40d503..5a19a5984c57 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -993,6 +993,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ARM_SVE 170
#define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
#define KVM_CAP_ARM_PTRAUTH_GENERIC 172
+#define KVM_CAP_ENABLE_USR_WAIT_PAUSE 173
#ifdef KVM_CAP_IRQ_ROUTING
--
2.20.1
On Fri, May 24, 2019 at 03:56:35PM +0800, Tao Xu wrote:
> This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
> in kvm, and by default dont't expose it to kvm and provide a capability
> to enable it.
I'm thinking this should be conditional on the guest being a 1:1 guest,
and I also seem to remember we have bits for that already -- they were
used to disable paravirt spinlocks for example.
On 27/05/2019 18:30, Peter Zijlstra wrote:
> On Fri, May 24, 2019 at 03:56:35PM +0800, Tao Xu wrote:
>> This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
>> in kvm, and by default dont't expose it to kvm and provide a capability
>> to enable it.
>
> I'm thinking this should be conditional on the guest being a 1:1 guest,
> and I also seem to remember we have bits for that already -- they were
> used to disable paravirt spinlocks for example.
>
Hi Peter,
I am wondering if "1:1 guest" means different guests in the same host
should have different settings on user wait instructions?
User wait instructions(UMONITOR, UMWAIT and TPAUSE) can use in guest
only when the VMCS Secondary Processor-Based VM-Execution Control bit 26
is 1, otherwise any execution of TPAUSE, UMONITOR, or UMWAIT causes a #UD.
So with a capability to enable it, we use qemu kvm_vm_ioctl_enable_cap()
to enable it. The qemu link is blew:
https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg05810.html
By using different QEMU parameters, different guests in the same host
would have different features with or without user wait instructions.
About "disable paravirt spinlocks" case, I am wondering if it uses
kernel parameters? If it uses kernel parameters, different guests in the
same host may have same settings on user wait instructions.
Or when we uses kernel parameters to disable user wait instructions, for
a host chooses to enable user wait instructions, we should do some work
on QEMU to choose disable or enable user wait instructions?
Thanks
Tao
On Tue, 28 May 2019 at 13:16, Tao Xu <[email protected]> wrote:
>
>
> On 27/05/2019 18:30, Peter Zijlstra wrote:
> > On Fri, May 24, 2019 at 03:56:35PM +0800, Tao Xu wrote:
> >> This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
> >> in kvm, and by default dont't expose it to kvm and provide a capability
> >> to enable it.
> >
> > I'm thinking this should be conditional on the guest being a 1:1 guest,
> > and I also seem to remember we have bits for that already -- they were
> > used to disable paravirt spinlocks for example.
> >
>
> Hi Peter,
>
> I am wondering if "1:1 guest" means different guests in the same host
> should have different settings on user wait instructions?
>
> User wait instructions(UMONITOR, UMWAIT and TPAUSE) can use in guest
> only when the VMCS Secondary Processor-Based VM-Execution Control bit 26
> is 1, otherwise any execution of TPAUSE, UMONITOR, or UMWAIT causes a #UD.
>
> So with a capability to enable it, we use qemu kvm_vm_ioctl_enable_cap()
> to enable it. The qemu link is blew:
> https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg05810.html
>
> By using different QEMU parameters, different guests in the same host
> would have different features with or without user wait instructions.
>
> About "disable paravirt spinlocks" case, I am wondering if it uses
Please refer to a4429e53c9 (KVM: Introduce paravirtualization hints
and KVM_HINTS_DEDICATED) and b2798ba0b87 (KVM: X86: Choose qspinlock
when dedicated physical CPUs are available).
> kernel parameters? If it uses kernel parameters, different guests in the
> same host may have same settings on user wait instructions.
>
> Or when we uses kernel parameters to disable user wait instructions, for
> a host chooses to enable user wait instructions, we should do some work
> on QEMU to choose disable or enable user wait instructions?
>
> Thanks
>
> Tao
On 28/05/2019 14:11, Wanpeng Li wrote:
> On Tue, 28 May 2019 at 13:16, Tao Xu <[email protected]> wrote:
>>
>>
>> On 27/05/2019 18:30, Peter Zijlstra wrote:
>>> On Fri, May 24, 2019 at 03:56:35PM +0800, Tao Xu wrote:
>>>> This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
>>>> in kvm, and by default dont't expose it to kvm and provide a capability
>>>> to enable it.
>>>
>>> I'm thinking this should be conditional on the guest being a 1:1 guest,
>>> and I also seem to remember we have bits for that already -- they were
>>> used to disable paravirt spinlocks for example.
>>>
>>
>> Hi Peter,
>>
>> I am wondering if "1:1 guest" means different guests in the same host
>> should have different settings on user wait instructions?
>>
>> User wait instructions(UMONITOR, UMWAIT and TPAUSE) can use in guest
>> only when the VMCS Secondary Processor-Based VM-Execution Control bit 26
>> is 1, otherwise any execution of TPAUSE, UMONITOR, or UMWAIT causes a #UD.
>>
>> So with a capability to enable it, we use qemu kvm_vm_ioctl_enable_cap()
>> to enable it. The qemu link is blew:
>> https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg05810.html
>>
>> By using different QEMU parameters, different guests in the same host
>> would have different features with or without user wait instructions.
>>
>> About "disable paravirt spinlocks" case, I am wondering if it uses
>
> Please refer to a4429e53c9 (KVM: Introduce paravirtualization hints
> and KVM_HINTS_DEDICATED) and b2798ba0b87 (KVM: X86: Choose qspinlock
> when dedicated physical CPUs are available)
Hi Wanpeng,
Thank you! This information really helped me. After I read the code in
KVM/QEMU, I was wondering that with qemu command-line "-cpu
host,+kvm-hint-dedicated", then in KVM,
"kvm_hint_has_feature(KVM_HINTS_DEDICATED)" will be true, am I right?
Tao
On 24/05/19 09:56, Tao Xu wrote:
> +7.19 KVM_CAP_ENABLE_USR_WAIT_PAUSE
> +
> +Architectures: x86
> +Parameters: args[0] whether feature should be enabled or not
> +
> +With this capability enabled, a VM can use UMONITOR, UMWAIT and TPAUSE
> +instructions. If the instruction causes a delay, the amount of
> +time delayed is called here the physical delay. The physical delay is
> +first computed by determining the virtual delay (the time to delay
> +relative to the VM’s timestamp counter). Otherwise, UMONITOR, UMWAIT
> +and TPAUSE cause an invalid-opcode exception(#UD).
> +
There is no need to make it a capability. You can just check the guest
CPUID and see if it includes X86_FEATURE_WAITPKG.
Paolo
On 27/05/19 12:30, Peter Zijlstra wrote:
>> This patch adds support for UMONITOR, UMWAIT and TPAUSE instructions
>> in kvm, and by default dont't expose it to kvm and provide a capability
>> to enable it.
>
> I'm thinking this should be conditional on the guest being a 1:1 guest,
> and I also seem to remember we have bits for that already -- they were
> used to disable paravirt spinlocks for example.
This should be userspace's choice. It would indeed be silly to enable
this while overcommitted, but KVM doesn't really care.
Paolo
On 28/05/19 09:19, Tao Xu wrote:
>
> Thank you! This information really helped me. After I read the code in
> KVM/QEMU, I was wondering that with qemu command-line "-cpu
> host,+kvm-hint-dedicated", then in KVM,
> "kvm_hint_has_feature(KVM_HINTS_DEDICATED)" will be true, am I right?
Yes, but it doesn't matter for this patch series.
Paolo
On 24/05/19 09:56, Tao Xu wrote:
> As the latest Intel 64 and IA-32 Architectures Software Developer's
> Manual, UMWAIT and TPAUSE instructions cause a VM exit if the
> “RDTSC exiting” and “enable user wait and pause” VM-execution controls
> are both 1.
>
> This patch is to handle the vm-exit for UMWAIT and TPAUSE as invalid_op.
KVM never enables RDTSC exiting, so this is not necessary.
Paolo
On 24/05/19 09:56, Tao Xu wrote:
> +
> + if (rdmsrl_safe(MSR_IA32_UMWAIT_CONTROL, &host_umwait_control))
> + return;
> +
Does the host value ever change? If not, this can perhaps be read once
when kvm_intel is loaded. And if it changes often, it should be
shadowed into a percpu variable.
Paolo
On 29/05/2019 09:29, Paolo Bonzini wrote:
> On 24/05/19 09:56, Tao Xu wrote:
>> +
>> + if (rdmsrl_safe(MSR_IA32_UMWAIT_CONTROL, &host_umwait_control))
>> + return;
>> +
>
> Does the host value ever change? If not, this can perhaps be read once
> when kvm_intel is loaded. And if it changes often, it should be
> shadowed into a percpu variable.
>
> Paolo
>
Yes, the host value may change, we contact the host patch author Fenghua
to add the shadow in host when the host msr value change. And we will
improve this in the next version of patch.
On 29/05/2019 09:24, Paolo Bonzini wrote:
> On 24/05/19 09:56, Tao Xu wrote:
>> +7.19 KVM_CAP_ENABLE_USR_WAIT_PAUSE
>> +
>> +Architectures: x86
>> +Parameters: args[0] whether feature should be enabled or not
>> +
>> +With this capability enabled, a VM can use UMONITOR, UMWAIT and TPAUSE
>> +instructions. If the instruction causes a delay, the amount of
>> +time delayed is called here the physical delay. The physical delay is
>> +first computed by determining the virtual delay (the time to delay
>> +relative to the VM’s timestamp counter). Otherwise, UMONITOR, UMWAIT
>> +and TPAUSE cause an invalid-opcode exception(#UD).
>> +
>
> There is no need to make it a capability. You can just check the guest
> CPUID and see if it includes X86_FEATURE_WAITPKG.
>
> Paolo
>
Thank you Paolo, but I have another question. I was wondering if it is
appropriate to enable X86_FEATURE_WAITPKG when QEMU uses "-overcommit
cpu-pm=on"? Or just enable X86_FEATURE_WAITPKG when QEMU add the feature
"-cpu host,+waitpkg"? User wait instructions is the wait or pause
instructions may be executed at any privilege level, but can use
IA32_UMWAIT_CONTROL to set the maximum time.
On 29/05/2019 09:28, Paolo Bonzini wrote:
> On 24/05/19 09:56, Tao Xu wrote:
>> As the latest Intel 64 and IA-32 Architectures Software Developer's
>> Manual, UMWAIT and TPAUSE instructions cause a VM exit if the
>> “RDTSC exiting” and “enable user wait and pause” VM-execution controls
>> are both 1.
>>
>> This patch is to handle the vm-exit for UMWAIT and TPAUSE as invalid_op.
>
> KVM never enables RDTSC exiting, so this is not necessary.
>
> Paolo
>
OK, but should we just drop this patch?
Or add the VMX_EXIT_REASONS bits of UMWAIT and TPAUSE and handle like
XSAVES/XRSTORS:
"kvm_skip_emulated_instruction(vcpu);"
"WARN(1, "this should never happen\n");"
Looking forward to your reply.
Tao
On 29/05/19 04:05, Tao Xu wrote:
>>
>
> Thank you Paolo, but I have another question. I was wondering if it is
> appropriate to enable X86_FEATURE_WAITPKG when QEMU uses "-overcommit
> cpu-pm=on"?
"-overcommit" only establishes the behavior of KVM, it doesn't change
the cpuid bits. So you'd need "-cpu" as well.
Paolo
> Or just enable X86_FEATURE_WAITPKG when QEMU add the feature
> "-cpu host,+waitpkg"? User wait instructions is the wait or pause
> instructions may be executed at any privilege level, but can use
> IA32_UMWAIT_CONTROL to set the maximum time.
On 29/05/19 04:25, Tao Xu wrote:
>>
> OK, but should we just drop this patch?
> Or add the VMX_EXIT_REASONS bits of UMWAIT and TPAUSE and handle like
> XSAVES/XRSTORS:
> "kvm_skip_emulated_instruction(vcpu);"
> "WARN(1, "this should never happen\n");"
Yes, this sounds good to me.
Paolo
On 5/29/2019 10:38 AM, Paolo Bonzini wrote:
> On 29/05/19 04:05, Tao Xu wrote:
>>>
>>
>> Thank you Paolo, but I have another question. I was wondering if it is
>> appropriate to enable X86_FEATURE_WAITPKG when QEMU uses "-overcommit
>> cpu-pm=on"?
>
> "-overcommit" only establishes the behavior of KVM, it doesn't change
> the cpuid bits. So you'd need "-cpu" as well.
>
> Paolo
>
OK I got it. Thank you for your review.