QEMU uses SIGUSR1 to force a userspace exit and also to queue an early
exit before calling VCPU_RUN -- the signal is blocked in user space and
temporarily unblocked in VCPU_RUN.
The temporal unblocking by sigprocmask() in kvm_arch_vcpu_ioctl_run()
takes a shared siglock, which leads to cacheline bouncing in NUMA
systems.
This series allows the same with a new request bit and VM IOCTL that
marks and kicks target VCPU, hence no need to unblock.
inl_from_{pmtimer,qemu} vmexit benchmark from kvm-unit-tests shows ~5%
speedup for 1-4 VCPUs (300-2000 saved cycles) without noticeably
regressing kernel VM exits.
(Paolo did a quick run of older version of this series on a NUMA system
and the speedup was around 35% when utilizing more nodes.)
Radim Krčmář (5):
KVM: add kvm_has_request wrapper
KVM: add KVM_REQ_EXIT request for userspace exit
KVM: add KVM_USER_EXIT vm ioctl for userspace exit
KVM: optimize common cases in KVM_USER_EXIT
KVM: x86: add request_exits debug counter
Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/vmx.c | 4 ++--
arch/x86/kvm/x86.c | 9 +++++++++
include/linux/kvm_host.h | 15 ++++++++++++--
include/uapi/linux/kvm.h | 9 +++++++++
virt/kvm/kvm_main.c | 41 ++++++++++++++++++++++++++++++++++++++-
7 files changed, 104 insertions(+), 5 deletions(-)
--
2.5.0
We want to have requests abstracted from bit operations.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/vmx.c | 2 +-
include/linux/kvm_host.h | 7 ++++++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 217f66343dc8..17514fe7d2cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,7 +5879,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
- if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+ if (kvm_has_request(KVM_REQ_EVENT, vcpu))
return 1;
err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27ccdf91a465..52e388367a26 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1089,9 +1089,14 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
set_bit(req, &vcpu->requests);
}
+static inline bool kvm_has_request(int req, struct kvm_vcpu *vcpu)
+{
+ return test_bit(req, &vcpu->requests);
+}
+
static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
{
- if (test_bit(req, &vcpu->requests)) {
+ if (kvm_has_request(req, vcpu)) {
clear_bit(req, &vcpu->requests);
return true;
} else {
--
2.5.0
When userspace wants KVM to exit to userspace, it sends a signal.
This has a disadvantage of requiring a change to the signal mask because
the signal needs to be blocked in userspace to stay pending when sending
to self.
Using a request flag allows us to shave 200-300 cycles from every
userspace exit and the speedup grows with NUMA because unblocking
touches shared spinlock.
The disadvantage is that it adds an overhead of one bit check for all
kernel exits. A quick tracing shows that the ratio of userspace exits
after boot is about 1/5 and in subsequent run of nmap and kernel compile
has about 1/60, so the check should not regress global performance.
All signal_pending() calls are userspace exit requests, so we add a
check for KVM_REQ_EXIT there. There is one omitted call in kvm_vcpu_run
because KVM_REQ_EXIT is implied in earlier check for requests.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/vmx.c | 2 +-
arch/x86/kvm/x86.c | 6 ++++++
include/linux/kvm_host.h | 8 +++++++-
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 2 +-
5 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 17514fe7d2cb..496f6ba1c812 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5903,7 +5903,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (signal_pending(current))
+ if (kvm_need_exit(vcpu))
goto out;
if (need_resched())
schedule();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec7bf2b16948..c5d790fdfc2e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6547,6 +6547,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
++vcpu->stat.signal_exits;
break;
}
+ if (unlikely(kvm_has_request(KVM_REQ_EXIT, vcpu))) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_REQUEST;
+ break;
+ }
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
cond_resched();
@@ -6683,6 +6688,7 @@ out:
post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ clear_bit(KVM_REQ_EXIT, &vcpu->requests);
return r;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 52e388367a26..dcc57171e3ec 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -121,7 +121,7 @@ static inline bool is_error_page(struct page *page)
#define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7
#define KVM_REQ_CLOCK_UPDATE 8
-#define KVM_REQ_KICK 9
+#define KVM_REQ_EXIT 9
#define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12
@@ -1104,6 +1104,12 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
}
}
+static inline bool kvm_need_exit(struct kvm_vcpu *vcpu)
+{
+ return signal_pending(current) ||
+ kvm_has_request(KVM_REQ_EXIT, vcpu);
+}
+
extern bool kvm_rebooting;
struct kvm_device {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 26daafbba9ec..d996a7cdb4d2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -184,6 +184,7 @@ struct kvm_s390_skeys {
#define KVM_EXIT_SYSTEM_EVENT 24
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
+#define KVM_EXIT_REQUEST 27
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 093b3d10b411..b34a328fdac1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1914,7 +1914,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_pending_timer(vcpu))
return -EINTR;
- if (signal_pending(current))
+ if (kvm_need_exit(vcpu))
return -EINTR;
return 0;
--
2.5.0
The guest can use KVM_USER_EXIT instead of a signal-based exiting to
userspace. Availability depends on KVM_CAP_USER_EXIT.
Only x86 is implemented so far.
It would be cleaner to use 'unsigned long' to store the vcpu_id, but I
really don't like its variable size and 'u64' will be same/bigger for
few for more years.
Signed-off-by: Radim Krčmář <[email protected]>
---
Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 1 +
include/uapi/linux/kvm.h | 8 ++++++++
virt/kvm/kvm_main.c | 35 +++++++++++++++++++++++++++++++++++
4 files changed, 74 insertions(+)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c714d43a717..5cf25a15fc6f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3020,6 +3020,36 @@ Returns: 0 on success, -1 on error
Queues an SMI on the thread's vcpu.
+
+4.97 KVM_USER_EXIT
+
+Capability: KVM_CAP_USER_EXIT
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_user_exit (in)
+Returns: 0 on success,
+ -EFAULT if the parameter couldn't be read,
+ -EINVAL if 'reserved' is not zeroed,
+ -ENOENT if VCPU with 'vcpu_id' is not present
+
+struct kvm_user_exit {
+ __u64 vcpu_id;
+ __u32 reserved[14];
+};
+
+Make vcpu_id exit to userspace as soon as possible. If the VCPU is not running
+in kernel at the time, it will exit early on the next call to KVM_RUN.
+If the VCPU was going to exit because of other reasons when KVM_USER_EXIT was
+issued, it will keep the original exit reason and not exit early on next
+KVM_RUN.
+If VCPU exited because of KVM_USER_EXIT, the exit reason is KVM_EXIT_REQUEST.
+
+This ioctl has very similar effect (same sans some races on userspace exit) as
+sending a signal (that is blocked in userspace and set in KVM_SET_SIGNAL_MASK)
+to the VCPU thread.
+
+
+
5. The kvm_run structure
------------------------
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5d790fdfc2e..61f35944dd53 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2465,6 +2465,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
#endif
+ case KVM_CAP_USER_EXIT:
r = 1;
break;
case KVM_CAP_X86_SMM:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d996a7cdb4d2..79316489346c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -826,6 +826,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_X86_SMM 117
#define KVM_CAP_MULTI_ADDRESS_SPACE 118
#define KVM_CAP_SPLIT_IRQCHIP 119
+#define KVM_CAP_USER_EXIT 120
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1008,6 +1009,11 @@ struct kvm_device_attr {
__u64 addr; /* userspace address of attr data */
};
+struct kvm_user_exit {
+ __u64 vcpu_id; /* the 'unsigned long' used in KVM_CREATE_VCPU */
+ __u32 reserved[14];
+};
+
#define KVM_DEV_VFIO_GROUP 1
#define KVM_DEV_VFIO_GROUP_ADD 1
#define KVM_DEV_VFIO_GROUP_DEL 2
@@ -1119,6 +1125,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr)
/* Available with KVM_CAP_PPC_RTAS */
#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_USER_EXIT */
+#define KVM_USER_EXIT _IOW(KVMIO, 0xad, struct kvm_user_exit)
/* ioctl for vm fd */
#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b34a328fdac1..024428b64812 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2644,6 +2644,32 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return kvm_vm_ioctl_check_extension(kvm, arg);
}
+int kvm_vm_ioctl_user_exit(struct kvm *kvm, struct kvm_user_exit *info)
+{
+ /* Casting vcpu_id to int is intended and matches the behavior of
+ * KVM_CREATE_VCPU, where we cast from unsigned long.
+ */
+ int vcpu_id = info->vcpu_id;
+ int idx;
+ struct kvm_vcpu *vcpu;
+ const struct kvm_user_exit valid = {.vcpu_id = info->vcpu_id};
+
+ BUILD_BUG_ON(sizeof(struct kvm_user_exit) != 64);
+
+ if (memcmp(info, &valid, sizeof(valid)))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(idx, vcpu, kvm)
+ if (vcpu->vcpu_id == vcpu_id) {
+ kvm_make_request(KVM_REQ_EXIT, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2779,6 +2805,15 @@ out_free_irq_routing:
vfree(entries);
break;
}
+ case KVM_USER_EXIT: {
+ struct kvm_user_exit info;
+
+ r = -EFAULT;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ goto out;
+ r = kvm_vm_ioctl_user_exit(kvm, &info);
+ break;
+ }
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.5.0
VCPU with vcpu->vcpu_id has highest probability of being stored in
kvm->vcpus[vcpu->vcpu_id]. Other common case, sparse sequential
vcpu_id, is more likely to find a match downwards from vcpu->vcpu_id.
Random distribution does not matter so we first search slots
[vcpu->vcpu_id..0] and then slots (vcpu->vcpu_id..kvm->online_vcpus).
If we value cycles over memory, a direct map between vcpu_id and
vcpu->vcpu_id would be better.
(Like kvm_for_each_vcpu, the code avoid the kvm->lock by presuming that
kvm->online_vcpus doesn't shrink and that the vcpu pointer is set up
before incrementing. kvm_free_vcpus() breaks that presumption, but vm
is destroyed only after the fd has been released.)
Signed-off-by: Radim Krčmář <[email protected]>
---
virt/kvm/kvm_main.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 024428b64812..7d532591d5af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2650,7 +2650,7 @@ int kvm_vm_ioctl_user_exit(struct kvm *kvm, struct kvm_user_exit *info)
* KVM_CREATE_VCPU, where we cast from unsigned long.
*/
int vcpu_id = info->vcpu_id;
- int idx;
+ int idx, first;
struct kvm_vcpu *vcpu;
const struct kvm_user_exit valid = {.vcpu_id = info->vcpu_id};
@@ -2659,7 +2659,11 @@ int kvm_vm_ioctl_user_exit(struct kvm *kvm, struct kvm_user_exit *info)
if (memcmp(info, &valid, sizeof(valid)))
return -EINVAL;
- kvm_for_each_vcpu(idx, vcpu, kvm)
+ for (idx = first = min(vcpu_id, atomic_read(&kvm->online_vcpus) - 1);
+ idx >= 0 ? (vcpu = kvm_get_vcpu(kvm, idx)) != NULL
+ : ++first < atomic_read(&kvm->online_vcpus) &&
+ (vcpu = kvm_get_vcpu(kvm, first)) != NULL;
+ idx--)
if (vcpu->vcpu_id == vcpu_id) {
kvm_make_request(KVM_REQ_EXIT, vcpu);
kvm_vcpu_kick(vcpu);
--
2.5.0
We are still interested in the amount of exits userspace requested and
signal_exits doesn't cover that anymore.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e851d518b24..c875f70df02a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -717,6 +717,7 @@ struct kvm_vcpu_stat {
u32 hypercalls;
u32 irq_injections;
u32 nmi_injections;
+ u32 request_exits;
};
struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 61f35944dd53..8f72be681e36 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "io_exits", VCPU_STAT(io_exits) },
{ "mmio_exits", VCPU_STAT(mmio_exits) },
{ "signal_exits", VCPU_STAT(signal_exits) },
+ { "request_exits", VCPU_STAT(request_exits) },
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
@@ -6551,6 +6552,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(kvm_has_request(KVM_REQ_EXIT, vcpu))) {
r = 0;
vcpu->run->exit_reason = KVM_EXIT_REQUEST;
+ ++vcpu->stat.request_exits;
break;
}
if (need_resched()) {
--
2.5.0
On 05/08/2015 15:21, Radim Krčmář wrote:
> + kvm_for_each_vcpu(idx, vcpu, kvm)
> + if (vcpu->vcpu_id == vcpu_id) {
> + kvm_make_request(KVM_REQ_EXIT, vcpu);
> + kvm_vcpu_kick(vcpu);
> +
> + return 0;
> + }
> +
Why not a vcpu ioctl? kvm_for_each_vcpu can become pretty expensive if
you have a few dozen VCPUs.
Paolo
2015-08-05 15:29+0200, Paolo Bonzini:
> On 05/08/2015 15:21, Radim Krčmář wrote:
>> + kvm_for_each_vcpu(idx, vcpu, kvm)
>> + if (vcpu->vcpu_id == vcpu_id) {
>> + kvm_make_request(KVM_REQ_EXIT, vcpu);
>> + kvm_vcpu_kick(vcpu);
>> +
>> + return 0;
>> + }
>> +
>
> Why not a vcpu ioctl? kvm_for_each_vcpu can become pretty expensive if
> you have a few dozen VCPUs.
Yeah, it will be slow, the only bright side is low frequency of calls.
vcpu ioctl should only be issued by the vcpu thread so it would
significantly limit use.
On 05/08/2015 15:34, Radim Krčmář wrote:
> 2015-08-05 15:29+0200, Paolo Bonzini:
>> On 05/08/2015 15:21, Radim Krčmář wrote:
>>> + kvm_for_each_vcpu(idx, vcpu, kvm)
>>> + if (vcpu->vcpu_id == vcpu_id) {
>>> + kvm_make_request(KVM_REQ_EXIT, vcpu);
>>> + kvm_vcpu_kick(vcpu);
>>> +
>>> + return 0;
>>> + }
>>> +
>>
>> Why not a vcpu ioctl? kvm_for_each_vcpu can become pretty expensive if
>> you have a few dozen VCPUs.
>
> Yeah, it will be slow, the only bright side is low frequency of calls.
>
> vcpu ioctl should only be issued by the vcpu thread so it would
> significantly limit use.
That's a general limitation, but you can lift it for particular ioctls.
See in particular this:
#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
*/
if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
#endif
You can add an "if (ioctl == KVM_USER_EXIT)" before.
Paolo
2015-08-05 15:38+0200, Paolo Bonzini:
> On 05/08/2015 15:34, Radim Krčmář wrote:
>> vcpu ioctl should only be issued by the vcpu thread so it would
>> significantly limit use.
>
> That's a general limitation, but you can lift it for particular ioctls.
>
> See in particular this:
>
> #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
> /*
> * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
> * so vcpu_load() would break it.
> */
> if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
> return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
> #endif
>
> You can add an "if (ioctl == KVM_USER_EXIT)" before.
Thanks, it looks to be safe, I'll put it in v2.