LinuxLists.cc - [PATCH v4 1/4] KVM: x86: refactor req_immediate

2023-12-05 10:39:17

Subject: [PATCH v4 1/4] KVM: x86: refactor req_immediate_exit logic

- move req_immediate_exit variable from arch specific to common code.
- remove arch specific callback .request_immediate_exit and move the code
down to the arch's vcpu_run's code.

No functional change is intended.

Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 5 ++---
arch/x86/kvm/svm/svm.c | 7 ++++---
arch/x86/kvm/vmx/vmx.c | 18 ++++++-----------
arch/x86/kvm/vmx/vmx.h | 2 --
arch/x86/kvm/x86.c | 31 +++++++++++++-----------------
6 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 26b628d84594b93..3aeb7c669a0b09b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -101,7 +101,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d7036982332e33d..044b4f9265c5427 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1021,6 +1021,8 @@ struct kvm_vcpu_arch {
*/
bool pdptrs_from_userspace;

+ bool req_immediate_exit;
+
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
#endif
@@ -1700,8 +1702,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);

- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
-
void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);

/*
@@ -2187,7 +2187,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);

int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);

void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1855a6d7c976ad2..d2c6ff9036009dd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4137,9 +4137,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is enough to force an immediate vmexit.
*/
disable_nmi_singlestep(svm);
- smp_send_reschedule(vcpu->cpu);
+ vcpu->arch.req_immediate_exit = true;
}

+ if (vcpu->arch.req_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
pre_svm_run(vcpu);

sync_lapic_to_cr8(vcpu);
@@ -4995,8 +4998,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,

- .request_immediate_exit = __kvm_request_immediate_exit,
-
.sched_in = svm_sched_in,

.nested_ops = &svm_nested_ops,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index be20a60047b1f29..b8fa16f9e621878 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -67,6 +67,8 @@
#include "x86.h"
#include "smm.h"

+#include <trace/events/ipi.h>
+
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

@@ -1288,8 +1290,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;

- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -5996,7 +5996,7 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

- if (!vmx->req_immediate_exit &&
+ if (!vcpu->arch.req_immediate_exit &&
!unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
kvm_lapic_expired_hv_timer(vcpu);
return EXIT_FASTPATH_REENTER_GUEST;
@@ -7154,7 +7154,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
u64 tscl;
u32 delta_tsc;

- if (vmx->req_immediate_exit) {
+ if (vcpu->arch.req_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7357,6 +7357,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
+ else if (vcpu->arch.req_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);

kvm_wait_lapic_expire(vcpu);

@@ -7899,11 +7901,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}

-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -8312,8 +8309,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,

- .request_immediate_exit = vmx_request_immediate_exit,
-
.sched_in = vmx_sched_in,

.cpu_dirty_log_size = PML_ENTITY_NUM,
@@ -8571,7 +8566,6 @@ static __init int hardware_setup(void)
if (!enable_preemption_timer) {
vmx_x86_ops.set_hv_timer = NULL;
vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}

kvm_caps.supported_mce_cap |= MCG_LMCE_P;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index c2130d2c8e24bb5..4dabd16a3d7180e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -330,8 +330,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;

- bool req_immediate_exit;
-
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c924075f6f112a..2089a0b08ce08c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10179,8 +10179,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
* ordering between that side effect, the instruction completing, _and_ the
* delivery of the asynchronous event.
*/
-static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
- bool *req_immediate_exit)
+static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu)
{
bool can_inject;
int r;
@@ -10357,8 +10356,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,

if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
- *req_immediate_exit = true;
+ kvm_x86_ops.nested_ops->has_events(vcpu)) {
+ vcpu->arch.req_immediate_exit = true;
+ }

/*
* KVM must never queue a new exception while injecting an event; KVM
@@ -10375,10 +10375,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
WARN_ON_ONCE(vcpu->arch.exception.pending ||
vcpu->arch.exception_vmexit.pending);
return 0;
-
out:
if (r == -EBUSY) {
- *req_immediate_exit = true;
+ vcpu->arch.req_immediate_exit = true;
r = 0;
}
return r;
@@ -10605,12 +10604,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}

-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10625,7 +10618,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;

- bool req_immediate_exit = false;

if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
@@ -10787,7 +10779,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}

- r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
+ r = kvm_check_and_inject_events(vcpu);
if (r < 0) {
r = 0;
goto out;
@@ -10856,10 +10848,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}

- if (req_immediate_exit) {
+
+ if (vcpu->arch.req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
- }

fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10891,6 +10882,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));

exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;

@@ -10906,6 +10898,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
}

+ vcpu->arch.req_immediate_exit = false;
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -10993,8 +10986,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
return r;

cancel_injection:
- if (req_immediate_exit)
+ if (vcpu->arch.req_immediate_exit) {
+ vcpu->arch.req_immediate_exit = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
static_call(kvm_x86_cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
--
2.26.3

2023-12-05 17:24:04

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCH v4 1/4] KVM: x86: refactor req_immediate_exit logic

On Tue, Dec 05, 2023, Maxim Levitsky wrote:
> - move req_immediate_exit variable from arch specific to common code.
> - remove arch specific callback .request_immediate_exit and move the code
> down to the arch's vcpu_run's code.
>
> No functional change is intended.

Ooh, but there is a very subtle change here that could impact functionality. This
moves the sending of the self-IPI much closer to VM-Enter. In practice that is
extremely unlikely to matter, but in theory it ever so slightly increases the
danger of the IRQ not being pended before VM-Enter. I am not saying we can't do
the move, but it should be done in a fairly isolated patch.

And I am strongly opposed to moving req_immediate_exit into kvm_vcpu_arch. The
flag in vcpu_vmx is bad enough, but at least that one is decently well guarded
in the sense that's its unconditionally cleared in vmx_prepare_switch_to_guest().
I would much rather go in the opposite direction and turn req_immediate_exit into
a local variable that is passed down the stack as needed.

Somewhat of a side topic, I think we should name the variable that's passed down
the stack "force_immediate_exit". I suspect the "request" verbiage came from the
fact that the immediate exit is triggered in order to service a request. But as
far as the the vendor's .vcpu_run() is concerned, an immediate exit isn't being
requested, it's being forced.

The VMX preemption timer code in particular can benefit from related cleanups, as
the fastpath vs. slowpath code is super confusing (the slowpath _looks_ redundant,
but it's subtly necessary to handle exits from L2). With ~5 patches of cleanup,
the VMX code can be made to look like this, i.e. can eliminate the vcpu_vmx flag
and also handle forced exits from L2 in the fastpath.

static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

/*
* In the *extremely* unlikely scenario that this is a spurious VM-Exit
* due to the timer expiring while it was "soft" disabled, just eat the
* exit and re-enter the guest.
*/
if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;

/*
* If the timer expired because KVM used it to force an immediate exit,
* then mission accomplished.
*/
if (force_immediate_exit)
return EXIT_FASTPATH_EXIT_HANDLED;

/*
* If L2 is active, go down the slow path as emulating the guest timer
* expiration likely requires synthesizing a nested VM-Exit.
*/
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;

kvm_lapic_expired_hv_timer(vcpu);
return EXIT_FASTPATH_REENTER_GUEST;
}

static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
/*
* This non-fastpath handler is reached if and only if the preemption
* timer was being used to emulate a guest timer while L2 is active.
* All other scenarios are supposed to be handled in the fastpath.
*/
WARN_ON_ONCE(!is_guest_mode(vcpu));
kvm_lapic_expired_hv_timer(vcpu);
return 1;
}

For this series, the minimal change is to plumb the flag into track_kvm_entry()
in a standalone patch. The below is compile tested only.

Holler if you want me to post an RFC for the full series I have in mind, e.g. if
it's hard to see how the below leads to a sane end result. I have the code written
and compile tested, but I'm going to be buried in non-upstream stuff for a few
days and probably won't get around to fully testing and writing changelogs until
later this week.

---
arch/x86/include/asm/kvm_host.h | 3 ++-
arch/x86/kvm/svm/svm.c | 5 +++--
arch/x86/kvm/trace.h | 9 ++++++---
arch/x86/kvm/vmx/vmx.c | 4 ++--
arch/x86/kvm/x86.c | 2 +-
5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8c7e2475a18..aee423e6c6f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1652,7 +1652,8 @@ struct kvm_x86_ops {
void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);

int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c46f07b28230..33f3db2058fd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4112,12 +4112,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}

-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);

- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);

svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 83843379813e..88659de4d2a7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,20 +15,23 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(struct kvm_vcpu *vcpu),
- TP_ARGS(vcpu),
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),

TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
+ __field( bool, immediate_exit )
),

TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
),

- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ __entry->immediate_exit ? "[immediate exit]" : "")
);

/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d30df9b3fe3e..dc0d986e35c3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7270,7 +7270,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_exit_irqoff();
}

-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7297,7 +7297,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}

- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);

if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1983947b8965..f1d9828abeba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10916,7 +10916,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));

- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;

base-commit: 1ab097653e4dd8d23272d028a61352c23486fd4a
--