INFO: task gnome-terminal-:1734 blocked for more than 120 seconds.
Not tainted 4.12.0-rc4+ #8
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
gnome-terminal- D 0 1734 1015 0x00000000
Call Trace:
__schedule+0x3cd/0xb30
schedule+0x40/0x90
kvm_async_pf_task_wait+0x1cc/0x270
? __vfs_read+0x37/0x150
? prepare_to_swait+0x22/0x70
do_async_page_fault+0x77/0xb0
? do_async_page_fault+0x77/0xb0
async_page_fault+0x28/0x30
This is triggered by running both win7 and win2016 on L1 KVM simultaneously,
and then gives stress to memory on L1, I can observed this hang on L1 when
at least ~70% swap area is occupied on L0.
This is due to async pf was injected to L2 which should be injected to L1,
L2 guest starts receiving pagefault w/ bogus %cr2(apf token from the host
actually), and L1 guest starts accumulating tasks stuck in D state in
kvm_async_pf_task_wait() since missing PAGE_READY async_pfs.
This patchset fixes it according to Radim's proposal "force a nested VM exit
from nested_vmx_check_exception if the injected #PF is async_pf and handle
the #PF VM exit in L1". https://www.spinics.net/lists/kvm/msg142498.html
Note: The patchset almost not touch SVM since I don't have AMD CPU to verify
the modification.
v1 -> v2:
* remove nested_vmx_check_exception nr parameter
* construct a simple special vm-exit information field for async pf
* introduce nested_apf_token to vcpu->arch.apf to avoid change the CR2
visible in L2 guest
* avoid pass the apf directed towards it (L1) into L2 if there is L3
at the moment
Wanpeng Li (4):
KVM: x86: Simple kvm_x86_ops->queue_exception parameter
KVM: async_pf: Add L1 guest async_pf #PF vmexit handler
KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf
KVM: async_pf: Let host know whether the guest support delivery async_pf as #PF vmexit
Documentation/virtual/kvm/msr.txt | 5 +--
arch/x86/include/asm/kvm_emulate.h | 1 +
arch/x86/include/asm/kvm_host.h | 7 +++--
arch/x86/include/uapi/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 1 +
arch/x86/kvm/mmu.c | 2 +-
arch/x86/kvm/svm.c | 8 +++--
arch/x86/kvm/vmx.c | 61 ++++++++++++++++++++++++++++--------
arch/x86/kvm/x86.c | 18 ++++++-----
9 files changed, 75 insertions(+), 29 deletions(-)
--
2.7.4
From: Wanpeng Li <[email protected]>
Add an async_page_fault field to vcpu->arch.exception to identify an async
page fault, and constructs the expected vm-exit information fields. Force
a nested VM exit from nested_vmx_check_exception() if the injected #PF
is async page fault.
Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/include/asm/kvm_emulate.h | 1 +
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/vmx.c | 17 ++++++++++++++---
arch/x86/kvm/x86.c | 8 +++++++-
4 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0559626..b5bcad9 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -23,6 +23,7 @@ struct x86_exception {
u16 error_code;
bool nested_page_fault;
u64 address; /* cr2 or nested page fault gpa */
+ bool async_page_fault;
};
/*
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1f01bfb..100ad9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -545,6 +545,7 @@ struct kvm_vcpu_arch {
bool reinject;
u8 nr;
u32 error_code;
+ bool async_page_fault;
} exception;
struct kvm_queued_interrupt {
@@ -645,6 +646,7 @@ struct kvm_vcpu_arch {
u64 msr_val;
u32 id;
bool send_user_only;
+ unsigned long nested_apf_token;
} apf;
/* OSVW MSRs (AMD only) */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f533cc1..e7b9844 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2419,13 +2419,24 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
*/
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
- if (!(vmcs12->exception_bitmap & (1u << nr)))
+ if (!((vmcs12->exception_bitmap & (1u << nr)) ||
+ (nr == PF_VECTOR && vcpu->arch.exception.async_page_fault)))
return 0;
+ if (vcpu->arch.exception.async_page_fault) {
+ vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ vcpu->arch.apf.nested_apf_token);
+ return 1;
+ }
+
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
@@ -2442,7 +2453,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (!reinject && is_guest_mode(vcpu) &&
- nested_vmx_check_exception(vcpu, nr))
+ nested_vmx_check_exception(vcpu))
return;
if (has_error_code) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b28a31..5931ce7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = fault->address;
+ vcpu->arch.exception.async_page_fault = fault->async_page_fault;
+ if (is_guest_mode(vcpu) && vcpu->arch.exception.async_page_fault)
+ vcpu->arch.apf.nested_apf_token = fault->address;
+ else
+ vcpu->arch.cr2 = fault->address;
kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
@@ -8571,6 +8575,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
+ fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
}
}
@@ -8593,6 +8598,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
+ fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
}
vcpu->arch.apf.halted = false;
--
2.7.4
From: Wanpeng Li <[email protected]>
Adds another flag bit (bit 2) to MSR_KVM_ASYNC_PF_EN. If bit 2 is 1, async
page faults are delivered to L1 as #PF vmexits; if bit 2 is 0, kvm_can_do_async_pf
returns 0 if in guest mode.
Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
Documentation/virtual/kvm/msr.txt | 5 +++--
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/uapi/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 1 +
arch/x86/kvm/mmu.c | 2 +-
arch/x86/kvm/vmx.c | 2 +-
arch/x86/kvm/x86.c | 5 +++--
7 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 0a9ea51..1ebecc1 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -166,10 +166,11 @@ MSR_KVM_SYSTEM_TIME: 0x12
MSR_KVM_ASYNC_PF_EN: 0x4b564d02
data: Bits 63-6 hold 64-byte aligned physical address of a
64 byte memory area which must be in guest RAM and must be
- zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
+ zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1
when asynchronous page faults are enabled on the vcpu 0 when
disabled. Bit 1 is 1 if asynchronous page faults can be injected
- when vcpu is in cpl == 0.
+ when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults
+ are delivered to L1 as #PF vmexits.
First 4 byte of 64 byte memory location will be written to by
the hypervisor at the time of asynchronous page fault (APF)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 100ad9a..9e18de4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_arch {
u32 id;
bool send_user_only;
unsigned long nested_apf_token;
+ bool delivery_as_pf_vmexit;
} apf;
/* OSVW MSRs (AMD only) */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index cff0bb6..a965e5b 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -67,6 +67,7 @@ struct kvm_clock_pairing {
#define KVM_ASYNC_PF_ENABLED (1 << 0)
#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 43e10d6..1e29a77 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -330,6 +330,7 @@ static void kvm_guest_cpu_init(void)
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
+ pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
__this_cpu_write(apf_reason.enabled, 1);
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb82259..c49aecd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3704,7 +3704,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
kvm_event_needs_reinjection(vcpu)))
return false;
- if (is_guest_mode(vcpu))
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
return false;
return kvm_x86_ops->interrupt_allowed(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e7b9844..2e906cf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8025,7 +8025,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
- return enable_ept;
+ return !vmx->apf_reason && enable_ept;
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5931ce7..ad67980 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2064,8 +2064,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are reserved, Should be zero */
- if (data & 0x3c)
+ /* Bits 3:5 are reserved, Should be zero */
+ if (data & 0x38)
return 1;
vcpu->arch.apf.msr_val = data;
@@ -2081,6 +2081,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
kvm_async_pf_wakeup_all(vcpu);
return 0;
}
--
2.7.4
From: Wanpeng Li <[email protected]>
This patch adds the L1 guest async page fault #PF vmexit handler, such
#PF is converted into vmexit from L2 to L1 on #PF which is then handled
by L1 similar to ordinary async page fault.
Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/vmx.c | 34 ++++++++++++++++++++++++++++------
1 file changed, 28 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df825bb..f533cc1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,6 +616,7 @@ struct vcpu_vmx {
bool emulation_required;
u32 exit_reason;
+ u32 apf_reason;
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -5648,14 +5649,31 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_page_fault(intr_info)) {
- /* EPT won't cause page fault directly */
- BUG_ON(enable_ept);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
- trace_kvm_page_fault(cr2, error_code);
+ switch (vmx->apf_reason) {
+ default:
+ /* EPT won't cause page fault directly */
+ BUG_ON(enable_ept);
+ trace_kvm_page_fault(cr2, error_code);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, cr2);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ vmx->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(cr2);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ vmx->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(cr2);
+ local_irq_enable();
+ break;
+ }
+ return 0;
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -8602,6 +8620,10 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
exit_intr_info = vmx->exit_intr_info;
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(exit_intr_info))
+ vmx->apf_reason = kvm_read_and_reset_pf_reason();
+
/* Handle machine checks before interrupts are enabled */
if (is_machine_check(exit_intr_info))
kvm_machine_check();
--
2.7.4
From: Wanpeng Li <[email protected]>
This patch removes all arguments except the first in kvm_x86_ops->queue_exception
since they can extract the arguments from vcpu->arch.exception themselves, do the
same in nested_{vmx,svm}_check_exception.
Cc: Paolo Bonzini <[email protected]>
Cc: Radim Krčmář <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 +---
arch/x86/kvm/svm.c | 8 +++++---
arch/x86/kvm/vmx.c | 8 +++++---
arch/x86/kvm/x86.c | 5 +----
4 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605e..1f01bfb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -948,9 +948,7 @@ struct kvm_x86_ops {
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu);
void (*set_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject);
+ void (*queue_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba9891a..e1f8e89 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -631,11 +631,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm_set_interrupt_shadow(vcpu, 0);
}
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
+static void svm_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ bool reinject = vcpu->arch.exception.reinject;
+ u32 error_code = vcpu->arch.exception.error_code;
/*
* If we are within a nested VM we'd better #VMEXIT and let the guest
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b9..df825bb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2431,11 +2431,13 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
return 1;
}
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
+static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ bool reinject = vcpu->arch.exception.reinject;
+ u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (!reinject && is_guest_mode(vcpu) &&
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d3cb9..1b28a31 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6345,10 +6345,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_update_dr7(vcpu);
}
- kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code,
- vcpu->arch.exception.reinject);
+ kvm_x86_ops->queue_exception(vcpu);
return 0;
}
--
2.7.4
2017-06-14 19:26-0700, Wanpeng Li:
> From: Wanpeng Li <[email protected]>
>
> Add an async_page_fault field to vcpu->arch.exception to identify an async
> page fault, and constructs the expected vm-exit information fields. Force
> a nested VM exit from nested_vmx_check_exception() if the injected #PF
> is async page fault.
>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> ---
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
> {
> ++vcpu->stat.pf_guest;
> - vcpu->arch.cr2 = fault->address;
> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
I think we need to act as if arch.exception.async_page_fault was not
pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
migrate with pending async_page_fault exception, we'd inject it as a
normal #PF, which could confuse/kill the nested guest.
And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
sanity as well.
Thanks.
2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
> 2017-06-14 19:26-0700, Wanpeng Li:
>> From: Wanpeng Li <[email protected]>
>>
>> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> page fault, and constructs the expected vm-exit information fields. Force
>> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> is async page fault.
>>
>> Cc: Paolo Bonzini <[email protected]>
>> Cc: Radim Krčmář <[email protected]>
>> Signed-off-by: Wanpeng Li <[email protected]>
>> ---
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
>> {
>> ++vcpu->stat.pf_guest;
>> - vcpu->arch.cr2 = fault->address;
>> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>
> I think we need to act as if arch.exception.async_page_fault was not
> pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
> migrate with pending async_page_fault exception, we'd inject it as a
> normal #PF, which could confuse/kill the nested guest.
>
> And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
> sanity as well.
Do you mean we should add a field like async_page_fault to
kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
restores events->exception.async_page_fault to
arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
Regards,
Wanpeng Li
2017-06-16 22:24+0800, Wanpeng Li:
> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
> > 2017-06-14 19:26-0700, Wanpeng Li:
> >> From: Wanpeng Li <[email protected]>
> >>
> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
> >> page fault, and constructs the expected vm-exit information fields. Force
> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
> >> is async page fault.
> >>
> >> Cc: Paolo Bonzini <[email protected]>
> >> Cc: Radim Krčmář <[email protected]>
> >> Signed-off-by: Wanpeng Li <[email protected]>
> >> ---
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
> >> {
> >> ++vcpu->stat.pf_guest;
> >> - vcpu->arch.cr2 = fault->address;
> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
> >
> > I think we need to act as if arch.exception.async_page_fault was not
> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
> > migrate with pending async_page_fault exception, we'd inject it as a
> > normal #PF, which could confuse/kill the nested guest.
> >
> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
> > sanity as well.
>
> Do you mean we should add a field like async_page_fault to
> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
> restores events->exception.async_page_fault to
> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
No, I thought we could get away with a disgusting hack of hiding the
exception from userspace, which would work for migration, but not if
local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
Extending the userspace interface would work, but I'd do it as a last
resort, after all conservative solutions have failed.
async_pf migration is very crude, so exposing the exception is just an
ugly workaround for the local case. Adding the flag would also require
userspace configuration of async_pf features for the guest to keep
compatibility.
I see two options that might be simpler than adding the userspace flag:
1) do the nested VM exit sooner, at the place where we now queue #PF,
2) queue the #PF later, save the async_pf in some intermediate
structure and consume it at the place where you proposed the nested
VM exit.
2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li <[email protected]>
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini <[email protected]>
>> >> Cc: Radim Krčmář <[email protected]>
>> >> Signed-off-by: Wanpeng Li <[email protected]>
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
>> >> {
>> >> ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case. Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
> 1) do the nested VM exit sooner, at the place where we now queue #PF,
> 2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.
Yeah, this hidden looks rational to me, even if we lost a
PAGE_NOTREADY async_pf to L1, that just a hint to L1 reschedule
optimization, and PAGE_READY async_pf will be guranteed by the wakeup
all after live migration.
Regards,
Wanpeng Li
2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li <[email protected]>
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini <[email protected]>
>> >> Cc: Radim Krčmář <[email protected]>
>> >> Signed-off-by: Wanpeng Li <[email protected]>
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
>> >> {
>> >> ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case. Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
> 1) do the nested VM exit sooner, at the place where we now queue #PF,
> 2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.
How about something like this to not get exception events if it is
"is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
vcpu->arch.exception.async_page_fault" since lost a reschedule
optimization is not that importmant in L1.
@@ -3072,13 +3074,16 @@ static void
kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
process_nmi(vcpu);
- events->exception.injected =
- vcpu->arch.exception.pending &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
- events->exception.nr = vcpu->arch.exception.nr;
- events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
- events->exception.error_code = vcpu->arch.exception.error_code;
+ if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
+ vcpu->arch.exception.async_page_fault)) {
+ events->exception.injected =
+ vcpu->arch.exception.pending &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ events->exception.nr = vcpu->arch.exception.nr;
+ events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+ events->exception.pad = 0;
+ events->exception.error_code = vcpu->arch.exception.error_code;
+ }
Regards,
Wanpeng Li
2017-06-17 13:52+0800, Wanpeng Li:
> 2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
> > 2017-06-16 22:24+0800, Wanpeng Li:
> >> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
> >> > 2017-06-14 19:26-0700, Wanpeng Li:
> >> >> From: Wanpeng Li <[email protected]>
> >> >>
> >> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
> >> >> page fault, and constructs the expected vm-exit information fields. Force
> >> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
> >> >> is async page fault.
> >> >>
> >> >> Cc: Paolo Bonzini <[email protected]>
> >> >> Cc: Radim Krčmář <[email protected]>
> >> >> Signed-off-by: Wanpeng Li <[email protected]>
> >> >> ---
> >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
> >> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
> >> >> {
> >> >> ++vcpu->stat.pf_guest;
> >> >> - vcpu->arch.cr2 = fault->address;
> >> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
> >> >
> >> > I think we need to act as if arch.exception.async_page_fault was not
> >> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
> >> > migrate with pending async_page_fault exception, we'd inject it as a
> >> > normal #PF, which could confuse/kill the nested guest.
> >> >
> >> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
> >> > sanity as well.
> >>
> >> Do you mean we should add a field like async_page_fault to
> >> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
> >> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
> >> restores events->exception.async_page_fault to
> >> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
> >
> > No, I thought we could get away with a disgusting hack of hiding the
> > exception from userspace, which would work for migration, but not if
> > local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
> >
> > Extending the userspace interface would work, but I'd do it as a last
> > resort, after all conservative solutions have failed.
> > async_pf migration is very crude, so exposing the exception is just an
> > ugly workaround for the local case. Adding the flag would also require
> > userspace configuration of async_pf features for the guest to keep
> > compatibility.
> >
> > I see two options that might be simpler than adding the userspace flag:
> >
> > 1) do the nested VM exit sooner, at the place where we now queue #PF,
> > 2) queue the #PF later, save the async_pf in some intermediate
> > structure and consume it at the place where you proposed the nested
> > VM exit.
>
> How about something like this to not get exception events if it is
> "is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
> vcpu->arch.exception.async_page_fault" since lost a reschedule
> optimization is not that importmant in L1.
>
> @@ -3072,13 +3074,16 @@ static void
> kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> struct kvm_vcpu_events *events)
> {
> process_nmi(vcpu);
> - events->exception.injected =
> - vcpu->arch.exception.pending &&
> - !kvm_exception_is_soft(vcpu->arch.exception.nr);
> - events->exception.nr = vcpu->arch.exception.nr;
> - events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> - events->exception.pad = 0;
> - events->exception.error_code = vcpu->arch.exception.error_code;
> + if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
> + vcpu->arch.exception.async_page_fault)) {
> + events->exception.injected =
> + vcpu->arch.exception.pending &&
> + !kvm_exception_is_soft(vcpu->arch.exception.nr);
> + events->exception.nr = vcpu->arch.exception.nr;
> + events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> + events->exception.pad = 0;
> + events->exception.error_code = vcpu->arch.exception.error_code;
> + }
This adds a bug when userspace does KVM_GET_VCPU_EVENTS and
KVM_SET_VCPU_EVENTS without migration -- KVM would drop the async_pf and
a L1 process gets stuck as a result.
We we'd need to add a similar condition to
kvm_vcpu_ioctl_x86_set_vcpu_events(), so userspace SET doesn't drop it,
but that is far beyond the realm of acceptable code.
I realized this bug only after the first mail, sorry for the confusing
paragraph.
2017-06-19 22:51 GMT+08:00 Radim Krčmář <[email protected]>:
> 2017-06-17 13:52+0800, Wanpeng Li:
>> 2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
>> > 2017-06-16 22:24+0800, Wanpeng Li:
>> >> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
>> >> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> >> From: Wanpeng Li <[email protected]>
>> >> >>
>> >> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> >> is async page fault.
>> >> >>
>> >> >> Cc: Paolo Bonzini <[email protected]>
>> >> >> Cc: Radim Krčmář <[email protected]>
>> >> >> Signed-off-by: Wanpeng Li <[email protected]>
>> >> >> ---
>> >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
>> >> >> {
>> >> >> ++vcpu->stat.pf_guest;
>> >> >> - vcpu->arch.cr2 = fault->address;
>> >> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >> >
>> >> > I think we need to act as if arch.exception.async_page_fault was not
>> >> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
>> >> > migrate with pending async_page_fault exception, we'd inject it as a
>> >> > normal #PF, which could confuse/kill the nested guest.
>> >> >
>> >> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> >> > sanity as well.
>> >>
>> >> Do you mean we should add a field like async_page_fault to
>> >> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> >> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> >> restores events->exception.async_page_fault to
>> >> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>> >
>> > No, I thought we could get away with a disgusting hack of hiding the
>> > exception from userspace, which would work for migration, but not if
>> > local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>> >
>> > Extending the userspace interface would work, but I'd do it as a last
>> > resort, after all conservative solutions have failed.
>> > async_pf migration is very crude, so exposing the exception is just an
>> > ugly workaround for the local case. Adding the flag would also require
>> > userspace configuration of async_pf features for the guest to keep
>> > compatibility.
>> >
>> > I see two options that might be simpler than adding the userspace flag:
>> >
>> > 1) do the nested VM exit sooner, at the place where we now queue #PF,
>> > 2) queue the #PF later, save the async_pf in some intermediate
>> > structure and consume it at the place where you proposed the nested
>> > VM exit.
>>
>> How about something like this to not get exception events if it is
>> "is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
>> vcpu->arch.exception.async_page_fault" since lost a reschedule
>> optimization is not that importmant in L1.
>>
>> @@ -3072,13 +3074,16 @@ static void
>> kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>> struct kvm_vcpu_events *events)
>> {
>> process_nmi(vcpu);
>> - events->exception.injected =
>> - vcpu->arch.exception.pending &&
>> - !kvm_exception_is_soft(vcpu->arch.exception.nr);
>> - events->exception.nr = vcpu->arch.exception.nr;
>> - events->exception.has_error_code = vcpu->arch.exception.has_error_code;
>> - events->exception.pad = 0;
>> - events->exception.error_code = vcpu->arch.exception.error_code;
>> + if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
>> + vcpu->arch.exception.async_page_fault)) {
>> + events->exception.injected =
>> + vcpu->arch.exception.pending &&
>> + !kvm_exception_is_soft(vcpu->arch.exception.nr);
>> + events->exception.nr = vcpu->arch.exception.nr;
>> + events->exception.has_error_code = vcpu->arch.exception.has_error_code;
>> + events->exception.pad = 0;
>> + events->exception.error_code = vcpu->arch.exception.error_code;
>> + }
>
> This adds a bug when userspace does KVM_GET_VCPU_EVENTS and
> KVM_SET_VCPU_EVENTS without migration -- KVM would drop the async_pf and
> a L1 process gets stuck as a result.
>
> We we'd need to add a similar condition to
> kvm_vcpu_ioctl_x86_set_vcpu_events(), so userspace SET doesn't drop it,
> but that is far beyond the realm of acceptable code.
Do you mean current status of the patchset v2 can be accepted?
Otherwise, what's the next should be done?
Regards,
Wanpeng Li
2017-06-20 05:47+0800, Wanpeng Li:
> 2017-06-19 22:51 GMT+08:00 Radim Krčmář <[email protected]>:
> > 2017-06-17 13:52+0800, Wanpeng Li:
> >> 2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
> >> > 2017-06-16 22:24+0800, Wanpeng Li:
> >> >> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
> >> >> > 2017-06-14 19:26-0700, Wanpeng Li:
> >> >> >> From: Wanpeng Li <[email protected]>
> >> >> >>
> >> >> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
> >> >> >> page fault, and constructs the expected vm-exit information fields. Force
> >> >> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
> >> >> >> is async page fault.
> >> >> >>
> >> >> >> Cc: Paolo Bonzini <[email protected]>
> >> >> >> Cc: Radim Krčmář <[email protected]>
> >> >> >> Signed-off-by: Wanpeng Li <[email protected]>
> >> >> >> ---
> >> >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> >> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
> >> >> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
> >> >> >> {
> >> >> >> ++vcpu->stat.pf_guest;
> >> >> >> - vcpu->arch.cr2 = fault->address;
> >> >> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
> >> >> >
> >> >> > I think we need to act as if arch.exception.async_page_fault was not
> >> >> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
> >> >> > migrate with pending async_page_fault exception, we'd inject it as a
> >> >> > normal #PF, which could confuse/kill the nested guest.
> >> >> >
> >> >> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
> >> >> > sanity as well.
> >> >>
> >> >> Do you mean we should add a field like async_page_fault to
> >> >> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
> >> >> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
> >> >> restores events->exception.async_page_fault to
> >> >> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
> >> >
> >> > No, I thought we could get away with a disgusting hack of hiding the
> >> > exception from userspace, which would work for migration, but not if
> >> > local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
> >> >
> >> > Extending the userspace interface would work, but I'd do it as a last
> >> > resort, after all conservative solutions have failed.
> >> > async_pf migration is very crude, so exposing the exception is just an
> >> > ugly workaround for the local case. Adding the flag would also require
> >> > userspace configuration of async_pf features for the guest to keep
> >> > compatibility.
> >> >
> >> > I see two options that might be simpler than adding the userspace flag:
> >> >
> >> > 1) do the nested VM exit sooner, at the place where we now queue #PF,
> >> > 2) queue the #PF later, save the async_pf in some intermediate
> >> > structure and consume it at the place where you proposed the nested
> >> > VM exit.
> >>
> >> How about something like this to not get exception events if it is
> >> "is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
> >> vcpu->arch.exception.async_page_fault" since lost a reschedule
> >> optimization is not that importmant in L1.
> >>
> >> @@ -3072,13 +3074,16 @@ static void
> >> kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> >> struct kvm_vcpu_events *events)
> >> {
> >> process_nmi(vcpu);
> >> - events->exception.injected =
> >> - vcpu->arch.exception.pending &&
> >> - !kvm_exception_is_soft(vcpu->arch.exception.nr);
> >> - events->exception.nr = vcpu->arch.exception.nr;
> >> - events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> >> - events->exception.pad = 0;
> >> - events->exception.error_code = vcpu->arch.exception.error_code;
> >> + if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
> >> + vcpu->arch.exception.async_page_fault)) {
> >> + events->exception.injected =
> >> + vcpu->arch.exception.pending &&
> >> + !kvm_exception_is_soft(vcpu->arch.exception.nr);
> >> + events->exception.nr = vcpu->arch.exception.nr;
> >> + events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> >> + events->exception.pad = 0;
> >> + events->exception.error_code = vcpu->arch.exception.error_code;
> >> + }
> >
> > This adds a bug when userspace does KVM_GET_VCPU_EVENTS and
> > KVM_SET_VCPU_EVENTS without migration -- KVM would drop the async_pf and
> > a L1 process gets stuck as a result.
> >
> > We we'd need to add a similar condition to
> > kvm_vcpu_ioctl_x86_set_vcpu_events(), so userspace SET doesn't drop it,
> > but that is far beyond the realm of acceptable code.
>
> Do you mean current status of the patchset v2 can be accepted?
> Otherwise, what's the next should be done?
No, sorry, that one has the migration bug (the async_page_fault gets
dropped on destination).
You proposed to add the flag to the userspace interface, which is a
sound solution. I was asking to look for a different one, because the
flag is a work-around for an implementation detail, which isn't a good
thing to put into a userspace interface ...
Still, I looked at the early VM exit (1) and it doesn't fit well into
SVM's model of single nested VM exit location, so it's out.
The remaining contender is to add a paravirtualized event for apf and
only convert it into nested VM exit or #PF in inject_pending_event().
The end result would likely be a slightly better version of the
exception flag ...
I guess that doing a prototype of the userspace interface extension is a
good follow up.
2017-06-21 0:12 GMT+08:00 Radim Krčmář <[email protected]>:
> 2017-06-20 05:47+0800, Wanpeng Li:
>> 2017-06-19 22:51 GMT+08:00 Radim Krčmář <[email protected]>:
>> > 2017-06-17 13:52+0800, Wanpeng Li:
>> >> 2017-06-16 23:38 GMT+08:00 Radim Krčmář <[email protected]>:
>> >> > 2017-06-16 22:24+0800, Wanpeng Li:
>> >> >> 2017-06-16 21:37 GMT+08:00 Radim Krčmář <[email protected]>:
>> >> >> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> >> >> From: Wanpeng Li <[email protected]>
>> >> >> >>
>> >> >> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> >> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> >> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> >> >> is async page fault.
>> >> >> >>
>> >> >> >> Cc: Paolo Bonzini <[email protected]>
>> >> >> >> Cc: Radim Krčmář <[email protected]>
>> >> >> >> Signed-off-by: Wanpeng Li <[email protected]>
>> >> >> >> ---
>> >> >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> >> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >> >> >> void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
>> >> >> >> {
>> >> >> >> ++vcpu->stat.pf_guest;
>> >> >> >> - vcpu->arch.cr2 = fault->address;
>> >> >> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >> >> >
>> >> >> > I think we need to act as if arch.exception.async_page_fault was not
>> >> >> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events(). Otherwise, if we
>> >> >> > migrate with pending async_page_fault exception, we'd inject it as a
>> >> >> > normal #PF, which could confuse/kill the nested guest.
>> >> >> >
>> >> >> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> >> >> > sanity as well.
>> >> >>
>> >> >> Do you mean we should add a field like async_page_fault to
>> >> >> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> >> >> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> >> >> restores events->exception.async_page_fault to
>> >> >> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>> >> >
>> >> > No, I thought we could get away with a disgusting hack of hiding the
>> >> > exception from userspace, which would work for migration, but not if
>> >> > local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>> >> >
>> >> > Extending the userspace interface would work, but I'd do it as a last
>> >> > resort, after all conservative solutions have failed.
>> >> > async_pf migration is very crude, so exposing the exception is just an
>> >> > ugly workaround for the local case. Adding the flag would also require
>> >> > userspace configuration of async_pf features for the guest to keep
>> >> > compatibility.
>> >> >
>> >> > I see two options that might be simpler than adding the userspace flag:
>> >> >
>> >> > 1) do the nested VM exit sooner, at the place where we now queue #PF,
>> >> > 2) queue the #PF later, save the async_pf in some intermediate
>> >> > structure and consume it at the place where you proposed the nested
>> >> > VM exit.
>> >>
>> >> How about something like this to not get exception events if it is
>> >> "is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
>> >> vcpu->arch.exception.async_page_fault" since lost a reschedule
>> >> optimization is not that importmant in L1.
>> >>
>> >> @@ -3072,13 +3074,16 @@ static void
>> >> kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>> >> struct kvm_vcpu_events *events)
>> >> {
>> >> process_nmi(vcpu);
>> >> - events->exception.injected =
>> >> - vcpu->arch.exception.pending &&
>> >> - !kvm_exception_is_soft(vcpu->arch.exception.nr);
>> >> - events->exception.nr = vcpu->arch.exception.nr;
>> >> - events->exception.has_error_code = vcpu->arch.exception.has_error_code;
>> >> - events->exception.pad = 0;
>> >> - events->exception.error_code = vcpu->arch.exception.error_code;
>> >> + if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
>> >> + vcpu->arch.exception.async_page_fault)) {
>> >> + events->exception.injected =
>> >> + vcpu->arch.exception.pending &&
>> >> + !kvm_exception_is_soft(vcpu->arch.exception.nr);
>> >> + events->exception.nr = vcpu->arch.exception.nr;
>> >> + events->exception.has_error_code = vcpu->arch.exception.has_error_code;
>> >> + events->exception.pad = 0;
>> >> + events->exception.error_code = vcpu->arch.exception.error_code;
>> >> + }
>> >
>> > This adds a bug when userspace does KVM_GET_VCPU_EVENTS and
>> > KVM_SET_VCPU_EVENTS without migration -- KVM would drop the async_pf and
>> > a L1 process gets stuck as a result.
>> >
>> > We we'd need to add a similar condition to
>> > kvm_vcpu_ioctl_x86_set_vcpu_events(), so userspace SET doesn't drop it,
>> > but that is far beyond the realm of acceptable code.
>>
>> Do you mean current status of the patchset v2 can be accepted?
>> Otherwise, what's the next should be done?
>
> No, sorry, that one has the migration bug (the async_page_fault gets
> dropped on destination).
>
> You proposed to add the flag to the userspace interface, which is a
> sound solution. I was asking to look for a different one, because the
> flag is a work-around for an implementation detail, which isn't a good
> thing to put into a userspace interface ...
>
> Still, I looked at the early VM exit (1) and it doesn't fit well into
> SVM's model of single nested VM exit location, so it's out.
>
> The remaining contender is to add a paravirtualized event for apf and
> only convert it into nested VM exit or #PF in inject_pending_event().
> The end result would likely be a slightly better version of the
> exception flag ...
>
> I guess that doing a prototype of the userspace interface extension is a
> good follow up.
Yeah, I just do this in patch 3/4 v3 and another qemu patch. Please
have a review. :)
Regards,
Wanpeng Li