svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb ( nested ).
Issues:
1) There is some wholesale copying of vmcb.save and vmcb.contol
areas which will need to be refined.
2) There is a workaround in nested_svm_vmexit() where
if (svm->vmcb01->control.asid == 0)
svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
This was done as a result of the kvm selftest 'state_test'. In that
test svm_set_nested_state() is called before svm_vcpu_run().
The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
vmcb which is now vmcb02 as we are in nested mode subsequently
vmcb01.control.asid is never set as it should be.
Tested:
kvm-unit-tests
kvm self tests
Signed-off-by: Cathy Avery <[email protected]>
---
arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
arch/x86/kvm/svm/svm.c | 41 +++++++-------
arch/x86/kvm/svm/svm.h | 10 ++--
3 files changed, 81 insertions(+), 86 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..0a06e62010d8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
+ svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
+ h = &svm->vmcb01->control;
g = &svm->nested.ctl;
svm->nested.host_intercept_exceptions = h->intercept_exceptions;
@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
- (svm->nested.hsave->control.int_ctl & mask);
+ (svm->vmcb01->control.int_ctl & mask);
svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
@@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
int ret;
svm->nested.vmcb = vmcb_gpa;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+ svm->nested.vmcb02->control = svm->vmcb01->control;
+ svm->vmcb = svm->nested.vmcb02;
+ svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
{
int ret;
struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;
@@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(&hsave->control, &vmcb->control);
+
+ /* Update vmcb0. We will restore everything when a VMEXIT occurs */
+
+ svm->vmcb01->save.efer = svm->vcpu.arch.efer;
+ svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
+ svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
+ svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
+
+ if (!npt_enabled)
+ svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
svm->nested.nested_run_pending = 1;
@@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
{
int rc;
struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
@@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
- /* Restore the original control entries */
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ if (svm->vmcb01->control.asid == 0)
+ svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
+
+ svm->vmcb = svm->vmcb01;
+ svm->vmcb_pa = svm->nested.vmcb01_pa;
/* On vmexit the GIF is set to false */
svm_set_gif(svm, false);
@@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->nested.ctl.nested_cr3 = 0;
/* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
+ kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
+ svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
+ svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
+ kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
+ kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
+ kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
svm->vmcb->save.dr7 = 0;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
@@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_uninit_mmu_context(&svm->vcpu);
- rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
if (rc)
return 1;
- if (npt_enabled)
- svm->vmcb->save.cr3 = hsave->save.cr3;
+ if (!npt_enabled)
+ svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
@@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
void svm_leave_nested(struct vcpu_svm *svm)
{
if (is_guest_mode(&svm->vcpu)) {
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
-
svm->nested.nested_run_pending = 0;
leave_guest_mode(&svm->vcpu);
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ svm->vmcb = svm->vmcb01;
+ svm->vmcb_pa = svm->nested.vmcb01_pa;
nested_svm_uninit_mmu_context(&svm->vcpu);
}
}
@@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
sizeof(user_vmcb->control)))
return -EFAULT;
- if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
sizeof(user_vmcb->save)))
return -EFAULT;
-
out:
return kvm_state.size;
}
@@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state *kvm_state)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area ctl;
@@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
if (!(save.cr0 & X86_CR0_PG))
return -EINVAL;
+ svm->nested.vmcb02->control = svm->vmcb01->control;
+ svm->nested.vmcb02->save = svm->vmcb01->save;
+ svm->vmcb01->save = save;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+ svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
+
+ svm->vmcb = svm->nested.vmcb02;
+ svm->vmcb_pa = svm->nested.vmcb02_pa;
+
/*
- * All checks done, we can enter guest mode. L1 control fields
- * come from the nested save state. Guest state is already
- * in the registers, the save area of the nested state instead
- * contains saved L1 state.
+ * All checks done, we can enter guest mode. L2 control fields will
+ * be the result of a combination of L1 and userspace indicated
+ * L12.control. The save area of L1 vmcb now contains the userspace
+ * indicated L1.save.
*/
- copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = save;
- svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
load_nested_vmcb_control(svm, &ctl);
nested_prepare_vmcb_control(svm);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5764b87379cf..d8022f989ffb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
if (is_guest_mode(vcpu)) {
/* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
+ svm->vmcb01->control.tsc_offset;
+ svm->vmcb01->control.tsc_offset = offset;
}
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *page;
+ struct page *vmcb01_page;
+ struct page *vmcb02_page;
struct page *msrpm_pages;
- struct page *hsave_page;
struct page *nested_msrpm_pages;
int err;
@@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!vmcb01_page)
goto out;
msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
@@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (!nested_msrpm_pages)
goto free_page2;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!hsave_page)
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!vmcb02_page)
goto free_page3;
err = avic_init_vcpu(svm);
@@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
svm->avic_is_running = true;
- svm->nested.hsave = page_address(hsave_page);
- clear_page(svm->nested.hsave);
+ svm->nested.vmcb02 = page_address(vmcb02_page);
+ clear_page(svm->nested.vmcb02);
+ svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
svm->msrpm = page_address(msrpm_pages);
svm_vcpu_init_msrpm(svm->msrpm);
@@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->nested.msrpm = page_address(nested_msrpm_pages);
svm_vcpu_init_msrpm(svm->nested.msrpm);
- svm->vmcb = page_address(page);
+ svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
clear_page(svm->vmcb);
- svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+ svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm->nested.vmcb01_pa = svm->vmcb_pa;
+
svm->asid_generation = 0;
init_vmcb(svm);
@@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
return 0;
free_page4:
- __free_page(hsave_page);
+ __free_page(vmcb02_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
- __free_page(page);
+ __free_page(vmcb01_page);
out:
return err;
}
@@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
* svm_vcpu_load(). So, ensure that no logical CPU has this
* vmcb page recorded as its current vmcb.
*/
- svm_clear_current_vmcb(svm->vmcb);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
+ svm_clear_current_vmcb(svm->vmcb);
+ __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >> PAGE_SHIFT));
+ __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
}
@@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
if (is_guest_mode(&svm->vcpu)) {
- svm->nested.hsave->control.int_ctl &= mask;
+ svm->vmcb01->control.int_ctl &= mask;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -3127,7 +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
- ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+ ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
: !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
return true;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a798e1731709..e908b83bfa69 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -82,7 +82,9 @@ struct kvm_svm {
struct kvm_vcpu;
struct svm_nested_state {
- struct vmcb *hsave;
+ struct vmcb *vmcb02;
+ unsigned long vmcb01_pa;
+ unsigned long vmcb02_pa;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb;
@@ -102,6 +104,7 @@ struct svm_nested_state {
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
+ struct vmcb *vmcb01;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
uint64_t asid_generation;
@@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
+ return svm->vmcb01;
}
static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
--
2.20.1
Cathy,
Thanks for the patches. It cleans up the code nicely.
But there are some issues with the patch. I was able to bring the L1 guest
with your patch. But when I tried to load L2 guest it crashed. I am
thinking It is mostly due to save/restore part of vmcb. Few comments below.
> -----Original Message-----
> From: Cathy Avery <[email protected]>
> Sent: Thursday, September 17, 2020 2:23 PM
> To: [email protected]; [email protected]; [email protected]
> Cc: [email protected]; Huang2, Wei <[email protected]>
> Subject: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest
>
> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb
> ( nested ).
>
> Issues:
>
> 1) There is some wholesale copying of vmcb.save and vmcb.contol
> areas which will need to be refined.
>
> 2) There is a workaround in nested_svm_vmexit() where
>
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> This was done as a result of the kvm selftest 'state_test'. In that
> test svm_set_nested_state() is called before svm_vcpu_run().
> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> vmcb which is now vmcb02 as we are in nested mode subsequently
> vmcb01.control.asid is never set as it should be.
>
> Tested:
> kvm-unit-tests
> kvm self tests
>
> Signed-off-by: Cathy Avery <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
> arch/x86/kvm/svm/svm.c | 41 +++++++-------
> arch/x86/kvm/svm/svm.h | 10 ++--
> 3 files changed, 81 insertions(+), 86 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index
> e90bc436f584..0a06e62010d8 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct
> kvm_vcpu *vcpu) static void nested_svm_init_mmu_context(struct kvm_vcpu
> *vcpu) {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
>
> WARN_ON(mmu_is_nested(vcpu));
>
> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4,
> hsave->save.efer,
> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01-
> >save.cr4,
> + svm->vmcb01->save.efer,
> svm->nested.ctl.nested_cr3);
> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
> return;
>
> c = &svm->vmcb->control;
> - h = &svm->nested.hsave->control;
> + h = &svm->vmcb01->control;
> g = &svm->nested.ctl;
>
> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct
> vcpu_svm *svm)
>
> svm->vmcb->control.int_ctl =
> (svm->nested.ctl.int_ctl & ~mask) |
> - (svm->nested.hsave->control.int_ctl & mask);
> + (svm->vmcb01->control.int_ctl & mask);
>
> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64
> vmcb_gpa,
> int ret;
>
> svm->nested.vmcb = vmcb_gpa;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> nested_prepare_vmcb_save(svm, nested_vmcb);
> nested_prepare_vmcb_control(svm);
> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm) {
> int ret;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
> u64 vmcb_gpa;
>
> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> kvm_clear_exception_queue(&svm->vcpu);
> kvm_clear_interrupt_queue(&svm->vcpu);
>
> - /*
> - * Save the old vmcb, so we don't need to pick what we save, but can
> - * restore everything when a VMEXIT occurs
> - */
> - hsave->save.es = vmcb->save.es;
> - hsave->save.cs = vmcb->save.cs;
> - hsave->save.ss = vmcb->save.ss;
> - hsave->save.ds = vmcb->save.ds;
> - hsave->save.gdtr = vmcb->save.gdtr;
> - hsave->save.idtr = vmcb->save.idtr;
> - hsave->save.efer = svm->vcpu.arch.efer;
> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
> - hsave->save.cr4 = svm->vcpu.arch.cr4;
> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
> - hsave->save.rsp = vmcb->save.rsp;
> - hsave->save.rax = vmcb->save.rax;
> - if (npt_enabled)
> - hsave->save.cr3 = vmcb->save.cr3;
> - else
> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
> -
> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
You may have to carefully check the above cleanup.
> +
> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
> +
> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
> +
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> svm->nested.nested_run_pending = 1;
>
> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) {
> int rc;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
>
> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> nested_vmcb->control.pause_filter_thresh =
> svm->vmcb->control.pause_filter_thresh;
>
> - /* Restore the original control entries */
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + if (svm->vmcb01->control.asid == 0)
> + svm->vmcb01->control.asid = svm->nested.vmcb02-
> >control.asid;
> +
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> /* On vmexit the GIF is set to false */
> svm_set_gif(svm, false);
> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> svm->nested.ctl.nested_cr3 = 0;
>
> /* Restore selected save entries */
> - svm->vmcb->save.es = hsave->save.es;
> - svm->vmcb->save.cs = hsave->save.cs;
> - svm->vmcb->save.ss = hsave->save.ss;
> - svm->vmcb->save.ds = hsave->save.ds;
> - svm->vmcb->save.gdtr = hsave->save.gdtr;
> - svm->vmcb->save.idtr = hsave->save.idtr;
> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
> - svm_set_efer(&svm->vcpu, hsave->save.efer);
> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
> svm->vmcb->save.dr7 = 0;
> svm->vmcb->save.cpl = 0;
> svm->vmcb->control.exit_int_info = 0;
> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>
> nested_svm_uninit_mmu_context(&svm->vcpu);
>
> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
> if (rc)
> return 1;
>
> - if (npt_enabled)
> - svm->vmcb->save.cr3 = hsave->save.cr3;
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> /*
> * Drop what we picked up for L2 via svm_complete_interrupts() so it
> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm) void
> svm_leave_nested(struct vcpu_svm *svm) {
> if (is_guest_mode(&svm->vcpu)) {
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> -
> svm->nested.nested_run_pending = 0;
> leave_guest_mode(&svm->vcpu);
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> nested_svm_uninit_mmu_context(&svm->vcpu);
> }
> }
> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu
> *vcpu,
> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
> sizeof(user_vmcb->control)))
> return -EFAULT;
> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
> sizeof(user_vmcb->save)))
> return -EFAULT;
> -
> out:
> return kvm_state.size;
> }
> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu
> *vcpu,
> struct kvm_nested_state *kvm_state) {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb __user *user_vmcb = (struct vmcb __user *)
> &user_kvm_nested_state->data.svm[0];
> struct vmcb_control_area ctl;
> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu
> *vcpu,
> if (!(save.cr0 & X86_CR0_PG))
> return -EINVAL;
>
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->nested.vmcb02->save = svm->vmcb01->save;
> + svm->vmcb01->save = save;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> +
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> +
> /*
> - * All checks done, we can enter guest mode. L1 control fields
> - * come from the nested save state. Guest state is already
> - * in the registers, the save area of the nested state instead
> - * contains saved L1 state.
> + * All checks done, we can enter guest mode. L2 control fields will
> + * be the result of a combination of L1 and userspace indicated
> + * L12.control. The save area of L1 vmcb now contains the userspace
> + * indicated L1.save.
> */
> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
> - hsave->save = save;
>
> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> load_nested_vmcb_control(svm, &ctl);
> nested_prepare_vmcb_control(svm);
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index
> 5764b87379cf..d8022f989ffb 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu
> *vcpu, u64 offset)
> if (is_guest_mode(vcpu)) {
> /* Write L1's TSC offset. */
> g_tsc_offset = svm->vmcb->control.tsc_offset -
> - svm->nested.hsave->control.tsc_offset;
> - svm->nested.hsave->control.tsc_offset = offset;
> + svm->vmcb01->control.tsc_offset;
> + svm->vmcb01->control.tsc_offset = offset;
> }
>
> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu,
> bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) {
> struct vcpu_svm *svm;
> - struct page *page;
> + struct page *vmcb01_page;
> + struct page *vmcb02_page;
> struct page *msrpm_pages;
> - struct page *hsave_page;
> struct page *nested_msrpm_pages;
> int err;
>
> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm = to_svm(vcpu);
>
> err = -ENOMEM;
> - page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!page)
> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb01_page)
> goto out;
>
> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT,
> MSRPM_ALLOC_ORDER); @@ -1193,8 +1193,8 @@ static int
> svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (!nested_msrpm_pages)
> goto free_page2;
>
> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!hsave_page)
> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb02_page)
> goto free_page3;
>
> err = avic_init_vcpu(svm);
> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
> svm->avic_is_running = true;
>
> - svm->nested.hsave = page_address(hsave_page);
> - clear_page(svm->nested.hsave);
> + svm->nested.vmcb02 = page_address(vmcb02_page);
> + clear_page(svm->nested.vmcb02);
> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) <<
> +PAGE_SHIFT);
>
> svm->msrpm = page_address(msrpm_pages);
> svm_vcpu_init_msrpm(svm->msrpm);
> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm->nested.msrpm = page_address(nested_msrpm_pages);
> svm_vcpu_init_msrpm(svm->nested.msrpm);
>
> - svm->vmcb = page_address(page);
> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
> clear_page(svm->vmcb);
> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) <<
> PAGE_SHIFT);
> + svm->nested.vmcb01_pa = svm->vmcb_pa;
> +
> svm->asid_generation = 0;
> init_vmcb(svm);
>
> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> return 0;
>
> free_page4:
> - __free_page(hsave_page);
> + __free_page(vmcb02_page);
> free_page3:
> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page2:
> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page1:
> - __free_page(page);
> + __free_page(vmcb01_page);
> out:
> return err;
> }
> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> * svm_vcpu_load(). So, ensure that no logical CPU has this
> * vmcb page recorded as its current vmcb.
> */
> - svm_clear_current_vmcb(svm->vmcb);
>
> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
> + svm_clear_current_vmcb(svm->vmcb);
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >>
> PAGE_SHIFT));
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >>
> +PAGE_SHIFT));
> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> - __free_page(virt_to_page(svm->nested.hsave));
> __free_pages(virt_to_page(svm->nested.msrpm),
> MSRPM_ALLOC_ORDER); }
>
> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
> /* Drop int_ctl fields related to VINTR injection. */
> svm->vmcb->control.int_ctl &= mask;
> if (is_guest_mode(&svm->vcpu)) {
> - svm->nested.hsave->control.int_ctl &= mask;
> + svm->vmcb01->control.int_ctl &= mask;
>
> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
> (svm->nested.ctl.int_ctl & V_TPR_MASK)); @@ -3127,7
> +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> if (is_guest_mode(vcpu)) {
> /* As long as interrupts are being delivered... */
> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
> return true;
>
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index
> a798e1731709..e908b83bfa69 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -82,7 +82,9 @@ struct kvm_svm {
> struct kvm_vcpu;
>
> struct svm_nested_state {
> - struct vmcb *hsave;
> + struct vmcb *vmcb02;
> + unsigned long vmcb01_pa;
> + unsigned long vmcb02_pa;
> u64 hsave_msr;
> u64 vm_cr_msr;
> u64 vmcb;
> @@ -102,6 +104,7 @@ struct svm_nested_state { struct vcpu_svm {
> struct kvm_vcpu vcpu;
> struct vmcb *vmcb;
> + struct vmcb *vmcb01;
> unsigned long vmcb_pa;
> struct svm_cpu_data *svm_data;
> uint64_t asid_generation;
> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu
> *vcpu)
>
> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) {
> - if (is_guest_mode(&svm->vcpu))
> - return svm->nested.hsave;
> - else
> - return svm->vmcb;
> + return svm->vmcb01;
Shouldn't it return svm->vmcb? That is what your commit message says.
> }
>
> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
> --
> 2.20.1
On 9/18/20 11:16 AM, Babu Moger wrote:
> Cathy,
> Thanks for the patches. It cleans up the code nicely.
> But there are some issues with the patch. I was able to bring the L1 guest
> with your patch. But when I tried to load L2 guest it crashed. I am
> thinking It is mostly due to save/restore part of vmcb. Few comments below.
>
>> -----Original Message-----
>> From: Cathy Avery <[email protected]>
>> Sent: Thursday, September 17, 2020 2:23 PM
>> To: [email protected]; [email protected]; [email protected]
>> Cc: [email protected]; Huang2, Wei <[email protected]>
>> Subject: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest
>>
>> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb
>> ( nested ).
>>
>> Issues:
>>
>> 1) There is some wholesale copying of vmcb.save and vmcb.contol
>> areas which will need to be refined.
>>
>> 2) There is a workaround in nested_svm_vmexit() where
>>
>> if (svm->vmcb01->control.asid == 0)
>> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>
>> This was done as a result of the kvm selftest 'state_test'. In that
>> test svm_set_nested_state() is called before svm_vcpu_run().
>> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
>> vmcb which is now vmcb02 as we are in nested mode subsequently
>> vmcb01.control.asid is never set as it should be.
>>
>> Tested:
>> kvm-unit-tests
>> kvm self tests
>>
>> Signed-off-by: Cathy Avery <[email protected]>
>> ---
>> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
>> arch/x86/kvm/svm/svm.c | 41 +++++++-------
>> arch/x86/kvm/svm/svm.h | 10 ++--
>> 3 files changed, 81 insertions(+), 86 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index
>> e90bc436f584..0a06e62010d8 100644
>> --- a/arch/x86/kvm/svm/nested.c
>> +++ b/arch/x86/kvm/svm/nested.c
>> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct
>> kvm_vcpu *vcpu) static void nested_svm_init_mmu_context(struct kvm_vcpu
>> *vcpu) {
>> struct vcpu_svm *svm = to_svm(vcpu);
>> - struct vmcb *hsave = svm->nested.hsave;
>>
>> WARN_ON(mmu_is_nested(vcpu));
>>
>> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
>> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4,
>> hsave->save.efer,
>> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01-
>>> save.cr4,
>> + svm->vmcb01->save.efer,
>> svm->nested.ctl.nested_cr3);
>> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
>> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
>> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
>> return;
>>
>> c = &svm->vmcb->control;
>> - h = &svm->nested.hsave->control;
>> + h = &svm->vmcb01->control;
>> g = &svm->nested.ctl;
>>
>> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
>> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct
>> vcpu_svm *svm)
>>
>> svm->vmcb->control.int_ctl =
>> (svm->nested.ctl.int_ctl & ~mask) |
>> - (svm->nested.hsave->control.int_ctl & mask);
>> + (svm->vmcb01->control.int_ctl & mask);
>>
>> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
>> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
>> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64
>> vmcb_gpa,
>> int ret;
>>
>> svm->nested.vmcb = vmcb_gpa;
>> +
>> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
>> +
>> + svm->nested.vmcb02->control = svm->vmcb01->control;
>> + svm->vmcb = svm->nested.vmcb02;
>> + svm->vmcb_pa = svm->nested.vmcb02_pa;
>> load_nested_vmcb_control(svm, &nested_vmcb->control);
>> nested_prepare_vmcb_save(svm, nested_vmcb);
>> nested_prepare_vmcb_control(svm);
>> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm) {
>> int ret;
>> struct vmcb *nested_vmcb;
>> - struct vmcb *hsave = svm->nested.hsave;
>> - struct vmcb *vmcb = svm->vmcb;
>> struct kvm_host_map map;
>> u64 vmcb_gpa;
>>
>> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
>> kvm_clear_exception_queue(&svm->vcpu);
>> kvm_clear_interrupt_queue(&svm->vcpu);
>>
>> - /*
>> - * Save the old vmcb, so we don't need to pick what we save, but can
>> - * restore everything when a VMEXIT occurs
>> - */
>> - hsave->save.es = vmcb->save.es;
>> - hsave->save.cs = vmcb->save.cs;
>> - hsave->save.ss = vmcb->save.ss;
>> - hsave->save.ds = vmcb->save.ds;
>> - hsave->save.gdtr = vmcb->save.gdtr;
>> - hsave->save.idtr = vmcb->save.idtr;
>> - hsave->save.efer = svm->vcpu.arch.efer;
>> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
>> - hsave->save.cr4 = svm->vcpu.arch.cr4;
>> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
>> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
>> - hsave->save.rsp = vmcb->save.rsp;
>> - hsave->save.rax = vmcb->save.rax;
>> - if (npt_enabled)
>> - hsave->save.cr3 = vmcb->save.cr3;
>> - else
>> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
>> -
>> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
> You may have to carefully check the above cleanup.
Thanks I'll check it out. I did not see a crash when running the tests.?
Could you send me more information about your test and test setup, stack
trace, etc.
>
>> +
>> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
>> +
>> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
>> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
>> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
>> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
>> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
>> +
>> + if (!npt_enabled)
>> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>>
>> svm->nested.nested_run_pending = 1;
>>
>> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) {
>> int rc;
>> struct vmcb *nested_vmcb;
>> - struct vmcb *hsave = svm->nested.hsave;
>> struct vmcb *vmcb = svm->vmcb;
>> struct kvm_host_map map;
>>
>> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> nested_vmcb->control.pause_filter_thresh =
>> svm->vmcb->control.pause_filter_thresh;
>>
>> - /* Restore the original control entries */
>> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
>> + if (svm->vmcb01->control.asid == 0)
>> + svm->vmcb01->control.asid = svm->nested.vmcb02-
>>> control.asid;
>> +
>> + svm->vmcb = svm->vmcb01;
>> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>>
>> /* On vmexit the GIF is set to false */
>> svm_set_gif(svm, false);
>> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> svm->nested.ctl.nested_cr3 = 0;
>>
>> /* Restore selected save entries */
>> - svm->vmcb->save.es = hsave->save.es;
>> - svm->vmcb->save.cs = hsave->save.cs;
>> - svm->vmcb->save.ss = hsave->save.ss;
>> - svm->vmcb->save.ds = hsave->save.ds;
>> - svm->vmcb->save.gdtr = hsave->save.gdtr;
>> - svm->vmcb->save.idtr = hsave->save.idtr;
>> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
>> - svm_set_efer(&svm->vcpu, hsave->save.efer);
>> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
>> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
>> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
>> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
>> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
>> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
>> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
>> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
>> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
>> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
>> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
>> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
>> svm->vmcb->save.dr7 = 0;
>> svm->vmcb->save.cpl = 0;
>> svm->vmcb->control.exit_int_info = 0;
>> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>>
>> nested_svm_uninit_mmu_context(&svm->vcpu);
>>
>> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
>> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
>> if (rc)
>> return 1;
>>
>> - if (npt_enabled)
>> - svm->vmcb->save.cr3 = hsave->save.cr3;
>> + if (!npt_enabled)
>> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>>
>> /*
>> * Drop what we picked up for L2 via svm_complete_interrupts() so it
>> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm) void
>> svm_leave_nested(struct vcpu_svm *svm) {
>> if (is_guest_mode(&svm->vcpu)) {
>> - struct vmcb *hsave = svm->nested.hsave;
>> - struct vmcb *vmcb = svm->vmcb;
>> -
>> svm->nested.nested_run_pending = 0;
>> leave_guest_mode(&svm->vcpu);
>> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
>> + svm->vmcb = svm->vmcb01;
>> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>> nested_svm_uninit_mmu_context(&svm->vcpu);
>> }
>> }
>> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu
>> *vcpu,
>> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
>> sizeof(user_vmcb->control)))
>> return -EFAULT;
>> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
>> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
>> sizeof(user_vmcb->save)))
>> return -EFAULT;
>> -
>> out:
>> return kvm_state.size;
>> }
>> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu
>> *vcpu,
>> struct kvm_nested_state *kvm_state) {
>> struct vcpu_svm *svm = to_svm(vcpu);
>> - struct vmcb *hsave = svm->nested.hsave;
>> struct vmcb __user *user_vmcb = (struct vmcb __user *)
>> &user_kvm_nested_state->data.svm[0];
>> struct vmcb_control_area ctl;
>> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu
>> *vcpu,
>> if (!(save.cr0 & X86_CR0_PG))
>> return -EINVAL;
>>
>> + svm->nested.vmcb02->control = svm->vmcb01->control;
>> + svm->nested.vmcb02->save = svm->vmcb01->save;
>> + svm->vmcb01->save = save;
>> +
>> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
>> +
>> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
>> +
>> + svm->vmcb = svm->nested.vmcb02;
>> + svm->vmcb_pa = svm->nested.vmcb02_pa;
>> +
>> /*
>> - * All checks done, we can enter guest mode. L1 control fields
>> - * come from the nested save state. Guest state is already
>> - * in the registers, the save area of the nested state instead
>> - * contains saved L1 state.
>> + * All checks done, we can enter guest mode. L2 control fields will
>> + * be the result of a combination of L1 and userspace indicated
>> + * L12.control. The save area of L1 vmcb now contains the userspace
>> + * indicated L1.save.
>> */
>> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
>> - hsave->save = save;
>>
>> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
>> load_nested_vmcb_control(svm, &ctl);
>> nested_prepare_vmcb_control(svm);
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index
>> 5764b87379cf..d8022f989ffb 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu
>> *vcpu, u64 offset)
>> if (is_guest_mode(vcpu)) {
>> /* Write L1's TSC offset. */
>> g_tsc_offset = svm->vmcb->control.tsc_offset -
>> - svm->nested.hsave->control.tsc_offset;
>> - svm->nested.hsave->control.tsc_offset = offset;
>> + svm->vmcb01->control.tsc_offset;
>> + svm->vmcb01->control.tsc_offset = offset;
>> }
>>
>> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu,
>> bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) {
>> struct vcpu_svm *svm;
>> - struct page *page;
>> + struct page *vmcb01_page;
>> + struct page *vmcb02_page;
>> struct page *msrpm_pages;
>> - struct page *hsave_page;
>> struct page *nested_msrpm_pages;
>> int err;
>>
>> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> svm = to_svm(vcpu);
>>
>> err = -ENOMEM;
>> - page = alloc_page(GFP_KERNEL_ACCOUNT);
>> - if (!page)
>> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> + if (!vmcb01_page)
>> goto out;
>>
>> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT,
>> MSRPM_ALLOC_ORDER); @@ -1193,8 +1193,8 @@ static int
>> svm_create_vcpu(struct kvm_vcpu *vcpu)
>> if (!nested_msrpm_pages)
>> goto free_page2;
>>
>> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> - if (!hsave_page)
>> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> + if (!vmcb02_page)
>> goto free_page3;
>>
>> err = avic_init_vcpu(svm);
>> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
>> svm->avic_is_running = true;
>>
>> - svm->nested.hsave = page_address(hsave_page);
>> - clear_page(svm->nested.hsave);
>> + svm->nested.vmcb02 = page_address(vmcb02_page);
>> + clear_page(svm->nested.vmcb02);
>> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) <<
>> +PAGE_SHIFT);
>>
>> svm->msrpm = page_address(msrpm_pages);
>> svm_vcpu_init_msrpm(svm->msrpm);
>> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> svm->nested.msrpm = page_address(nested_msrpm_pages);
>> svm_vcpu_init_msrpm(svm->nested.msrpm);
>>
>> - svm->vmcb = page_address(page);
>> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
>> clear_page(svm->vmcb);
>> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
>> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) <<
>> PAGE_SHIFT);
>> + svm->nested.vmcb01_pa = svm->vmcb_pa;
>> +
>> svm->asid_generation = 0;
>> init_vmcb(svm);
>>
>> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> return 0;
>>
>> free_page4:
>> - __free_page(hsave_page);
>> + __free_page(vmcb02_page);
>> free_page3:
>> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
>> free_page2:
>> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
>> free_page1:
>> - __free_page(page);
>> + __free_page(vmcb01_page);
>> out:
>> return err;
>> }
>> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>> * svm_vcpu_load(). So, ensure that no logical CPU has this
>> * vmcb page recorded as its current vmcb.
>> */
>> - svm_clear_current_vmcb(svm->vmcb);
>>
>> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
>> + svm_clear_current_vmcb(svm->vmcb);
>> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >>
>> PAGE_SHIFT));
>> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >>
>> +PAGE_SHIFT));
>> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
>> - __free_page(virt_to_page(svm->nested.hsave));
>> __free_pages(virt_to_page(svm->nested.msrpm),
>> MSRPM_ALLOC_ORDER); }
>>
>> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
>> /* Drop int_ctl fields related to VINTR injection. */
>> svm->vmcb->control.int_ctl &= mask;
>> if (is_guest_mode(&svm->vcpu)) {
>> - svm->nested.hsave->control.int_ctl &= mask;
>> + svm->vmcb01->control.int_ctl &= mask;
>>
>> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
>> (svm->nested.ctl.int_ctl & V_TPR_MASK)); @@ -3127,7
>> +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>> if (is_guest_mode(vcpu)) {
>> /* As long as interrupts are being delivered... */
>> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
>> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
>> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
>> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
>> return true;
>>
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index
>> a798e1731709..e908b83bfa69 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -82,7 +82,9 @@ struct kvm_svm {
>> struct kvm_vcpu;
>>
>> struct svm_nested_state {
>> - struct vmcb *hsave;
>> + struct vmcb *vmcb02;
>> + unsigned long vmcb01_pa;
>> + unsigned long vmcb02_pa;
>> u64 hsave_msr;
>> u64 vm_cr_msr;
>> u64 vmcb;
>> @@ -102,6 +104,7 @@ struct svm_nested_state { struct vcpu_svm {
>> struct kvm_vcpu vcpu;
>> struct vmcb *vmcb;
>> + struct vmcb *vmcb01;
>> unsigned long vmcb_pa;
>> struct svm_cpu_data *svm_data;
>> uint64_t asid_generation;
>> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu
>> *vcpu)
>>
>> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) {
>> - if (is_guest_mode(&svm->vcpu))
>> - return svm->nested.hsave;
>> - else
>> - return svm->vmcb;
>> + return svm->vmcb01;
> Shouldn't it return svm->vmcb? That is what your commit message says.
I believe this is correct. The function is designed to return the host
vmcb which will always be vmcb01.
>
>> }
>>
>> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
>> --
>> 2.20.1
> -----Original Message-----
> From: Cathy Avery <[email protected]>
> Sent: Friday, September 18, 2020 10:27 AM
> To: Moger, Babu <[email protected]>; [email protected];
> [email protected]; [email protected]
> Cc: [email protected]; Huang2, Wei <[email protected]>
> Subject: Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest
>
> On 9/18/20 11:16 AM, Babu Moger wrote:
> > Cathy,
> > Thanks for the patches. It cleans up the code nicely.
> > But there are some issues with the patch. I was able to bring the L1
> > guest with your patch. But when I tried to load L2 guest it crashed. I
> > am thinking It is mostly due to save/restore part of vmcb. Few comments
> below.
> >
> >> -----Original Message-----
> >> From: Cathy Avery <[email protected]>
> >> Sent: Thursday, September 17, 2020 2:23 PM
> >> To: [email protected]; [email protected];
> >> [email protected]
> >> Cc: [email protected]; Huang2, Wei <[email protected]>
> >> Subject: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2
> >> guest
> >>
> >> svm->vmcb will now point to either a separate vmcb L1 ( not nested )
> >> svm->or L2 vmcb
> >> ( nested ).
> >>
> >> Issues:
> >>
> >> 1) There is some wholesale copying of vmcb.save and vmcb.contol
> >> areas which will need to be refined.
> >>
> >> 2) There is a workaround in nested_svm_vmexit() where
> >>
> >> if (svm->vmcb01->control.asid == 0)
> >> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> >>
> >> This was done as a result of the kvm selftest 'state_test'. In that
> >> test svm_set_nested_state() is called before svm_vcpu_run().
> >> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> >> vmcb which is now vmcb02 as we are in nested mode subsequently
> >> vmcb01.control.asid is never set as it should be.
> >>
> >> Tested:
> >> kvm-unit-tests
> >> kvm self tests
> >>
> >> Signed-off-by: Cathy Avery <[email protected]>
> >> ---
> >> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
> >> arch/x86/kvm/svm/svm.c | 41 +++++++-------
> >> arch/x86/kvm/svm/svm.h | 10 ++--
> >> 3 files changed, 81 insertions(+), 86 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> >> index
> >> e90bc436f584..0a06e62010d8 100644
> >> --- a/arch/x86/kvm/svm/nested.c
> >> +++ b/arch/x86/kvm/svm/nested.c
> >> @@ -75,12 +75,12 @@ static unsigned long
> >> nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) static void
> >> nested_svm_init_mmu_context(struct kvm_vcpu
> >> *vcpu) {
> >> struct vcpu_svm *svm = to_svm(vcpu);
> >> - struct vmcb *hsave = svm->nested.hsave;
> >>
> >> WARN_ON(mmu_is_nested(vcpu));
> >>
> >> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
> >> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4,
> >> hsave->save.efer,
> >> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01-
> >>> save.cr4,
> >> + svm->vmcb01->save.efer,
> >> svm->nested.ctl.nested_cr3);
> >> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
> >> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
> >> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
> >> return;
> >>
> >> c = &svm->vmcb->control;
> >> - h = &svm->nested.hsave->control;
> >> + h = &svm->vmcb01->control;
> >> g = &svm->nested.ctl;
> >>
> >> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
> >> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct
> >> vcpu_svm *svm)
> >>
> >> svm->vmcb->control.int_ctl =
> >> (svm->nested.ctl.int_ctl & ~mask) |
> >> - (svm->nested.hsave->control.int_ctl & mask);
> >> + (svm->vmcb01->control.int_ctl & mask);
> >>
> >> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
> >> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
> >> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm,
> >> u64 vmcb_gpa,
> >> int ret;
> >>
> >> svm->nested.vmcb = vmcb_gpa;
> >> +
> >> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> >> +
> >> + svm->nested.vmcb02->control = svm->vmcb01->control;
> >> + svm->vmcb = svm->nested.vmcb02;
> >> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> >> load_nested_vmcb_control(svm, &nested_vmcb->control);
> >> nested_prepare_vmcb_save(svm, nested_vmcb);
> >> nested_prepare_vmcb_control(svm);
> >> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm) {
> >> int ret;
> >> struct vmcb *nested_vmcb;
> >> - struct vmcb *hsave = svm->nested.hsave;
> >> - struct vmcb *vmcb = svm->vmcb;
> >> struct kvm_host_map map;
> >> u64 vmcb_gpa;
> >>
> >> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> >> kvm_clear_exception_queue(&svm->vcpu);
> >> kvm_clear_interrupt_queue(&svm->vcpu);
> >>
> >> - /*
> >> - * Save the old vmcb, so we don't need to pick what we save, but can
> >> - * restore everything when a VMEXIT occurs
> >> - */
> >> - hsave->save.es = vmcb->save.es;
> >> - hsave->save.cs = vmcb->save.cs;
> >> - hsave->save.ss = vmcb->save.ss;
> >> - hsave->save.ds = vmcb->save.ds;
> >> - hsave->save.gdtr = vmcb->save.gdtr;
> >> - hsave->save.idtr = vmcb->save.idtr;
> >> - hsave->save.efer = svm->vcpu.arch.efer;
> >> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
> >> - hsave->save.cr4 = svm->vcpu.arch.cr4;
> >> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
> >> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
> >> - hsave->save.rsp = vmcb->save.rsp;
> >> - hsave->save.rax = vmcb->save.rax;
> >> - if (npt_enabled)
> >> - hsave->save.cr3 = vmcb->save.cr3;
> >> - else
> >> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
> >> -
> >> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
> > You may have to carefully check the above cleanup.
> Thanks I'll check it out. I did not see a crash when running the tests. Could you
> send me more information about your test and test setup, stack trace, etc.
There is no stack trace. It basically does not load l2 guest.
1. Bare metal setup with RH 8.2(Any AMD EPYC system should be fine).
Download the latest kvm tree. Mine is kernel 5.9.0-rc1+your patch
# uname -r
5.9.0-rc1+
(You can actually just unload/load the kvm modules should also work.)
2. Now bring up the L1 guest on with the following command.
#qemu-system-x86_64 -name rhel8 -m 16384 -smp
cores=16,threads=1,sockets=1 -hda vdisk.qcow2 -enable-kvm -net nic -net
bridge,br=virbr0,helper=/usr/libexec/qem
u-bridge-helper -cpu EPYC -nographic
Download the latest kvm kernel on L1 guest. Mine is kernel
5.9.0-rc1+your patch
# uname -r
5.9.0-rc1+
3. Try to bring with the L2 guest with the following command.
#qemu-system-x86_64 -name rhel8 -m 16384 -smp
cores=16,threads=1,sockets=1 -hda vdisk.qcow2 -enable-kvm -net nic -net
bridge,br=virbr0,helper=/usr/libexec/qem
u-bridge-helper -cpu EPYC -nographic
L2 guest fails to start.
Note that L2 guest comes up fine without your patch.
Let me know if you need any more information. Thanks.
> >
> >> +
> >> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
> >> +
> >> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
> >> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
> >> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
> >> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
> >> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
> >> +
> >> + if (!npt_enabled)
> >> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
> >>
> >> svm->nested.nested_run_pending = 1;
> >>
> >> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) {
> >> int rc;
> >> struct vmcb *nested_vmcb;
> >> - struct vmcb *hsave = svm->nested.hsave;
> >> struct vmcb *vmcb = svm->vmcb;
> >> struct kvm_host_map map;
> >>
> >> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> >> nested_vmcb->control.pause_filter_thresh =
> >> svm->vmcb->control.pause_filter_thresh;
> >>
> >> - /* Restore the original control entries */
> >> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> >> + if (svm->vmcb01->control.asid == 0)
> >> + svm->vmcb01->control.asid = svm->nested.vmcb02-
> >>> control.asid;
> >> +
> >> + svm->vmcb = svm->vmcb01;
> >> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> >>
> >> /* On vmexit the GIF is set to false */
> >> svm_set_gif(svm, false);
> >> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> >> svm->nested.ctl.nested_cr3 = 0;
> >>
> >> /* Restore selected save entries */
> >> - svm->vmcb->save.es = hsave->save.es;
> >> - svm->vmcb->save.cs = hsave->save.cs;
> >> - svm->vmcb->save.ss = hsave->save.ss;
> >> - svm->vmcb->save.ds = hsave->save.ds;
> >> - svm->vmcb->save.gdtr = hsave->save.gdtr;
> >> - svm->vmcb->save.idtr = hsave->save.idtr;
> >> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
> >> - svm_set_efer(&svm->vcpu, hsave->save.efer);
> >> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
> >> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
> >> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
> >> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
> >> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
> >> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
> >> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
> >> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
> >> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
> >> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
> >> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
> >> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
> >> svm->vmcb->save.dr7 = 0;
> >> svm->vmcb->save.cpl = 0;
> >> svm->vmcb->control.exit_int_info = 0; @@ -670,12 +658,12 @@ int
> >> nested_svm_vmexit(struct vcpu_svm *svm)
> >>
> >> nested_svm_uninit_mmu_context(&svm->vcpu);
> >>
> >> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
> >> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
> >> if (rc)
> >> return 1;
> >>
> >> - if (npt_enabled)
> >> - svm->vmcb->save.cr3 = hsave->save.cr3;
> >> + if (!npt_enabled)
> >> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
> >>
> >> /*
> >> * Drop what we picked up for L2 via svm_complete_interrupts() so
> >> it @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> >> void svm_leave_nested(struct vcpu_svm *svm) {
> >> if (is_guest_mode(&svm->vcpu)) {
> >> - struct vmcb *hsave = svm->nested.hsave;
> >> - struct vmcb *vmcb = svm->vmcb;
> >> -
> >> svm->nested.nested_run_pending = 0;
> >> leave_guest_mode(&svm->vcpu);
> >> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> >> + svm->vmcb = svm->vmcb01;
> >> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> >> nested_svm_uninit_mmu_context(&svm->vcpu);
> >> }
> >> }
> >> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct
> >> kvm_vcpu *vcpu,
> >> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
> >> sizeof(user_vmcb->control)))
> >> return -EFAULT;
> >> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
> >> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
> >> sizeof(user_vmcb->save)))
> >> return -EFAULT;
> >> -
> >> out:
> >> return kvm_state.size;
> >> }
> >> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu
> >> *vcpu,
> >> struct kvm_nested_state *kvm_state) {
> >> struct vcpu_svm *svm = to_svm(vcpu);
> >> - struct vmcb *hsave = svm->nested.hsave;
> >> struct vmcb __user *user_vmcb = (struct vmcb __user *)
> >> &user_kvm_nested_state->data.svm[0];
> >> struct vmcb_control_area ctl;
> >> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct
> >> kvm_vcpu *vcpu,
> >> if (!(save.cr0 & X86_CR0_PG))
> >> return -EINVAL;
> >>
> >> + svm->nested.vmcb02->control = svm->vmcb01->control;
> >> + svm->nested.vmcb02->save = svm->vmcb01->save;
> >> + svm->vmcb01->save = save;
> >> +
> >> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> >> +
> >> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> >> +
> >> + svm->vmcb = svm->nested.vmcb02;
> >> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> >> +
> >> /*
> >> - * All checks done, we can enter guest mode. L1 control fields
> >> - * come from the nested save state. Guest state is already
> >> - * in the registers, the save area of the nested state instead
> >> - * contains saved L1 state.
> >> + * All checks done, we can enter guest mode. L2 control fields will
> >> + * be the result of a combination of L1 and userspace indicated
> >> + * L12.control. The save area of L1 vmcb now contains the userspace
> >> + * indicated L1.save.
> >> */
> >> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
> >> - hsave->save = save;
> >>
> >> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> >> load_nested_vmcb_control(svm, &ctl);
> >> nested_prepare_vmcb_control(svm);
> >>
> >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index
> >> 5764b87379cf..d8022f989ffb 100644
> >> --- a/arch/x86/kvm/svm/svm.c
> >> +++ b/arch/x86/kvm/svm/svm.c
> >> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct
> >> kvm_vcpu *vcpu, u64 offset)
> >> if (is_guest_mode(vcpu)) {
> >> /* Write L1's TSC offset. */
> >> g_tsc_offset = svm->vmcb->control.tsc_offset -
> >> - svm->nested.hsave->control.tsc_offset;
> >> - svm->nested.hsave->control.tsc_offset = offset;
> >> + svm->vmcb01->control.tsc_offset;
> >> + svm->vmcb01->control.tsc_offset = offset;
> >> }
> >>
> >> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> >> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu
> >> *vcpu, bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) {
> >> struct vcpu_svm *svm;
> >> - struct page *page;
> >> + struct page *vmcb01_page;
> >> + struct page *vmcb02_page;
> >> struct page *msrpm_pages;
> >> - struct page *hsave_page;
> >> struct page *nested_msrpm_pages;
> >> int err;
> >>
> >> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >> svm = to_svm(vcpu);
> >>
> >> err = -ENOMEM;
> >> - page = alloc_page(GFP_KERNEL_ACCOUNT);
> >> - if (!page)
> >> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
> >> + if (!vmcb01_page)
> >> goto out;
> >>
> >> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT,
> MSRPM_ALLOC_ORDER);
> >> @@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu
> >> *vcpu)
> >> if (!nested_msrpm_pages)
> >> goto free_page2;
> >>
> >> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
> >> - if (!hsave_page)
> >> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
> >> + if (!vmcb02_page)
> >> goto free_page3;
> >>
> >> err = avic_init_vcpu(svm);
> >> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
> >> svm->avic_is_running = true;
> >>
> >> - svm->nested.hsave = page_address(hsave_page);
> >> - clear_page(svm->nested.hsave);
> >> + svm->nested.vmcb02 = page_address(vmcb02_page);
> >> + clear_page(svm->nested.vmcb02);
> >> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) <<
> >> +PAGE_SHIFT);
> >>
> >> svm->msrpm = page_address(msrpm_pages);
> >> svm_vcpu_init_msrpm(svm->msrpm);
> >> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu
> *vcpu)
> >> svm->nested.msrpm = page_address(nested_msrpm_pages);
> >> svm_vcpu_init_msrpm(svm->nested.msrpm);
> >>
> >> - svm->vmcb = page_address(page);
> >> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
> >> clear_page(svm->vmcb);
> >> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
> >> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) <<
> >> PAGE_SHIFT);
> >> + svm->nested.vmcb01_pa = svm->vmcb_pa;
> >> +
> >> svm->asid_generation = 0;
> >> init_vmcb(svm);
> >>
> >> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu
> *vcpu)
> >> return 0;
> >>
> >> free_page4:
> >> - __free_page(hsave_page);
> >> + __free_page(vmcb02_page);
> >> free_page3:
> >> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> >> free_page2:
> >> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
> >> free_page1:
> >> - __free_page(page);
> >> + __free_page(vmcb01_page);
> >> out:
> >> return err;
> >> }
> >> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu
> *vcpu)
> >> * svm_vcpu_load(). So, ensure that no logical CPU has this
> >> * vmcb page recorded as its current vmcb.
> >> */
> >> - svm_clear_current_vmcb(svm->vmcb);
> >>
> >> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
> >> + svm_clear_current_vmcb(svm->vmcb);
> >> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >>
> >> PAGE_SHIFT));
> >> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >>
> >> +PAGE_SHIFT));
> >> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> >> - __free_page(virt_to_page(svm->nested.hsave));
> >> __free_pages(virt_to_page(svm->nested.msrpm),
> >> MSRPM_ALLOC_ORDER); }
> >>
> >> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
> >> /* Drop int_ctl fields related to VINTR injection. */
> >> svm->vmcb->control.int_ctl &= mask;
> >> if (is_guest_mode(&svm->vcpu)) {
> >> - svm->nested.hsave->control.int_ctl &= mask;
> >> + svm->vmcb01->control.int_ctl &= mask;
> >>
> >> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
> >> (svm->nested.ctl.int_ctl & V_TPR_MASK)); @@ -3127,7
> >> +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> >> if (is_guest_mode(vcpu)) {
> >> /* As long as interrupts are being delivered... */
> >> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
> >> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
> >> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
> >> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
> >> return true;
> >>
> >> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index
> >> a798e1731709..e908b83bfa69 100644
> >> --- a/arch/x86/kvm/svm/svm.h
> >> +++ b/arch/x86/kvm/svm/svm.h
> >> @@ -82,7 +82,9 @@ struct kvm_svm {
> >> struct kvm_vcpu;
> >>
> >> struct svm_nested_state {
> >> - struct vmcb *hsave;
> >> + struct vmcb *vmcb02;
> >> + unsigned long vmcb01_pa;
> >> + unsigned long vmcb02_pa;
> >> u64 hsave_msr;
> >> u64 vm_cr_msr;
> >> u64 vmcb;
> >> @@ -102,6 +104,7 @@ struct svm_nested_state { struct vcpu_svm {
> >> struct kvm_vcpu vcpu;
> >> struct vmcb *vmcb;
> >> + struct vmcb *vmcb01;
> >> unsigned long vmcb_pa;
> >> struct svm_cpu_data *svm_data;
> >> uint64_t asid_generation;
> >> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct
> >> kvm_vcpu
> >> *vcpu)
> >>
> >> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) {
> >> - if (is_guest_mode(&svm->vcpu))
> >> - return svm->nested.hsave;
> >> - else
> >> - return svm->vmcb;
> >> + return svm->vmcb01;
> > Shouldn't it return svm->vmcb? That is what your commit message says.
> I believe this is correct. The function is designed to return the host vmcb which
> will always be vmcb01.
> >
> >> }
> >>
> >> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
> >> --
> >> 2.20.1
>
On 09/17 03:23, Cathy Avery wrote:
> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb ( nested ).
>
> Issues:
>
> 1) There is some wholesale copying of vmcb.save and vmcb.contol
> areas which will need to be refined.
>
> 2) There is a workaround in nested_svm_vmexit() where
>
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> This was done as a result of the kvm selftest 'state_test'. In that
> test svm_set_nested_state() is called before svm_vcpu_run().
> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> vmcb which is now vmcb02 as we are in nested mode subsequently
> vmcb01.control.asid is never set as it should be.
>
> Tested:
> kvm-unit-tests
> kvm self tests
I was able to run some basic nested SVM tests using this patch. Full L2 VM
(Fedora) had some problem to boot after grub loading kernel.
Comments below.
>
> Signed-off-by: Cathy Avery <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
> arch/x86/kvm/svm/svm.c | 41 +++++++-------
> arch/x86/kvm/svm/svm.h | 10 ++--
> 3 files changed, 81 insertions(+), 86 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e90bc436f584..0a06e62010d8 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
> static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
>
> WARN_ON(mmu_is_nested(vcpu));
>
> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
> + svm->vmcb01->save.efer,
> svm->nested.ctl.nested_cr3);
> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
> return;
>
> c = &svm->vmcb->control;
> - h = &svm->nested.hsave->control;
> + h = &svm->vmcb01->control;
> g = &svm->nested.ctl;
>
> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
>
> svm->vmcb->control.int_ctl =
> (svm->nested.ctl.int_ctl & ~mask) |
> - (svm->nested.hsave->control.int_ctl & mask);
> + (svm->vmcb01->control.int_ctl & mask);
>
> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> int ret;
>
> svm->nested.vmcb = vmcb_gpa;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb02->control = svm->vmcb01->control;
This part is a bit confusing. svm->vmcb01->control contains the control
info from L0 hypervisor to L1 VM. Shouldn't vmcb02->control use the info
from the control info contained in nested_vmcb?
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> nested_prepare_vmcb_save(svm, nested_vmcb);
> nested_prepare_vmcb_control(svm);
> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> {
> int ret;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
> u64 vmcb_gpa;
>
> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> kvm_clear_exception_queue(&svm->vcpu);
> kvm_clear_interrupt_queue(&svm->vcpu);
>
> - /*
> - * Save the old vmcb, so we don't need to pick what we save, but can
> - * restore everything when a VMEXIT occurs
> - */
> - hsave->save.es = vmcb->save.es;
> - hsave->save.cs = vmcb->save.cs;
> - hsave->save.ss = vmcb->save.ss;
> - hsave->save.ds = vmcb->save.ds;
> - hsave->save.gdtr = vmcb->save.gdtr;
> - hsave->save.idtr = vmcb->save.idtr;
> - hsave->save.efer = svm->vcpu.arch.efer;
> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
> - hsave->save.cr4 = svm->vcpu.arch.cr4;
> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
> - hsave->save.rsp = vmcb->save.rsp;
> - hsave->save.rax = vmcb->save.rax;
> - if (npt_enabled)
> - hsave->save.cr3 = vmcb->save.cr3;
> - else
> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
> -
> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
> +
> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
> +
> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
> +
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
We do save some copying time here, which is always a good thing especially
for nested virt.
>
> svm->nested.nested_run_pending = 1;
>
> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> int rc;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
>
> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> nested_vmcb->control.pause_filter_thresh =
> svm->vmcb->control.pause_filter_thresh;
>
> - /* Restore the original control entries */
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + if (svm->vmcb01->control.asid == 0)
> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> +
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> /* On vmexit the GIF is set to false */
> svm_set_gif(svm, false);
> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> svm->nested.ctl.nested_cr3 = 0;
>
> /* Restore selected save entries */
> - svm->vmcb->save.es = hsave->save.es;
> - svm->vmcb->save.cs = hsave->save.cs;
> - svm->vmcb->save.ss = hsave->save.ss;
> - svm->vmcb->save.ds = hsave->save.ds;
> - svm->vmcb->save.gdtr = hsave->save.gdtr;
> - svm->vmcb->save.idtr = hsave->save.idtr;
> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
> - svm_set_efer(&svm->vcpu, hsave->save.efer);
> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
> svm->vmcb->save.dr7 = 0;
> svm->vmcb->save.cpl = 0;
> svm->vmcb->control.exit_int_info = 0;
> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>
> nested_svm_uninit_mmu_context(&svm->vcpu);
>
> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
> if (rc)
> return 1;
>
> - if (npt_enabled)
> - svm->vmcb->save.cr3 = hsave->save.cr3;
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
Does this mean the original code is missing the following?
else
svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> /*
> * Drop what we picked up for L2 via svm_complete_interrupts() so it
> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> void svm_leave_nested(struct vcpu_svm *svm)
> {
> if (is_guest_mode(&svm->vcpu)) {
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> -
> svm->nested.nested_run_pending = 0;
> leave_guest_mode(&svm->vcpu);
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> nested_svm_uninit_mmu_context(&svm->vcpu);
> }
> }
> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
> sizeof(user_vmcb->control)))
> return -EFAULT;
> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
> sizeof(user_vmcb->save)))
> return -EFAULT;
> -
> out:
> return kvm_state.size;
> }
> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> struct kvm_nested_state *kvm_state)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb __user *user_vmcb = (struct vmcb __user *)
> &user_kvm_nested_state->data.svm[0];
> struct vmcb_control_area ctl;
> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> if (!(save.cr0 & X86_CR0_PG))
> return -EINVAL;
>
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->nested.vmcb02->save = svm->vmcb01->save;
> + svm->vmcb01->save = save;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> +
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> +
> /*
> - * All checks done, we can enter guest mode. L1 control fields
> - * come from the nested save state. Guest state is already
> - * in the registers, the save area of the nested state instead
> - * contains saved L1 state.
> + * All checks done, we can enter guest mode. L2 control fields will
> + * be the result of a combination of L1 and userspace indicated
> + * L12.control. The save area of L1 vmcb now contains the userspace
> + * indicated L1.save.
> */
> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
> - hsave->save = save;
>
> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> load_nested_vmcb_control(svm, &ctl);
> nested_prepare_vmcb_control(svm);
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 5764b87379cf..d8022f989ffb 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> if (is_guest_mode(vcpu)) {
> /* Write L1's TSC offset. */
> g_tsc_offset = svm->vmcb->control.tsc_offset -
> - svm->nested.hsave->control.tsc_offset;
> - svm->nested.hsave->control.tsc_offset = offset;
> + svm->vmcb01->control.tsc_offset;
> + svm->vmcb01->control.tsc_offset = offset;
> }
>
> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm;
> - struct page *page;
> + struct page *vmcb01_page;
> + struct page *vmcb02_page;
> struct page *msrpm_pages;
> - struct page *hsave_page;
> struct page *nested_msrpm_pages;
> int err;
>
> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm = to_svm(vcpu);
>
> err = -ENOMEM;
> - page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!page)
> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb01_page)
> goto out;
>
> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
> @@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (!nested_msrpm_pages)
> goto free_page2;
>
> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!hsave_page)
> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb02_page)
> goto free_page3;
>
> err = avic_init_vcpu(svm);
> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
> svm->avic_is_running = true;
>
> - svm->nested.hsave = page_address(hsave_page);
> - clear_page(svm->nested.hsave);
> + svm->nested.vmcb02 = page_address(vmcb02_page);
> + clear_page(svm->nested.vmcb02);
> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
>
> svm->msrpm = page_address(msrpm_pages);
> svm_vcpu_init_msrpm(svm->msrpm);
> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm->nested.msrpm = page_address(nested_msrpm_pages);
> svm_vcpu_init_msrpm(svm->nested.msrpm);
>
> - svm->vmcb = page_address(page);
> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
> clear_page(svm->vmcb);
> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
> + svm->nested.vmcb01_pa = svm->vmcb_pa;
> +
> svm->asid_generation = 0;
> init_vmcb(svm);
>
> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> return 0;
>
> free_page4:
> - __free_page(hsave_page);
> + __free_page(vmcb02_page);
> free_page3:
> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page2:
> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page1:
> - __free_page(page);
> + __free_page(vmcb01_page);
> out:
> return err;
> }
> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> * svm_vcpu_load(). So, ensure that no logical CPU has this
> * vmcb page recorded as its current vmcb.
> */
> - svm_clear_current_vmcb(svm->vmcb);
>
> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
> + svm_clear_current_vmcb(svm->vmcb);
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >> PAGE_SHIFT));
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >> PAGE_SHIFT));
> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> - __free_page(virt_to_page(svm->nested.hsave));
> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> }
>
> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
> /* Drop int_ctl fields related to VINTR injection. */
> svm->vmcb->control.int_ctl &= mask;
> if (is_guest_mode(&svm->vcpu)) {
> - svm->nested.hsave->control.int_ctl &= mask;
> + svm->vmcb01->control.int_ctl &= mask;
>
> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
> (svm->nested.ctl.int_ctl & V_TPR_MASK));
> @@ -3127,7 +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> if (is_guest_mode(vcpu)) {
> /* As long as interrupts are being delivered... */
> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
> return true;
>
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index a798e1731709..e908b83bfa69 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -82,7 +82,9 @@ struct kvm_svm {
> struct kvm_vcpu;
>
> struct svm_nested_state {
> - struct vmcb *hsave;
> + struct vmcb *vmcb02;
> + unsigned long vmcb01_pa;
Any reason that vmcb01_pa can't be placed in "struct vcpu_svm" below, along
with vmcb01?
> + unsigned long vmcb02_pa;
> u64 hsave_msr;
> u64 vm_cr_msr;
> u64 vmcb;
> @@ -102,6 +104,7 @@ struct svm_nested_state {
> struct vcpu_svm {
> struct kvm_vcpu vcpu;
> struct vmcb *vmcb;
> + struct vmcb *vmcb01;
> unsigned long vmcb_pa;
> struct svm_cpu_data *svm_data;
> uint64_t asid_generation;
> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
>
> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
> {
> - if (is_guest_mode(&svm->vcpu))
> - return svm->nested.hsave;
> - else
> - return svm->vmcb;
> + return svm->vmcb01;
> }
>
> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
> --
> 2.20.1
>
On 9/18/20 5:11 PM, Wei Huang wrote:
> On 09/17 03:23, Cathy Avery wrote:
>> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb ( nested ).
>>
>> Issues:
>>
>> 1) There is some wholesale copying of vmcb.save and vmcb.contol
>> areas which will need to be refined.
>>
>> 2) There is a workaround in nested_svm_vmexit() where
>>
>> if (svm->vmcb01->control.asid == 0)
>> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>
>> This was done as a result of the kvm selftest 'state_test'. In that
>> test svm_set_nested_state() is called before svm_vcpu_run().
>> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
>> vmcb which is now vmcb02 as we are in nested mode subsequently
>> vmcb01.control.asid is never set as it should be.
>>
>> Tested:
>> kvm-unit-tests
>> kvm self tests
> I was able to run some basic nested SVM tests using this patch. Full L2 VM
> (Fedora) had some problem to boot after grub loading kernel.
Thanks for the feedback and my responses are below. I am looking into
the load issue.
>
> Comments below.
>
>> Signed-off-by: Cathy Avery <[email protected]>
>> ---
>> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
>> arch/x86/kvm/svm/svm.c | 41 +++++++-------
>> arch/x86/kvm/svm/svm.h | 10 ++--
>> 3 files changed, 81 insertions(+), 86 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>> index e90bc436f584..0a06e62010d8 100644
>> --- a/arch/x86/kvm/svm/nested.c
>> +++ b/arch/x86/kvm/svm/nested.c
>> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
>> static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
>> {
>> struct vcpu_svm *svm = to_svm(vcpu);
>> - struct vmcb *hsave = svm->nested.hsave;
>>
>> WARN_ON(mmu_is_nested(vcpu));
>>
>> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
>> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
>> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
>> + svm->vmcb01->save.efer,
>> svm->nested.ctl.nested_cr3);
>> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
>> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
>> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
>> return;
>>
>> c = &svm->vmcb->control;
>> - h = &svm->nested.hsave->control;
>> + h = &svm->vmcb01->control;
>> g = &svm->nested.ctl;
>>
>> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
>> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
>>
>> svm->vmcb->control.int_ctl =
>> (svm->nested.ctl.int_ctl & ~mask) |
>> - (svm->nested.hsave->control.int_ctl & mask);
>> + (svm->vmcb01->control.int_ctl & mask);
>>
>> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
>> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
>> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
>> int ret;
>>
>> svm->nested.vmcb = vmcb_gpa;
>> +
>> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
>> +
>> + svm->nested.vmcb02->control = svm->vmcb01->control;
> This part is a bit confusing. svm->vmcb01->control contains the control
> info from L0 hypervisor to L1 VM. Shouldn't vmcb02->control use the info
> from the control info contained in nested_vmcb?
It does during nested_prepare_vmcb_control(). Now that there is a
separate vmcb01 and vmcb02 vmcb02.control still relies on the original
contents of vmcb01.control so vmcb02.control is initialized by a whole
sale copy of vmcb01.control which is admittedly excessive ( patch issues
section ) and needs to be refined to just what is needed.
>
>> + svm->vmcb = svm->nested.vmcb02;
>> + svm->vmcb_pa = svm->nested.vmcb02_pa;
>> load_nested_vmcb_control(svm, &nested_vmcb->control);
>> nested_prepare_vmcb_save(svm, nested_vmcb);
>> nested_prepare_vmcb_control(svm);
>> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
>> {
>> int ret;
>> struct vmcb *nested_vmcb;
>> - struct vmcb *hsave = svm->nested.hsave;
>> - struct vmcb *vmcb = svm->vmcb;
>> struct kvm_host_map map;
>> u64 vmcb_gpa;
>>
>> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
>> kvm_clear_exception_queue(&svm->vcpu);
>> kvm_clear_interrupt_queue(&svm->vcpu);
>>
>> - /*
>> - * Save the old vmcb, so we don't need to pick what we save, but can
>> - * restore everything when a VMEXIT occurs
>> - */
>> - hsave->save.es = vmcb->save.es;
>> - hsave->save.cs = vmcb->save.cs;
>> - hsave->save.ss = vmcb->save.ss;
>> - hsave->save.ds = vmcb->save.ds;
>> - hsave->save.gdtr = vmcb->save.gdtr;
>> - hsave->save.idtr = vmcb->save.idtr;
>> - hsave->save.efer = svm->vcpu.arch.efer;
>> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
>> - hsave->save.cr4 = svm->vcpu.arch.cr4;
>> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
>> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
>> - hsave->save.rsp = vmcb->save.rsp;
>> - hsave->save.rax = vmcb->save.rax;
>> - if (npt_enabled)
>> - hsave->save.cr3 = vmcb->save.cr3;
>> - else
>> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
>> -
>> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
>> +
>> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
>> +
>> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
>> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
>> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
>> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
>> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
>> +
>> + if (!npt_enabled)
>> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
> We do save some copying time here, which is always a good thing especially
> for nested virt.
>
>> svm->nested.nested_run_pending = 1;
>>
>> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> {
>> int rc;
>> struct vmcb *nested_vmcb;
>> - struct vmcb *hsave = svm->nested.hsave;
>> struct vmcb *vmcb = svm->vmcb;
>> struct kvm_host_map map;
>>
>> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> nested_vmcb->control.pause_filter_thresh =
>> svm->vmcb->control.pause_filter_thresh;
>>
>> - /* Restore the original control entries */
>> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
>> + if (svm->vmcb01->control.asid == 0)
>> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>> +
>> + svm->vmcb = svm->vmcb01;
>> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>>
>> /* On vmexit the GIF is set to false */
>> svm_set_gif(svm, false);
>> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> svm->nested.ctl.nested_cr3 = 0;
>>
>> /* Restore selected save entries */
>> - svm->vmcb->save.es = hsave->save.es;
>> - svm->vmcb->save.cs = hsave->save.cs;
>> - svm->vmcb->save.ss = hsave->save.ss;
>> - svm->vmcb->save.ds = hsave->save.ds;
>> - svm->vmcb->save.gdtr = hsave->save.gdtr;
>> - svm->vmcb->save.idtr = hsave->save.idtr;
>> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
>> - svm_set_efer(&svm->vcpu, hsave->save.efer);
>> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
>> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
>> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
>> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
>> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
>> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
>> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
>> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
>> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
>> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
>> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
>> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
>> svm->vmcb->save.dr7 = 0;
>> svm->vmcb->save.cpl = 0;
>> svm->vmcb->control.exit_int_info = 0;
>> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>>
>> nested_svm_uninit_mmu_context(&svm->vcpu);
>>
>> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
>> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
>> if (rc)
>> return 1;
>>
>> - if (npt_enabled)
>> - svm->vmcb->save.cr3 = hsave->save.cr3;
>> + if (!npt_enabled)
>> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
> Does this mean the original code is missing the following?
>
> else
> svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
No it means I made an assumption here. I'll look at this again.
>
>>
>> /*
>> * Drop what we picked up for L2 via svm_complete_interrupts() so it
>> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>> void svm_leave_nested(struct vcpu_svm *svm)
>> {
>> if (is_guest_mode(&svm->vcpu)) {
>> - struct vmcb *hsave = svm->nested.hsave;
>> - struct vmcb *vmcb = svm->vmcb;
>> -
>> svm->nested.nested_run_pending = 0;
>> leave_guest_mode(&svm->vcpu);
>> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
>> + svm->vmcb = svm->vmcb01;
>> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>> nested_svm_uninit_mmu_context(&svm->vcpu);
>> }
>> }
>> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
>> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
>> sizeof(user_vmcb->control)))
>> return -EFAULT;
>> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
>> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
>> sizeof(user_vmcb->save)))
>> return -EFAULT;
>> -
>> out:
>> return kvm_state.size;
>> }
>> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
>> struct kvm_nested_state *kvm_state)
>> {
>> struct vcpu_svm *svm = to_svm(vcpu);
>> - struct vmcb *hsave = svm->nested.hsave;
>> struct vmcb __user *user_vmcb = (struct vmcb __user *)
>> &user_kvm_nested_state->data.svm[0];
>> struct vmcb_control_area ctl;
>> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
>> if (!(save.cr0 & X86_CR0_PG))
>> return -EINVAL;
>>
>> + svm->nested.vmcb02->control = svm->vmcb01->control;
>> + svm->nested.vmcb02->save = svm->vmcb01->save;
>> + svm->vmcb01->save = save;
>> +
>> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
>> +
>> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
>> +
>> + svm->vmcb = svm->nested.vmcb02;
>> + svm->vmcb_pa = svm->nested.vmcb02_pa;
>> +
>> /*
>> - * All checks done, we can enter guest mode. L1 control fields
>> - * come from the nested save state. Guest state is already
>> - * in the registers, the save area of the nested state instead
>> - * contains saved L1 state.
>> + * All checks done, we can enter guest mode. L2 control fields will
>> + * be the result of a combination of L1 and userspace indicated
>> + * L12.control. The save area of L1 vmcb now contains the userspace
>> + * indicated L1.save.
>> */
>> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
>> - hsave->save = save;
>>
>> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
>> load_nested_vmcb_control(svm, &ctl);
>> nested_prepare_vmcb_control(svm);
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 5764b87379cf..d8022f989ffb 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> if (is_guest_mode(vcpu)) {
>> /* Write L1's TSC offset. */
>> g_tsc_offset = svm->vmcb->control.tsc_offset -
>> - svm->nested.hsave->control.tsc_offset;
>> - svm->nested.hsave->control.tsc_offset = offset;
>> + svm->vmcb01->control.tsc_offset;
>> + svm->vmcb01->control.tsc_offset = offset;
>> }
>>
>> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>> static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> {
>> struct vcpu_svm *svm;
>> - struct page *page;
>> + struct page *vmcb01_page;
>> + struct page *vmcb02_page;
>> struct page *msrpm_pages;
>> - struct page *hsave_page;
>> struct page *nested_msrpm_pages;
>> int err;
>>
>> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> svm = to_svm(vcpu);
>>
>> err = -ENOMEM;
>> - page = alloc_page(GFP_KERNEL_ACCOUNT);
>> - if (!page)
>> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> + if (!vmcb01_page)
>> goto out;
>>
>> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
>> @@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> if (!nested_msrpm_pages)
>> goto free_page2;
>>
>> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> - if (!hsave_page)
>> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
>> + if (!vmcb02_page)
>> goto free_page3;
>>
>> err = avic_init_vcpu(svm);
>> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
>> svm->avic_is_running = true;
>>
>> - svm->nested.hsave = page_address(hsave_page);
>> - clear_page(svm->nested.hsave);
>> + svm->nested.vmcb02 = page_address(vmcb02_page);
>> + clear_page(svm->nested.vmcb02);
>> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
>>
>> svm->msrpm = page_address(msrpm_pages);
>> svm_vcpu_init_msrpm(svm->msrpm);
>> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> svm->nested.msrpm = page_address(nested_msrpm_pages);
>> svm_vcpu_init_msrpm(svm->nested.msrpm);
>>
>> - svm->vmcb = page_address(page);
>> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
>> clear_page(svm->vmcb);
>> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
>> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
>> + svm->nested.vmcb01_pa = svm->vmcb_pa;
>> +
>> svm->asid_generation = 0;
>> init_vmcb(svm);
>>
>> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>> return 0;
>>
>> free_page4:
>> - __free_page(hsave_page);
>> + __free_page(vmcb02_page);
>> free_page3:
>> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
>> free_page2:
>> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
>> free_page1:
>> - __free_page(page);
>> + __free_page(vmcb01_page);
>> out:
>> return err;
>> }
>> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>> * svm_vcpu_load(). So, ensure that no logical CPU has this
>> * vmcb page recorded as its current vmcb.
>> */
>> - svm_clear_current_vmcb(svm->vmcb);
>>
>> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
>> + svm_clear_current_vmcb(svm->vmcb);
>> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >> PAGE_SHIFT));
>> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >> PAGE_SHIFT));
>> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
>> - __free_page(virt_to_page(svm->nested.hsave));
>> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
>> }
>>
>> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
>> /* Drop int_ctl fields related to VINTR injection. */
>> svm->vmcb->control.int_ctl &= mask;
>> if (is_guest_mode(&svm->vcpu)) {
>> - svm->nested.hsave->control.int_ctl &= mask;
>> + svm->vmcb01->control.int_ctl &= mask;
>>
>> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
>> (svm->nested.ctl.int_ctl & V_TPR_MASK));
>> @@ -3127,7 +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>> if (is_guest_mode(vcpu)) {
>> /* As long as interrupts are being delivered... */
>> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
>> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
>> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
>> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
>> return true;
>>
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index a798e1731709..e908b83bfa69 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -82,7 +82,9 @@ struct kvm_svm {
>> struct kvm_vcpu;
>>
>> struct svm_nested_state {
>> - struct vmcb *hsave;
>> + struct vmcb *vmcb02;
>> + unsigned long vmcb01_pa;
> Any reason that vmcb01_pa can't be placed in "struct vcpu_svm" below, along
> with vmcb01?
I just grouped it with the other nesting components. I can move it.
>
>> + unsigned long vmcb02_pa;
>> u64 hsave_msr;
>> u64 vm_cr_msr;
>> u64 vmcb;
>> @@ -102,6 +104,7 @@ struct svm_nested_state {
>> struct vcpu_svm {
>> struct kvm_vcpu vcpu;
>> struct vmcb *vmcb;
>> + struct vmcb *vmcb01;
>> unsigned long vmcb_pa;
>> struct svm_cpu_data *svm_data;
>> uint64_t asid_generation;
>> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
>>
>> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
>> {
>> - if (is_guest_mode(&svm->vcpu))
>> - return svm->nested.hsave;
>> - else
>> - return svm->vmcb;
>> + return svm->vmcb01;
>> }
>>
>> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
>> --
>> 2.20.1
>>
On 21/09/20 16:07, Cathy Avery wrote:
>>> - if (npt_enabled)
>>> - svm->vmcb->save.cr3 = hsave->save.cr3;
>>> + if (!npt_enabled)
>>> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>> Does this mean the original code is missing the following?
>>
>> else
>> svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
> No it means I made an assumption here. I'll look at this again.
This should not be needed, nested_svm_load_cr3's call to kvm_init_mmu
should write to svm->vmcb->save.cr3.
>>>
>>> + unsigned long vmcb01_pa;
>> Any reason that vmcb01_pa can't be placed in "struct vcpu_svm" below, along
>> with vmcb01?
> I just grouped it with the other nesting components. I can move it.
Please do it, vmcb01 is not part of nesting.
> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
> {
> - if (is_guest_mode(&svm->vcpu))
> - return svm->nested.hsave;
> - else
> - return svm->vmcb;
> + return svm->vmcb01;
You can remove the function altogether (in a second patch).
Paolo
On 17/09/20 21:23, Cathy Avery wrote:
>
> 2) There is a workaround in nested_svm_vmexit() where
>
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> This was done as a result of the kvm selftest 'state_test'. In that
> test svm_set_nested_state() is called before svm_vcpu_run().
> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> vmcb which is now vmcb02 as we are in nested mode subsequently
> vmcb01.control.asid is never set as it should be.
I think the asid should be kept in svm->asid, and copied to
svm->vmcb->control.asid in svm_vcpu_run. It's slightly overkill for
non-nested but it simplifies the nested case a lot so it's worth it.
That would be a first patch in the series, placed before this one.
Paolo
On Thu, 2020-09-17 at 15:23 -0400, Cathy Avery wrote:
> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb ( nested ).
>
> Issues:
>
> 1) There is some wholesale copying of vmcb.save and vmcb.contol
> areas which will need to be refined.
>
> 2) There is a workaround in nested_svm_vmexit() where
>
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> This was done as a result of the kvm selftest 'state_test'. In that
> test svm_set_nested_state() is called before svm_vcpu_run().
> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> vmcb which is now vmcb02 as we are in nested mode subsequently
> vmcb01.control.asid is never set as it should be.
>
> Tested:
> kvm-unit-tests
> kvm self tests
>
> Signed-off-by: Cathy Avery <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
> arch/x86/kvm/svm/svm.c | 41 +++++++-------
> arch/x86/kvm/svm/svm.h | 10 ++--
> 3 files changed, 81 insertions(+), 86 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e90bc436f584..0a06e62010d8 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
> static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
>
> WARN_ON(mmu_is_nested(vcpu));
>
> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
> + svm->vmcb01->save.efer,
> svm->nested.ctl.nested_cr3);
> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
> return;
>
> c = &svm->vmcb->control;
> - h = &svm->nested.hsave->control;
> + h = &svm->vmcb01->control;
> g = &svm->nested.ctl;
>
> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
>
> svm->vmcb->control.int_ctl =
> (svm->nested.ctl.int_ctl & ~mask) |
> - (svm->nested.hsave->control.int_ctl & mask);
> + (svm->vmcb01->control.int_ctl & mask);
>
> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> int ret;
>
> svm->nested.vmcb = vmcb_gpa;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> nested_prepare_vmcb_save(svm, nested_vmcb);
> nested_prepare_vmcb_control(svm);
> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> {
> int ret;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
> u64 vmcb_gpa;
>
> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> kvm_clear_exception_queue(&svm->vcpu);
> kvm_clear_interrupt_queue(&svm->vcpu);
>
> - /*
> - * Save the old vmcb, so we don't need to pick what we save, but can
> - * restore everything when a VMEXIT occurs
> - */
> - hsave->save.es = vmcb->save.es;
> - hsave->save.cs = vmcb->save.cs;
> - hsave->save.ss = vmcb->save.ss;
> - hsave->save.ds = vmcb->save.ds;
> - hsave->save.gdtr = vmcb->save.gdtr;
> - hsave->save.idtr = vmcb->save.idtr;
> - hsave->save.efer = svm->vcpu.arch.efer;
> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
> - hsave->save.cr4 = svm->vcpu.arch.cr4;
> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
> - hsave->save.rsp = vmcb->save.rsp;
> - hsave->save.rax = vmcb->save.rax;
> - if (npt_enabled)
> - hsave->save.cr3 = vmcb->save.cr3;
> - else
> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
> -
> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
> +
> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
> +
> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
> +
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> svm->nested.nested_run_pending = 1;
>
> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> int rc;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
>
> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> nested_vmcb->control.pause_filter_thresh =
> svm->vmcb->control.pause_filter_thresh;
>
> - /* Restore the original control entries */
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + if (svm->vmcb01->control.asid == 0)
> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> +
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> /* On vmexit the GIF is set to false */
> svm_set_gif(svm, false);
> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> svm->nested.ctl.nested_cr3 = 0;
>
> /* Restore selected save entries */
> - svm->vmcb->save.es = hsave->save.es;
> - svm->vmcb->save.cs = hsave->save.cs;
> - svm->vmcb->save.ss = hsave->save.ss;
> - svm->vmcb->save.ds = hsave->save.ds;
> - svm->vmcb->save.gdtr = hsave->save.gdtr;
> - svm->vmcb->save.idtr = hsave->save.idtr;
> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
> - svm_set_efer(&svm->vcpu, hsave->save.efer);
> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
> svm->vmcb->save.dr7 = 0;
> svm->vmcb->save.cpl = 0;
> svm->vmcb->control.exit_int_info = 0;
> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>
> nested_svm_uninit_mmu_context(&svm->vcpu);
>
> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
> if (rc)
> return 1;
>
> - if (npt_enabled)
> - svm->vmcb->save.cr3 = hsave->save.cr3;
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> /*
> * Drop what we picked up for L2 via svm_complete_interrupts() so it
> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> void svm_leave_nested(struct vcpu_svm *svm)
> {
> if (is_guest_mode(&svm->vcpu)) {
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> -
> svm->nested.nested_run_pending = 0;
> leave_guest_mode(&svm->vcpu);
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> nested_svm_uninit_mmu_context(&svm->vcpu);
> }
> }
> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
> sizeof(user_vmcb->control)))
> return -EFAULT;
> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
> sizeof(user_vmcb->save)))
> return -EFAULT;
> -
> out:
> return kvm_state.size;
> }
> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> struct kvm_nested_state *kvm_state)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb __user *user_vmcb = (struct vmcb __user *)
> &user_kvm_nested_state->data.svm[0];
> struct vmcb_control_area ctl;
> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> if (!(save.cr0 & X86_CR0_PG))
> return -EINVAL;
>
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->nested.vmcb02->save = svm->vmcb01->save;
> + svm->vmcb01->save = save;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> +
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> +
> /*
> - * All checks done, we can enter guest mode. L1 control fields
> - * come from the nested save state. Guest state is already
> - * in the registers, the save area of the nested state instead
> - * contains saved L1 state.
> + * All checks done, we can enter guest mode. L2 control fields will
> + * be the result of a combination of L1 and userspace indicated
> + * L12.control. The save area of L1 vmcb now contains the userspace
> + * indicated L1.save.
> */
> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
> - hsave->save = save;
>
> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> load_nested_vmcb_control(svm, &ctl);
> nested_prepare_vmcb_control(svm);
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 5764b87379cf..d8022f989ffb 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> if (is_guest_mode(vcpu)) {
> /* Write L1's TSC offset. */
> g_tsc_offset = svm->vmcb->control.tsc_offset -
> - svm->nested.hsave->control.tsc_offset;
> - svm->nested.hsave->control.tsc_offset = offset;
> + svm->vmcb01->control.tsc_offset;
> + svm->vmcb01->control.tsc_offset = offset;
> }
>
> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm;
> - struct page *page;
> + struct page *vmcb01_page;
> + struct page *vmcb02_page;
> struct page *msrpm_pages;
> - struct page *hsave_page;
> struct page *nested_msrpm_pages;
> int err;
>
> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm = to_svm(vcpu);
>
> err = -ENOMEM;
> - page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!page)
> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb01_page)
> goto out;
>
> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
> @@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (!nested_msrpm_pages)
> goto free_page2;
>
> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!hsave_page)
> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb02_page)
> goto free_page3;
>
> err = avic_init_vcpu(svm);
> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
> svm->avic_is_running = true;
>
> - svm->nested.hsave = page_address(hsave_page);
> - clear_page(svm->nested.hsave);
> + svm->nested.vmcb02 = page_address(vmcb02_page);
> + clear_page(svm->nested.vmcb02);
> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
>
> svm->msrpm = page_address(msrpm_pages);
> svm_vcpu_init_msrpm(svm->msrpm);
> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm->nested.msrpm = page_address(nested_msrpm_pages);
> svm_vcpu_init_msrpm(svm->nested.msrpm);
>
> - svm->vmcb = page_address(page);
> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
> clear_page(svm->vmcb);
> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
> + svm->nested.vmcb01_pa = svm->vmcb_pa;
> +
> svm->asid_generation = 0;
> init_vmcb(svm);
>
> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> return 0;
>
> free_page4:
> - __free_page(hsave_page);
> + __free_page(vmcb02_page);
> free_page3:
> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page2:
> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page1:
> - __free_page(page);
> + __free_page(vmcb01_page);
> out:
> return err;
> }
> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> * svm_vcpu_load(). So, ensure that no logical CPU has this
> * vmcb page recorded as its current vmcb.
> */
> - svm_clear_current_vmcb(svm->vmcb);
>
> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
> + svm_clear_current_vmcb(svm->vmcb);
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >> PAGE_SHIFT));
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >> PAGE_SHIFT));
> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> - __free_page(virt_to_page(svm->nested.hsave));
> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> }
>
> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
> /* Drop int_ctl fields related to VINTR injection. */
> svm->vmcb->control.int_ctl &= mask;
> if (is_guest_mode(&svm->vcpu)) {
> - svm->nested.hsave->control.int_ctl &= mask;
> + svm->vmcb01->control.int_ctl &= mask;
>
> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
> (svm->nested.ctl.int_ctl & V_TPR_MASK));
> @@ -3127,7 +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> if (is_guest_mode(vcpu)) {
> /* As long as interrupts are being delivered... */
> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
> return true;
>
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index a798e1731709..e908b83bfa69 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -82,7 +82,9 @@ struct kvm_svm {
> struct kvm_vcpu;
>
> struct svm_nested_state {
> - struct vmcb *hsave;
> + struct vmcb *vmcb02;
> + unsigned long vmcb01_pa;
> + unsigned long vmcb02_pa;
> u64 hsave_msr;
> u64 vm_cr_msr;
> u64 vmcb;
> @@ -102,6 +104,7 @@ struct svm_nested_state {
> struct vcpu_svm {
> struct kvm_vcpu vcpu;
> struct vmcb *vmcb;
> + struct vmcb *vmcb01;
> unsigned long vmcb_pa;
> struct svm_cpu_data *svm_data;
> uint64_t asid_generation;
> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
>
> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
> {
> - if (is_guest_mode(&svm->vcpu))
> - return svm->nested.hsave;
> - else
> - return svm->vmcb;
> + return svm->vmcb01;
> }
>
> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
I was kind of busy this week, but very soon I'll review and test this patch.
Best regards,
Maxim Levitsky
On Thu, 2020-09-17 at 15:23 -0400, Cathy Avery wrote:
> svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb ( nested ).
>
> Issues:
>
> 1) There is some wholesale copying of vmcb.save and vmcb.contol
> areas which will need to be refined.
>
> 2) There is a workaround in nested_svm_vmexit() where
>
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> This was done as a result of the kvm selftest 'state_test'. In that
> test svm_set_nested_state() is called before svm_vcpu_run().
> The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
> vmcb which is now vmcb02 as we are in nested mode subsequently
> vmcb01.control.asid is never set as it should be.
>
> Tested:
> kvm-unit-tests
> kvm self tests
>
> Signed-off-by: Cathy Avery <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 116 ++++++++++++++++++--------------------
> arch/x86/kvm/svm/svm.c | 41 +++++++-------
> arch/x86/kvm/svm/svm.h | 10 ++--
> 3 files changed, 81 insertions(+), 86 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e90bc436f584..0a06e62010d8 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
> static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
>
> WARN_ON(mmu_is_nested(vcpu));
>
> vcpu->arch.mmu = &vcpu->arch.guest_mmu;
> - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
> + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
> + svm->vmcb01->save.efer,
> svm->nested.ctl.nested_cr3);
> vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
> vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
> @@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
> return;
>
> c = &svm->vmcb->control;
> - h = &svm->nested.hsave->control;
> + h = &svm->vmcb01->control;
> g = &svm->nested.ctl;
>
> svm->nested.host_intercept_exceptions = h->intercept_exceptions;
> @@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
>
> svm->vmcb->control.int_ctl =
> (svm->nested.ctl.int_ctl & ~mask) |
> - (svm->nested.hsave->control.int_ctl & mask);
> + (svm->vmcb01->control.int_ctl & mask);
>
> svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
> svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
> @@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> int ret;
>
> svm->nested.vmcb = vmcb_gpa;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> nested_prepare_vmcb_save(svm, nested_vmcb);
> nested_prepare_vmcb_control(svm);
> @@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> {
> int ret;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
> u64 vmcb_gpa;
>
> @@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> kvm_clear_exception_queue(&svm->vcpu);
> kvm_clear_interrupt_queue(&svm->vcpu);
>
> - /*
> - * Save the old vmcb, so we don't need to pick what we save, but can
> - * restore everything when a VMEXIT occurs
> - */
> - hsave->save.es = vmcb->save.es;
> - hsave->save.cs = vmcb->save.cs;
> - hsave->save.ss = vmcb->save.ss;
> - hsave->save.ds = vmcb->save.ds;
> - hsave->save.gdtr = vmcb->save.gdtr;
> - hsave->save.idtr = vmcb->save.idtr;
> - hsave->save.efer = svm->vcpu.arch.efer;
> - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
> - hsave->save.cr4 = svm->vcpu.arch.cr4;
> - hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
> - hsave->save.rip = kvm_rip_read(&svm->vcpu);
> - hsave->save.rsp = vmcb->save.rsp;
> - hsave->save.rax = vmcb->save.rax;
> - if (npt_enabled)
> - hsave->save.cr3 = vmcb->save.cr3;
> - else
> - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
> -
> - copy_vmcb_control_area(&hsave->control, &vmcb->control);
> +
> + /* Update vmcb0. We will restore everything when a VMEXIT occurs */
I think that the above comment is wrong.
What is happening here is that we writeback some stuff to the vmcb01 that had
changed during the skipping of the vmrun instruction, something that we normally
(for non nested run) do in next svm_vcpu_run. I am not sure that we need even to
update here anything but rip, because it is written by kvm_rip_write which writes
only the vcpu->arch.regs and marks the register as dirty which we don't seem to
notice in svm code. Instead svm_vcpu_run just always syncs rax/rsp/rip to current vmcb.
Best would be to check the arch register dirty bits and update if set - no need to do this now
of course.
> +
> + svm->vmcb01->save.efer = svm->vcpu.arch.efer;
> + svm->vmcb01->save.cr0 = kvm_read_cr0(&svm->vcpu);
> + svm->vmcb01->save.cr4 = svm->vcpu.arch.cr4;
> + svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
> + svm->vmcb01->save.rip = kvm_rip_read(&svm->vcpu);
In addition to this, I *think* I would refer to svm->vmcb instead of svm->vmcb01,
to mark the fact that we work on current vmcb (the switch to vmdb02 didn't happen yet)
But this is purely cosmetic thing.
Note thought that besides the comment, the code is not wrong here (e.g there is no bug).
> +
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
>
> svm->nested.nested_run_pending = 1;
>
> @@ -564,7 +556,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> int rc;
> struct vmcb *nested_vmcb;
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb *vmcb = svm->vmcb;
> struct kvm_host_map map;
>
> @@ -628,8 +619,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> nested_vmcb->control.pause_filter_thresh =
> svm->vmcb->control.pause_filter_thresh;
>
> - /* Restore the original control entries */
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + if (svm->vmcb01->control.asid == 0)
> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
I think that the above should be done always. The asid field is currently host
controlled only (that is L2 value is ignored, selective ASID tlb flush is not
advertized to the guest and lnvlpga is emulated as invlpg).
So if we have a tlb flush, the asid in vmcb02 will change, and that change
should propogate to vmcb01 (this can also be done when it is changed in 'new_asid')
> +
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> /* On vmexit the GIF is set to false */
> svm_set_gif(svm, false);
> @@ -640,19 +634,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> svm->nested.ctl.nested_cr3 = 0;
>
> /* Restore selected save entries */
> - svm->vmcb->save.es = hsave->save.es;
> - svm->vmcb->save.cs = hsave->save.cs;
> - svm->vmcb->save.ss = hsave->save.ss;
> - svm->vmcb->save.ds = hsave->save.ds;
> - svm->vmcb->save.gdtr = hsave->save.gdtr;
> - svm->vmcb->save.idtr = hsave->save.idtr;
> - kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
> - svm_set_efer(&svm->vcpu, hsave->save.efer);
> - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
> - svm_set_cr4(&svm->vcpu, hsave->save.cr4);
> - kvm_rax_write(&svm->vcpu, hsave->save.rax);
> - kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
> - kvm_rip_write(&svm->vcpu, hsave->save.rip);
> + kvm_set_rflags(&svm->vcpu, svm->vmcb->save.rflags);
> + svm_set_efer(&svm->vcpu, svm->vmcb->save.efer);
> + svm_set_cr0(&svm->vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
> + svm_set_cr4(&svm->vcpu, svm->vmcb->save.cr4);
> + kvm_rax_write(&svm->vcpu, svm->vmcb->save.rax);
> + kvm_rsp_write(&svm->vcpu, svm->vmcb->save.rsp);
> + kvm_rip_write(&svm->vcpu, svm->vmcb->save.rip);
> svm->vmcb->save.dr7 = 0;
> svm->vmcb->save.cpl = 0;
> svm->vmcb->control.exit_int_info = 0;
> @@ -670,12 +658,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>
> nested_svm_uninit_mmu_context(&svm->vcpu);
>
> - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
> + rc = nested_svm_load_cr3(&svm->vcpu, svm->vmcb->save.cr3, false);
> if (rc)
> return 1;
>
> - if (npt_enabled)
> - svm->vmcb->save.cr3 = hsave->save.cr3;
> + if (!npt_enabled)
> + svm->vmcb01->save.cr3 = kvm_read_cr3(&svm->vcpu);
Probably should be vmcb instead (also only cosmetic change since vmcb should be vmcb01 at
at that point)
>
> /*
> * Drop what we picked up for L2 via svm_complete_interrupts() so it
> @@ -694,12 +682,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> void svm_leave_nested(struct vcpu_svm *svm)
> {
> if (is_guest_mode(&svm->vcpu)) {
> - struct vmcb *hsave = svm->nested.hsave;
> - struct vmcb *vmcb = svm->vmcb;
> -
> svm->nested.nested_run_pending = 0;
> leave_guest_mode(&svm->vcpu);
> - copy_vmcb_control_area(&vmcb->control, &hsave->control);
> + svm->vmcb = svm->vmcb01;
> + svm->vmcb_pa = svm->nested.vmcb01_pa;
> nested_svm_uninit_mmu_context(&svm->vcpu);
> }
> }
> @@ -1046,10 +1032,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
> if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
> sizeof(user_vmcb->control)))
> return -EFAULT;
> - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
> + if (copy_to_user(&user_vmcb->save, &svm->vmcb01->save,
> sizeof(user_vmcb->save)))
> return -EFAULT;
> -
> out:
> return kvm_state.size;
> }
> @@ -1059,7 +1044,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> struct kvm_nested_state *kvm_state)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct vmcb *hsave = svm->nested.hsave;
> struct vmcb __user *user_vmcb = (struct vmcb __user *)
> &user_kvm_nested_state->data.svm[0];
> struct vmcb_control_area ctl;
> @@ -1121,16 +1105,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> if (!(save.cr0 & X86_CR0_PG))
> return -EINVAL;
>
> + svm->nested.vmcb02->control = svm->vmcb01->control;
> + svm->nested.vmcb02->save = svm->vmcb01->save;
> + svm->vmcb01->save = save;
> +
> + WARN_ON(svm->vmcb == svm->nested.vmcb02);
> +
> + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> +
> + svm->vmcb = svm->nested.vmcb02;
> + svm->vmcb_pa = svm->nested.vmcb02_pa;
> +
> /*
> - * All checks done, we can enter guest mode. L1 control fields
> - * come from the nested save state. Guest state is already
> - * in the registers, the save area of the nested state instead
> - * contains saved L1 state.
> + * All checks done, we can enter guest mode. L2 control fields will
> + * be the result of a combination of L1 and userspace indicated
> + * L12.control. The save area of L1 vmcb now contains the userspace
> + * indicated L1.save.
> */
> - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
> - hsave->save = save;
>
> - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
> load_nested_vmcb_control(svm, &ctl);
> nested_prepare_vmcb_control(svm);
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 5764b87379cf..d8022f989ffb 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -971,8 +971,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> if (is_guest_mode(vcpu)) {
> /* Write L1's TSC offset. */
> g_tsc_offset = svm->vmcb->control.tsc_offset -
> - svm->nested.hsave->control.tsc_offset;
> - svm->nested.hsave->control.tsc_offset = offset;
> + svm->vmcb01->control.tsc_offset;
> + svm->vmcb01->control.tsc_offset = offset;
> }
>
> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> @@ -1171,9 +1171,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm;
> - struct page *page;
> + struct page *vmcb01_page;
> + struct page *vmcb02_page;
> struct page *msrpm_pages;
> - struct page *hsave_page;
> struct page *nested_msrpm_pages;
> int err;
>
> @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm = to_svm(vcpu);
>
> err = -ENOMEM;
> - page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!page)
> + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb01_page)
> goto out;
>
> msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
> @@ -1193,8 +1193,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (!nested_msrpm_pages)
> goto free_page2;
>
> - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
> - if (!hsave_page)
> + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT);
> + if (!vmcb02_page)
> goto free_page3;
>
> err = avic_init_vcpu(svm);
> @@ -1207,8 +1207,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
> svm->avic_is_running = true;
>
> - svm->nested.hsave = page_address(hsave_page);
> - clear_page(svm->nested.hsave);
> + svm->nested.vmcb02 = page_address(vmcb02_page);
> + clear_page(svm->nested.vmcb02);
> + svm->nested.vmcb02_pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
>
> svm->msrpm = page_address(msrpm_pages);
> svm_vcpu_init_msrpm(svm->msrpm);
> @@ -1216,9 +1217,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> svm->nested.msrpm = page_address(nested_msrpm_pages);
> svm_vcpu_init_msrpm(svm->nested.msrpm);
>
> - svm->vmcb = page_address(page);
> + svm->vmcb = svm->vmcb01 = page_address(vmcb01_page);
> clear_page(svm->vmcb);
> - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
> + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
> + svm->nested.vmcb01_pa = svm->vmcb_pa;
> +
> svm->asid_generation = 0;
> init_vmcb(svm);
>
> @@ -1228,13 +1231,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> return 0;
>
> free_page4:
> - __free_page(hsave_page);
> + __free_page(vmcb02_page);
> free_page3:
> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page2:
> __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page1:
> - __free_page(page);
> + __free_page(vmcb01_page);
> out:
> return err;
> }
> @@ -1256,11 +1259,11 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> * svm_vcpu_load(). So, ensure that no logical CPU has this
> * vmcb page recorded as its current vmcb.
> */
> - svm_clear_current_vmcb(svm->vmcb);
>
> - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
> + svm_clear_current_vmcb(svm->vmcb);
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb01_pa) >> PAGE_SHIFT));
> + __free_page(pfn_to_page(__sme_clr(svm->nested.vmcb02_pa) >> PAGE_SHIFT));
> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> - __free_page(virt_to_page(svm->nested.hsave));
> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> }
>
> @@ -1393,7 +1396,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
> /* Drop int_ctl fields related to VINTR injection. */
> svm->vmcb->control.int_ctl &= mask;
> if (is_guest_mode(&svm->vcpu)) {
> - svm->nested.hsave->control.int_ctl &= mask;
> + svm->vmcb01->control.int_ctl &= mask;
>
> WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
> (svm->nested.ctl.int_ctl & V_TPR_MASK));
> @@ -3127,7 +3130,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> if (is_guest_mode(vcpu)) {
> /* As long as interrupts are being delivered... */
> if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
> - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
> + ? !(svm->vmcb01->save.rflags & X86_EFLAGS_IF)
> : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
> return true;
>
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index a798e1731709..e908b83bfa69 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -82,7 +82,9 @@ struct kvm_svm {
> struct kvm_vcpu;
>
> struct svm_nested_state {
> - struct vmcb *hsave;
> + struct vmcb *vmcb02;
> + unsigned long vmcb01_pa;
> + unsigned long vmcb02_pa;
> u64 hsave_msr;
> u64 vm_cr_msr;
> u64 vmcb;
> @@ -102,6 +104,7 @@ struct svm_nested_state {
> struct vcpu_svm {
> struct kvm_vcpu vcpu;
> struct vmcb *vmcb;
> + struct vmcb *vmcb01;
> unsigned long vmcb_pa;
> struct svm_cpu_data *svm_data;
> uint64_t asid_generation;
> @@ -208,10 +211,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
>
> static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
> {
> - if (is_guest_mode(&svm->vcpu))
> - return svm->nested.hsave;
> - else
> - return svm->vmcb;
> + return svm->vmcb01;
> }
>
> static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
Honestly can't find anything seriosly wrong with this patch. I tried it,
and actually I was able to boot a fedora guest. The L2 guest is just SLOW.
I mean as slow as drawing a single character while moving in grub menu,
and later you have to wait like few minutes to just start seeing output
from the kernel on the serial line. Its is so slow that systemd timeouts
on most services, so I wasn't able to get to the GUI. Interstingly the L1
continues to work as if nothing happened. Nothing on the kernel dmesg in both
L1 and L2
I must say I never had such a puzzling issue.
I debugged it a bit but without any luck. I guess this can be brute-force
debugged by comparing the traces, etc but this probably is out of scope for
me for now.
My guesses on why this could happen:
1. Something wrong with memory types - like guest is using UC memory for everything.
I can't completely rule that out yet
2. Something wrong with TLB/MMU - I played a bit with asid related things, but don't see
anything significantly wrong.
3. Dirty bits of vmcb - a test to always set them without this patch and see if that
tanks performance can be done (didn't do this)
4. Something with interrupts/int_ctl. I tested that NMI single setep code is not involved,
vgif=0 doesn't help. avic=0 doesn't help either (tested just in case)
I applied this patch on Linus's mainline branch (it doesn't apply to kvm/queue)
Best regards,
Maxim Levitsky
On 08/10/20 00:14, Maxim Levitsky wrote:
>>
>> + if (svm->vmcb01->control.asid == 0)
>> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> I think that the above should be done always. The asid field is currently host
> controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> advertized to the guest and lnvlpga is emulated as invlpg).
Yes, in fact I suggested that ASID should be in svm->asid and moved to
svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
it in nested code.
This should be a patch coming before this one.
>
> 1. Something wrong with memory types - like guest is using UC memory for everything.
> I can't completely rule that out yet
You can print g_pat and see if it is all zeroes.
In general I think it's better to be explicit with vmcb01 vs. vmcb02,
like Cathy did, but I can see it's a matter of personal preference to
some extent.
Paolo
On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
> On 08/10/20 00:14, Maxim Levitsky wrote:
> > > + if (svm->vmcb01->control.asid == 0)
> > > + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> >
> > I think that the above should be done always. The asid field is currently host
> > controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> > advertized to the guest and lnvlpga is emulated as invlpg).
>
> Yes, in fact I suggested that ASID should be in svm->asid and moved to
> svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
> it in nested code.
This makes lot of sense!
>
> This should be a patch coming before this one.
>
> > 1. Something wrong with memory types - like guest is using UC memory for everything.
> > I can't completely rule that out yet
>
> You can print g_pat and see if it is all zeroes.
I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
but now that I look at it, we set it to a default value and there is no code to set it to
default value for vmcb02. This is it. now my fedora guest boots just fine!
I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
I knew that it has to be something with memory types, but it never occured to me
that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
>
> In general I think it's better to be explicit with vmcb01 vs. vmcb02,
> like Cathy did, but I can see it's a matter of personal preference to
> some extent.
I also think so in general, but in the code that is outside 'is_guest_mode'
IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
it, its not wrong to do so either.
My windows hyper-v guest doesn't boot though and I know why.
As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
Now suppose a nested hypervisor wants to enter a nested guest.
It will do vmload, which will load the extra state from the nested vmcb (vmcb12
or as I woudl say the vmcb that nested hypervisor thinks that it is using),
to the CPU. This can cause some vmexits I think, but this doesn't matter much.
Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
The same issue happens the other way around on nested vmexit.
I fixed this by doing nested_svm_vmloadsave, but that should be probably be
optimized with dirty bits. Now though I guess the goal it to make
it work first.
With this fixed HyperV boots fine, and even passes the 'works' test of booting
the windows 10 with hyperv enabled nested itself and starting the vm inside,
which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
https://i.imgur.com/sRYqtVV.png
In summary this is the diff of fixes (just pasted to email, probably mangled):
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 0a06e62010d8c..7293ba23b3cbc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
WARN_ON(svm->vmcb == svm->nested.vmcb02);
svm->nested.vmcb02->control = svm->vmcb01->control;
+
+ nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+
svm->vmcb = svm->nested.vmcb02;
svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
@@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (svm->vmcb01->control.asid == 0)
svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
+ nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
svm->vmcb = svm->vmcb01;
svm->vmcb_pa = svm->nested.vmcb01_pa;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b66239b26885d..ee9f87fe611f2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = svm->vcpu.arch.pat;
+ svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
Best regards,
Maxim Levitsky
>
> Paolo
>
On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:
> On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
> > On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
> > > On 08/10/20 00:14, Maxim Levitsky wrote:
> > > > > + if (svm->vmcb01->control.asid == 0)
> > > > > + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > > >
> > > > I think that the above should be done always. The asid field is currently host
> > > > controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> > > > advertized to the guest and lnvlpga is emulated as invlpg).
> > >
> > > Yes, in fact I suggested that ASID should be in svm->asid and moved to
> > > svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
> > > it in nested code.
> > This makes lot of sense!
> > > This should be a patch coming before this one.
> > >
> > > > 1. Something wrong with memory types - like guest is using UC memory for everything.
> > > > I can't completely rule that out yet
> > >
> > > You can print g_pat and see if it is all zeroes.
> > I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
> > but now that I look at it, we set it to a default value and there is no code to set it to
> > default value for vmcb02. This is it. now my fedora guest boots just fine!
> >
> > I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
> > I knew that it has to be something with memory types, but it never occured to me
> > that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
> >
> >
> > > In general I think it's better to be explicit with vmcb01 vs. vmcb02,
> > > like Cathy did, but I can see it's a matter of personal preference to
> > > some extent.
> > I also think so in general, but in the code that is outside 'is_guest_mode'
> > IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
> > it, its not wrong to do so either.
> >
> >
> > My windows hyper-v guest doesn't boot though and I know why.
> >
> > As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
> > Now suppose a nested hypervisor wants to enter a nested guest.
> >
> > It will do vmload, which will load the extra state from the nested vmcb (vmcb12
> > or as I woudl say the vmcb that nested hypervisor thinks that it is using),
> > to the CPU. This can cause some vmexits I think, but this doesn't matter much.
> >
> > Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
> > and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
> > when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
> > The same issue happens the other way around on nested vmexit.
> >
> > I fixed this by doing nested_svm_vmloadsave, but that should be probably be
> > optimized with dirty bits. Now though I guess the goal it to make
> > it work first.
> >
> > With this fixed HyperV boots fine, and even passes the 'works' test of booting
> > the windows 10 with hyperv enabled nested itself and starting the vm inside,
> > which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
> >
> > https://i.imgur.com/sRYqtVV.png
> >
> > In summary this is the diff of fixes (just pasted to email, probably mangled):
> >
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 0a06e62010d8c..7293ba23b3cbc 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> > WARN_ON(svm->vmcb == svm->nested.vmcb02);
> >
> > svm->nested.vmcb02->control = svm->vmcb01->control;
> > +
> > + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
> > +
> > svm->vmcb = svm->nested.vmcb02;
> > svm->vmcb_pa = svm->nested.vmcb02_pa;
> > load_nested_vmcb_control(svm, &nested_vmcb->control);
> > @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > if (svm->vmcb01->control.asid == 0)
> > svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> >
> > + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
> > svm->vmcb = svm->vmcb01;
> > svm->vmcb_pa = svm->nested.vmcb01_pa;
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index b66239b26885d..ee9f87fe611f2 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
> > clr_cr_intercept(svm, INTERCEPT_CR3_READ);
> > clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
> > save->g_pat = svm->vcpu.arch.pat;
> > + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
> > save->cr3 = 0;
> > save->cr4 = 0;
> > }
> >
> >
> >
> > Best regards,
> > Maxim Levitsky
> >
> > > Paolo
> > >
> And another thing I spotted before I forget.
>
> If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
> then this field will be copied to vmcb02 but on next vmexit we clear it in current
> (that is vmcb02) and that change will not propogate to vmcb01.
>
> I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
> it as what Paulo suggested to do with asid -
> set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
And yet another thing to note is that we curently ignore L2's g_pat. However it _seems_ that we practically
ignore L1 PAT as well in regard to shadowing NPT mmu. I am not 100% sure about this.
I'll dig that area eventually.
Best regards,
Maxim Levitsky
>
> Best regards,
> Maxim Levitsky
On 10/8/20 6:54 AM, Maxim Levitsky wrote:
> On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:
>> On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
>>> On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
>>>> On 08/10/20 00:14, Maxim Levitsky wrote:
>>>>>> + if (svm->vmcb01->control.asid == 0)
>>>>>> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>>>> I think that the above should be done always. The asid field is currently host
>>>>> controlled only (that is L2 value is ignored, selective ASID tlb flush is not
>>>>> advertized to the guest and lnvlpga is emulated as invlpg).
>>>> Yes, in fact I suggested that ASID should be in svm->asid and moved to
>>>> svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
>>>> it in nested code.
>>> This makes lot of sense!
>>>> This should be a patch coming before this one.
>>>>
>>>>> 1. Something wrong with memory types - like guest is using UC memory for everything.
>>>>> I can't completely rule that out yet
>>>> You can print g_pat and see if it is all zeroes.
>>> I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
>>> but now that I look at it, we set it to a default value and there is no code to set it to
>>> default value for vmcb02. This is it. now my fedora guest boots just fine!
>>>
>>> I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
>>> I knew that it has to be something with memory types, but it never occured to me
>>> that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
>>>
>>>
>>>> In general I think it's better to be explicit with vmcb01 vs. vmcb02,
>>>> like Cathy did, but I can see it's a matter of personal preference to
>>>> some extent.
>>> I also think so in general, but in the code that is outside 'is_guest_mode'
>>> IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
>>> it, its not wrong to do so either.
>>>
>>>
>>> My windows hyper-v guest doesn't boot though and I know why.
>>>
>>> As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
>>> Now suppose a nested hypervisor wants to enter a nested guest.
>>>
>>> It will do vmload, which will load the extra state from the nested vmcb (vmcb12
>>> or as I woudl say the vmcb that nested hypervisor thinks that it is using),
>>> to the CPU. This can cause some vmexits I think, but this doesn't matter much.
>>>
>>> Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
>>> and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
>>> when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
>>> The same issue happens the other way around on nested vmexit.
>>>
>>> I fixed this by doing nested_svm_vmloadsave, but that should be probably be
>>> optimized with dirty bits. Now though I guess the goal it to make
>>> it work first.
>>>
>>> With this fixed HyperV boots fine, and even passes the 'works' test of booting
>>> the windows 10 with hyperv enabled nested itself and starting the vm inside,
>>> which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
>>>
>>> https://i.imgur.com/sRYqtVV.png
>>>
>>> In summary this is the diff of fixes (just pasted to email, probably mangled):
>>>
>>>
>>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>>> index 0a06e62010d8c..7293ba23b3cbc 100644
>>> --- a/arch/x86/kvm/svm/nested.c
>>> +++ b/arch/x86/kvm/svm/nested.c
>>> @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
>>> WARN_ON(svm->vmcb == svm->nested.vmcb02);
>>>
>>> svm->nested.vmcb02->control = svm->vmcb01->control;
>>> +
>>> + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
>>> +
>>> svm->vmcb = svm->nested.vmcb02;
>>> svm->vmcb_pa = svm->nested.vmcb02_pa;
>>> load_nested_vmcb_control(svm, &nested_vmcb->control);
>>> @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>>> if (svm->vmcb01->control.asid == 0)
>>> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>>
>>> + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
>>> svm->vmcb = svm->vmcb01;
>>> svm->vmcb_pa = svm->nested.vmcb01_pa;
>>>
>>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>>> index b66239b26885d..ee9f87fe611f2 100644
>>> --- a/arch/x86/kvm/svm/svm.c
>>> +++ b/arch/x86/kvm/svm/svm.c
>>> @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
>>> clr_cr_intercept(svm, INTERCEPT_CR3_READ);
>>> clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
>>> save->g_pat = svm->vcpu.arch.pat;
>>> + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's
value which was not helpful but I did not try the current vcpu value.
I am getting a #UD which I suspected had something to do with cr3 but
I'll know more after I add your suggestions.
emu-system-x86-1647 [033] .... 3167.589402: kvm_nested_vmexit_inject:
reason: UD excp ext_inf1: 0x0000000000000000 ext_inf2:
0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000
>>> save->cr3 = 0;
>>> save->cr4 = 0;
>>> }
>>>
>>>
>>>
>>> Best regards,
>>> Maxim Levitsky
>>>
>>>> Paolo
>>>>
>> And another thing I spotted before I forget.
>>
>> If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
>> then this field will be copied to vmcb02 but on next vmexit we clear it in current
>> (that is vmcb02) and that change will not propogate to vmcb01.
ctl.tlb_ctl is dependent on the value of save.cr4 which was not being
set in vmcb02.
>>
>> I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
>> it as what Paulo suggested to do with asid -
>> set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
> And yet another thing to note is that we curently ignore L2's g_pat. However it _seems_ that we practically
> ignore L1 PAT as well in regard to shadowing NPT mmu. I am not 100% sure about this.
> I'll dig that area eventually.
>
> Best regards,
> Maxim Levitsky
>
>> Best regards,
>> Maxim Levitsky
>
Thanks,
Cathy
On Thu, 2020-10-08 at 08:46 -0400, Cathy Avery wrote:
> On 10/8/20 6:54 AM, Maxim Levitsky wrote:
> > On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:
> > > On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
> > > > On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
> > > > > On 08/10/20 00:14, Maxim Levitsky wrote:
> > > > > > > + if (svm->vmcb01->control.asid == 0)
> > > > > > > + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > > > > > I think that the above should be done always. The asid field is currently host
> > > > > > controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> > > > > > advertized to the guest and lnvlpga is emulated as invlpg).
> > > > > Yes, in fact I suggested that ASID should be in svm->asid and moved to
> > > > > svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
> > > > > it in nested code.
> > > > This makes lot of sense!
> > > > > This should be a patch coming before this one.
> > > > >
> > > > > > 1. Something wrong with memory types - like guest is using UC memory for everything.
> > > > > > I can't completely rule that out yet
> > > > > You can print g_pat and see if it is all zeroes.
> > > > I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
> > > > but now that I look at it, we set it to a default value and there is no code to set it to
> > > > default value for vmcb02. This is it. now my fedora guest boots just fine!
> > > >
> > > > I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
> > > > I knew that it has to be something with memory types, but it never occured to me
> > > > that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
> > > >
> > > >
> > > > > In general I think it's better to be explicit with vmcb01 vs. vmcb02,
> > > > > like Cathy did, but I can see it's a matter of personal preference to
> > > > > some extent.
> > > > I also think so in general, but in the code that is outside 'is_guest_mode'
> > > > IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
> > > > it, its not wrong to do so either.
> > > >
> > > >
> > > > My windows hyper-v guest doesn't boot though and I know why.
> > > >
> > > > As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
> > > > Now suppose a nested hypervisor wants to enter a nested guest.
> > > >
> > > > It will do vmload, which will load the extra state from the nested vmcb (vmcb12
> > > > or as I woudl say the vmcb that nested hypervisor thinks that it is using),
> > > > to the CPU. This can cause some vmexits I think, but this doesn't matter much.
> > > >
> > > > Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
> > > > and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
> > > > when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
> > > > The same issue happens the other way around on nested vmexit.
> > > >
> > > > I fixed this by doing nested_svm_vmloadsave, but that should be probably be
> > > > optimized with dirty bits. Now though I guess the goal it to make
> > > > it work first.
> > > >
> > > > With this fixed HyperV boots fine, and even passes the 'works' test of booting
> > > > the windows 10 with hyperv enabled nested itself and starting the vm inside,
> > > > which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
> > > >
> > > > https://i.imgur.com/sRYqtVV.png
> > > >
> > > > In summary this is the diff of fixes (just pasted to email, probably mangled):
> > > >
> > > >
> > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > > > index 0a06e62010d8c..7293ba23b3cbc 100644
> > > > --- a/arch/x86/kvm/svm/nested.c
> > > > +++ b/arch/x86/kvm/svm/nested.c
> > > > @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> > > > WARN_ON(svm->vmcb == svm->nested.vmcb02);
> > > >
> > > > svm->nested.vmcb02->control = svm->vmcb01->control;
> > > > +
> > > > + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
> > > > +
> > > > svm->vmcb = svm->nested.vmcb02;
> > > > svm->vmcb_pa = svm->nested.vmcb02_pa;
> > > > load_nested_vmcb_control(svm, &nested_vmcb->control);
> > > > @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > > > if (svm->vmcb01->control.asid == 0)
> > > > svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > > >
> > > > + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
> > > > svm->vmcb = svm->vmcb01;
> > > > svm->vmcb_pa = svm->nested.vmcb01_pa;
> > > >
> > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > > index b66239b26885d..ee9f87fe611f2 100644
> > > > --- a/arch/x86/kvm/svm/svm.c
> > > > +++ b/arch/x86/kvm/svm/svm.c
> > > > @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
> > > > clr_cr_intercept(svm, INTERCEPT_CR3_READ);
> > > > clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
> > > > save->g_pat = svm->vcpu.arch.pat;
> > > > + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
>
> I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's
> value which was not helpful but I did not try the current vcpu value.
>
> I am getting a #UD which I suspected had something to do with cr3 but
> I'll know more after I add your suggestions.
>
> emu-system-x86-1647 [033] .... 3167.589402: kvm_nested_vmexit_inject:
> reason: UD excp ext_inf1: 0x0000000000000000 ext_inf2:
> 0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000
>
>
> > > > save->cr3 = 0;
> > > > save->cr4 = 0;
> > > > }
> > > >
> > > >
> > > >
> > > > Best regards,
> > > > Maxim Levitsky
> > > >
> > > > > Paolo
> > > > >
> > > And another thing I spotted before I forget.
> > >
> > > If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
> > > then this field will be copied to vmcb02 but on next vmexit we clear it in current
> > > (that is vmcb02) and that change will not propogate to vmcb01.
> ctl.tlb_ctl is dependent on the value of save.cr4 which was not being
> set in vmcb02.
Not sure I understand. Could you explain?
The vmcb02.save.cr4 is set in nested_prepare_vmcb_save, which does
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4),
which eventually does 'to_svm(vcpu)->vmcb->save.cr4 = cr4;'
And in this point vmcb points to vmcb02.
Besides SEV, it seems like ctl.tlb_ctl is set by svm_flush_tlb,
which is indeed called from svm_set_cr4, but it is also exposed
but .tlb_flush* callbacks from KVM core.
What I meant is that we can create vcpu->arch->needs_tlb_flush, set it in svm_flush_tlb instead of tlb_ctl,
and then svm_vcpu_run can set tlb_ctl in the current ctl to that TLB_CONTROL_FLUSH_ASID when
vcpu->arch->needs_tlb_flush is set.
Best regards,
Maxim Levitsky
> > > I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
> > > it as what Paulo suggested to do with asid -
> > > set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
> > And yet another thing to note is that we curently ignore L2's g_pat. However it _seems_ that we practically
> > ignore L1 PAT as well in regard to shadowing NPT mmu. I am not 100% sure about this.
> > I'll dig that area eventually.
> >
> > Best regards,
> > Maxim Levitsky
> >
> > > Best regards,
> > > Maxim Levitsky
> Thanks,
>
> Cathy
>
On 10/8/20 9:11 AM, Maxim Levitsky wrote:
> On Thu, 2020-10-08 at 08:46 -0400, Cathy Avery wrote:
>> On 10/8/20 6:54 AM, Maxim Levitsky wrote:
>>> On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:
>>>> On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
>>>>> On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
>>>>>> On 08/10/20 00:14, Maxim Levitsky wrote:
>>>>>>>> + if (svm->vmcb01->control.asid == 0)
>>>>>>>> + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>>>>>> I think that the above should be done always. The asid field is currently host
>>>>>>> controlled only (that is L2 value is ignored, selective ASID tlb flush is not
>>>>>>> advertized to the guest and lnvlpga is emulated as invlpg).
>>>>>> Yes, in fact I suggested that ASID should be in svm->asid and moved to
>>>>>> svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
>>>>>> it in nested code.
>>>>> This makes lot of sense!
>>>>>> This should be a patch coming before this one.
>>>>>>
>>>>>>> 1. Something wrong with memory types - like guest is using UC memory for everything.
>>>>>>> I can't completely rule that out yet
>>>>>> You can print g_pat and see if it is all zeroes.
>>>>> I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
>>>>> but now that I look at it, we set it to a default value and there is no code to set it to
>>>>> default value for vmcb02. This is it. now my fedora guest boots just fine!
>>>>>
>>>>> I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
>>>>> I knew that it has to be something with memory types, but it never occured to me
>>>>> that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
>>>>>
>>>>>
>>>>>> In general I think it's better to be explicit with vmcb01 vs. vmcb02,
>>>>>> like Cathy did, but I can see it's a matter of personal preference to
>>>>>> some extent.
>>>>> I also think so in general, but in the code that is outside 'is_guest_mode'
>>>>> IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
>>>>> it, its not wrong to do so either.
>>>>>
>>>>>
>>>>> My windows hyper-v guest doesn't boot though and I know why.
>>>>>
>>>>> As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
>>>>> Now suppose a nested hypervisor wants to enter a nested guest.
>>>>>
>>>>> It will do vmload, which will load the extra state from the nested vmcb (vmcb12
>>>>> or as I woudl say the vmcb that nested hypervisor thinks that it is using),
>>>>> to the CPU. This can cause some vmexits I think, but this doesn't matter much.
>>>>>
>>>>> Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
>>>>> and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
>>>>> when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
>>>>> The same issue happens the other way around on nested vmexit.
>>>>>
>>>>> I fixed this by doing nested_svm_vmloadsave, but that should be probably be
>>>>> optimized with dirty bits. Now though I guess the goal it to make
>>>>> it work first.
>>>>>
>>>>> With this fixed HyperV boots fine, and even passes the 'works' test of booting
>>>>> the windows 10 with hyperv enabled nested itself and starting the vm inside,
>>>>> which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
>>>>>
>>>>> https://i.imgur.com/sRYqtVV.png
>>>>>
>>>>> In summary this is the diff of fixes (just pasted to email, probably mangled):
>>>>>
>>>>>
>>>>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>>>>> index 0a06e62010d8c..7293ba23b3cbc 100644
>>>>> --- a/arch/x86/kvm/svm/nested.c
>>>>> +++ b/arch/x86/kvm/svm/nested.c
>>>>> @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
>>>>> WARN_ON(svm->vmcb == svm->nested.vmcb02);
>>>>>
>>>>> svm->nested.vmcb02->control = svm->vmcb01->control;
>>>>> +
>>>>> + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
>>>>> +
>>>>> svm->vmcb = svm->nested.vmcb02;
>>>>> svm->vmcb_pa = svm->nested.vmcb02_pa;
>>>>> load_nested_vmcb_control(svm, &nested_vmcb->control);
>>>>> @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>>>>> if (svm->vmcb01->control.asid == 0)
>>>>> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>>>>>
>>>>> + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
>>>>> svm->vmcb = svm->vmcb01;
>>>>> svm->vmcb_pa = svm->nested.vmcb01_pa;
>>>>>
>>>>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>>>>> index b66239b26885d..ee9f87fe611f2 100644
>>>>> --- a/arch/x86/kvm/svm/svm.c
>>>>> +++ b/arch/x86/kvm/svm/svm.c
>>>>> @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
>>>>> clr_cr_intercept(svm, INTERCEPT_CR3_READ);
>>>>> clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
>>>>> save->g_pat = svm->vcpu.arch.pat;
>>>>> + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
>> I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's
>> value which was not helpful but I did not try the current vcpu value.
>>
>> I am getting a #UD which I suspected had something to do with cr3 but
>> I'll know more after I add your suggestions.
>>
>> emu-system-x86-1647 [033] .... 3167.589402: kvm_nested_vmexit_inject:
>> reason: UD excp ext_inf1: 0x0000000000000000 ext_inf2:
>> 0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000
>>
>>
>>>>> save->cr3 = 0;
>>>>> save->cr4 = 0;
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> Best regards,
>>>>> Maxim Levitsky
>>>>>
>>>>>> Paolo
>>>>>>
>>>> And another thing I spotted before I forget.
>>>>
>>>> If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
>>>> then this field will be copied to vmcb02 but on next vmexit we clear it in current
>>>> (that is vmcb02) and that change will not propogate to vmcb01.
>> ctl.tlb_ctl is dependent on the value of save.cr4 which was not being
>> set in vmcb02.
> Not sure I understand. Could you explain?
>
> The vmcb02.save.cr4 is set in nested_prepare_vmcb_save, which does
> svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4),
> which eventually does 'to_svm(vcpu)->vmcb->save.cr4 = cr4;'
> And in this point vmcb points to vmcb02.
Yes it points to vmcb02 but
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; <---
this vmcb02 was never initialized and will always be zero. Where
previously it would have contained L1.save.cr4 and tbl_ctl is set to
something other than TLB_CONTROL_DO_NOTHING. I saw this in my traces.
>
> Besides SEV, it seems like ctl.tlb_ctl is set by svm_flush_tlb,
> which is indeed called from svm_set_cr4, but it is also exposed
> but .tlb_flush* callbacks from KVM core.
>
> What I meant is that we can create vcpu->arch->needs_tlb_flush, set it in svm_flush_tlb instead of tlb_ctl,
> and then svm_vcpu_run can set tlb_ctl in the current ctl to that TLB_CONTROL_FLUSH_ASID when
> vcpu->arch->needs_tlb_flush is set.
>
> Best regards,
> Maxim Levitsky
>
>
>>>> I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
>>>> it as what Paulo suggested to do with asid -
>>>> set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
>>> And yet another thing to note is that we curently ignore L2's g_pat. However it _seems_ that we practically
>>> ignore L1 PAT as well in regard to shadowing NPT mmu. I am not 100% sure about this.
>>> I'll dig that area eventually.
>>>
>>> Best regards,
>>> Maxim Levitsky
>>>
>>>> Best regards,
>>>> Maxim Levitsky
>> Thanks,
>>
>> Cathy
>>
>
On Thu, 2020-10-08 at 09:52 -0400, Cathy Avery wrote:
> On 10/8/20 9:11 AM, Maxim Levitsky wrote:
> > On Thu, 2020-10-08 at 08:46 -0400, Cathy Avery wrote:
> > > On 10/8/20 6:54 AM, Maxim Levitsky wrote:
> > > > On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:
> > > > > On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
> > > > > > On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
> > > > > > > On 08/10/20 00:14, Maxim Levitsky wrote:
> > > > > > > > > + if (svm->vmcb01->control.asid == 0)
> > > > > > > > > + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > > > > > > > I think that the above should be done always. The asid field is currently host
> > > > > > > > controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> > > > > > > > advertized to the guest and lnvlpga is emulated as invlpg).
> > > > > > > Yes, in fact I suggested that ASID should be in svm->asid and moved to
> > > > > > > svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
> > > > > > > it in nested code.
> > > > > > This makes lot of sense!
> > > > > > > This should be a patch coming before this one.
> > > > > > >
> > > > > > > > 1. Something wrong with memory types - like guest is using UC memory for everything.
> > > > > > > > I can't completely rule that out yet
> > > > > > > You can print g_pat and see if it is all zeroes.
> > > > > > I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
> > > > > > but now that I look at it, we set it to a default value and there is no code to set it to
> > > > > > default value for vmcb02. This is it. now my fedora guest boots just fine!
> > > > > >
> > > > > > I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
> > > > > > I knew that it has to be something with memory types, but it never occured to me
> > > > > > that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
> > > > > >
> > > > > >
> > > > > > > In general I think it's better to be explicit with vmcb01 vs. vmcb02,
> > > > > > > like Cathy did, but I can see it's a matter of personal preference to
> > > > > > > some extent.
> > > > > > I also think so in general, but in the code that is outside 'is_guest_mode'
> > > > > > IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
> > > > > > it, its not wrong to do so either.
> > > > > >
> > > > > >
> > > > > > My windows hyper-v guest doesn't boot though and I know why.
> > > > > >
> > > > > > As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
> > > > > > Now suppose a nested hypervisor wants to enter a nested guest.
> > > > > >
> > > > > > It will do vmload, which will load the extra state from the nested vmcb (vmcb12
> > > > > > or as I woudl say the vmcb that nested hypervisor thinks that it is using),
> > > > > > to the CPU. This can cause some vmexits I think, but this doesn't matter much.
> > > > > >
> > > > > > Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
> > > > > > and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
> > > > > > when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
> > > > > > The same issue happens the other way around on nested vmexit.
> > > > > >
> > > > > > I fixed this by doing nested_svm_vmloadsave, but that should be probably be
> > > > > > optimized with dirty bits. Now though I guess the goal it to make
> > > > > > it work first.
> > > > > >
> > > > > > With this fixed HyperV boots fine, and even passes the 'works' test of booting
> > > > > > the windows 10 with hyperv enabled nested itself and starting the vm inside,
> > > > > > which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
> > > > > >
> > > > > > https://i.imgur.com/sRYqtVV.png
> > > > > >
> > > > > > In summary this is the diff of fixes (just pasted to email, probably mangled):
> > > > > >
> > > > > >
> > > > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > > > > > index 0a06e62010d8c..7293ba23b3cbc 100644
> > > > > > --- a/arch/x86/kvm/svm/nested.c
> > > > > > +++ b/arch/x86/kvm/svm/nested.c
> > > > > > @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> > > > > > WARN_ON(svm->vmcb == svm->nested.vmcb02);
> > > > > >
> > > > > > svm->nested.vmcb02->control = svm->vmcb01->control;
> > > > > > +
> > > > > > + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
> > > > > > +
> > > > > > svm->vmcb = svm->nested.vmcb02;
> > > > > > svm->vmcb_pa = svm->nested.vmcb02_pa;
> > > > > > load_nested_vmcb_control(svm, &nested_vmcb->control);
> > > > > > @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > > > > > if (svm->vmcb01->control.asid == 0)
> > > > > > svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > > > > >
> > > > > > + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
> > > > > > svm->vmcb = svm->vmcb01;
> > > > > > svm->vmcb_pa = svm->nested.vmcb01_pa;
> > > > > >
> > > > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > > > > index b66239b26885d..ee9f87fe611f2 100644
> > > > > > --- a/arch/x86/kvm/svm/svm.c
> > > > > > +++ b/arch/x86/kvm/svm/svm.c
> > > > > > @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
> > > > > > clr_cr_intercept(svm, INTERCEPT_CR3_READ);
> > > > > > clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
> > > > > > save->g_pat = svm->vcpu.arch.pat;
> > > > > > + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
> > > I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's
> > > value which was not helpful but I did not try the current vcpu value.
> > >
> > > I am getting a #UD which I suspected had something to do with cr3 but
> > > I'll know more after I add your suggestions.
> > >
> > > emu-system-x86-1647 [033] .... 3167.589402: kvm_nested_vmexit_inject:
> > > reason: UD excp ext_inf1: 0x0000000000000000 ext_inf2:
> > > 0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000
> > >
> > >
> > > > > > save->cr3 = 0;
> > > > > > save->cr4 = 0;
> > > > > > }
> > > > > >
> > > > > >
> > > > > >
> > > > > > Best regards,
> > > > > > Maxim Levitsky
> > > > > >
> > > > > > > Paolo
> > > > > > >
> > > > > And another thing I spotted before I forget.
> > > > >
> > > > > If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
> > > > > then this field will be copied to vmcb02 but on next vmexit we clear it in current
> > > > > (that is vmcb02) and that change will not propogate to vmcb01.
> > > ctl.tlb_ctl is dependent on the value of save.cr4 which was not being
> > > set in vmcb02.
> > Not sure I understand. Could you explain?
> >
> > The vmcb02.save.cr4 is set in nested_prepare_vmcb_save, which does
> > svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4),
> > which eventually does 'to_svm(vcpu)->vmcb->save.cr4 = cr4;'
> > And in this point vmcb points to vmcb02.
>
> Yes it points to vmcb02 but
>
> int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> {
> unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
> unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; <---
> this vmcb02 was never initialized and will always be zero. Where
> previously it would have contained L1.save.cr4 and tbl_ctl is set to
> something other than TLB_CONTROL_DO_NOTHING. I saw this in my traces.
Now I understand, this is another bug, thanks!
I guess a simple fix something like below could
work, but I don't know for sure. I need to understand a bit better the relation
to stuff saved into vcpu->arch and in vmcb.
svm->vmcb->save.cr4 = svm->vmcb01->save.cr4;
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
Best regards,
Maxim Levitsky
>
> > Besides SEV, it seems like ctl.tlb_ctl is set by svm_flush_tlb,
> > which is indeed called from svm_set_cr4, but it is also exposed
> > but .tlb_flush* callbacks from KVM core.
> >
> > What I meant is that we can create vcpu->arch->needs_tlb_flush, set it in svm_flush_tlb instead of tlb_ctl,
> > and then svm_vcpu_run can set tlb_ctl in the current ctl to that TLB_CONTROL_FLUSH_ASID when
> > vcpu->arch->needs_tlb_flush is set.
> >
> > Best regards,
> > Maxim Levitsky
> >
> >
> > > > > I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
> > > > > it as what Paulo suggested to do with asid -
> > > > > set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
> > > > And yet another thing to note is that we curently ignore L2's g_pat. However it _seems_ that we practically
> > > > ignore L1 PAT as well in regard to shadowing NPT mmu. I am not 100% sure about this.
> > > > I'll dig that area eventually.
> > > >
> > > > Best regards,
> > > > Maxim Levitsky
> > > >
> > > > > Best regards,
> > > > > Maxim Levitsky
> > > Thanks,
> > >
> > > Cathy
> > >
On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:
> On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:
> > On 08/10/20 00:14, Maxim Levitsky wrote:
> > > > + if (svm->vmcb01->control.asid == 0)
> > > > + svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
> > >
> > > I think that the above should be done always. The asid field is currently host
> > > controlled only (that is L2 value is ignored, selective ASID tlb flush is not
> > > advertized to the guest and lnvlpga is emulated as invlpg).
> >
> > Yes, in fact I suggested that ASID should be in svm->asid and moved to
> > svm->vmcb->asid in svm_vcpu_run. Then there's no need to special case
> > it in nested code.
> This makes lot of sense!
> > This should be a patch coming before this one.
> >
> > > 1. Something wrong with memory types - like guest is using UC memory for everything.
> > > I can't completely rule that out yet
> >
> > You can print g_pat and see if it is all zeroes.
> I don't even need to print it. I know that it is never set anywhere, unless guest writes it,
> but now that I look at it, we set it to a default value and there is no code to set it to
> default value for vmcb02. This is it. now my fedora guest boots just fine!
>
> I played a lot with g_pat, and yet this didn't occur to me . I was that close :-(
> I knew that it has to be something with memory types, but it never occured to me
> that guest just doesn't write IA32_PAT and uses our value which we set in init_vmcb
>
>
> > In general I think it's better to be explicit with vmcb01 vs. vmcb02,
> > like Cathy did, but I can see it's a matter of personal preference to
> > some extent.
> I also think so in general, but in the code that is outside 'is_guest_mode'
> IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
> it, its not wrong to do so either.
>
>
> My windows hyper-v guest doesn't boot though and I know why.
>
> As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
> Now suppose a nested hypervisor wants to enter a nested guest.
>
> It will do vmload, which will load the extra state from the nested vmcb (vmcb12
> or as I woudl say the vmcb that nested hypervisor thinks that it is using),
> to the CPU. This can cause some vmexits I think, but this doesn't matter much.
>
> Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU registers,
> and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
> when we do vmload on vmcb02 prior to vmenter on it, which loads stale state from it.
> The same issue happens the other way around on nested vmexit.
>
> I fixed this by doing nested_svm_vmloadsave, but that should be probably be
> optimized with dirty bits. Now though I guess the goal it to make
> it work first.
>
> With this fixed HyperV boots fine, and even passes the 'works' test of booting
> the windows 10 with hyperv enabled nested itself and starting the vm inside,
> which makes that VM L3 (in addition to windows itself that runs as L3 in relation to hyper-v)
>
> https://i.imgur.com/sRYqtVV.png
>
> In summary this is the diff of fixes (just pasted to email, probably mangled):
>
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 0a06e62010d8c..7293ba23b3cbc 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> WARN_ON(svm->vmcb == svm->nested.vmcb02);
>
> svm->nested.vmcb02->control = svm->vmcb01->control;
> +
> + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
> +
> svm->vmcb = svm->nested.vmcb02;
> svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
> svm->vmcb = svm->vmcb01;
> svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b66239b26885d..ee9f87fe611f2 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
> clr_cr_intercept(svm, INTERCEPT_CR3_READ);
> clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
> save->g_pat = svm->vcpu.arch.pat;
> + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
> save->cr3 = 0;
> save->cr4 = 0;
> }
>
>
>
> Best regards,
> Maxim Levitsky
>
> > Paolo
> >
>
>
And another thing I spotted before I forget.
If we setup a tlb flush in ctl.tlb_ctl of vmcb01, just prior to nested vmentry
then this field will be copied to vmcb02 but on next vmexit we clear it in current
(that is vmcb02) and that change will not propogate to vmcb01.
I am not sure if this is a theorerical issue or not. We probably should apply the same treatment to
it as what Paulo suggested to do with asid -
set it just prior to vmentry if tlb flush is needed, and clear it afterwards as we do.
Best regards,
Maxim Levitsky
On 10/8/20 6:23 AM, Maxim Levitsky wrote:
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 0a06e62010d8c..7293ba23b3cbc 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
> WARN_ON(svm->vmcb == svm->nested.vmcb02);
>
> svm->nested.vmcb02->control = svm->vmcb01->control;
> +
> + nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
> +
> svm->vmcb = svm->nested.vmcb02;
> svm->vmcb_pa = svm->nested.vmcb02_pa;
> load_nested_vmcb_control(svm, &nested_vmcb->control);
> @@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> if (svm->vmcb01->control.asid == 0)
> svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
>
> + nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);
> svm->vmcb = svm->vmcb01;
> svm->vmcb_pa = svm->nested.vmcb01_pa;
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b66239b26885d..ee9f87fe611f2 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
> clr_cr_intercept(svm, INTERCEPT_CR3_READ);
> clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
> save->g_pat = svm->vcpu.arch.pat;
> + svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
> save->cr3 = 0;
> save->cr4 = 0;
> }
OK this worked for me. Thanks!