This is the next version of this patch series.
In V5 I adopted Sean Christopherson's suggestion to make .set_efer return
a negative error (-ENOMEM in this case) which in most cases in kvm
propagates to the userspace.
I noticed though that wrmsr emulation code doesn't do this and instead
it injects #GP to the guest on _any_ error.
So I fixed the wrmsr code to behave in a similar way to the rest
of the kvm code.
(#GP only on a positive error value, and forward the negative error to
the userspace)
I had to adjust one wrmsr handler (xen_hvm_config) to stop it from returning
negative values so that new WRMSR emulation behavior doesn't break it.
This patch was only compile tested.
The memory allocation failure was tested by always returning -ENOMEM
from svm_allocate_nested.
The nested allocation itself was tested by countless attempts to run
nested guests, do nested migration on both my AMD and Intel machines.
I wasn't able to break it.
Changes from V5: addressed Sean Christopherson's review feedback.
Best regards,
Maxim Levitsky
Maxim Levitsky (4):
KVM: x86: xen_hvm_config: cleanup return values
KVM: x86: report negative values from wrmsr emulation to userspace
KVM: x86: allow kvm_x86_ops.set_efer to return an error value
KVM: nSVM: implement on demand allocation of the nested state
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/emulate.c | 7 ++--
arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++
arch/x86/kvm/svm/svm.c | 58 +++++++++++++++++++--------------
arch/x86/kvm/svm/svm.h | 8 ++++-
arch/x86/kvm/vmx/vmx.c | 6 ++--
arch/x86/kvm/x86.c | 37 ++++++++++++---------
7 files changed, 113 insertions(+), 47 deletions(-)
--
2.26.2
This will allow the KVM to report such errors (e.g -ENOMEM)
to the userspace.
Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/emulate.c | 7 +++++--
arch/x86/kvm/x86.c | 6 +++++-
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1d450d7710d63..d855304f5a509 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
{
u64 msr_data;
+ int ret;
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+
+ ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
+ if (ret > 0)
return emulate_gp(ctxt, 0);
- return X86EMUL_CONTINUE;
+ return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 063d70e736f7f..e4b07be450d4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1612,8 +1612,12 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
+ int ret = kvm_set_msr(vcpu, ecx, data);
- if (kvm_set_msr(vcpu, ecx, data)) {
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
--
2.26.2
This will be used to signal an error to the userspace, in case
the vendor code failed during handling of this msr. (e.g -ENOMEM)
Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/svm/svm.c | 3 ++-
arch/x86/kvm/svm/svm.h | 2 +-
arch/x86/kvm/vmx/vmx.c | 6 ++++--
arch/x86/kvm/x86.c | 8 +++++++-
5 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5303dbc5c9bce..b273c199b9a55 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1069,7 +1069,7 @@ struct kvm_x86_ops {
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
- void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3da5b2f1b4a19..18f8af55e970a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -263,7 +263,7 @@ static int get_max_npt_level(void)
#endif
}
-void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->arch.efer = efer;
@@ -283,6 +283,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
svm->vmcb->save.efer = efer | EFER_SVME;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ return 0;
}
static int is_external_interrupt(u32 info)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 45496775f0db2..1e1842de0efe7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -338,7 +338,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
#define MSR_INVALID 0xffffffffU
u32 svm_msrpm_offset(u32 msr);
-void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6f9a0c6d5dc59..7f7451ff80ffc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2835,13 +2835,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ /* Nothing to do if hardware doesn't support EFER. */
if (!msr)
- return;
+ return 0;
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
@@ -2853,6 +2854,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
msr->data = efer & ~EFER_LME;
}
setup_msrs(vmx);
+ return 0;
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e4b07be450d4e..df53baa0059fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1456,6 +1456,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 old_efer = vcpu->arch.efer;
u64 efer = msr_info->data;
+ int r;
if (efer & efer_reserved_bits)
return 1;
@@ -1472,7 +1473,12 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- kvm_x86_ops.set_efer(vcpu, efer);
+ r = kvm_x86_ops.set_efer(vcpu, efer);
+
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
--
2.26.2
Return 1 on errors that are caused by wrong guest behavior
(which will inject #GP to the guest)
And return a negative error value on issues that are
the kernel's fault (e.g -ENOMEM)
Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/x86.c | 23 +++++++++--------------
1 file changed, 9 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 17f4995e80a7e..063d70e736f7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2694,24 +2694,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
u8 *page;
- int r;
- r = -E2BIG;
if (page_num >= blob_size)
- goto out;
- r = -ENOMEM;
+ return 1;
+
page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page)) {
- r = PTR_ERR(page);
- goto out;
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
}
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
- goto out_free;
- r = 0;
-out_free:
- kfree(page);
-out:
- return r;
+ return 0;
}
static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
--
2.26.2
This way we don't waste memory on VMs which don't use nesting
virtualization even when the host enabled it for them.
Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
arch/x86/kvm/svm/svm.c | 55 ++++++++++++++++++++++-----------------
arch/x86/kvm/svm/svm.h | 6 +++++
3 files changed, 79 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 09417f5197410..dd13856818a03 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
vmcb12 = map.hva;
+ if (WARN_ON(!svm->nested.initialized))
+ return 1;
+
if (!nested_vmcb_checks(svm, vmcb12)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
@@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
return 0;
}
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *hsave_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!hsave_page)
+ return -ENOMEM;
+
+ svm->nested.hsave = page_address(hsave_page);
+
+ svm->nested.msrpm = svm_vcpu_init_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_hsave;
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_hsave:
+ __free_page(hsave_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.hsave));
+ svm->nested.hsave = NULL;
+
+ svm->nested.initialized = false;
+}
+
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 18f8af55e970a..d1265c95e2b0b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -266,6 +266,7 @@ static int get_max_npt_level(void)
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
vcpu->arch.efer = efer;
if (!npt_enabled) {
@@ -276,9 +277,27 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
}
- if (!(efer & EFER_SVME)) {
- svm_leave_nested(svm);
- svm_set_gif(svm, true);
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(svm);
+ svm_set_gif(svm, true);
+
+ /*
+ * Free the nested guest state, unless we are in SMM.
+ * In this case we will return to the nested guest
+ * as soon as we leave SMM.
+ */
+ if (!is_smm(&svm->vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+ }
}
svm->vmcb->save.efer = efer | EFER_SVME;
@@ -610,7 +629,7 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
msrpm[offset] = tmp;
}
-static u32 *svm_vcpu_init_msrpm(void)
+u32 *svm_vcpu_init_msrpm(void)
{
int i;
u32 *msrpm;
@@ -630,7 +649,7 @@ static u32 *svm_vcpu_init_msrpm(void)
return msrpm;
}
-static void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(u32 *msrpm)
{
__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
}
@@ -1204,7 +1223,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb_page;
- struct page *hsave_page;
int err;
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1215,13 +1233,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (!vmcb_page)
goto out;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
- goto error_free_vmcb_page;
-
err = avic_init_vcpu(svm);
if (err)
- goto error_free_hsave_page;
+ goto out;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -1229,15 +1243,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
svm->avic_is_running = true;
- svm->nested.hsave = page_address(hsave_page);
-
svm->msrpm = svm_vcpu_init_msrpm();
if (!svm->msrpm)
- goto error_free_hsave_page;
-
- svm->nested.msrpm = svm_vcpu_init_msrpm();
- if (!svm->nested.msrpm)
- goto error_free_msrpm;
+ goto error_free_vmcb_page;
svm->vmcb = page_address(vmcb_page);
svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
@@ -1249,10 +1257,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
return 0;
-error_free_msrpm:
- svm_vcpu_free_msrpm(svm->msrpm);
-error_free_hsave_page:
- __free_page(hsave_page);
error_free_vmcb_page:
__free_page(vmcb_page);
out:
@@ -1278,10 +1282,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
*/
svm_clear_current_vmcb(svm->vmcb);
+ svm_free_nested(svm);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->nested.hsave));
- __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -3964,6 +3968,9 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
return 1;
+ if (svm_allocate_nested(svm))
+ return 1;
+
ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1e1842de0efe7..10453abc5bed3 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -96,6 +96,8 @@ struct svm_nested_state {
/* cache for control fields of the guest */
struct vmcb_control_area ctl;
+
+ bool initialized;
};
struct vcpu_svm {
@@ -339,6 +341,8 @@ static inline bool gif_set(struct vcpu_svm *svm)
u32 svm_msrpm_offset(u32 msr);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+u32 *svm_vcpu_init_msrpm(void);
+void svm_vcpu_free_msrpm(u32 *msrpm);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -379,6 +383,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
struct vmcb *nested_vmcb);
void svm_leave_nested(struct vcpu_svm *svm);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct vcpu_svm *svm);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
--
2.26.2
On Wed, Sep 23, 2020 at 12:10:24AM +0300, Maxim Levitsky wrote:
> This will be used to signal an error to the userspace, in case
> the vendor code failed during handling of this msr. (e.g -ENOMEM)
>
> Signed-off-by: Maxim Levitsky <[email protected]>
> ---
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e4b07be450d4e..df53baa0059fe 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1456,6 +1456,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> {
> u64 old_efer = vcpu->arch.efer;
> u64 efer = msr_info->data;
> + int r;
>
> if (efer & efer_reserved_bits)
> return 1;
> @@ -1472,7 +1473,12 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> efer &= ~EFER_LMA;
> efer |= vcpu->arch.efer & EFER_LMA;
>
> - kvm_x86_ops.set_efer(vcpu, efer);
> + r = kvm_x86_ops.set_efer(vcpu, efer);
> +
Nit: IMO, omitting the newline would help the reader make a direct connection
between setting 'r' and checking 'r'.
> + if (r) {
> + WARN_ON(r > 0);
> + return r;
> + }
>
> /* Update reserved bits */
> if ((efer ^ old_efer) & EFER_NX)
> --
> 2.26.2
>
On Wed, Sep 23, 2020 at 12:10:25AM +0300, Maxim Levitsky wrote:
> This way we don't waste memory on VMs which don't use nesting
> virtualization even when the host enabled it for them.
>
> Signed-off-by: Maxim Levitsky <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
> arch/x86/kvm/svm/svm.c | 55 ++++++++++++++++++++++-----------------
> arch/x86/kvm/svm/svm.h | 6 +++++
> 3 files changed, 79 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 09417f5197410..dd13856818a03 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
>
> vmcb12 = map.hva;
>
> + if (WARN_ON(!svm->nested.initialized))
Probably should use WARN_ON_ONCE, if this is somehow it, userspace could
easily spam the kernel.
Side topic, do we actually need 'initialized'? Wouldn't checking for a
valid nested.msrpm or nested.hsave suffice?
> + return 1;
> +
> if (!nested_vmcb_checks(svm, vmcb12)) {
> vmcb12->control.exit_code = SVM_EXIT_ERR;
> vmcb12->control.exit_code_hi = 0;
> @@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> return 0;
> }
On Mon, 2020-09-28 at 22:15 -0700, Sean Christopherson wrote:
> On Wed, Sep 23, 2020 at 12:10:25AM +0300, Maxim Levitsky wrote:
> > This way we don't waste memory on VMs which don't use nesting
> > virtualization even when the host enabled it for them.
> >
> > Signed-off-by: Maxim Levitsky <[email protected]>
> > ---
> > arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
> > arch/x86/kvm/svm/svm.c | 55 ++++++++++++++++++++++-----------------
> > arch/x86/kvm/svm/svm.h | 6 +++++
> > 3 files changed, 79 insertions(+), 24 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 09417f5197410..dd13856818a03 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
> >
> > vmcb12 = map.hva;
> >
> > + if (WARN_ON(!svm->nested.initialized))
>
> Probably should use WARN_ON_ONCE, if this is somehow it, userspace could
> easily spam the kernel.
Makes sense.
>
> Side topic, do we actually need 'initialized'? Wouldn't checking for a
> valid nested.msrpm or nested.hsave suffice?
It a matter of taste - I prefer to have a single variable controlling this,
rather than two.
a WARN_ON(svm->nested.initialized && !svm->nested.msrpm || !svm->nested.hsave))
would probably be nice to have. IMHO I rather leave this like it is if you
don't object.
>
> > + return 1;
> > +
> > if (!nested_vmcb_checks(svm, vmcb12)) {
> > vmcb12->control.exit_code = SVM_EXIT_ERR;
> > vmcb12->control.exit_code_hi = 0;
> > @@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > return 0;
> > }
Best regards,
Maxim Levitsky
On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> This is the next version of this patch series.
>
> In V5 I adopted Sean Christopherson's suggestion to make .set_efer return
> a negative error (-ENOMEM in this case) which in most cases in kvm
> propagates to the userspace.
>
> I noticed though that wrmsr emulation code doesn't do this and instead
> it injects #GP to the guest on _any_ error.
>
> So I fixed the wrmsr code to behave in a similar way to the rest
> of the kvm code.
> (#GP only on a positive error value, and forward the negative error to
> the userspace)
>
> I had to adjust one wrmsr handler (xen_hvm_config) to stop it from returning
> negative values so that new WRMSR emulation behavior doesn't break it.
> This patch was only compile tested.
>
> The memory allocation failure was tested by always returning -ENOMEM
> from svm_allocate_nested.
>
> The nested allocation itself was tested by countless attempts to run
> nested guests, do nested migration on both my AMD and Intel machines.
> I wasn't able to break it.
>
> Changes from V5: addressed Sean Christopherson's review feedback.
>
> Best regards,
> Maxim Levitsky
>
> Maxim Levitsky (4):
> KVM: x86: xen_hvm_config: cleanup return values
> KVM: x86: report negative values from wrmsr emulation to userspace
> KVM: x86: allow kvm_x86_ops.set_efer to return an error value
> KVM: nSVM: implement on demand allocation of the nested state
>
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/emulate.c | 7 ++--
> arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++
> arch/x86/kvm/svm/svm.c | 58 +++++++++++++++++++--------------
> arch/x86/kvm/svm/svm.h | 8 ++++-
> arch/x86/kvm/vmx/vmx.c | 6 ++--
> arch/x86/kvm/x86.c | 37 ++++++++++++---------
> 7 files changed, 113 insertions(+), 47 deletions(-)
>
> --
> 2.26.2
>
Very polite ping on this patch series.
Best regards,
Maxim Levitsky
On Wed, Sep 30, 2020 at 06:35:40PM +0300, Maxim Levitsky wrote:
> On Mon, 2020-09-28 at 22:15 -0700, Sean Christopherson wrote:
> > Side topic, do we actually need 'initialized'? Wouldn't checking for a
> > valid nested.msrpm or nested.hsave suffice?
>
> It a matter of taste - I prefer to have a single variable controlling this,
> rather than two.
> a WARN_ON(svm->nested.initialized && !svm->nested.msrpm || !svm->nested.hsave))
> would probably be nice to have. IMHO I rather leave this like it is if you
> don't object.
I don't have a strong preference. I wouldn't bother with the second WARN_ON.
Unless you take action, e.g. bail early, a NULL pointer will likely provide a
stack trace soon enough :-).
On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> This will allow the KVM to report such errors (e.g -ENOMEM)
> to the userspace.
>
> Signed-off-by: Maxim Levitsky <[email protected]>
Reverting this and its dependency:
72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value
on the top of linux-next (they have also unfortunately merged into the mainline
at the same time) fixed an issue that a simple Intel KVM guest is unable to boot
below.
.config: http://people.redhat.com/qcai/x86.config
qemu-kvm-4.2.0-34.module+el8.3.0+7976+077be4ec.x86_64
# /usr/libexec/qemu-kvm -name ubuntu-20.04-server-cloudimg -cpu host -smp 2 -m 2g -hda ./ubuntu-20.04-server-cloudimg.qcow2 -cdrom ./ubuntu-20.04-server-cloudimg.iso -nic user,hostfwd=tcp::2222-:22 -nographic
[ 1.141022] evm: Initialising EVM extended attributes:
[ 1.143344] evm: security.selinux
[ 1.144968] evm: security.SMACK64
[ 1.146574] evm: security.SMACK64EXEC
[ 1.148305] evm: security.SMACK64TRANSMUTE
[ 1.150215] evm: security.SMACK64MMAP
[ 1.151960] evm: security.apparmor
[ 1.153755] evm: security.ima
[ 1.155454] evm: security.capability
[ 1.155456] evm: HMAC attrs: 0x1
[ 1.162331] ata1.00: ATA-7: QEMU HARDDISK, 2.5+, max UDMA/100
[ 1.162635] PM: Magic number: 8:937:635
[ 1.165607] ata1.00: 2147483648 sectors, multi 16: LBA48
[ 1.169799] scsi 0:0:0:0: Direct-Access ATA QEMU HARDDISK 2.5+ PQ: 0 ANSI: 5
[ 1.174196] rtc_cmos 00:00: setting system clock to 2020-10-26T13:38:53 UTC (1603719533)
[ 1.178237] sd 0:0:0:0: Attached scsi generic sg0 type 0
[ 1.178293] sd 0:0:0:0: [sda] 2147483648 512-byte logical blocks: (1.10 TB/1.00 TiB)
[ 1.180567] ata2.00: ATAPI: QEMU DVD-ROM, 2.5+, max UDMA/100
[ 1.183986] sd 0:0:0:0: [sda] Write Protect is off
[error: kvm run failed No such file or directory
RAX=0000000000000000 RBX=0000000000000000 RCX=0000000000000150 RDX=000000008000001c
RSI=0000000000000000 RDI=0000000000000150 RBP=ffffb67840083e40 RSP=ffffb67840083e00
R8 =ffff931dfda17608 R9 =0000000000000000 R10=ffff931dfda17848 R11=0000000000000000
R12=0000000000000000 R13=00000000000000b7 R14=ffff931dfd4013c0 R15=ffffffffaa8f48d0
RIP=ffffffffaa078894 RFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 0000000000000000 ffffffff 00c00000
CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
SS =0000 0000000000000000 ffffffff 00c00000
DS =0000 0000000000000000 ffffffff 00c00000
FS =0000 0000000000000000 ffffffff 00c00000
GS =0000 ffff931dfda00000 ffffffff 00c00000
LDT=0000 0000000000000000 ffffffff 00c00000
TR =0040 fffffe0000003000 0000206f 00008b00 DPL=0 TSS64-busy
GDT= fffffe0000001000 0000007f
IDT= fffffe0000000000 00000fff
CR0=80050033 CR2=0000000000000000 CR3=000000002960a001 CR4=00760ef0
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
DR6=00000000fffe0ff0 DR7=0000000000000400
EFER=0000000000000d01
Code=dc 60 4e 00 4c 89 e0 41 5c 5d c3 0f 1f 44 00 00 89 f0 89 f9 <0f> 30 31 c0 0f 1f 44 00 00 c3 55 48 c1 e2 20 89 f6 48 09 d6 89 c2 48 89 e5 48 83 ec 08 89
> ---
> arch/x86/kvm/emulate.c | 7 +++++--
> arch/x86/kvm/x86.c | 6 +++++-
> 2 files changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 1d450d7710d63..d855304f5a509 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
> static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
> {
> u64 msr_data;
> + int ret;
>
> msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
> | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
> - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
> +
> + ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
> + if (ret > 0)
> return emulate_gp(ctxt, 0);
>
> - return X86EMUL_CONTINUE;
> + return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
> }
>
> static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 063d70e736f7f..e4b07be450d4e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1612,8 +1612,12 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
> {
> u32 ecx = kvm_rcx_read(vcpu);
> u64 data = kvm_read_edx_eax(vcpu);
> + int ret = kvm_set_msr(vcpu, ecx, data);
>
> - if (kvm_set_msr(vcpu, ecx, data)) {
> + if (ret < 0)
> + return ret;
> +
> + if (ret > 0) {
> trace_kvm_msr_write_ex(ecx, data);
> kvm_inject_gp(vcpu, 0);
> return 1;
On Mon, 2020-10-26 at 15:40 -0400, Qian Cai wrote:
> On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> > This will allow the KVM to report such errors (e.g -ENOMEM)
> > to the userspace.
> >
> > Signed-off-by: Maxim Levitsky <[email protected]>
>
> Reverting this and its dependency:
>
> 72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value
>
> on the top of linux-next (they have also unfortunately merged into the
> mainline
> at the same time) fixed an issue that a simple Intel KVM guest is unable to
> boot
> below.
So I debug this a bit more. This also breaks nested virt (VMX). We have here:
[ 345.504403] kvm [1491]: vcpu0 unhandled rdmsr: 0x4e data 0x0
[ 345.758560] kvm [1491]: vcpu0 unhandled rdmsr: 0x1c9 data 0x0
[ 345.758594] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a6 data 0x0
[ 345.758619] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a7 data 0x0
[ 345.758644] kvm [1491]: vcpu0 unhandled rdmsr: 0x3f6 data 0x0
[ 345.951601] kvm [1493]: vcpu1 unhandled rdmsr: 0x4e data 0x0
[ 351.857036] kvm [1493]: vcpu1 unhandled wrmsr: 0xc90 data 0xfffff
After this commit, -ENOENT is returned to vcpu_enter_guest() causes the
userspace to abort.
kvm_msr_ignored_check()
kvm_set_msr()
kvm_emulate_wrmsr()
vmx_handle_exit()
vcpu_enter_guest()
Something like below will unbreak the userspace, but does anyone has a better
idea?
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1748,7 +1748,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
return 0;
/* Signal all other negative errors to userspace */
- if (r < 0)
+ if (r < 0 && r != -ENOENT)
return r;
/* MSR write failed? Inject a #GP */
On Tue, 2020-10-27 at 16:31 -0400, Qian Cai wrote:
> On Mon, 2020-10-26 at 15:40 -0400, Qian Cai wrote:
> > On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> > > This will allow the KVM to report such errors (e.g -ENOMEM)
> > > to the userspace.
> > >
> > > Signed-off-by: Maxim Levitsky <[email protected]>
> >
> > Reverting this and its dependency:
> >
> > 72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value
> >
> > on the top of linux-next (they have also unfortunately merged into the
> > mainline
> > at the same time) fixed an issue that a simple Intel KVM guest is unable to
> > boot
> > below.
>
> So I debug this a bit more. This also breaks nested virt (VMX). We have here:
>
> [ 345.504403] kvm [1491]: vcpu0 unhandled rdmsr: 0x4e data 0x0
> [ 345.758560] kvm [1491]: vcpu0 unhandled rdmsr: 0x1c9 data 0x0
> [ 345.758594] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a6 data 0x0
> [ 345.758619] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a7 data 0x0
> [ 345.758644] kvm [1491]: vcpu0 unhandled rdmsr: 0x3f6 data 0x0
> [ 345.951601] kvm [1493]: vcpu1 unhandled rdmsr: 0x4e data 0x0
> [ 351.857036] kvm [1493]: vcpu1 unhandled wrmsr: 0xc90 data 0xfffff
>
> After this commit, -ENOENT is returned to vcpu_enter_guest() causes the
> userspace to abort.
Thank you very much for debugging it!
Yestarday I tried pretty much everything to reproduce it on my intel's laptop
but I wasn't able.
I tried kvm/queue, then latest mainline, linux-next on my laptop
and all worked and even booted nested guests.
For qemu side,
I even tried RHEL's qemu, exact version as you tested.
I got really unlucky here that it seems that none of my guests ever write
an unknown msr.
Now with the information you provided, it is trivial to reproduce it
even on my AMD machine -
All I need to do is to write an unknown msr,
something like 'wrmsr 0x1234 0x0' using wrmsr tool.
And for the root cause of this, this is the fallout of last minute rebase of my code on top
of the userspace msr feature. I missed this -ENOENT logic that clashes with mine.
>
> kvm_msr_ignored_check()
> kvm_set_msr()
> kvm_emulate_wrmsr()
> vmx_handle_exit()
> vcpu_enter_guest()
>
> Something like below will unbreak the userspace, but does anyone has a better
> idea?
Checking for -ENOENT might be a right solution but I'll check now in depth,
what else interactions are affected and if this can be done closer to the
place where it happens.
Also, I am more inclined to add a new positive error code (similiar to KVM_MSR_RET_INVALID)
to indicate 'msr not found' error since this error condition is arch defined.
My reasoning is that positive error values should be used for error conditions
that cause #GP to the guest, while negative error values should only be used
for internal errors which are propogated to qemu userspace and usually kill
the guest.
I'll prepare a patch for this very soon.
Best regards,
Maxim Levitsky
>
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1748,7 +1748,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
> return 0;
>
> /* Signal all other negative errors to userspace */
> - if (r < 0)
> + if (r < 0 && r != -ENOENT)
> return r;
>
> /* MSR write failed? Inject a #GP */
>