2020-09-21 13:23:04

by Maxim Levitsky

[permalink] [raw]
Subject: [PATCH v5 0/4] KVM: nSVM: ondemand nested state allocation

This is yet another version of ondemand nested state allocation.



In this version I adoped the suggestion of Sean Christopherson

to return make EFER write return a negative error which then should

propogate to the userspace.



So I fixed the WRMSR code to actually obey this (#GP on positive

return value, exit to userspace when negative error value,

and success on 0 error value, and fixed one user (xen)

that returned negative error code on failures.



The XEN patch is only compile tested. The rest were tested

by always returning -ENOMEM from svm_allocate_nested.



Best regards,

Maxim Levitsky



Maxim Levitsky (4):

KVM: x86: xen_hvm_config cleanup return values

KVM: x86: report negative values from wrmsr to userspace

KVM: x86: allow kvm_x86_ops.set_efer to return a value

KVM: nSVM: implement ondemand allocation of the nested state



arch/x86/include/asm/kvm_host.h | 2 +-

arch/x86/kvm/emulate.c | 7 ++--

arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++

arch/x86/kvm/svm/svm.c | 58 +++++++++++++++++++--------------

arch/x86/kvm/svm/svm.h | 8 ++++-

arch/x86/kvm/vmx/vmx.c | 9 +++--

arch/x86/kvm/x86.c | 36 ++++++++++----------

7 files changed, 113 insertions(+), 49 deletions(-)



--

2.26.2





2020-09-21 13:23:09

by Maxim Levitsky

[permalink] [raw]
Subject: [PATCH v5 2/4] KVM: x86: report negative values from wrmsr to userspace

This will allow us to make some MSR writes fatal to the guest
(e.g when out of memory condition occurs)

Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/emulate.c | 7 +++++--
arch/x86/kvm/x86.c | 5 +++--
2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1d450d7710d63..d855304f5a509 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
{
u64 msr_data;
+ int ret;

msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+
+ ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
+ if (ret > 0)
return emulate_gp(ctxt, 0);

- return X86EMUL_CONTINUE;
+ return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}

static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 063d70e736f7f..b6c67ab7c4f34 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1612,15 +1612,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
+ int ret = kvm_set_msr(vcpu, ecx, data);

- if (kvm_set_msr(vcpu, ecx, data)) {
+ if (ret > 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}

trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(vcpu);
+ return ret < 0 ? ret : kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);

--
2.26.2

2020-09-21 13:23:47

by Maxim Levitsky

[permalink] [raw]
Subject: [PATCH v5 4/4] KVM: nSVM: implement ondemand allocation of the nested state

This way we don't waste memory on VMs which don't use
nesting virtualization even if it is available to them.

If allocation of nested state fails (which should happen,
only when host is about to OOM anyway), use new KVM_REQ_OUT_OF_MEMORY
request to shut down the guest

Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++
arch/x86/kvm/svm/svm.c | 55 ++++++++++++++++++++++-----------------
arch/x86/kvm/svm/svm.h | 6 +++++
3 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 09417f5197410..dd13856818a03 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -467,6 +467,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)

vmcb12 = map.hva;

+ if (WARN_ON(!svm->nested.initialized))
+ return 1;
+
if (!nested_vmcb_checks(svm, vmcb12)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
@@ -684,6 +687,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
return 0;
}

+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *hsave_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!hsave_page)
+ return -ENOMEM;
+
+ svm->nested.hsave = page_address(hsave_page);
+
+ svm->nested.msrpm = svm_vcpu_init_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_hsave;
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_hsave:
+ __free_page(hsave_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.hsave));
+ svm->nested.hsave = NULL;
+
+ svm->nested.initialized = false;
+}
+
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 18f8af55e970a..a77a95bff7d0a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -266,6 +266,7 @@ static int get_max_npt_level(void)
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
vcpu->arch.efer = efer;

if (!npt_enabled) {
@@ -276,9 +277,27 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
}

- if (!(efer & EFER_SVME)) {
- svm_leave_nested(svm);
- svm_set_gif(svm, true);
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(svm);
+ svm_set_gif(svm, true);
+
+ /*
+ * Free the nested state unless we are in SMM, in which
+ * case the exit from SVM mode is only for duration of the SMI
+ * handler
+ */
+ if (!is_smm(&svm->vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+ }
}

svm->vmcb->save.efer = efer | EFER_SVME;
@@ -610,7 +629,7 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
msrpm[offset] = tmp;
}

-static u32 *svm_vcpu_init_msrpm(void)
+u32 *svm_vcpu_init_msrpm(void)
{
int i;
u32 *msrpm;
@@ -630,7 +649,7 @@ static u32 *svm_vcpu_init_msrpm(void)
return msrpm;
}

-static void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(u32 *msrpm)
{
__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
}
@@ -1204,7 +1223,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb_page;
- struct page *hsave_page;
int err;

BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1215,13 +1233,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (!vmcb_page)
goto out;

- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
- goto error_free_vmcb_page;
-
err = avic_init_vcpu(svm);
if (err)
- goto error_free_hsave_page;
+ goto out;

/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -1229,15 +1243,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
svm->avic_is_running = true;

- svm->nested.hsave = page_address(hsave_page);
-
svm->msrpm = svm_vcpu_init_msrpm();
if (!svm->msrpm)
- goto error_free_hsave_page;
-
- svm->nested.msrpm = svm_vcpu_init_msrpm();
- if (!svm->nested.msrpm)
- goto error_free_msrpm;
+ goto error_free_vmcb_page;

svm->vmcb = page_address(vmcb_page);
svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
@@ -1249,10 +1257,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)

return 0;

-error_free_msrpm:
- svm_vcpu_free_msrpm(svm->msrpm);
-error_free_hsave_page:
- __free_page(hsave_page);
error_free_vmcb_page:
__free_page(vmcb_page);
out:
@@ -1278,10 +1282,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
*/
svm_clear_current_vmcb(svm->vmcb);

+ svm_free_nested(svm);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->nested.hsave));
- __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
}

static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -3964,6 +3968,9 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
return 1;

+ if (svm_allocate_nested(svm))
+ return 1;
+
ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1e1842de0efe7..10453abc5bed3 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -96,6 +96,8 @@ struct svm_nested_state {

/* cache for control fields of the guest */
struct vmcb_control_area ctl;
+
+ bool initialized;
};

struct vcpu_svm {
@@ -339,6 +341,8 @@ static inline bool gif_set(struct vcpu_svm *svm)

u32 svm_msrpm_offset(u32 msr);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+u32 *svm_vcpu_init_msrpm(void);
+void svm_vcpu_free_msrpm(u32 *msrpm);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -379,6 +383,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
struct vmcb *nested_vmcb);
void svm_leave_nested(struct vcpu_svm *svm);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct vcpu_svm *svm);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
--
2.26.2

2020-09-21 16:09:42

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 2/4] KVM: x86: report negative values from wrmsr to userspace

On Mon, Sep 21, 2020 at 04:19:21PM +0300, Maxim Levitsky wrote:
> This will allow us to make some MSR writes fatal to the guest
> (e.g when out of memory condition occurs)
>
> Signed-off-by: Maxim Levitsky <[email protected]>
> ---
> arch/x86/kvm/emulate.c | 7 +++++--
> arch/x86/kvm/x86.c | 5 +++--
> 2 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 1d450d7710d63..d855304f5a509 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
> static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
> {
> u64 msr_data;
> + int ret;
>
> msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
> | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
> - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
> +
> + ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
> + if (ret > 0)
> return emulate_gp(ctxt, 0);
>
> - return X86EMUL_CONTINUE;
> + return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
> }
>
> static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 063d70e736f7f..b6c67ab7c4f34 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1612,15 +1612,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
> {
> u32 ecx = kvm_rcx_read(vcpu);
> u64 data = kvm_read_edx_eax(vcpu);
> + int ret = kvm_set_msr(vcpu, ecx, data);
>
> - if (kvm_set_msr(vcpu, ecx, data)) {
> + if (ret > 0) {
> trace_kvm_msr_write_ex(ecx, data);
> kvm_inject_gp(vcpu, 0);
> return 1;
> }
>
> trace_kvm_msr_write(ecx, data);

Tracing the access as non-faulting feels wrong. The WRMSR has not completed,
e.g. if userspace cleanly handles -ENOMEM and restarts the guest, KVM would
trace the WRMSR twice.

What about:

int ret = kvm_set_msr(vcpu, ecx, data);

if (ret < 0)
return ret;

if (ret) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}

trace_kvm_msr_write(ecx, data);
return kvm_skip_emulated_instruction(vcpu);

> - return kvm_skip_emulated_instruction(vcpu);
> + return ret < 0 ? ret : kvm_skip_emulated_instruction(vcpu);
> }
> EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
>
> --
> 2.26.2
>

2020-09-22 16:16:42

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v5 2/4] KVM: x86: report negative values from wrmsr to userspace

On Mon, 2020-09-21 at 09:08 -0700, Sean Christopherson wrote:
> On Mon, Sep 21, 2020 at 04:19:21PM +0300, Maxim Levitsky wrote:
> > This will allow us to make some MSR writes fatal to the guest
> > (e.g when out of memory condition occurs)
> >
> > Signed-off-by: Maxim Levitsky <[email protected]>
> > ---
> > arch/x86/kvm/emulate.c | 7 +++++--
> > arch/x86/kvm/x86.c | 5 +++--
> > 2 files changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> > index 1d450d7710d63..d855304f5a509 100644
> > --- a/arch/x86/kvm/emulate.c
> > +++ b/arch/x86/kvm/emulate.c
> > @@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
> > static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
> > {
> > u64 msr_data;
> > + int ret;
> >
> > msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
> > | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
> > - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
> > +
> > + ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
> > + if (ret > 0)
> > return emulate_gp(ctxt, 0);
> >
> > - return X86EMUL_CONTINUE;
> > + return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
> > }
> >
> > static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 063d70e736f7f..b6c67ab7c4f34 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1612,15 +1612,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
> > {
> > u32 ecx = kvm_rcx_read(vcpu);
> > u64 data = kvm_read_edx_eax(vcpu);
> > + int ret = kvm_set_msr(vcpu, ecx, data);
> >
> > - if (kvm_set_msr(vcpu, ecx, data)) {
> > + if (ret > 0) {
> > trace_kvm_msr_write_ex(ecx, data);
> > kvm_inject_gp(vcpu, 0);
> > return 1;
> > }
> >
> > trace_kvm_msr_write(ecx, data);
>
> Tracing the access as non-faulting feels wrong. The WRMSR has not completed,
> e.g. if userspace cleanly handles -ENOMEM and restarts the guest, KVM would
> trace the WRMSR twice.

I guess you are right. Since in this case we didn't actually executed the
instruction (exception can also be thought as an execution of an instruction,
since it leads to the exception handler), but in
this case we just fail
and let the userspace do something so we can restart from the same point again.

So I'll go with your suggestion.

Thanks for the review,
Best regards,
Maxim Levitsky

>
> What about:
>
> int ret = kvm_set_msr(vcpu, ecx, data);
>
> if (ret < 0)
> return ret;
>
> if (ret) {
> trace_kvm_msr_write_ex(ecx, data);
> kvm_inject_gp(vcpu, 0);
> return 1;
> }
>
> trace_kvm_msr_write(ecx, data);
> return kvm_skip_emulated_instruction(vcpu);
>
> > - return kvm_skip_emulated_instruction(vcpu);
> > + return ret < 0 ? ret : kvm_skip_emulated_instruction(vcpu);
> > }
> > EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
> >
> > --
> > 2.26.2
> >