2024-04-04 12:24:01

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

Some VM types have characteristics in common; in fact, the only use
of VM types right now is kvm_arch_has_private_mem and it assumes that
_all_ nonzero VM types have private memory.

We will soon introduce a VM type for SEV and SEV-ES VMs, and at that
point we will have two special characteristics of confidential VMs
that depend on the VM type: not just if memory is private, but
also whether guest state is protected. For the latter we have
kvm->arch.guest_state_protected, which is only set on a fully initialized
VM.

For VM types with protected guest state, we can actually fix a problem in
the SEV-ES implementation, where ioctls to set registers do not cause an
error even if the VM has been initialized and the guest state encrypted.
Make sure that when using VM types that will become an error.

Signed-off-by: Paolo Bonzini <[email protected]>
Message-Id: <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 7 ++-
arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++-------
2 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 04c430eb25cf..3d56b5bb10e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1279,12 +1279,14 @@ enum kvm_apicv_inhibit {
};

struct kvm_arch {
- unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
+ u8 vm_type;
+ bool has_private_mem;
+ bool has_protected_state;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -2153,8 +2155,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);

+
#ifdef CONFIG_KVM_PRIVATE_MEM
-#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3934e7682734..d4a8d896798f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5555,11 +5555,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}

-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
{
unsigned int i;

+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
memset(dbgregs, 0, sizeof(*dbgregs));

BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -5568,6 +5572,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,

dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
}

static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5575,6 +5580,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
{
unsigned int i;

+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (dbgregs->flags)
return -EINVAL;

@@ -5595,8 +5604,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
}


-static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
- u8 *state, unsigned int size)
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
/*
* Only copy state for features that are enabled for the guest. The
@@ -5614,24 +5623,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
XFEATURE_MASK_FPSSE;

if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
supported_xcr0, vcpu->arch.pkru);
+ return 0;
}

-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}

static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
@@ -5639,18 +5649,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.pkru);
}

-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
- return;
+ return 0;
}

guest_xcrs->nr_xcrs = 1;
guest_xcrs->flags = 0;
guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ return 0;
}

static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@ -5658,6 +5673,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
{
int i, r = 0;

+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;

@@ -6040,7 +6059,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_DEBUGREGS: {
struct kvm_debugregs dbgregs;

- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ if (r < 0)
+ break;

r = -EFAULT;
if (copy_to_user(argp, &dbgregs,
@@ -6070,7 +6091,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;

- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ if (r < 0)
+ break;

r = -EFAULT;
if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@ -6099,7 +6122,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;

- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ if (r < 0)
+ break;

r = -EFAULT;
if (copy_to_user(argp, u.xsave, size))
@@ -6115,7 +6140,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xcrs)
break;

- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ if (r < 0)
+ break;

r = -EFAULT;
if (copy_to_user(argp, u.xcrs,
@@ -6259,6 +6286,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_GET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
r = -ENOMEM;
if (!u.sregs2)
@@ -6271,6 +6303,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
if (IS_ERR(u.sregs2)) {
r = PTR_ERR(u.sregs2);
@@ -11478,6 +11515,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11519,6 +11560,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__set_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11591,6 +11636,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11858,6 +11907,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int ret;

+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11975,7 +12028,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;

if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

vcpu_load(vcpu);

@@ -11998,7 +12051,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;

if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

vcpu_load(vcpu);

@@ -12524,6 +12577,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;

kvm->arch.vm_type = type;
+ kvm->arch.has_private_mem =
+ (type == KVM_X86_SW_PROTECTED_VM);

ret = kvm_page_track_init(kvm);
if (ret)
--
2.43.0




2024-04-04 21:39:55

by Isaku Yamahata

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Thu, Apr 04, 2024 at 08:13:17AM -0400,
Paolo Bonzini <[email protected]> wrote:

> Some VM types have characteristics in common; in fact, the only use
> of VM types right now is kvm_arch_has_private_mem and it assumes that
> _all_ nonzero VM types have private memory.
>
> We will soon introduce a VM type for SEV and SEV-ES VMs, and at that
> point we will have two special characteristics of confidential VMs
> that depend on the VM type: not just if memory is private, but
> also whether guest state is protected. For the latter we have
> kvm->arch.guest_state_protected, which is only set on a fully initialized
> VM.
>
> For VM types with protected guest state, we can actually fix a problem in
> the SEV-ES implementation, where ioctls to set registers do not cause an
> error even if the VM has been initialized and the guest state encrypted.
> Make sure that when using VM types that will become an error.
>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Message-Id: <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 7 ++-
> arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++-------
> 2 files changed, 79 insertions(+), 21 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 04c430eb25cf..3d56b5bb10e9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1279,12 +1279,14 @@ enum kvm_apicv_inhibit {
> };
>
> struct kvm_arch {
> - unsigned long vm_type;
> unsigned long n_used_mmu_pages;
> unsigned long n_requested_mmu_pages;
> unsigned long n_max_mmu_pages;
> unsigned int indirect_shadow_pages;
> u8 mmu_valid_gen;
> + u8 vm_type;
> + bool has_private_mem;
> + bool has_protected_state;
> struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
> struct list_head active_mmu_pages;
> struct list_head zapped_obsolete_pages;
> @@ -2153,8 +2155,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
> void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> int tdp_max_root_level, int tdp_huge_page_level);
>
> +
> #ifdef CONFIG_KVM_PRIVATE_MEM
> -#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
> +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
> #else
> #define kvm_arch_has_private_mem(kvm) false
> #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3934e7682734..d4a8d896798f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5555,11 +5555,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
> - struct kvm_debugregs *dbgregs)
> +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
> + struct kvm_debugregs *dbgregs)
> {
> unsigned int i;
>
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> memset(dbgregs, 0, sizeof(*dbgregs));
>
> BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
> @@ -5568,6 +5572,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
>
> dbgregs->dr6 = vcpu->arch.dr6;
> dbgregs->dr7 = vcpu->arch.dr7;
> + return 0;
> }
>
> static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
> @@ -5575,6 +5580,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
> {
> unsigned int i;
>
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> if (dbgregs->flags)
> return -EINVAL;
>
> @@ -5595,8 +5604,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
> }
>
>
> -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
> - u8 *state, unsigned int size)
> +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
> + u8 *state, unsigned int size)
> {
> /*
> * Only copy state for features that are enabled for the guest. The
> @@ -5614,24 +5623,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
> XFEATURE_MASK_FPSSE;
>
> if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> - return;
> + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>
> fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
> supported_xcr0, vcpu->arch.pkru);
> + return 0;
> }
>
> -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
> - struct kvm_xsave *guest_xsave)
> +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
> + struct kvm_xsave *guest_xsave)
> {
> - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
> - sizeof(guest_xsave->region));
> + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
> + sizeof(guest_xsave->region));
> }
>
> static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
> struct kvm_xsave *guest_xsave)
> {
> if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> - return 0;
> + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>
> return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
> guest_xsave->region,
> @@ -5639,18 +5649,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
> &vcpu->arch.pkru);
> }
>
> -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
> - struct kvm_xcrs *guest_xcrs)
> +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
> + struct kvm_xcrs *guest_xcrs)
> {
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
> guest_xcrs->nr_xcrs = 0;
> - return;
> + return 0;
> }
>
> guest_xcrs->nr_xcrs = 1;
> guest_xcrs->flags = 0;
> guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
> guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
> + return 0;
> }
>
> static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
> @@ -5658,6 +5673,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
> {
> int i, r = 0;
>
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> if (!boot_cpu_has(X86_FEATURE_XSAVE))
> return -EINVAL;
>
> @@ -6040,7 +6059,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> case KVM_GET_DEBUGREGS: {
> struct kvm_debugregs dbgregs;
>
> - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
> + r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
> + if (r < 0)
> + break;
>
> r = -EFAULT;
> if (copy_to_user(argp, &dbgregs,
> @@ -6070,7 +6091,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> if (!u.xsave)
> break;
>
> - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
> + r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
> + if (r < 0)
> + break;
>
> r = -EFAULT;
> if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
> @@ -6099,7 +6122,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> if (!u.xsave)
> break;
>
> - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
> + r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
> + if (r < 0)
> + break;
>
> r = -EFAULT;
> if (copy_to_user(argp, u.xsave, size))
> @@ -6115,7 +6140,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> if (!u.xcrs)
> break;
>
> - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
> + r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
> + if (r < 0)
> + break;
>
> r = -EFAULT;
> if (copy_to_user(argp, u.xcrs,
> @@ -6259,6 +6286,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> }
> #endif
> case KVM_GET_SREGS2: {
> + r = -EINVAL;
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + goto out;
> +
> u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
> r = -ENOMEM;
> if (!u.sregs2)
> @@ -6271,6 +6303,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> break;
> }
> case KVM_SET_SREGS2: {
> + r = -EINVAL;
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + goto out;
> +
> u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
> if (IS_ERR(u.sregs2)) {
> r = PTR_ERR(u.sregs2);
> @@ -11478,6 +11515,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>
> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> {
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> vcpu_load(vcpu);
> __get_regs(vcpu, regs);
> vcpu_put(vcpu);
> @@ -11519,6 +11560,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>
> int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> {
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> vcpu_load(vcpu);
> __set_regs(vcpu, regs);
> vcpu_put(vcpu);
> @@ -11591,6 +11636,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
> int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> struct kvm_sregs *sregs)
> {
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> vcpu_load(vcpu);
> __get_sregs(vcpu, sregs);
> vcpu_put(vcpu);
> @@ -11858,6 +11907,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> {
> int ret;
>
> + if (vcpu->kvm->arch.has_protected_state &&
> + vcpu->arch.guest_state_protected)
> + return -EINVAL;
> +
> vcpu_load(vcpu);
> ret = __set_sregs(vcpu, sregs);
> vcpu_put(vcpu);
> @@ -11975,7 +12028,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> struct fxregs_state *fxsave;
>
> if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> - return 0;
> + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>
> vcpu_load(vcpu);
>
> @@ -11998,7 +12051,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> struct fxregs_state *fxsave;
>
> if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> - return 0;
> + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>
> vcpu_load(vcpu);
>
> @@ -12524,6 +12577,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> return -EINVAL;
>
> kvm->arch.vm_type = type;
> + kvm->arch.has_private_mem =
> + (type == KVM_X86_SW_PROTECTED_VM);
>
> ret = kvm_page_track_init(kvm);
> if (ret)
> --
> 2.43.0

This works well with TDX KVM patch series.

Reviewed-by: Isaku Yamahata <[email protected]>
--
Isaku Yamahata <[email protected]>

2024-04-05 23:01:48

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Thu, 2024-04-04 at 08:13 -0400, Paolo Bonzini wrote:
>  
>  struct kvm_arch {
> -       unsigned long vm_type;
>         unsigned long n_used_mmu_pages;
>         unsigned long n_requested_mmu_pages;
>         unsigned long n_max_mmu_pages;
>         unsigned int indirect_shadow_pages;
>         u8 mmu_valid_gen;
> +       u8 vm_type;
> +       bool has_private_mem;
> +       bool has_protected_state;

I'm a little late to this conversation, so hopefully not just complicating things. But why not
deduce has_private_mem and has_protected_state from the vm_type during runtime? Like if
kvm.arch.vm_type was instead a bit mask with the bit position of the KVM_X86_*_VM set,
kvm_arch_has_private_mem() could bitwise-and with a compile time mask of vm_types that have primate
memory. This also prevents it from ever transitioning through non-nonsensical states like vm_type ==
KVM_X86_TDX_VM, but !has_private_memory, so would be a little more robust.

Partly why I ask is there is logic in the x86 MMU TDX changes that tries to be generic but still
needs special handling for it. The current solution is to look at kvm_gfn_shared_mask() as TDX is
the only vm type that sets it, but Isaku and I were discussing if we should check something else,
that didn't appear to be tying together to unrelated concepts:
https://lore.kernel.org/kvm/[email protected]/

Since it's down the mail, the relevant snippet:
"
> > void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> > struct kvm_memory_slot *slot)
> > {
> > - kvm_mmu_zap_all_fast(kvm);
> > + if (kvm_gfn_shared_mask(kvm))
>
> There seems to be an attempt to abstract away the existence of Secure-
> EPT in mmu.c, that is not fully successful. In this case the code
> checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> in a way specific needed by S-EPT. It ends up being a little confusing
> because the actual check is about whether there is a shared bit. It
> only works because only S-EPT is the only thing that has a
> kvm_gfn_shared_mask().
>
> Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> but is more honest about what we are getting up to here. I'm not sure
> though, what do you think?

Right, I attempted and failed in zapping case. This is due to the restriction
that the Secure-EPT pages must be removed from the leaves. the VMX case (also
NPT, even SNP) heavily depends on zapping root entry as optimization.

I can think of
- add TDX check. Looks wrong
- Use kvm_gfn_shared_mask(kvm). confusing
- Give other name for this check like zap_from_leafs (or better name?)
The implementation is same to kvm_gfn_shared_mask() with comment.
- Or we can add a boolean variable to struct kvm
"

This patch seems like the convention would be to add and check a "zap_leafs_only" bool. But it
starts to become a lot of bools. If instead we added an arch_zap_leafs_only(struct kvm *kvm), that
checked the vm_type was KVM_X86_TDX_VM, it could make the calling code more clear. But then I wonder
why not do the same for has_private_mem and has_protected_state?

Of course TDX can adjust for any format of the state. Just seems cleaner to me.

2024-04-09 01:23:51

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Fri, Apr 05, 2024, Rick P Edgecombe wrote:
> On Thu, 2024-04-04 at 08:13 -0400, Paolo Bonzini wrote:
> >  
> >  struct kvm_arch {
> > -       unsigned long vm_type;
> >         unsigned long n_used_mmu_pages;
> >         unsigned long n_requested_mmu_pages;
> >         unsigned long n_max_mmu_pages;
> >         unsigned int indirect_shadow_pages;
> >         u8 mmu_valid_gen;
> > +       u8 vm_type;
> > +       bool has_private_mem;
> > +       bool has_protected_state;
>
> I'm a little late to this conversation, so hopefully not just complicating
> things. But why not deduce has_private_mem and has_protected_state from the
> vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> bitwise-and with a compile time mask of vm_types that have primate memory.
> This also prevents it from ever transitioning through non-nonsensical states
> like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> more robust.

LOL, time is a circle, or something like that. Paolo actually did this in v2[*],
and I objected, vociferously.

KVM advertises VM types to userspace via a 32-bit field, one bit per type. So
without more uAPI changes, the VM type needs to be <=31. KVM could embed the
"has private memory" information into the type, but then we cut down on the number
of possible VM types *and* bleed has_private_memory into KVM's ABI.

While it's unlikely KVM will ever support TDX without has_private_memory, it's
entirely possible that KVM could add support for an existing VM "base" type that
doesn't currently support private memory. E.g. with some massaging, KVM could
support private memory for SEV and SEV-ES. And then we need to add an entirely
new VM type just so that KVM can let it use private memory.

Obviously KVM could shove in bits after the fact, e.g. store vm_type as a u64
instead of u32 (or u8 as in this patch), but then what's the point? Burning a
byte instead of a bit for per-VM flag is a complete non-issue, and booleans tend
to yield code that's easier to read and easier to maintain.

[*] https://lore.kernel.org/all/[email protected]

> Partly why I ask is there is logic in the x86 MMU TDX changes that tries to
> be generic but still needs special handling for it. The current solution is
> to look at kvm_gfn_shared_mask() as TDX is the only vm type that sets it, but
> Isaku and I were discussing if we should check something else, that didn't
> appear to be tying together to unrelated concepts:
> https://lore.kernel.org/kvm/[email protected]/
>
> Since it's down the mail, the relevant snippet:
> "
> > > void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> > > struct kvm_memory_slot *slot)
> > > {
> > > - kvm_mmu_zap_all_fast(kvm);
> > > + if (kvm_gfn_shared_mask(kvm))

Whatever you do that is TDX specific and an internal KVM thing is likely the wrong
thing :-)

The main reason KVM doesn't do a targeted zap on memslot removal is because of ABI
baggage that we _think_ is limited to interaction with VFIO. Since KVM doesn't
have any ABI for TDX *or* SNP, I want to at least entertain the option of doing
a target zap for SNP as well as TDX, even though it's only truly "necessary" for
TDX, in quotes because it's not strictly necessary, e.g. KVM could BLOCK the S-EPT
entries without fully removing the mappings.

Whether or not targeted zapping is optimal for SNP (or any VM type) is very much
TBD, and likely highly dependent on use case, but at the same time it would be
nice to not rule it out completely.

E.g. ChromeOS currently has a use case where they frequently delete and recreate
a 2GiB (give or take) memslot. For that use case, zapping _just_ that memslot is
likely far superious than blasting and rebuilding the entire VM. But if userspace
deletes a 1TiB for some reason, e.g. for memory unplug?, then the fast zap is
probably better, even though it requires rebuilding all SPTEs.

> > There seems to be an attempt to abstract away the existence of Secure-
> > EPT in mmu.c, that is not fully successful. In this case the code
> > checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> > in a way specific needed by S-EPT. It ends up being a little confusing
> > because the actual check is about whether there is a shared bit. It
> > only works because only S-EPT is the only thing that has a
> > kvm_gfn_shared_mask().
> >
> > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> > but is more honest about what we are getting up to here. I'm not sure
> > though, what do you think?
>
> Right, I attempted and failed in zapping case. This is due to the restriction
> that the Secure-EPT pages must be removed from the leaves. the VMX case (also
> NPT, even SNP) heavily depends on zapping root entry as optimization.

As above, it's more nuanced than that. KVM has come to depend on the fast zap,
but it got that way *because* KVM has historical zapped everything, and userspace
has (unknowingly) relied on that behavior.

> I can think of
> - add TDX check. Looks wrong
> - Use kvm_gfn_shared_mask(kvm). confusing

Ya, even if we end up making it a hardcoded TDX thing, dress it up a bit. E.g.
even if KVM checks for a shared mask under the hood, add a helper to capture the
logic, e.g. kvm_zap_all_sptes_on_memslot_deletion(kvm).

> - Give other name for this check like zap_from_leafs (or better name?)
> The implementation is same to kvm_gfn_shared_mask() with comment.
> - Or we can add a boolean variable to struct kvm

If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM capability
(and thus boolean) is likely the way to go. My off-the-cuff vote is probably for
a per-memslot flag.

2024-04-09 14:06:13

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Mon, 2024-04-08 at 18:21 -0700, Sean Christopherson wrote:
>
> Whatever you do that is TDX specific and an internal KVM thing is likely the
> wrong
> thing :-)
>
> The main reason KVM doesn't do a targeted zap on memslot removal is because of
> ABI
> baggage that we _think_ is limited to interaction with VFIO.  Since KVM
> doesn't
> have any ABI for TDX *or* SNP, I want to at least entertain the option of
> doing
> a target zap for SNP as well as TDX, even though it's only truly "necessary"
> for
> TDX, in quotes because it's not strictly necessary, e.g. KVM could BLOCK the
> S-EPT
> entries without fully removing the mappings.
>
> Whether or not targeted zapping is optimal for SNP (or any VM type) is very
> much
> TBD, and likely highly dependent on use case, but at the same time it would be
> nice to not rule it out completely.
>
> E.g. ChromeOS currently has a use case where they frequently delete and
> recreate
> a 2GiB (give or take) memslot.  For that use case, zapping _just_ that memslot
> is
> likely far superious than blasting and rebuilding the entire VM.  But if
> userspace
> deletes a 1TiB for some reason, e.g. for memory unplug?, then the fast zap is
> probably better, even though it requires rebuilding all SPTEs.

Interesting, thanks for the history.

>
> > > There seems to be an attempt to abstract away the existence of Secure-
> > > EPT in mmu.c, that is not fully successful. In this case the code
> > > checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> > > in a way specific needed by S-EPT. It ends up being a little confusing
> > > because the actual check is about whether there is a shared bit. It
> > > only works because only S-EPT is the only thing that has a
> > > kvm_gfn_shared_mask().
> > >
> > > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> > > but is more honest about what we are getting up to here. I'm not sure
> > > though, what do you think?
> >
> > Right, I attempted and failed in zapping case.  This is due to the
> > restriction
> > that the Secure-EPT pages must be removed from the leaves.  the VMX case
> > (also
> > NPT, even SNP) heavily depends on zapping root entry as optimization.
>
> As above, it's more nuanced than that.  KVM has come to depend on the fast
> zap,
> but it got that way *because* KVM has historical zapped everything, and
> userspace
> has (unknowingly) relied on that behavior.
>
> > I can think of
> > - add TDX check. Looks wrong
> > - Use kvm_gfn_shared_mask(kvm). confusing
>
> Ya, even if we end up making it a hardcoded TDX thing, dress it up a bit. 
> E.g.
> even if KVM checks for a shared mask under the hood, add a helper to capture
> the
> logic, e.g. kvm_zap_all_sptes_on_memslot_deletion(kvm).
>
> > - Give other name for this check like zap_from_leafs (or better name?)
> >    The implementation is same to kvm_gfn_shared_mask() with comment.
> >    - Or we can add a boolean variable to struct kvm
>
> If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM capability
> (and thus boolean) is likely the way to go.  My off-the-cuff vote is probably
> for
> a per-memslot flag.

The per-memslot flag is interesting. If we had a per-memslot flag it might be
nice for that 2GB memslot. For TDX, making userspace have to know about zapping
requirements is not ideal. If TDX somehow loses the restriction someday, then
userspace would have to manage that as well. I think the decision belongs inside
KVM, for TDX at least.

We'll have to take a look at how they would come together in the code.

2024-04-09 15:25:51

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Tue, Apr 9, 2024 at 3:21 AM Sean Christopherson <[email protected]> wrote:
> > I'm a little late to this conversation, so hopefully not just complicating
> > things. But why not deduce has_private_mem and has_protected_state from the
> > vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> > the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> > bitwise-and with a compile time mask of vm_types that have primate memory.
> > This also prevents it from ever transitioning through non-nonsensical states
> > like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> > more robust.
>
> LOL, time is a circle, or something like that. Paolo actually did this in v2[*],
> and I objected, vociferously.

To be fair, Rick is asking for something much less hideous - just set

kvm->arch.vm_type = (1 << vm_type);

and then define kvm_has_*(kvm) as !!(kvm->arch.vm_type & SOME_BIT_MASK).

And indeed it makes sense as an alternative. It also feels a little
bit more restrictive and the benefit is small, so I think I'm going to
go with this version.

Paolo


2024-04-09 15:30:17

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Tue, Apr 09, 2024, Paolo Bonzini wrote:
> On Tue, Apr 9, 2024 at 3:21 AM Sean Christopherson <seanjc@googlecom> wrote:
> > > I'm a little late to this conversation, so hopefully not just complicating
> > > things. But why not deduce has_private_mem and has_protected_state from the
> > > vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> > > the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> > > bitwise-and with a compile time mask of vm_types that have primate memory.
> > > This also prevents it from ever transitioning through non-nonsensical states
> > > like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> > > more robust.
> >
> > LOL, time is a circle, or something like that. Paolo actually did this in v2[*],
> > and I objected, vociferously.
>
> To be fair, Rick is asking for something much less hideous - just set
>
> kvm->arch.vm_type = (1 << vm_type);
>
> and then define kvm_has_*(kvm) as !!(kvm->arch.vm_type & SOME_BIT_MASK).
>
> And indeed it makes sense as an alternative.

Ah, yeah, I'd be fine with that.

> It also feels a little bit more restrictive and the benefit is small, so I
> think I'm going to go with this version.

+1

2024-05-07 23:18:17

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Mon, 2024-04-08 at 18:21 -0700, Sean Christopherson wrote:
> > - Give other name for this check like zap_from_leafs (or better name?)
> >    The implementation is same to kvm_gfn_shared_mask() with comment.
> >    - Or we can add a boolean variable to struct kvm
>
> If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM capability
> (and thus boolean) is likely the way to go.  My off-the-cuff vote is probably
> for
> a per-memslot flag.

Hi Sean,

Can you elaborate on the reason for a per-memslot flag? We are discussing this
design point internally, and also the intersection with the previous attempts to
do something similar with a per-vm flag[0].

I'm wondering if the intention is to try to make a memslot flag, so it can be
expanded for the normal VM usage. Because the discussion on the original
attempts, it seems safer to keep this behavior more limited (TDX only) for now.
And for TDX's usage a struct kvm bool fits best because all memslots need to be
set to zap_leafs_only = true, anyway. It's simpler for userspace, and less
possible situations to worry about for KVM.

[0]
https://lore.kernel.org/kvm/[email protected]/

2024-05-08 00:21:15

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Tue, May 07, 2024, Rick P Edgecombe wrote:
> On Mon, 2024-04-08 at 18:21 -0700, Sean Christopherson wrote:
> > > - Give other name for this check like zap_from_leafs (or better name?)
> > >    The implementation is same to kvm_gfn_shared_mask() with comment.
> > >    - Or we can add a boolean variable to struct kvm
> >
> > If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM
> > capability (and thus boolean) is likely the way to go.  My off-the-cuff
> > vote is probably for a per-memslot flag.
>
> Hi Sean,
>
> Can you elaborate on the reason for a per-memslot flag? We are discussing this
> design point internally, and also the intersection with the previous attempts to
> do something similar with a per-vm flag[0].
>
> I'm wondering if the intention is to try to make a memslot flag, so it can be
> expanded for the normal VM usage.

Sure, I'll go with that answer. Like I said, off-the-cuff.

There's no concrete motiviation, it's more that _if_ we're going to expose a knob
to userspace, then I'd prefer to make it as precise as possible to minimize the
changes of KVM ending up back in ABI hell again.

> Because the discussion on the original attempts, it seems safer to keep this
> behavior more limited (TDX only) for now. And for TDX's usage a struct kvm
> bool fits best because all memslots need to be set to zap_leafs_only = true,
> anyway.

No they don't. They might be set that way in practice for QEMU, but it's not
strictly required. E.g. nothing would prevent a VMM from exposing a shared-only
memslot to a guest. The memslots that burned KVM the first time around were
related to VFIO devices, and I wouldn't put it past someone to be crazy enough
to expose an passhtrough an untrusted device to a TDX guest.

> It's simpler for userspace, and less possible situations to worry about for KVM.
>
> [0] https://lore.kernel.org/kvm/[email protected]/


2024-05-08 01:20:08

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Tue, 2024-05-07 at 17:21 -0700, Sean Christopherson wrote:
> > Can you elaborate on the reason for a per-memslot flag? We are discussing
> > this
> > design point internally, and also the intersection with the previous
> > attempts to
> > do something similar with a per-vm flag[0].
> >
> > I'm wondering if the intention is to try to make a memslot flag, so it can
> > be
> > expanded for the normal VM usage.
>
> Sure, I'll go with that answer.  Like I said, off-the-cuff.
>
> There's no concrete motiviation, it's more that _if_ we're going to expose a
> knob
> to userspace, then I'd prefer to make it as precise as possible to minimize
> the
> changes of KVM ending up back in ABI hell again.
>
> > Because the discussion on the original attempts, it seems safer to keep this
> > behavior more limited (TDX only) for now.  And for TDX's usage a struct kvm
> > bool fits best because all memslots need to be set to zap_leafs_only = true,
> > anyway.
>
> No they don't.  They might be set that way in practice for QEMU, but it's not
> strictly required.  E.g. nothing would prevent a VMM from exposing a shared-
> only
> memslot to a guest.  The memslots that burned KVM the first time around were
> related to VFIO devices, and I wouldn't put it past someone to be crazy enough
> to expose an passhtrough an untrusted device to a TDX guest.

Ok, thanks for clarification. So it's more of a strategic thing to move more
zapping logic into userspace so the logic can change without introducing kernel
regressions.

2024-05-08 14:41:33

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Wed, May 08, 2024, Rick P Edgecombe wrote:
> On Tue, 2024-05-07 at 17:21 -0700, Sean Christopherson wrote:
> > > Can you elaborate on the reason for a per-memslot flag? We are discussing
> > > this
> > > design point internally, and also the intersection with the previous
> > > attempts to
> > > do something similar with a per-vm flag[0].
> > >
> > > I'm wondering if the intention is to try to make a memslot flag, so it can
> > > be
> > > expanded for the normal VM usage.
> >
> > Sure, I'll go with that answer.  Like I said, off-the-cuff.
> >
> > There's no concrete motiviation, it's more that _if_ we're going to expose
> > a knob to userspace, then I'd prefer to make it as precise as possible to
> > minimize the changes of KVM ending up back in ABI hell again.
> >
> > > Because the discussion on the original attempts, it seems safer to keep this
> > > behavior more limited (TDX only) for now.  And for TDX's usage a struct kvm
> > > bool fits best because all memslots need to be set to zap_leafs_only = true,
> > > anyway.
> >
> > No they don't.  They might be set that way in practice for QEMU, but it's
> > not strictly required.  E.g. nothing would prevent a VMM from exposing a
> > shared- only memslot to a guest.  The memslots that burned KVM the first
> > time around were related to VFIO devices, and I wouldn't put it past
> > someone to be crazy enough
> > to expose an passhtrough an untrusted device to a TDX guest.
>
> Ok, thanks for clarification. So it's more of a strategic thing to move more
> zapping logic into userspace so the logic can change without introducing kernel
> regressions.

You're _really_ reading too much into my suggestion. As above, my suggestion
was very spur of the momemnt. I haven't put much thought into the tradeoffs and
side effects.

2024-05-08 15:05:31

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCH v5 07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

On Wed, 2024-05-08 at 07:38 -0700, Sean Christopherson wrote:
> > Ok, thanks for clarification. So it's more of a strategic thing to move more
> > zapping logic into userspace so the logic can change without introducing
> > kernel
> > regressions.
>
> You're _really_ reading too much into my suggestion.  As above, my suggestion
> was very spur of the momemnt.  I haven't put much thought into the tradeoffs
> and
> side effects.

I'm not taking it as a mandate. Just trying to glean your insights. That said,
I'm really on the fence and so leaning on your intuition as the tie breaker.

For TDX's usage a struct kvm bool seems simpler code wise in KVM, and for
userspace. But the zapping logic as ABI problem seems like a reasonable thing to
think about while we are designing new ABI. Of course, it also means KVM has to
be responsible now for safely zapping memory from a variety of userspace
algorithms. So it somewhat makes KVM's job easier, and somewhat makes it harder.

The real issue might be that that problem was never debugged. While there is no
evidence it will affect TDXs, it remains a possibility. But we can't do the zap
roots thing for TDX, so in the end the ABI design will not affect TDX exposure
either way. But making it a normal feature will affect exposure for normal VMs.
So we are also balancing ABI flexibility with exposure to that specific bug.