2023-01-12 16:48:44

by Isaku Yamahata

[permalink] [raw]
Subject: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

From: Isaku Yamahata <[email protected]>

TD guest vcpu need to be configured before ready to run which requests
addtional information from Device model (e.g. qemu), one 64bit value is
passed to vcpu's RCX as an initial value. Repurpose KVM_MEMORY_ENCRYPT_OP
to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
additional vcpu configuration.

Add callback for kvm vCPU-scoped operations of KVM_MEMORY_ENCRYPT_OP and
add a new subcommand, KVM_TDX_INIT_VCPU, for further vcpu initialization.

Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/uapi/asm/kvm.h | 1 +
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/tdx.c | 147 +++++++++++++++++++++++++-
arch/x86/kvm/vmx/tdx.h | 7 ++
arch/x86/kvm/vmx/x86_ops.h | 10 +-
arch/x86/kvm/x86.c | 6 ++
tools/arch/x86/include/uapi/asm/kvm.h | 1 +
9 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 1a27f3aee982..e3e9b1c2599b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -123,6 +123,7 @@ KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 30f4ddb18548..35773f925cc5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1698,6 +1698,7 @@ struct kvm_x86_ops {

int (*dev_mem_enc_ioctl)(void __user *argp);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index b8f28d86d4fd..9236c1699c48 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -536,6 +536,7 @@ struct kvm_pmu_event_filter {
enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,

KVM_TDX_CMD_NR_MAX,
};
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 59813ca05f36..23b3ffc3fe23 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -103,6 +103,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
return tdx_vm_ioctl(kvm, argp);
}

+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_ioctl(vcpu, argp);
+}
+
struct kvm_x86_ops vt_x86_ops __initdata = {
.name = KBUILD_MODNAME,

@@ -249,6 +257,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {

.dev_mem_enc_ioctl = tdx_dev_ioctl,
.mem_enc_ioctl = vt_mem_enc_ioctl,
+ .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
};

struct kvm_x86_init_ops vt_init_ops __initdata = {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 099f0737a5aa..e2f5a07ad4e5 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
}

+static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
+{
+ return tdx->tdvpr_pa;
+}
+
static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
{
return kvm_tdx->tdr_pa;
@@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
return kvm_tdx->hkid > 0;
}

+static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->finalized;
+}
+
static void tdx_clear_page(unsigned long page_pa)
{
const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)

void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
- /* This is stub for now. More logic will come. */
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int i;
+
+ /* Can't reclaim or free pages if teardown failed. */
+ if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+ return;
+
+ if (tdx->tdvpx_pa) {
+ for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
+ tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
+ kfree(tdx->tdvpx_pa);
+ tdx->tdvpx_pa = NULL;
+ }
+ tdx_reclaim_td_page(tdx->tdvpr_pa);
+ tdx->tdvpr_pa = 0;
}

void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
/* TDX doesn't support INIT event. */
if (WARN_ON_ONCE(init_event))
goto td_bugged;
+ if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
+ goto td_bugged;

/* TDX rquires X2APIC. */
apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
@@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
return r;
}

+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ unsigned long *tdvpx_pa = NULL;
+ unsigned long tdvpr_pa;
+ unsigned long va;
+ int ret, i;
+ u64 err;
+
+ if (is_td_vcpu_created(tdx))
+ return -EINVAL;
+
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ return -ENOMEM;
+ tdvpr_pa = __pa(va);
+
+ tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tdvpx_pa) {
+ ret = -ENOMEM;
+ goto free_tdvpr;
+ }
+ for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_tdvpx;
+ tdvpx_pa[i] = __pa(va);
+ }
+
+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
+ if (WARN_ON_ONCE(err)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_CREATE, err, NULL);
+ goto td_bugged_free_tdvpx;
+ }
+ tdx->tdvpr_pa = tdvpr_pa;
+
+ tdx->tdvpx_pa = tdvpx_pa;
+ for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
+ if (WARN_ON_ONCE(err)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_ADDCX, err, NULL);
+ for (; i < tdx_caps.tdvpx_nr_pages; i++) {
+ free_page((unsigned long)__va(tdvpx_pa[i]));
+ tdvpx_pa[i] = 0;
+ }
+ goto td_bugged;
+ }
+ }
+
+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
+ if (WARN_ON_ONCE(err)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_INIT, err, NULL);
+ goto td_bugged;
+ }
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+
+td_bugged_free_tdvpx:
+ for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
+ free_page((unsigned long)__va(tdvpx_pa[i]));
+ tdvpx_pa[i] = 0;
+ }
+ kfree(tdvpx_pa);
+td_bugged:
+ vcpu->kvm->vm_bugged = true;
+ return ret;
+
+free_tdvpx:
+ for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
+ if (tdvpx_pa[i])
+ free_page((unsigned long)__va(tdvpx_pa[i]));
+ kfree(tdvpx_pa);
+ tdx->tdvpx_pa = NULL;
+free_tdvpr:
+ if (tdvpr_pa)
+ free_page((unsigned long)__va(tdvpr_pa));
+ tdx->tdvpr_pa = 0;
+
+ return ret;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (tdx->vcpu_initialized)
+ return -EINVAL;
+
+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.error || cmd.unused)
+ return -EINVAL;
+
+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
+ return -EINVAL;
+
+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
+ if (ret)
+ return ret;
+
+ tdx->vcpu_initialized = true;
+ return 0;
+}
+
static int __init tdx_module_setup(void)
{
const struct tdsysinfo_struct *tdsysinfo;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index af7fdc1516d5..e909883d60fa 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -17,12 +17,19 @@ struct kvm_tdx {
u64 xfam;
int hkid;

+ bool finalized;
+
u64 tsc_offset;
};

struct vcpu_tdx {
struct kvm_vcpu vcpu;

+ unsigned long tdvpr_pa;
+ unsigned long *tdvpx_pa;
+
+ bool vcpu_initialized;
+
/*
* Dummy to make pmu_intel not corrupt memory.
* TODO: Support PMU for TDX. Future work.
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 37ab2cfd35bc..fba8d0800597 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -148,11 +148,12 @@ int tdx_vm_init(struct kvm *kvm);
void tdx_mmu_release_hkid(struct kvm *kvm);
void tdx_vm_free(struct kvm *kvm);

-int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
-
int tdx_vcpu_create(struct kvm_vcpu *vcpu);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
#else
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; }
static inline void tdx_hardware_unsetup(void) {}
@@ -165,11 +166,12 @@ static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
static inline void tdx_vm_free(struct kvm *kvm) {}

-static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
-
static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
+
+static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
+static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
#endif

#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8bc66031a1d..d548d3af6428 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5976,6 +5976,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_DEVICE_ATTR:
r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index eb800965b589..6971f1288043 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -531,6 +531,7 @@ struct kvm_pmu_event_filter {
enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,

KVM_TDX_CMD_NR_MAX,
};
--
2.25.1


2023-01-16 18:14:18

by Zhi Wang

[permalink] [raw]
Subject: Re: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

On Thu, 12 Jan 2023 08:31:32 -0800
[email protected] wrote:

> From: Isaku Yamahata <[email protected]>
>
> TD guest vcpu need to be configured before ready to run which requests
> addtional information from Device model (e.g. qemu), one 64bit value is
> passed to vcpu's RCX as an initial value. Repurpose KVM_MEMORY_ENCRYPT_OP
> to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
> additional vcpu configuration.
>

Better add more details for this mystic value to save the review efforts.

For exmaple, refining the above part as:

----

TD hands-off block(HOB) is used to pass the information from VMM to
TD virtual firmware(TDVF). Before KVM calls Intel TDX module to launch
TDVF, the address of HOB must be placed in the guest RCX.

Extend KVM_MEMORY_ENCRYPT_OP to vcpu-scope and add new... so that
TDH.VP.INIT can take the address of HOB from QEMU and place it in the
guest RCX when initializing a TDX vCPU.

----

The below paragraph seems repeating the end of the first paragraph. Guess
it can be refined or removed.


> Add callback for kvm vCPU-scoped operations of KVM_MEMORY_ENCRYPT_OP and
> add a new subcommand, KVM_TDX_INIT_VCPU, for further vcpu initialization.
>


PS: I am curious if the value of guest RCX on each VCPU will be configured
differently? (It seems they are the same according to the code of tdx-qemu)

If yes, then it is just an approach to configure the value (even it is
through TDH.VP.XXX). It should be configured in the domain level in KVM. The
TDX vCPU creation and initialization can be moved into tdx_vcpu_create()
and TDH.VP.INIT can take the value from a per-vm data structure.

> Signed-off-by: Sean Christopherson <[email protected]>
> Signed-off-by: Isaku Yamahata <[email protected]>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 1 +
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/include/uapi/asm/kvm.h | 1 +
> arch/x86/kvm/vmx/main.c | 9 ++
> arch/x86/kvm/vmx/tdx.c | 147 +++++++++++++++++++++++++-
> arch/x86/kvm/vmx/tdx.h | 7 ++
> arch/x86/kvm/vmx/x86_ops.h | 10 +-
> arch/x86/kvm/x86.c | 6 ++
> tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> 9 files changed, 178 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 1a27f3aee982..e3e9b1c2599b 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -123,6 +123,7 @@ KVM_X86_OP(enable_smi_window)
> #endif
> KVM_X86_OP_OPTIONAL(dev_mem_enc_ioctl)
> KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
> +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
> KVM_X86_OP_OPTIONAL(mem_enc_register_region)
> KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
> KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 30f4ddb18548..35773f925cc5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1698,6 +1698,7 @@ struct kvm_x86_ops {
>
> int (*dev_mem_enc_ioctl)(void __user *argp);
> int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
> + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
> int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index b8f28d86d4fd..9236c1699c48 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -536,6 +536,7 @@ struct kvm_pmu_event_filter {
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> KVM_TDX_INIT_VM,
> + KVM_TDX_INIT_VCPU,
>
> KVM_TDX_CMD_NR_MAX,
> };
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 59813ca05f36..23b3ffc3fe23 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -103,6 +103,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
> return tdx_vm_ioctl(kvm, argp);
> }
>
> +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> + if (!is_td_vcpu(vcpu))
> + return -EINVAL;
> +
> + return tdx_vcpu_ioctl(vcpu, argp);
> +}
> +
> struct kvm_x86_ops vt_x86_ops __initdata = {
> .name = KBUILD_MODNAME,
>
> @@ -249,6 +257,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>
> .dev_mem_enc_ioctl = tdx_dev_ioctl,
> .mem_enc_ioctl = vt_mem_enc_ioctl,
> + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> };
>
> struct kvm_x86_init_ops vt_init_ops __initdata = {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 099f0737a5aa..e2f5a07ad4e5 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> }
>
> +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> +{
> + return tdx->tdvpr_pa;
> +}
> +
> static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> {
> return kvm_tdx->tdr_pa;
> @@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> return kvm_tdx->hkid > 0;
> }
>
> +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> +{
> + return kvm_tdx->finalized;
> +}
> +
> static void tdx_clear_page(unsigned long page_pa)
> {
> const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> @@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>
> void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> {
> - /* This is stub for now. More logic will come. */
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + int i;
> +
> + /* Can't reclaim or free pages if teardown failed. */
> + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> + return;
> +

Should we have an WARN_ON_ONCE here?

> + if (tdx->tdvpx_pa) {
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> + tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
> + kfree(tdx->tdvpx_pa);
> + tdx->tdvpx_pa = NULL;
> + }
> + tdx_reclaim_td_page(tdx->tdvpr_pa);
> + tdx->tdvpr_pa = 0;
> }
>
> void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> /* TDX doesn't support INIT event. */
> if (WARN_ON_ONCE(init_event))
> goto td_bugged;
> + if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
> + goto td_bugged;
>
> /* TDX rquires X2APIC. */
> apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
> @@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> return r;
> }
>
> +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + unsigned long *tdvpx_pa = NULL;
> + unsigned long tdvpr_pa;
> + unsigned long va;
> + int ret, i;
> + u64 err;
> +
> + if (is_td_vcpu_created(tdx))
> + return -EINVAL;
> +
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + return -ENOMEM;
> + tdvpr_pa = __pa(va);
> +
> + tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
> + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> + if (!tdvpx_pa) {
> + ret = -ENOMEM;
> + goto free_tdvpr;
> + }
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + goto free_tdvpx;
> + tdvpx_pa[i] = __pa(va);
> + }
> +
> + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_CREATE, err, NULL);
> + goto td_bugged_free_tdvpx;
> + }
> + tdx->tdvpr_pa = tdvpr_pa;
> +
> + tdx->tdvpx_pa = tdvpx_pa;
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> + for (; i < tdx_caps.tdvpx_nr_pages; i++) {
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + tdvpx_pa[i] = 0;
> + }
> + goto td_bugged;
> + }
> + }
> +
> + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_INIT, err, NULL);
> + goto td_bugged;
> + }
> +
> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> + return 0;
> +
> +td_bugged_free_tdvpx:
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + tdvpx_pa[i] = 0;
> + }
> + kfree(tdvpx_pa);
> +td_bugged:
> + vcpu->kvm->vm_bugged = true;
> + return ret;
> +
> +free_tdvpx:
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> + if (tdvpx_pa[i])
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + kfree(tdvpx_pa);
> + tdx->tdvpx_pa = NULL;
> +free_tdvpr:
> + if (tdvpr_pa)
> + free_page((unsigned long)__va(tdvpr_pa));
> + tdx->tdvpr_pa = 0;
> +
> + return ret;
> +}

Same comments with using vm_bugged in the previous patch.

> +
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + struct kvm_tdx_cmd cmd;
> + int ret;
> +
> + if (tdx->vcpu_initialized)
> + return -EINVAL;
> +
> + if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
> + return -EINVAL;
> +
> + if (copy_from_user(&cmd, argp, sizeof(cmd)))
> + return -EFAULT;
> +
> + if (cmd.error || cmd.unused)
> + return -EINVAL;
> +
> + /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
> + if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
> + return -EINVAL;
> +
> + ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
> + if (ret)
> + return ret;
> +
> + tdx->vcpu_initialized = true;
> + return 0;
> +}
> +
> static int __init tdx_module_setup(void)
> {
> const struct tdsysinfo_struct *tdsysinfo;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index af7fdc1516d5..e909883d60fa 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -17,12 +17,19 @@ struct kvm_tdx {
> u64 xfam;
> int hkid;
>
> + bool finalized;
> +
> u64 tsc_offset;
> };
>
> struct vcpu_tdx {
> struct kvm_vcpu vcpu;
>
> + unsigned long tdvpr_pa;
> + unsigned long *tdvpx_pa;
> +
> + bool vcpu_initialized;
> +
> /*
> * Dummy to make pmu_intel not corrupt memory.
> * TODO: Support PMU for TDX. Future work.
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 37ab2cfd35bc..fba8d0800597 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -148,11 +148,12 @@ int tdx_vm_init(struct kvm *kvm);
> void tdx_mmu_release_hkid(struct kvm *kvm);
> void tdx_vm_free(struct kvm *kvm);
>
> -int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
> -
> int tdx_vcpu_create(struct kvm_vcpu *vcpu);
> void tdx_vcpu_free(struct kvm_vcpu *vcpu);
> void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
> +
> +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> #else
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; }
> static inline void tdx_hardware_unsetup(void) {}
> @@ -165,11 +166,12 @@ static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
> static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
> static inline void tdx_vm_free(struct kvm *kvm) {}
>
> -static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
> -
> static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
> static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
> static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
> +
> +static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
> +static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
> #endif
>
> #endif /* __KVM_X86_VMX_X86_OPS_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e8bc66031a1d..d548d3af6428 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5976,6 +5976,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> case KVM_SET_DEVICE_ATTR:
> r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
> break;
> + case KVM_MEMORY_ENCRYPT_OP:
> + r = -ENOTTY;
> + if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
> + goto out;
> + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
> + break;
> default:
> r = -EINVAL;
> }
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index eb800965b589..6971f1288043 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -531,6 +531,7 @@ struct kvm_pmu_event_filter {
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> KVM_TDX_INIT_VM,
> + KVM_TDX_INIT_VCPU,
>
> KVM_TDX_CMD_NR_MAX,
> };

2023-01-19 11:59:29

by Kai Huang

[permalink] [raw]
Subject: Re: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

On Thu, 2023-01-12 at 08:31 -0800, [email protected] wrote:
> From: Isaku Yamahata <[email protected]>
>
> TD guest vcpu need to be configured before ready to run which requests
> addtional information from Device model (e.g. qemu), one 64bit value is
> passed to vcpu's RCX as an initial value.  
>

The first half sentence doesn't parse to me. It also has grammar issue.

Also, the second half only talks about TDH.VP.INIT, but there's more regarding
to creating/initializing a TDX guest vcpu. IMHO It would be better if you can
briefly describe the whole sequence here so people can get some idea about your
code below.

Btw, I don't understand what's the point of pointing out "64bit value passed to
vcpu's RCX ...". You can add this to the comment instead. If it is important,
then please add more to explain it so people can understand more.

> Repurpose KVM_MEMORY_ENCRYPT_OP
> to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
> additional vcpu configuration.

I am not sure using the same command for both per-VM and per-vcpu ioctls is a
good idea. Is there any existing example does this?

>
> Add callback for kvm vCPU-scoped operations of KVM_MEMORY_ENCRYPT_OP and
> add a new subcommand, KVM_TDX_INIT_VCPU, for further vcpu initialization.

Personally I prefer KVM_TDX_VCPU_CREATE (instead of INIT) but will leave to
maintainers.

>
> Signed-off-by: Sean Christopherson <[email protected]>
> Signed-off-by: Isaku Yamahata <[email protected]>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 1 +
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/include/uapi/asm/kvm.h | 1 +
> arch/x86/kvm/vmx/main.c | 9 ++
> arch/x86/kvm/vmx/tdx.c | 147 +++++++++++++++++++++++++-
> arch/x86/kvm/vmx/tdx.h | 7 ++
> arch/x86/kvm/vmx/x86_ops.h | 10 +-
> arch/x86/kvm/x86.c | 6 ++
> tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> 9 files changed, 178 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 1a27f3aee982..e3e9b1c2599b 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -123,6 +123,7 @@ KVM_X86_OP(enable_smi_window)
> #endif
> KVM_X86_OP_OPTIONAL(dev_mem_enc_ioctl)
> KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
> +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
> KVM_X86_OP_OPTIONAL(mem_enc_register_region)
> KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
> KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 30f4ddb18548..35773f925cc5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1698,6 +1698,7 @@ struct kvm_x86_ops {
>
> int (*dev_mem_enc_ioctl)(void __user *argp);
> int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
> + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
> int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index b8f28d86d4fd..9236c1699c48 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -536,6 +536,7 @@ struct kvm_pmu_event_filter {
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> KVM_TDX_INIT_VM,
> + KVM_TDX_INIT_VCPU,
>
> KVM_TDX_CMD_NR_MAX,
> };
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 59813ca05f36..23b3ffc3fe23 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -103,6 +103,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
> return tdx_vm_ioctl(kvm, argp);
> }
>
> +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> + if (!is_td_vcpu(vcpu))
> + return -EINVAL;
> +
> + return tdx_vcpu_ioctl(vcpu, argp);
> +}
> +
> struct kvm_x86_ops vt_x86_ops __initdata = {
> .name = KBUILD_MODNAME,
>
> @@ -249,6 +257,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>
> .dev_mem_enc_ioctl = tdx_dev_ioctl,
> .mem_enc_ioctl = vt_mem_enc_ioctl,
> + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> };
>
> struct kvm_x86_init_ops vt_init_ops __initdata = {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 099f0737a5aa..e2f5a07ad4e5 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> }
>
> +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> +{
> + return tdx->tdvpr_pa;
> +}
> +
> static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> {
> return kvm_tdx->tdr_pa;
> @@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> return kvm_tdx->hkid > 0;
> }
>
> +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> +{
> + return kvm_tdx->finalized;
> +}
> +
> static void tdx_clear_page(unsigned long page_pa)
> {
> const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> @@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>
> void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> {
> - /* This is stub for now. More logic will come. */
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + int i;
> +
> + /* Can't reclaim or free pages if teardown failed. */
> + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> + return;

You may want to WARN() if it's a kernel bug you want to catch.
> +
> + if (tdx->tdvpx_pa) {
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> + tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
> + kfree(tdx->tdvpx_pa);
> + tdx->tdvpx_pa = NULL;
> + }
> + tdx_reclaim_td_page(tdx->tdvpr_pa);
> + tdx->tdvpr_pa = 0;
> }
>
> void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> /* TDX doesn't support INIT event. */
> if (WARN_ON_ONCE(init_event))
> goto td_bugged;
> + if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
> + goto td_bugged;

Again, not sure can we use KVM_BUG_ON()?

>
> /* TDX rquires X2APIC. */
> apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
> @@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> return r;
> }
>
> +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + unsigned long *tdvpx_pa = NULL;
> + unsigned long tdvpr_pa;
> + unsigned long va;
> + int ret, i;
> + u64 err;
> +
> + if (is_td_vcpu_created(tdx))
> + return -EINVAL;

Ditto. WARN()?

> +
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + return -ENOMEM;
> + tdvpr_pa = __pa(va);
> +
> + tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
> + GFP_KERNEL_ACCOUNT | __GFP_ZERO);

kcalloc() uses __GFP_ZERO internally.

> + if (!tdvpx_pa) {
> + ret = -ENOMEM;
> + goto free_tdvpr;
> + }
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + goto free_tdvpx;
> + tdvpx_pa[i] = __pa(va);
> + }
> +
> + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_CREATE, err, NULL);
> + goto td_bugged_free_tdvpx;
> + }
> + tdx->tdvpr_pa = tdvpr_pa;
> +
> + tdx->tdvpx_pa = tdvpx_pa;
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> + for (; i < tdx_caps.tdvpx_nr_pages; i++) {
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + tdvpx_pa[i] = 0;
> + }
> + goto td_bugged;
> + }
> + }
> +
> + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> + if (WARN_ON_ONCE(err)) {
> + ret = -EIO;
> + pr_tdx_error(TDH_VP_INIT, err, NULL);
> + goto td_bugged;
> + }
> +
> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> + return 0;
> +
> +td_bugged_free_tdvpx:
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + tdvpx_pa[i] = 0;
> + }
> + kfree(tdvpx_pa);
> +td_bugged:
> + vcpu->kvm->vm_bugged = true;
> + return ret;
> +
> +free_tdvpx:
> + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> + if (tdvpx_pa[i])
> + free_page((unsigned long)__va(tdvpx_pa[i]));
> + kfree(tdvpx_pa);

This piece of code appears 3 times in this function (and there are 3 'return
ret;'). I am sure it can be done in one place instead. Can you reorganize?

> + tdx->tdvpx_pa = NULL;
> +free_tdvpr:
> + if (tdvpr_pa)
> + free_page((unsigned long)__va(tdvpr_pa));
> + tdx->tdvpr_pa = 0;
> +
> + return ret;
> +}
> +
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> + struct kvm_tdx_cmd cmd;
> + int ret;
> +
> + if (tdx->vcpu_initialized)
> + return -EINVAL;
> +
> + if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
> + return -EINVAL;
> +
> + if (copy_from_user(&cmd, argp, sizeof(cmd)))
> + return -EFAULT;
> +
> + if (cmd.error || cmd.unused)
> + return -EINVAL;
> +
> + /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
> + if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
> + return -EINVAL;
> +
> + ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
> + if (ret)
> + return ret;
> +
> + tdx->vcpu_initialized = true;
> + return 0;
> +}
> +
> static int __init tdx_module_setup(void)
> {
> const struct tdsysinfo_struct *tdsysinfo;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index af7fdc1516d5..e909883d60fa 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -17,12 +17,19 @@ struct kvm_tdx {
> u64 xfam;
> int hkid;
>
> + bool finalized;
> +
> u64 tsc_offset;
> };
>
> struct vcpu_tdx {
> struct kvm_vcpu vcpu;
>
> + unsigned long tdvpr_pa;
> + unsigned long *tdvpx_pa;
> +
> + bool vcpu_initialized;

The 'vcpu_' prefix is kinda redundant.

> +
> /*
> * Dummy to make pmu_intel not corrupt memory.
> * TODO: Support PMU for TDX. Future work.
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 37ab2cfd35bc..fba8d0800597 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -148,11 +148,12 @@ int tdx_vm_init(struct kvm *kvm);
> void tdx_mmu_release_hkid(struct kvm *kvm);
> void tdx_vm_free(struct kvm *kvm);
>
> -int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
> -
> int tdx_vcpu_create(struct kvm_vcpu *vcpu);
> void tdx_vcpu_free(struct kvm_vcpu *vcpu);
> void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
> +
> +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);

Why bother moving the tdx_vm_ioctl() declaration?

> #else
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; }
> static inline void tdx_hardware_unsetup(void) {}
> @@ -165,11 +166,12 @@ static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
> static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
> static inline void tdx_vm_free(struct kvm *kvm) {}
>
> -static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
> -
> static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
> static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
> static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
> +
> +static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
> +static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
> #endif
>
> #endif /* __KVM_X86_VMX_X86_OPS_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e8bc66031a1d..d548d3af6428 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5976,6 +5976,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> case KVM_SET_DEVICE_ATTR:
> r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
> break;
> + case KVM_MEMORY_ENCRYPT_OP:
> + r = -ENOTTY;
> + if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
> + goto out;
> + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
> + break;
> default:
> r = -EINVAL;
> }
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index eb800965b589..6971f1288043 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -531,6 +531,7 @@ struct kvm_pmu_event_filter {
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> KVM_TDX_INIT_VM,
> + KVM_TDX_INIT_VCPU,
>
> KVM_TDX_CMD_NR_MAX,
> };


2023-02-28 11:18:01

by Isaku Yamahata

[permalink] [raw]
Subject: Re: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

On Mon, Jan 16, 2023 at 06:07:19PM +0200,
Zhi Wang <[email protected]> wrote:

> On Thu, 12 Jan 2023 08:31:32 -0800
> [email protected] wrote:
>
> > From: Isaku Yamahata <[email protected]>
> >
> > TD guest vcpu need to be configured before ready to run which requests
> > addtional information from Device model (e.g. qemu), one 64bit value is
> > passed to vcpu's RCX as an initial value. Repurpose KVM_MEMORY_ENCRYPT_OP
> > to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
> > additional vcpu configuration.
> >
>
> Better add more details for this mystic value to save the review efforts.
>
> For exmaple, refining the above part as:
>
> ----
>
> TD hands-off block(HOB) is used to pass the information from VMM to
> TD virtual firmware(TDVF). Before KVM calls Intel TDX module to launch
> TDVF, the address of HOB must be placed in the guest RCX.
>
> Extend KVM_MEMORY_ENCRYPT_OP to vcpu-scope and add new... so that
> TDH.VP.INIT can take the address of HOB from QEMU and place it in the
> guest RCX when initializing a TDX vCPU.
>
> ----
>
> The below paragraph seems repeating the end of the first paragraph. Guess
> it can be refined or removed.
>
>
> > Add callback for kvm vCPU-scoped operations of KVM_MEMORY_ENCRYPT_OP and
> > add a new subcommand, KVM_TDX_INIT_VCPU, for further vcpu initialization.
> >

I don't think it's good idea to mention about new terminology HOB and TDVF.
We can say, VMM can pass one parameter.
Here is the updated one.

TD guest vcpu needs TDX specific initialization before running. Repurpose
KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
KVM_TDX_INIT_VCPU, and implement the callback for it.


> PS: I am curious if the value of guest RCX on each VCPU will be configured
> differently? (It seems they are the same according to the code of tdx-qemu)
>
> If yes, then it is just an approach to configure the value (even it is
> through TDH.VP.XXX). It should be configured in the domain level in KVM. The
> TDX vCPU creation and initialization can be moved into tdx_vcpu_create()
> and TDH.VP.INIT can take the value from a per-vm data structure.

RCX can be set for each VCPUs as ABI (or TDX SEAMCALL API) between VMM and vcpu
initial value. It's convention between user space VMM(qemu) and guest
firmware(TDVF) to pass same RCX value for all vcpu. So KVM shouldn't enforce
same RCX value for all vcpus. KVM should allow user space VMM to set the value
for each vcpus.


> > Signed-off-by: Sean Christopherson <[email protected]>
> > Signed-off-by: Isaku Yamahata <[email protected]>
> > ---
> > arch/x86/include/asm/kvm-x86-ops.h | 1 +
> > arch/x86/include/asm/kvm_host.h | 1 +
> > arch/x86/include/uapi/asm/kvm.h | 1 +
> > arch/x86/kvm/vmx/main.c | 9 ++
> > arch/x86/kvm/vmx/tdx.c | 147 +++++++++++++++++++++++++-
> > arch/x86/kvm/vmx/tdx.h | 7 ++
> > arch/x86/kvm/vmx/x86_ops.h | 10 +-
> > arch/x86/kvm/x86.c | 6 ++
> > tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> > 9 files changed, 178 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > index 1a27f3aee982..e3e9b1c2599b 100644
> > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -123,6 +123,7 @@ KVM_X86_OP(enable_smi_window)
> > #endif
> > KVM_X86_OP_OPTIONAL(dev_mem_enc_ioctl)
> > KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
> > +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
> > KVM_X86_OP_OPTIONAL(mem_enc_register_region)
> > KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
> > KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 30f4ddb18548..35773f925cc5 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1698,6 +1698,7 @@ struct kvm_x86_ops {
> >
> > int (*dev_mem_enc_ioctl)(void __user *argp);
> > int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
> > + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
> > int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> > int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> > int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
> > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> > index b8f28d86d4fd..9236c1699c48 100644
> > --- a/arch/x86/include/uapi/asm/kvm.h
> > +++ b/arch/x86/include/uapi/asm/kvm.h
> > @@ -536,6 +536,7 @@ struct kvm_pmu_event_filter {
> > enum kvm_tdx_cmd_id {
> > KVM_TDX_CAPABILITIES = 0,
> > KVM_TDX_INIT_VM,
> > + KVM_TDX_INIT_VCPU,
> >
> > KVM_TDX_CMD_NR_MAX,
> > };
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 59813ca05f36..23b3ffc3fe23 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -103,6 +103,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
> > return tdx_vm_ioctl(kvm, argp);
> > }
> >
> > +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > +{
> > + if (!is_td_vcpu(vcpu))
> > + return -EINVAL;
> > +
> > + return tdx_vcpu_ioctl(vcpu, argp);
> > +}
> > +
> > struct kvm_x86_ops vt_x86_ops __initdata = {
> > .name = KBUILD_MODNAME,
> >
> > @@ -249,6 +257,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> >
> > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > };
> >
> > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 099f0737a5aa..e2f5a07ad4e5 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> > return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> > }
> >
> > +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> > +{
> > + return tdx->tdvpr_pa;
> > +}
> > +
> > static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> > {
> > return kvm_tdx->tdr_pa;
> > @@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> > return kvm_tdx->hkid > 0;
> > }
> >
> > +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> > +{
> > + return kvm_tdx->finalized;
> > +}
> > +
> > static void tdx_clear_page(unsigned long page_pa)
> > {
> > const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > @@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> >
> > void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> > {
> > - /* This is stub for now. More logic will come. */
> > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > + int i;
> > +
> > + /* Can't reclaim or free pages if teardown failed. */
> > + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> > + return;
> > +
>
> Should we have an WARN_ON_ONCE here?

No. In normal case, it can come with hkid already reclaimed.


> > + if (tdx->tdvpx_pa) {
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > + tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
> > + kfree(tdx->tdvpx_pa);
> > + tdx->tdvpx_pa = NULL;
> > + }
> > + tdx_reclaim_td_page(tdx->tdvpr_pa);
> > + tdx->tdvpr_pa = 0;
> > }
> >
> > void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > @@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > /* TDX doesn't support INIT event. */
> > if (WARN_ON_ONCE(init_event))
> > goto td_bugged;
> > + if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
> > + goto td_bugged;
> >
> > /* TDX rquires X2APIC. */
> > apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
> > @@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> > return r;
> > }
> >
> > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > +{
> > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > + unsigned long *tdvpx_pa = NULL;
> > + unsigned long tdvpr_pa;
> > + unsigned long va;
> > + int ret, i;
> > + u64 err;
> > +
> > + if (is_td_vcpu_created(tdx))
> > + return -EINVAL;
> > +
> > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > + if (!va)
> > + return -ENOMEM;
> > + tdvpr_pa = __pa(va);
> > +
> > + tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
> > + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > + if (!tdvpx_pa) {
> > + ret = -ENOMEM;
> > + goto free_tdvpr;
> > + }
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > + if (!va)
> > + goto free_tdvpx;
> > + tdvpx_pa[i] = __pa(va);
> > + }
> > +
> > + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_CREATE, err, NULL);
> > + goto td_bugged_free_tdvpx;
> > + }
> > + tdx->tdvpr_pa = tdvpr_pa;
> > +
> > + tdx->tdvpx_pa = tdvpx_pa;
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> > + for (; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + tdvpx_pa[i] = 0;
> > + }
> > + goto td_bugged;
> > + }
> > + }
> > +
> > + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_INIT, err, NULL);
> > + goto td_bugged;
> > + }
> > +
> > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > +
> > + return 0;
> > +
> > +td_bugged_free_tdvpx:
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + tdvpx_pa[i] = 0;
> > + }
> > + kfree(tdvpx_pa);
> > +td_bugged:
> > + vcpu->kvm->vm_bugged = true;
> > + return ret;
> > +
> > +free_tdvpx:
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > + if (tdvpx_pa[i])
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + kfree(tdvpx_pa);
> > + tdx->tdvpx_pa = NULL;
> > +free_tdvpr:
> > + if (tdvpr_pa)
> > + free_page((unsigned long)__va(tdvpr_pa));
> > + tdx->tdvpr_pa = 0;
> > +
> > + return ret;
> > +}
>
> Same comments with using vm_bugged in the previous patch.

I converted it to KVM_BUG_ON().
--
Isaku Yamahata <[email protected]>

2023-02-28 11:28:44

by Isaku Yamahata

[permalink] [raw]
Subject: Re: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

On Thu, Jan 19, 2023 at 10:37:43AM +0000,
"Huang, Kai" <[email protected]> wrote:

> On Thu, 2023-01-12 at 08:31 -0800, [email protected] wrote:
> > From: Isaku Yamahata <[email protected]>
> >
> > TD guest vcpu need to be configured before ready to run which requests
> > addtional information from Device model (e.g. qemu), one 64bit value is
> > passed to vcpu's RCX as an initial value.  
> >
>
> The first half sentence doesn't parse to me. It also has grammar issue.
>
> Also, the second half only talks about TDH.VP.INIT, but there's more regarding
> to creating/initializing a TDX guest vcpu. IMHO It would be better if you can
> briefly describe the whole sequence here so people can get some idea about your
> code below.
>
> Btw, I don't understand what's the point of pointing out "64bit value passed to
> vcpu's RCX ...". You can add this to the comment instead. If it is important,
> then please add more to explain it so people can understand more.
>

RCX and 64bit value doesn't make much sense in the commit message. I dropped
those sentence.

Here is the updated commit message.

KVM: TDX: Do TDX specific vcpu initialization

TD guest vcpu needs TDX specific initialization before running. Repurpose
KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
KVM_TDX_INIT_VCPU, and implement the callback for it.

> > Repurpose KVM_MEMORY_ENCRYPT_OP
> > to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
> > additional vcpu configuration.
>
> I am not sure using the same command for both per-VM and per-vcpu ioctls is a
> good idea. Is there any existing example does this?

There are some. Please break Documentation/virt/kvm/api.rst with "type:".
You can see some ioctl supports multiple type.

Type: system ioctl, vm ioctl
Type: vcpu ioctl / vm ioctl
Type: device ioctl, vm ioctl, vcpu ioctl


> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 099f0737a5aa..e2f5a07ad4e5 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> > return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> > }
> >
> > +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> > +{
> > + return tdx->tdvpr_pa;
> > +}
> > +
> > static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> > {
> > return kvm_tdx->tdr_pa;
> > @@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> > return kvm_tdx->hkid > 0;
> > }
> >
> > +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> > +{
> > + return kvm_tdx->finalized;
> > +}
> > +
> > static void tdx_clear_page(unsigned long page_pa)
> > {
> > const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > @@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> >
> > void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> > {
> > - /* This is stub for now. More logic will come. */
> > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > + int i;
> > +
> > + /* Can't reclaim or free pages if teardown failed. */
> > + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> > + return;
>
> You may want to WARN() if it's a kernel bug you want to catch.

No, it's not a bug to come here with hkid freed because vcpus_free method
can be called after vm destruction.


> > +
> > + if (tdx->tdvpx_pa) {
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > + tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
> > + kfree(tdx->tdvpx_pa);
> > + tdx->tdvpx_pa = NULL;
> > + }
> > + tdx_reclaim_td_page(tdx->tdvpr_pa);
> > + tdx->tdvpr_pa = 0;
> > }
> >
> > void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > @@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > /* TDX doesn't support INIT event. */
> > if (WARN_ON_ONCE(init_event))
> > goto td_bugged;
> > + if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
> > + goto td_bugged;
>
> Again, not sure can we use KVM_BUG_ON()?

I converted it into KVM_BUG_ON()


> > /* TDX rquires X2APIC. */
> > apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
> > @@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> > return r;
> > }
> >
> > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > +{
> > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > + unsigned long *tdvpx_pa = NULL;
> > + unsigned long tdvpr_pa;
> > + unsigned long va;
> > + int ret, i;
> > + u64 err;
> > +
> > + if (is_td_vcpu_created(tdx))
> > + return -EINVAL;
>
> Ditto. WARN()?

No. KVM_TDX_INIT_VCPU can be called multiple times. It's not kernel bug, but
misuse of the ioctl.


> > +
> > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > + if (!va)
> > + return -ENOMEM;
> > + tdvpr_pa = __pa(va);
> > +
> > + tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
> > + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>
> kcalloc() uses __GFP_ZERO internally.
>
> > + if (!tdvpx_pa) {
> > + ret = -ENOMEM;
> > + goto free_tdvpr;
> > + }
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > + if (!va)
> > + goto free_tdvpx;
> > + tdvpx_pa[i] = __pa(va);
> > + }
> > +
> > + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_CREATE, err, NULL);
> > + goto td_bugged_free_tdvpx;
> > + }
> > + tdx->tdvpr_pa = tdvpr_pa;
> > +
> > + tdx->tdvpx_pa = tdvpx_pa;
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> > + for (; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + tdvpx_pa[i] = 0;
> > + }
> > + goto td_bugged;
> > + }
> > + }
> > +
> > + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> > + if (WARN_ON_ONCE(err)) {
> > + ret = -EIO;
> > + pr_tdx_error(TDH_VP_INIT, err, NULL);
> > + goto td_bugged;
> > + }
> > +
> > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > +
> > + return 0;
> > +
> > +td_bugged_free_tdvpx:
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + tdvpx_pa[i] = 0;
> > + }
> > + kfree(tdvpx_pa);
> > +td_bugged:
> > + vcpu->kvm->vm_bugged = true;
> > + return ret;
> > +
> > +free_tdvpx:
> > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > + if (tdvpx_pa[i])
> > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > + kfree(tdvpx_pa);
>
> This piece of code appears 3 times in this function (and there are 3 'return
> ret;'). I am sure it can be done in one place instead. Can you reorganize?

I improved the logic to delete the duplication.
--
Isaku Yamahata <[email protected]>

2023-02-28 18:21:26

by Zhi Wang

[permalink] [raw]
Subject: Re: [PATCH v11 024/113] KVM: TDX: Do TDX specific vcpu initialization

On Tue, 28 Feb 2023 03:17:52 -0800
Isaku Yamahata <[email protected]> wrote:

> On Mon, Jan 16, 2023 at 06:07:19PM +0200,
> Zhi Wang <[email protected]> wrote:
>
> > On Thu, 12 Jan 2023 08:31:32 -0800
> > [email protected] wrote:
> >
> > > From: Isaku Yamahata <[email protected]>
> > >
> > > TD guest vcpu need to be configured before ready to run which requests
> > > addtional information from Device model (e.g. qemu), one 64bit value is
> > > passed to vcpu's RCX as an initial value. Repurpose KVM_MEMORY_ENCRYPT_OP
> > > to vcpu-scope and add new sub-commands KVM_TDX_INIT_VCPU under it for such
> > > additional vcpu configuration.
> > >
> >
> > Better add more details for this mystic value to save the review efforts.
> >
> > For exmaple, refining the above part as:
> >
> > ----
> >
> > TD hands-off block(HOB) is used to pass the information from VMM to
> > TD virtual firmware(TDVF). Before KVM calls Intel TDX module to launch
> > TDVF, the address of HOB must be placed in the guest RCX.
> >
> > Extend KVM_MEMORY_ENCRYPT_OP to vcpu-scope and add new... so that
> > TDH.VP.INIT can take the address of HOB from QEMU and place it in the
> > guest RCX when initializing a TDX vCPU.
> >
> > ----
> >
> > The below paragraph seems repeating the end of the first paragraph. Guess
> > it can be refined or removed.
> >
> >
> > > Add callback for kvm vCPU-scoped operations of KVM_MEMORY_ENCRYPT_OP and
> > > add a new subcommand, KVM_TDX_INIT_VCPU, for further vcpu initialization.
> > >
>
> I don't think it's good idea to mention about new terminology HOB and TDVF.
> We can say, VMM can pass one parameter.
> Here is the updated one.
>
> TD guest vcpu needs TDX specific initialization before running. Repurpose
> KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
> KVM_TDX_INIT_VCPU, and implement the callback for it.
>

Based on the experience of reviewing this patch, I think it depends on:

1) If the reviewer needs to understand the meaning of this parameter to review
this patch? If yes, a brief description of what it is (even one sentence) is
much better than "one parameter". If no, better mention the pointer, like
"Refer XXX spec for more details of xxx". (No need to mention chapter
according to Sean's maintainer book).

Direct and informative comment is always helpful for reviewing.

2) If describing this new terminology helps on the reviewing following patches?
If the following patches modify logic around this? If yes, should take this
opportunity to educate the reviewer with brief descriptions. If no, a pointer
is good enough.

>
> > PS: I am curious if the value of guest RCX on each VCPU will be configured
> > differently? (It seems they are the same according to the code of tdx-qemu)
> >
> > If yes, then it is just an approach to configure the value (even it is
> > through TDH.VP.XXX). It should be configured in the domain level in KVM. The
> > TDX vCPU creation and initialization can be moved into tdx_vcpu_create()
> > and TDH.VP.INIT can take the value from a per-vm data structure.
>
> RCX can be set for each VCPUs as ABI (or TDX SEAMCALL API) between VMM and vcpu
> initial value. It's convention between user space VMM(qemu) and guest
> firmware(TDVF) to pass same RCX value for all vcpu. So KVM shouldn't enforce
> same RCX value for all vcpus. KVM should allow user space VMM to set the value
> for each vcpus.
>

I see. That makes sense then.

>
> > > Signed-off-by: Sean Christopherson <[email protected]>
> > > Signed-off-by: Isaku Yamahata <[email protected]>
> > > ---
> > > arch/x86/include/asm/kvm-x86-ops.h | 1 +
> > > arch/x86/include/asm/kvm_host.h | 1 +
> > > arch/x86/include/uapi/asm/kvm.h | 1 +
> > > arch/x86/kvm/vmx/main.c | 9 ++
> > > arch/x86/kvm/vmx/tdx.c | 147 +++++++++++++++++++++++++-
> > > arch/x86/kvm/vmx/tdx.h | 7 ++
> > > arch/x86/kvm/vmx/x86_ops.h | 10 +-
> > > arch/x86/kvm/x86.c | 6 ++
> > > tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> > > 9 files changed, 178 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > > index 1a27f3aee982..e3e9b1c2599b 100644
> > > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > > @@ -123,6 +123,7 @@ KVM_X86_OP(enable_smi_window)
> > > #endif
> > > KVM_X86_OP_OPTIONAL(dev_mem_enc_ioctl)
> > > KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
> > > +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
> > > KVM_X86_OP_OPTIONAL(mem_enc_register_region)
> > > KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
> > > KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 30f4ddb18548..35773f925cc5 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1698,6 +1698,7 @@ struct kvm_x86_ops {
> > >
> > > int (*dev_mem_enc_ioctl)(void __user *argp);
> > > int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
> > > + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
> > > int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> > > int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
> > > int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
> > > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> > > index b8f28d86d4fd..9236c1699c48 100644
> > > --- a/arch/x86/include/uapi/asm/kvm.h
> > > +++ b/arch/x86/include/uapi/asm/kvm.h
> > > @@ -536,6 +536,7 @@ struct kvm_pmu_event_filter {
> > > enum kvm_tdx_cmd_id {
> > > KVM_TDX_CAPABILITIES = 0,
> > > KVM_TDX_INIT_VM,
> > > + KVM_TDX_INIT_VCPU,
> > >
> > > KVM_TDX_CMD_NR_MAX,
> > > };
> > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > > index 59813ca05f36..23b3ffc3fe23 100644
> > > --- a/arch/x86/kvm/vmx/main.c
> > > +++ b/arch/x86/kvm/vmx/main.c
> > > @@ -103,6 +103,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
> > > return tdx_vm_ioctl(kvm, argp);
> > > }
> > >
> > > +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > > +{
> > > + if (!is_td_vcpu(vcpu))
> > > + return -EINVAL;
> > > +
> > > + return tdx_vcpu_ioctl(vcpu, argp);
> > > +}
> > > +
> > > struct kvm_x86_ops vt_x86_ops __initdata = {
> > > .name = KBUILD_MODNAME,
> > >
> > > @@ -249,6 +257,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > >
> > > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > > + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > > };
> > >
> > > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index 099f0737a5aa..e2f5a07ad4e5 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -49,6 +49,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> > > return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> > > }
> > >
> > > +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> > > +{
> > > + return tdx->tdvpr_pa;
> > > +}
> > > +
> > > static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> > > {
> > > return kvm_tdx->tdr_pa;
> > > @@ -65,6 +70,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> > > return kvm_tdx->hkid > 0;
> > > }
> > >
> > > +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> > > +{
> > > + return kvm_tdx->finalized;
> > > +}
> > > +
> > > static void tdx_clear_page(unsigned long page_pa)
> > > {
> > > const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > > @@ -327,7 +337,21 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> > >
> > > void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> > > {
> > > - /* This is stub for now. More logic will come. */
> > > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > + int i;
> > > +
> > > + /* Can't reclaim or free pages if teardown failed. */
> > > + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> > > + return;
> > > +
> >
> > Should we have an WARN_ON_ONCE here?
>
> No. In normal case, it can come with hkid already reclaimed.
>
>
> > > + if (tdx->tdvpx_pa) {
> > > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > > + tdx_reclaim_td_page(tdx->tdvpx_pa[i]);
> > > + kfree(tdx->tdvpx_pa);
> > > + tdx->tdvpx_pa = NULL;
> > > + }
> > > + tdx_reclaim_td_page(tdx->tdvpr_pa);
> > > + tdx->tdvpr_pa = 0;
> > > }
> > >
> > > void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > > @@ -337,6 +361,8 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > > /* TDX doesn't support INIT event. */
> > > if (WARN_ON_ONCE(init_event))
> > > goto td_bugged;
> > > + if (WARN_ON_ONCE(is_td_vcpu_created(to_tdx(vcpu))))
> > > + goto td_bugged;
> > >
> > > /* TDX rquires X2APIC. */
> > > apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
> > > @@ -791,6 +817,125 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> > > return r;
> > > }
> > >
> > > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > > +{
> > > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > > + struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > + unsigned long *tdvpx_pa = NULL;
> > > + unsigned long tdvpr_pa;
> > > + unsigned long va;
> > > + int ret, i;
> > > + u64 err;
> > > +
> > > + if (is_td_vcpu_created(tdx))
> > > + return -EINVAL;
> > > +
> > > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > + if (!va)
> > > + return -ENOMEM;
> > > + tdvpr_pa = __pa(va);
> > > +
> > > + tdvpx_pa = kcalloc(tdx_caps.tdvpx_nr_pages, sizeof(*tdx->tdvpx_pa),
> > > + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > > + if (!tdvpx_pa) {
> > > + ret = -ENOMEM;
> > > + goto free_tdvpr;
> > > + }
> > > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > > + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > + if (!va)
> > > + goto free_tdvpx;
> > > + tdvpx_pa[i] = __pa(va);
> > > + }
> > > +
> > > + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> > > + if (WARN_ON_ONCE(err)) {
> > > + ret = -EIO;
> > > + pr_tdx_error(TDH_VP_CREATE, err, NULL);
> > > + goto td_bugged_free_tdvpx;
> > > + }
> > > + tdx->tdvpr_pa = tdvpr_pa;
> > > +
> > > + tdx->tdvpx_pa = tdvpx_pa;
> > > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > > + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> > > + if (WARN_ON_ONCE(err)) {
> > > + ret = -EIO;
> > > + pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> > > + for (; i < tdx_caps.tdvpx_nr_pages; i++) {
> > > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > > + tdvpx_pa[i] = 0;
> > > + }
> > > + goto td_bugged;
> > > + }
> > > + }
> > > +
> > > + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> > > + if (WARN_ON_ONCE(err)) {
> > > + ret = -EIO;
> > > + pr_tdx_error(TDH_VP_INIT, err, NULL);
> > > + goto td_bugged;
> > > + }
> > > +
> > > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > > +
> > > + return 0;
> > > +
> > > +td_bugged_free_tdvpx:
> > > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++) {
> > > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > > + tdvpx_pa[i] = 0;
> > > + }
> > > + kfree(tdvpx_pa);
> > > +td_bugged:
> > > + vcpu->kvm->vm_bugged = true;
> > > + return ret;
> > > +
> > > +free_tdvpx:
> > > + for (i = 0; i < tdx_caps.tdvpx_nr_pages; i++)
> > > + if (tdvpx_pa[i])
> > > + free_page((unsigned long)__va(tdvpx_pa[i]));
> > > + kfree(tdvpx_pa);
> > > + tdx->tdvpx_pa = NULL;
> > > +free_tdvpr:
> > > + if (tdvpr_pa)
> > > + free_page((unsigned long)__va(tdvpr_pa));
> > > + tdx->tdvpr_pa = 0;
> > > +
> > > + return ret;
> > > +}
> >
> > Same comments with using vm_bugged in the previous patch.
>
> I converted it to KVM_BUG_ON().