2023-04-07 20:20:26

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 0/5] Add TDX intra host migration support

This patchset adds support for TDX intra host migration using the same
API which was added for SEV intra host migration here:
https://lore.kernel.org/all/[email protected]/

This patchset relies on the latest TDX patches from Intel:
- fd-based approach for supporing KVM v10 and
https://lore.kernel.org/lkml/[email protected]/
- TDX host kernel support v10
https://lore.kernel.org/lkml/[email protected]/
- KVM TDX basic feature support v13
https://lore.kernel.org/[email protected]

The tree can be found at https://github.com/googleprodkernel/linux-cc/tree/copyless
and is based on Intel's tdx tree at https://github.com/intel/tdx/tree/kvm-upstream

In the TDX case, we need to transfer the VM state from multiple sources:

* HKID and encrypted VM state is transfered between the kvm_tdx
objects.
* Encrypted and runtime state is transfered between the vcpu_tdx
objects.
* The EPT table backing TD's private memory is transfered at the
kvm-mmu level. This is needed since the secure EPT table managed by
the TD module remains the same after the migration so moving the
current private EPT table eliminates the need to rebuild the private
EPT table to match the secure EPT table on the destination.
* Information regarding the current shared/private memory is trasfered
using the mem_attr_array stored at the kvm object.
* Additional information derived from shared/private memory state is
trasfered at the memslot level.

Tested with selftests locally. I will attach the self test in the next
version after we send the new TDX selftest framework patches based on
KVM TDX basic feature support v13.

Sagi Shahar (5):
KVM: Split tdp_mmu_pages to private and shared lists
KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from
KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from
KVM: TDX: Implement moving private pages between 2 TDs
KVM: TDX: Add core logic for TDX intra-host migration

arch/x86/include/asm/kvm_host.h | 5 +-
arch/x86/kvm/mmu.h | 2 +
arch/x86/kvm/mmu/mmu.c | 60 ++++++++
arch/x86/kvm/mmu/tdp_mmu.c | 88 +++++++++++-
arch/x86/kvm/mmu/tdp_mmu.h | 3 +
arch/x86/kvm/svm/sev.c | 175 +++--------------------
arch/x86/kvm/vmx/main.c | 10 ++
arch/x86/kvm/vmx/tdx.c | 245 ++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/tdx.h | 2 +
arch/x86/kvm/vmx/x86_ops.h | 5 +
arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++
arch/x86/kvm/x86.h | 16 +++
12 files changed, 613 insertions(+), 164 deletions(-)

--
2.40.0.348.gf938b09366-goog


2023-04-07 20:20:47

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 1/5] KVM: Split tdp_mmu_pages to private and shared lists

tdp_mmu_pages holds all the active pages used by the mmu. When we
transfer the state during intra-host migration we need to transfer the
private pages but not the shared ones.

Keeping them in separate counters makes this transfer more efficient.

Signed-off-by: Sagi Shahar <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 ++++-
arch/x86/kvm/mmu/tdp_mmu.c | 11 +++++++++--
2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ae377eec81987..5ed70cd9d74bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1426,9 +1426,12 @@ struct kvm_arch {
struct task_struct *nx_huge_page_recovery_thread;

#ifdef CONFIG_X86_64
- /* The number of TDP MMU pages across all roots. */
+ /* The number of non-private TDP MMU pages across all roots. */
atomic64_t tdp_mmu_pages;

+ /* Same as tdp_mmu_pages but only for private pages. */
+ atomic64_t tdp_private_mmu_pages;
+
/*
* List of struct kvm_mmu_pages being used as roots.
* All struct kvm_mmu_pages in the list should have
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 58a236a69ec72..327dee4f6170e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -44,6 +44,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);

WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(atomic64_read(&kvm->arch.tdp_private_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));

/*
@@ -373,13 +374,19 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
- atomic64_inc(&kvm->arch.tdp_mmu_pages);
+ if (is_private_sp(sp))
+ atomic64_inc(&kvm->arch.tdp_private_mmu_pages);
+ else
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
}

static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
- atomic64_dec(&kvm->arch.tdp_mmu_pages);
+ if (is_private_sp(sp))
+ atomic64_dec(&kvm->arch.tdp_private_mmu_pages);
+ else
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
}

/**
--
2.40.0.348.gf938b09366-goog

2023-04-07 20:21:08

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 2/5] KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from

Both SEV and TDX are going to use similar flows for intra-host
migration. This change moves some of the code which will be used by both
architecture into shared code in x86.h

Signed-off-by: Sagi Shahar <[email protected]>
---
arch/x86/kvm/svm/sev.c | 175 +++++------------------------------------
arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.h | 16 ++++
3 files changed, 201 insertions(+), 156 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c25aeb550cd97..18831a0b7734e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1553,116 +1553,6 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}

-static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
-{
- struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
- struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
- int r = -EBUSY;
-
- if (dst_kvm == src_kvm)
- return -EINVAL;
-
- /*
- * Bail if these VMs are already involved in a migration to avoid
- * deadlock between two VMs trying to migrate to/from each other.
- */
- if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
- return -EBUSY;
-
- if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
- goto release_dst;
-
- r = -EINTR;
- if (mutex_lock_killable(&dst_kvm->lock))
- goto release_src;
- if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
- goto unlock_dst;
- return 0;
-
-unlock_dst:
- mutex_unlock(&dst_kvm->lock);
-release_src:
- atomic_set_release(&src_sev->migration_in_progress, 0);
-release_dst:
- atomic_set_release(&dst_sev->migration_in_progress, 0);
- return r;
-}
-
-static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
-{
- struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
- struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
-
- mutex_unlock(&dst_kvm->lock);
- mutex_unlock(&src_kvm->lock);
- atomic_set_release(&dst_sev->migration_in_progress, 0);
- atomic_set_release(&src_sev->migration_in_progress, 0);
-}
-
-/* vCPU mutex subclasses. */
-enum sev_migration_role {
- SEV_MIGRATION_SOURCE = 0,
- SEV_MIGRATION_TARGET,
- SEV_NR_MIGRATION_ROLES,
-};
-
-static int sev_lock_vcpus_for_migration(struct kvm *kvm,
- enum sev_migration_role role)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i, j;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable_nested(&vcpu->mutex, role))
- goto out_unlock;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (!i)
- /*
- * Reset the role to one that avoids colliding with
- * the role used for the first vcpu mutex.
- */
- role = SEV_NR_MIGRATION_ROLES;
- else
- mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
-#endif
- }
-
- return 0;
-
-out_unlock:
-
- kvm_for_each_vcpu(j, vcpu, kvm) {
- if (i == j)
- break;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (j)
- mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
-#endif
-
- mutex_unlock(&vcpu->mutex);
- }
- return -EINTR;
-}
-
-static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
- bool first = true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (first)
- first = false;
- else
- mutex_acquire(&vcpu->mutex.dep_map,
- SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
-
- mutex_unlock(&vcpu->mutex);
- }
-}
-
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
@@ -1744,25 +1634,6 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
}
}

-static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
-{
- struct kvm_vcpu *src_vcpu;
- unsigned long i;
-
- if (!sev_es_guest(src))
- return 0;
-
- if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
- return -EINVAL;
-
- kvm_for_each_vcpu(i, src_vcpu, src) {
- if (!src_vcpu->arch.guest_state_protected)
- return -EINVAL;
- }
-
- return 0;
-}
-
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
@@ -1777,19 +1648,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
ret = -EBADF;
goto out_fput;
}
-
source_kvm = source_kvm_file->private_data;
- ret = sev_lock_two_vms(kvm, source_kvm);
+ src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ ret = pre_move_enc_context_from(kvm, source_kvm,
+ &dst_sev->migration_in_progress,
+ &src_sev->migration_in_progress);
if (ret)
goto out_fput;

- if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ if (sev_guest(kvm) || !sev_es_guest(source_kvm)) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_post;
}

- src_sev = &to_kvm_svm(source_kvm)->sev_info;
-
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1799,34 +1671,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}

- ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
- if (ret)
- goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
- if (ret)
- goto out_dst_vcpu;
-
- ret = sev_check_source_vcpus(kvm, source_kvm);
- if (ret)
- goto out_source_vcpu;
-
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;

-out_source_vcpu:
- sev_unlock_vcpus_for_migration(source_kvm);
-out_dst_vcpu:
- sev_unlock_vcpus_for_migration(kvm);
out_dst_cgroup:
/* Operates on the source on success, on the destination on failure. */
if (charged)
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
-out_unlock:
- sev_unlock_two_vms(kvm, source_kvm);
+out_post:
+ post_move_enc_context_from(kvm, source_kvm,
+ &dst_sev->migration_in_progress,
+ &src_sev->migration_in_progress);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
@@ -2058,7 +1917,11 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
}

source_kvm = source_kvm_file->private_data;
- ret = sev_lock_two_vms(kvm, source_kvm);
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ ret = lock_two_vms_for_migration(kvm, source_kvm,
+ &mirror_sev->migration_in_progress,
+ &source_sev->migration_in_progress);
if (ret)
goto e_source_fput;

@@ -2078,9 +1941,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
- source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);

/* Set enc_context_owner and copy its encryption context over */
@@ -2101,7 +1962,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
*/

e_unlock:
- sev_unlock_two_vms(kvm, source_kvm);
+ unlock_two_vms_for_migration(kvm, source_kvm,
+ &mirror_sev->migration_in_progress,
+ &source_sev->migration_in_progress);
e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 870041887ed91..865c434a94899 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13596,6 +13596,172 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);

+/* vCPU mutex subclasses. */
+enum migration_role {
+ MIGRATION_SOURCE = 0,
+ MIGRATION_TARGET,
+ NR_MIGRATION_ROLES,
+};
+
+static int lock_vcpus_for_migration(struct kvm *kvm, enum migration_role role)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
+ goto out_unlock;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (!i)
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = NR_MIGRATION_ROLES;
+ else
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+#endif
+ }
+
+ return 0;
+
+out_unlock:
+
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (j)
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+#endif
+
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+
+static void unlock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ bool first = true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map, NR_MIGRATION_ROLES,
+ 0, _THIS_IP_);
+
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
+int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress)
+{
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(dst_migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(src_migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(src_migration_in_progress, 0);
+release_dst:
+ atomic_set_release(dst_migration_in_progress, 0);
+ return r;
+}
+EXPORT_SYMBOL_GPL(lock_two_vms_for_migration);
+
+void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress)
+{
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(dst_migration_in_progress, 0);
+ atomic_set_release(src_migration_in_progress, 0);
+}
+EXPORT_SYMBOL_GPL(unlock_two_vms_for_migration);
+
+int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress)
+{
+ struct kvm_vcpu *src_vcpu;
+ unsigned long i;
+ int ret = -EINVAL;
+
+ ret = lock_two_vms_for_migration(dst_kvm, src_kvm,
+ dst_migration_in_progress,
+ src_migration_in_progress);
+ if (ret)
+ return ret;
+
+ ret = lock_vcpus_for_migration(dst_kvm, MIGRATION_TARGET);
+ if (ret)
+ goto unlock_vms;
+
+ ret = lock_vcpus_for_migration(src_kvm, MIGRATION_SOURCE);
+ if (ret)
+ goto unlock_dst_vcpu;
+
+ if (atomic_read(&dst_kvm->online_vcpus) !=
+ atomic_read(&src_kvm->online_vcpus))
+ goto unlock_dst_vcpu;
+
+ kvm_for_each_vcpu(i, src_vcpu, src_kvm) {
+ if (!src_vcpu->arch.guest_state_protected)
+ goto unlock_dst_vcpu;
+ }
+
+ return 0;
+
+unlock_dst_vcpu:
+ unlock_vcpus_for_migration(dst_kvm);
+unlock_vms:
+ unlock_two_vms_for_migration(dst_kvm, src_kvm,
+ dst_migration_in_progress,
+ src_migration_in_progress);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pre_move_enc_context_from);
+
+void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress)
+{
+ unlock_vcpus_for_migration(src_kvm);
+ unlock_vcpus_for_migration(dst_kvm);
+ unlock_two_vms_for_migration(dst_kvm, src_kvm,
+ dst_migration_in_progress,
+ src_migration_in_progress);
+}
+EXPORT_SYMBOL_GPL(post_move_enc_context_from);
+
bool kvm_arch_dirty_log_supported(struct kvm *kvm)
{
return kvm->arch.vm_type != KVM_X86_PROTECTED_VM;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 33a1a5341e788..554c797184994 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -502,4 +502,20 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in);

+int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress);
+
+void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress);
+
+int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress);
+
+void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
+ atomic_t *dst_migration_in_progress,
+ atomic_t *src_migration_in_progress);
+
#endif
--
2.40.0.348.gf938b09366-goog

2023-04-07 20:21:09

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

This should mostly match the logic in sev_vm_move_enc_context_from.

Signed-off-by: Sagi Shahar <[email protected]>
---
arch/x86/kvm/vmx/main.c | 10 +++++++
arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/tdx.h | 2 ++
arch/x86/kvm/vmx/x86_ops.h | 5 ++++
4 files changed, 73 insertions(+)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 5b64fe5404958..9d5d0ac465bf6 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}

+static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ if (!is_td(kvm))
+ return -ENOTTY;
+
+ return tdx_vm_move_enc_context_from(kvm, source_fd);
+}
+
#define VMX_REQUIRED_APICV_INHIBITS \
( \
BIT(APICV_INHIBIT_REASON_DISABLE)| \
@@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.dev_mem_enc_ioctl = tdx_dev_ioctl,
.mem_enc_ioctl = vt_mem_enc_ioctl,
.vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
+
+ .vm_move_enc_context_from = vt_move_enc_context_from,
};

struct kvm_x86_init_ops vt_init_ops __initdata = {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 8af7e4e81c860..0999a6d827c99 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2826,3 +2826,59 @@ int __init tdx_init(void)
INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
return 0;
}
+
+static __always_inline bool tdx_guest(struct kvm *kvm)
+{
+ struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
+
+ return tdx_kvm->finalized;
+}
+
+static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
+{
+ return -EINVAL;
+}
+
+int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
+ struct file *src_kvm_file;
+ struct kvm_tdx *src_tdx;
+ struct kvm *src_kvm;
+ int ret;
+
+ src_kvm_file = fget(source_fd);
+ if (!file_is_kvm(src_kvm_file)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+ src_kvm = src_kvm_file->private_data;
+ src_tdx = to_kvm_tdx(src_kvm);
+
+ ret = pre_move_enc_context_from(kvm, src_kvm,
+ &dst_tdx->migration_in_progress,
+ &src_tdx->migration_in_progress);
+ if (ret)
+ goto out_fput;
+
+ if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
+ ret = -EINVAL;
+ goto out_post;
+ }
+
+ ret = tdx_migrate_from(kvm, src_kvm);
+ if (ret)
+ goto out_post;
+
+ kvm_vm_dead(src_kvm);
+ ret = 0;
+
+out_post:
+ post_move_enc_context_from(kvm, src_kvm,
+ &dst_tdx->migration_in_progress,
+ &src_tdx->migration_in_progress);
+out_fput:
+ if (src_kvm_file)
+ fput(src_kvm_file);
+ return ret;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 71818c5001862..21b7e710be1fd 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -24,6 +24,8 @@ struct kvm_tdx {
atomic_t tdh_mem_track;

u64 tsc_offset;
+
+ atomic_t migration_in_progress;
};

union tdx_exit_reason {
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index d049e0c72ed0c..275f5d75e9bf1 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb(struct kvm_vcpu *vcpu);
int tdx_sept_tlb_remote_flush(struct kvm *kvm);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+
+int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
#else
static inline int tdx_init(void) { return 0; };
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
@@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
+
+static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
+ nsigned int source_fd) { return -EOPNOTSUPP; }
#endif

#if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
--
2.40.0.348.gf938b09366-goog

2023-04-07 20:21:42

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 4/5] KVM: TDX: Implement moving private pages between 2 TDs

Added functionality for moving the private EPT table from one TD to a
new one.

This function moves the root of the private EPT table and overwrite
the root of the destination.

Signed-off-by: Sagi Shahar <[email protected]>
---
arch/x86/kvm/mmu.h | 2 +
arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++
arch/x86/kvm/mmu/tdp_mmu.c | 77 +++++++++++++++++++++++++++++++++++---
arch/x86/kvm/mmu/tdp_mmu.h | 3 ++
4 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d10b08eeaefee..09bae7fe18a12 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -120,6 +120,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+int kvm_mmu_move_private_pages_from(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu *src_vcpu);

static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a35f2e7f9bc70..1acc9338323da 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3789,6 +3789,66 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
return r;
}

+int kvm_mmu_move_private_pages_from(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu *src_vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_mmu *src_mmu = src_vcpu->arch.mmu;
+ gfn_t gfn_shared = kvm_gfn_shared_mask(vcpu->kvm);
+ hpa_t private_root_hpa, shared_root_hpa;
+ int r = -EINVAL;
+
+ // Hold locks for both src and dst. Always take the src lock first.
+ write_lock(&src_vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (!gfn_shared)
+ goto out_unlock;
+
+ WARN_ON_ONCE(!is_tdp_mmu_active(vcpu));
+ WARN_ON_ONCE(!is_tdp_mmu_active(src_vcpu));
+
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
+ if (r)
+ goto out_unlock;
+
+ /*
+ * The private root is moved from the src to the dst and is marked as
+ * invalid in the src.
+ */
+ private_root_hpa = kvm_tdp_mmu_move_private_pages_from(vcpu, src_vcpu);
+ if (private_root_hpa == INVALID_PAGE) {
+ /*
+ * This likely means that the private root was already moved by
+ * another vCPU.
+ */
+ private_root_hpa = kvm_tdp_mmu_get_vcpu_root_hpa_no_alloc(vcpu, true);
+ if (private_root_hpa == INVALID_PAGE) {
+ r = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ mmu->private_root_hpa = private_root_hpa;
+ src_mmu->private_root_hpa = INVALID_PAGE;
+
+ /*
+ * The shared root is allocated normally and is not moved from the src.
+ */
+ shared_root_hpa = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
+ mmu->root.hpa = shared_root_hpa;
+
+ kvm_mmu_load_pgd(vcpu);
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
+
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&src_vcpu->kvm->mmu_lock);
+
+ return r;
+}
+EXPORT_SYMBOL(kvm_mmu_move_private_pages_from);
+
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 327dee4f6170e..685528fdc0ad6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -296,6 +296,23 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
trace_kvm_mmu_get_page(sp, true);
}

+static struct kvm_mmu_page *
+kvm_tdp_mmu_get_vcpu_root_no_alloc(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(root))
+ return root;
+ }
+
+ return NULL;
+}
+
static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
bool private)
{
@@ -311,11 +328,9 @@ static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
*/
if (private)
kvm_mmu_page_role_set_private(&role);
- for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
- if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(root))
- goto out;
- }
+ root = kvm_tdp_mmu_get_vcpu_root_no_alloc(vcpu, role);
+ if (!!root)
+ goto out;

root = tdp_mmu_alloc_sp(vcpu, role);
tdp_mmu_init_sp(root, NULL, 0);
@@ -330,6 +345,58 @@ static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
return root;
}

+hpa_t kvm_tdp_mmu_move_private_pages_from(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu *src_vcpu)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm *src_kvm = src_vcpu->kvm;
+ struct kvm_mmu_page *private_root = NULL;
+ struct kvm_mmu_page *root;
+ s64 num_private_pages, old;
+
+ lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
+ lockdep_assert_held_write(&src_vcpu->kvm->mmu_lock);
+
+ /* Find the private root of the source. */
+ kvm_mmu_page_role_set_private(&role);
+ for_each_tdp_mmu_root(src_kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word) {
+ private_root = root;
+ break;
+ }
+ }
+ if (!private_root)
+ return INVALID_PAGE;
+
+ /* Remove the private root from the src kvm and add it to dst kvm. */
+ list_del_rcu(&private_root->link);
+ list_add_rcu(&private_root->link, &kvm->arch.tdp_mmu_roots);
+
+ num_private_pages = atomic64_read(&src_kvm->arch.tdp_private_mmu_pages);
+ old = atomic64_cmpxchg(&kvm->arch.tdp_private_mmu_pages, 0,
+ num_private_pages);
+ /* The destination VM should have no private pages at this point. */
+ WARN_ON(old);
+ atomic64_set(&src_kvm->arch.tdp_private_mmu_pages, 0);
+
+ return __pa(private_root->spt);
+}
+
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa_no_alloc(struct kvm_vcpu *vcpu, bool private)
+{
+ struct kvm_mmu_page *root;
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+
+ if (private)
+ kvm_mmu_page_role_set_private(&role);
+ root = kvm_tdp_mmu_get_vcpu_root_no_alloc(vcpu, role);
+ if (!root)
+ return INVALID_PAGE;
+
+ return __pa(root->spt);
+}
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu, bool private)
{
return __pa(kvm_tdp_mmu_get_vcpu_root(vcpu, private)->spt);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3ae3c3b8642ac..0e9d38432673d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -11,6 +11,9 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);

hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu, bool private);
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa_no_alloc(struct kvm_vcpu *vcpu, bool private);
+hpa_t kvm_tdp_mmu_move_private_pages_from(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu *src_vcpu);

__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
--
2.40.0.348.gf938b09366-goog

2023-04-07 20:24:05

by Sagi Shahar

[permalink] [raw]
Subject: [RFC PATCH 5/5] KVM: TDX: Add core logic for TDX intra-host migration

Adds the core logic for transferring state between source and
destination TDs during intra-host migration.

Signed-off-by: Sagi Shahar <[email protected]>
---
arch/x86/kvm/vmx/tdx.c | 191 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 190 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0999a6d827c99..05b164a91437b 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2834,9 +2834,198 @@ static __always_inline bool tdx_guest(struct kvm *kvm)
return tdx_kvm->finalized;
}

+#define for_each_memslot_pair(memslots_1, memslots_2, memslot_iter_1, \
+ memslot_iter_2) \
+ for (memslot_iter_1 = rb_first(&memslots_1->gfn_tree), \
+ memslot_iter_2 = rb_first(&memslots_2->gfn_tree); \
+ memslot_iter_1 && memslot_iter_2; \
+ memslot_iter_1 = rb_next(memslot_iter_1), \
+ memslot_iter_2 = rb_next(memslot_iter_2))
+
static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
{
- return -EINVAL;
+ struct rb_node *src_memslot_iter, *dst_memslot_iter;
+ struct vcpu_tdx *dst_tdx_vcpu, *src_tdx_vcpu;
+ struct kvm_memslots *src_slots, *dst_slots;
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct kvm_tdx *src_tdx, *dst_tdx;
+ unsigned long i, j;
+ int ret;
+
+ src_tdx = to_kvm_tdx(src);
+ dst_tdx = to_kvm_tdx(dst);
+
+ src_slots = __kvm_memslots(src, 0);
+ dst_slots = __kvm_memslots(dst, 0);
+
+ ret = -EINVAL;
+
+ if (!src_tdx->finalized) {
+ pr_warn("Cannot migrate from a non finalized VM\n");
+ goto abort;
+ }
+
+ // Traverse both memslots in gfn order and compare them
+ for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter, dst_memslot_iter) {
+ struct kvm_memory_slot *src_slot, *dst_slot;
+
+ src_slot =
+ container_of(src_memslot_iter, struct kvm_memory_slot,
+ gfn_node[src_slots->node_idx]);
+ dst_slot =
+ container_of(src_memslot_iter, struct kvm_memory_slot,
+ gfn_node[dst_slots->node_idx]);
+
+ if (src_slot->base_gfn != dst_slot->base_gfn ||
+ src_slot->npages != dst_slot->npages) {
+ pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
+ goto abort;
+ }
+
+ if (src_slot->flags != dst_slot->flags) {
+ pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
+ goto abort;
+ }
+
+ if (src_slot->flags & KVM_MEM_PRIVATE) {
+ if (src_slot->restrictedmem.file->f_inode->i_ino !=
+ dst_slot->restrictedmem.file->f_inode->i_ino) {
+ pr_warn("Private memslots points to different restricted files\n");
+ goto abort;
+ }
+
+ if (src_slot->restrictedmem.index != dst_slot->restrictedmem.index) {
+ pr_warn("Private memslots points to the restricted file at different offsets\n");
+ goto abort;
+ }
+ }
+ }
+
+ if (src_memslot_iter || dst_memslot_iter) {
+ pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
+ goto abort;
+ }
+
+ dst_tdx->hkid = src_tdx->hkid;
+ dst_tdx->tdr_pa = src_tdx->tdr_pa;
+
+ dst_tdx->tdcs_pa = kcalloc(tdx_info.nr_tdcs_pages, sizeof(*dst_tdx->tdcs_pa),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!dst_tdx->tdcs_pa) {
+ ret = -ENOMEM;
+ goto late_abort;
+ }
+ memcpy(dst_tdx->tdcs_pa, src_tdx->tdcs_pa,
+ tdx_info.nr_tdcs_pages * sizeof(*dst_tdx->tdcs_pa));
+
+ dst_tdx->tsc_offset = src_tdx->tsc_offset;
+ dst_tdx->attributes = src_tdx->attributes;
+ dst_tdx->xfam = src_tdx->xfam;
+ dst_tdx->kvm.arch.gfn_shared_mask = src_tdx->kvm.arch.gfn_shared_mask;
+
+ kvm_for_each_vcpu(i, src_vcpu, src)
+ tdx_flush_vp_on_cpu(src_vcpu);
+
+ /* Copy per-vCPU state */
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ src_tdx_vcpu = to_tdx(src_vcpu);
+ dst_vcpu = kvm_get_vcpu(dst, i);
+ dst_tdx_vcpu = to_tdx(dst_vcpu);
+
+ vcpu_load(dst_vcpu);
+
+ memcpy(dst_vcpu->arch.regs, src_vcpu->arch.regs,
+ NR_VCPU_REGS * sizeof(src_vcpu->arch.regs[0]));
+ dst_vcpu->arch.regs_avail = src_vcpu->arch.regs_avail;
+ dst_vcpu->arch.regs_dirty = src_vcpu->arch.regs_dirty;
+
+ dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset;
+
+ dst_tdx_vcpu->interrupt_disabled_hlt = src_tdx_vcpu->interrupt_disabled_hlt;
+ dst_tdx_vcpu->buggy_hlt_workaround = src_tdx_vcpu->buggy_hlt_workaround;
+
+ dst_tdx_vcpu->tdvpr_pa = src_tdx_vcpu->tdvpr_pa;
+ dst_tdx_vcpu->tdvpx_pa = kcalloc(tdx_info.nr_tdvpx_pages,
+ sizeof(*dst_tdx_vcpu->tdvpx_pa),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst_tdx_vcpu->tdvpx_pa) {
+ ret = -ENOMEM;
+ vcpu_put(dst_vcpu);
+ goto late_abort;
+ }
+ memcpy(dst_tdx_vcpu->tdvpx_pa, src_tdx_vcpu->tdvpx_pa,
+ tdx_info.nr_tdvpx_pages * sizeof(*dst_tdx_vcpu->tdvpx_pa));
+
+ td_vmcs_write64(dst_tdx_vcpu, POSTED_INTR_DESC_ADDR, __pa(&dst_tdx_vcpu->pi_desc));
+
+ /* Copy private EPT tables */
+ if (kvm_mmu_move_private_pages_from(dst_vcpu, src_vcpu)) {
+ ret = -EINVAL;
+ vcpu_put(dst_vcpu);
+ goto late_abort;
+ }
+
+ for (j = 0; j < tdx_info.nr_tdvpx_pages; j++)
+ src_tdx_vcpu->tdvpx_pa[j] = 0;
+
+ src_tdx_vcpu->tdvpr_pa = 0;
+
+ vcpu_put(dst_vcpu);
+ }
+
+ for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter,
+ dst_memslot_iter) {
+ struct kvm_memory_slot *src_slot, *dst_slot;
+
+ src_slot = container_of(src_memslot_iter,
+ struct kvm_memory_slot,
+ gfn_node[src_slots->node_idx]);
+ dst_slot = container_of(src_memslot_iter,
+ struct kvm_memory_slot,
+ gfn_node[dst_slots->node_idx]);
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long ugfn;
+ int level = i + 1;
+
+ /*
+ * If the gfn and userspace address are not aligned wrt each other, then
+ * large page support should already be disabled at this level.
+ */
+ ugfn = dst_slot->userspace_addr >> PAGE_SHIFT;
+ if ((dst_slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ continue;
+
+ dst_slot->arch.lpage_info[i - 1] =
+ src_slot->arch.lpage_info[i - 1];
+ src_slot->arch.lpage_info[i - 1] = NULL;
+ }
+ }
+
+ dst->mem_attr_array.xa_head = src->mem_attr_array.xa_head;
+ src->mem_attr_array.xa_head = NULL;
+
+ dst_tdx->finalized = true;
+
+ /* Clear source VM to avoid freeing the hkid and pages on VM put */
+ src_tdx->hkid = -1;
+ src_tdx->tdr_pa = 0;
+ for (i = 0; i < tdx_info.nr_tdcs_pages; i++)
+ src_tdx->tdcs_pa[i] = 0;
+
+ return 0;
+
+late_abort:
+ /* If we aborted after the state transfer already started, the src VM
+ * is no longer valid.
+ */
+ kvm_vm_dead(src);
+
+abort:
+ dst_tdx->hkid = -1;
+ dst_tdx->tdr_pa = 0;
+
+ return ret;
}

int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
--
2.40.0.348.gf938b09366-goog

2023-04-14 07:06:18

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] Add TDX intra host migration support

On Fri, 7 Apr 2023 20:19:16 +0000
Sagi Shahar <[email protected]> wrote:

Hi:

Is there any userspace using these APIs? I cant find them in AMD-QEMU repo
and upstream QEMU repo. It would nice to first take a look on how userspace
is going to use it.

> This patchset adds support for TDX intra host migration using the same
> API which was added for SEV intra host migration here:
> https://lore.kernel.org/all/[email protected]/
>
> This patchset relies on the latest TDX patches from Intel:
> - fd-based approach for supporing KVM v10 and
> https://lore.kernel.org/lkml/[email protected]/
> - TDX host kernel support v10
> https://lore.kernel.org/lkml/[email protected]/
> - KVM TDX basic feature support v13
> https://lore.kernel.org/[email protected]
>
> The tree can be found at https://github.com/googleprodkernel/linux-cc/tree/copyless
> and is based on Intel's tdx tree at https://github.com/intel/tdx/tree/kvm-upstream
>
> In the TDX case, we need to transfer the VM state from multiple sources:
>
> * HKID and encrypted VM state is transfered between the kvm_tdx
> objects.
> * Encrypted and runtime state is transfered between the vcpu_tdx
> objects.
> * The EPT table backing TD's private memory is transfered at the
> kvm-mmu level. This is needed since the secure EPT table managed by
> the TD module remains the same after the migration so moving the
> current private EPT table eliminates the need to rebuild the private
> EPT table to match the secure EPT table on the destination.
> * Information regarding the current shared/private memory is trasfered
> using the mem_attr_array stored at the kvm object.
> * Additional information derived from shared/private memory state is
> trasfered at the memslot level.
>
> Tested with selftests locally. I will attach the self test in the next
> version after we send the new TDX selftest framework patches based on
> KVM TDX basic feature support v13.
>
> Sagi Shahar (5):
> KVM: Split tdp_mmu_pages to private and shared lists
> KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from
> KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from
> KVM: TDX: Implement moving private pages between 2 TDs
> KVM: TDX: Add core logic for TDX intra-host migration
>
> arch/x86/include/asm/kvm_host.h | 5 +-
> arch/x86/kvm/mmu.h | 2 +
> arch/x86/kvm/mmu/mmu.c | 60 ++++++++
> arch/x86/kvm/mmu/tdp_mmu.c | 88 +++++++++++-
> arch/x86/kvm/mmu/tdp_mmu.h | 3 +
> arch/x86/kvm/svm/sev.c | 175 +++--------------------
> arch/x86/kvm/vmx/main.c | 10 ++
> arch/x86/kvm/vmx/tdx.c | 245 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/vmx/tdx.h | 2 +
> arch/x86/kvm/vmx/x86_ops.h | 5 +
> arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++
> arch/x86/kvm/x86.h | 16 +++
> 12 files changed, 613 insertions(+), 164 deletions(-)
>

2023-04-14 19:13:04

by Sagi Shahar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] Add TDX intra host migration support

On Fri, Apr 14, 2023 at 12:04 AM Zhi Wang <[email protected]> wrote:
>
> On Fri, 7 Apr 2023 20:19:16 +0000
> Sagi Shahar <[email protected]> wrote:
>
> Hi:
>
> Is there any userspace using these APIs? I cant find them in AMD-QEMU repo
> and upstream QEMU repo. It would nice to first take a look on how userspace
> is going to use it.
>
We are using a different userspace VMM internally so we didn't make
changes to QEMU.
I've uploaded our selftests which exercise these APIs to our public
GitHub so you can take a look there:
https://github.com/googleprodkernel/linux-cc/commit/62c8dba4c3cf06e375018077a6d9f491c933dc6d

Note that these are a slightly older version based on TDX V10 API.
They also use the
KVM_CAP_VM_COPY_ENC_CONTEXT_FROM instead of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM

> > This patchset adds support for TDX intra host migration using the same
> > API which was added for SEV intra host migration here:
> > https://lore.kernel.org/all/[email protected]/
> >
> > This patchset relies on the latest TDX patches from Intel:
> > - fd-based approach for supporing KVM v10 and
> > https://lore.kernel.org/lkml/[email protected]/
> > - TDX host kernel support v10
> > https://lore.kernel.org/lkml/[email protected]/
> > - KVM TDX basic feature support v13
> > https://lore.kernel.org/[email protected]
> >
> > The tree can be found at https://github.com/googleprodkernel/linux-cc/tree/copyless
> > and is based on Intel's tdx tree at https://github.com/intel/tdx/tree/kvm-upstream
> >
> > In the TDX case, we need to transfer the VM state from multiple sources:
> >
> > * HKID and encrypted VM state is transfered between the kvm_tdx
> > objects.
> > * Encrypted and runtime state is transfered between the vcpu_tdx
> > objects.
> > * The EPT table backing TD's private memory is transfered at the
> > kvm-mmu level. This is needed since the secure EPT table managed by
> > the TD module remains the same after the migration so moving the
> > current private EPT table eliminates the need to rebuild the private
> > EPT table to match the secure EPT table on the destination.
> > * Information regarding the current shared/private memory is trasfered
> > using the mem_attr_array stored at the kvm object.
> > * Additional information derived from shared/private memory state is
> > trasfered at the memslot level.
> >
> > Tested with selftests locally. I will attach the self test in the next
> > version after we send the new TDX selftest framework patches based on
> > KVM TDX basic feature support v13.
> >
> > Sagi Shahar (5):
> > KVM: Split tdp_mmu_pages to private and shared lists
> > KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from
> > KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from
> > KVM: TDX: Implement moving private pages between 2 TDs
> > KVM: TDX: Add core logic for TDX intra-host migration
> >
> > arch/x86/include/asm/kvm_host.h | 5 +-
> > arch/x86/kvm/mmu.h | 2 +
> > arch/x86/kvm/mmu/mmu.c | 60 ++++++++
> > arch/x86/kvm/mmu/tdp_mmu.c | 88 +++++++++++-
> > arch/x86/kvm/mmu/tdp_mmu.h | 3 +
> > arch/x86/kvm/svm/sev.c | 175 +++--------------------
> > arch/x86/kvm/vmx/main.c | 10 ++
> > arch/x86/kvm/vmx/tdx.c | 245 ++++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/tdx.h | 2 +
> > arch/x86/kvm/vmx/x86_ops.h | 5 +
> > arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++
> > arch/x86/kvm/x86.h | 16 +++
> > 12 files changed, 613 insertions(+), 164 deletions(-)
> >
>

2023-04-17 19:53:28

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from

On Fri, 7 Apr 2023 20:19:18 +0000
Sagi Shahar <[email protected]> wrote:

> Both SEV and TDX are going to use similar flows for intra-host
> migration. This change moves some of the code which will be used by both
> architecture into shared code in x86.h
>
> Signed-off-by: Sagi Shahar <[email protected]>
> ---
> arch/x86/kvm/svm/sev.c | 175 +++++------------------------------------
> arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/x86.h | 16 ++++
> 3 files changed, 201 insertions(+), 156 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index c25aeb550cd97..18831a0b7734e 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -1553,116 +1553,6 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
> return false;
> }
>
> -static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
> -{
> - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
> - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
> - int r = -EBUSY;
> -
> - if (dst_kvm == src_kvm)
> - return -EINVAL;
> -
> - /*
> - * Bail if these VMs are already involved in a migration to avoid
> - * deadlock between two VMs trying to migrate to/from each other.
> - */
> - if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
> - return -EBUSY;
> -
> - if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
> - goto release_dst;
> -
> - r = -EINTR;
> - if (mutex_lock_killable(&dst_kvm->lock))
> - goto release_src;
> - if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
> - goto unlock_dst;
> - return 0;
> -
> -unlock_dst:
> - mutex_unlock(&dst_kvm->lock);
> -release_src:
> - atomic_set_release(&src_sev->migration_in_progress, 0);
> -release_dst:
> - atomic_set_release(&dst_sev->migration_in_progress, 0);
> - return r;
> -}
> -
> -static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
> -{
> - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
> - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
> -
> - mutex_unlock(&dst_kvm->lock);
> - mutex_unlock(&src_kvm->lock);
> - atomic_set_release(&dst_sev->migration_in_progress, 0);
> - atomic_set_release(&src_sev->migration_in_progress, 0);
> -}
> -
> -/* vCPU mutex subclasses. */
> -enum sev_migration_role {
> - SEV_MIGRATION_SOURCE = 0,
> - SEV_MIGRATION_TARGET,
> - SEV_NR_MIGRATION_ROLES,
> -};
> -
> -static int sev_lock_vcpus_for_migration(struct kvm *kvm,
> - enum sev_migration_role role)
> -{
> - struct kvm_vcpu *vcpu;
> - unsigned long i, j;
> -
> - kvm_for_each_vcpu(i, vcpu, kvm) {
> - if (mutex_lock_killable_nested(&vcpu->mutex, role))
> - goto out_unlock;
> -
> -#ifdef CONFIG_PROVE_LOCKING
> - if (!i)
> - /*
> - * Reset the role to one that avoids colliding with
> - * the role used for the first vcpu mutex.
> - */
> - role = SEV_NR_MIGRATION_ROLES;
> - else
> - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
> -#endif
> - }
> -
> - return 0;
> -
> -out_unlock:
> -
> - kvm_for_each_vcpu(j, vcpu, kvm) {
> - if (i == j)
> - break;
> -
> -#ifdef CONFIG_PROVE_LOCKING
> - if (j)
> - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
> -#endif
> -
> - mutex_unlock(&vcpu->mutex);
> - }
> - return -EINTR;
> -}
> -
> -static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
> -{
> - struct kvm_vcpu *vcpu;
> - unsigned long i;
> - bool first = true;
> -
> - kvm_for_each_vcpu(i, vcpu, kvm) {
> - if (first)
> - first = false;
> - else
> - mutex_acquire(&vcpu->mutex.dep_map,
> - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
> -
> - mutex_unlock(&vcpu->mutex);
> - }
> -}
> -
> static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
> {
> struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
> @@ -1744,25 +1634,6 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
> }
> }
>
> -static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
> -{
> - struct kvm_vcpu *src_vcpu;
> - unsigned long i;
> -
> - if (!sev_es_guest(src))
> - return 0;
> -
> - if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
> - return -EINVAL;
> -
> - kvm_for_each_vcpu(i, src_vcpu, src) {
> - if (!src_vcpu->arch.guest_state_protected)
> - return -EINVAL;
> - }
> -
> - return 0;
> -}
> -
> int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> {
> struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
> @@ -1777,19 +1648,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> ret = -EBADF;
> goto out_fput;
> }
> -
> source_kvm = source_kvm_file->private_data;
> - ret = sev_lock_two_vms(kvm, source_kvm);
> + src_sev = &to_kvm_svm(source_kvm)->sev_info;
> +
> + ret = pre_move_enc_context_from(kvm, source_kvm,
> + &dst_sev->migration_in_progress,
> + &src_sev->migration_in_progress);
> if (ret)
> goto out_fput;
>
> - if (sev_guest(kvm) || !sev_guest(source_kvm)) {
> + if (sev_guest(kvm) || !sev_es_guest(source_kvm)) {
> ret = -EINVAL;
> - goto out_unlock;
> + goto out_post;
> }
>
> - src_sev = &to_kvm_svm(source_kvm)->sev_info;
> -
> dst_sev->misc_cg = get_current_misc_cg();
> cg_cleanup_sev = dst_sev;
> if (dst_sev->misc_cg != src_sev->misc_cg) {
> @@ -1799,34 +1671,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> charged = true;
> }
>
> - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
> - if (ret)
> - goto out_dst_cgroup;
> - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
> - if (ret)
> - goto out_dst_vcpu;
> -
> - ret = sev_check_source_vcpus(kvm, source_kvm);
> - if (ret)
> - goto out_source_vcpu;
> -
> sev_migrate_from(kvm, source_kvm);
> kvm_vm_dead(source_kvm);
> cg_cleanup_sev = src_sev;
> ret = 0;
>
> -out_source_vcpu:
> - sev_unlock_vcpus_for_migration(source_kvm);
> -out_dst_vcpu:
> - sev_unlock_vcpus_for_migration(kvm);
> out_dst_cgroup:
> /* Operates on the source on success, on the destination on failure. */
> if (charged)
> sev_misc_cg_uncharge(cg_cleanup_sev);
> put_misc_cg(cg_cleanup_sev->misc_cg);
> cg_cleanup_sev->misc_cg = NULL;
> -out_unlock:
> - sev_unlock_two_vms(kvm, source_kvm);
> +out_post:
> + post_move_enc_context_from(kvm, source_kvm,
> + &dst_sev->migration_in_progress,
> + &src_sev->migration_in_progress);
> out_fput:
> if (source_kvm_file)
> fput(source_kvm_file);
> @@ -2058,7 +1917,11 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> }
>
> source_kvm = source_kvm_file->private_data;
> - ret = sev_lock_two_vms(kvm, source_kvm);
> + source_sev = &to_kvm_svm(source_kvm)->sev_info;
> + mirror_sev = &to_kvm_svm(kvm)->sev_info;
> + ret = lock_two_vms_for_migration(kvm, source_kvm,
> + &mirror_sev->migration_in_progress,
> + &source_sev->migration_in_progress);
> if (ret)
> goto e_source_fput;
>
> @@ -2078,9 +1941,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> * The mirror kvm holds an enc_context_owner ref so its asid can't
> * disappear until we're done with it
> */
> - source_sev = &to_kvm_svm(source_kvm)->sev_info;
> kvm_get_kvm(source_kvm);
> - mirror_sev = &to_kvm_svm(kvm)->sev_info;
> list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
>
> /* Set enc_context_owner and copy its encryption context over */
> @@ -2101,7 +1962,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> */
>
> e_unlock:
> - sev_unlock_two_vms(kvm, source_kvm);
> + unlock_two_vms_for_migration(kvm, source_kvm,
> + &mirror_sev->migration_in_progress,
> + &source_sev->migration_in_progress);
> e_source_fput:
> if (source_kvm_file)
> fput(source_kvm_file);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 870041887ed91..865c434a94899 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13596,6 +13596,172 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> }
> EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
>
> +/* vCPU mutex subclasses. */
> +enum migration_role {
> + MIGRATION_SOURCE = 0,
> + MIGRATION_TARGET,
> + NR_MIGRATION_ROLES,
> +};
> +

> +static int lock_vcpus_for_migration(struct kvm *kvm, enum migration_role role)
> +{
> + struct kvm_vcpu *vcpu;
> + unsigned long i, j;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (mutex_lock_killable_nested(&vcpu->mutex, role))
> + goto out_unlock;
> +
> +#ifdef CONFIG_PROVE_LOCKING
> + if (!i)
> + /*
> + * Reset the role to one that avoids colliding with
> + * the role used for the first vcpu mutex.
> + */
> + role = NR_MIGRATION_ROLES;
> + else
> + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
> +#endif
> + }
> +
> + return 0;
> +
> +out_unlock:
> +
> + kvm_for_each_vcpu(j, vcpu, kvm) {
> + if (i == j)
> + break;
> +
> +#ifdef CONFIG_PROVE_LOCKING
> + if (j)
> + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
> +#endif
> +
> + mutex_unlock(&vcpu->mutex);
> + }
> + return -EINTR;
> +}
> +
> +static void unlock_vcpus_for_migration(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + unsigned long i;
> + bool first = true;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (first)
> + first = false;
> + else
> + mutex_acquire(&vcpu->mutex.dep_map, NR_MIGRATION_ROLES,
> + 0, _THIS_IP_);
> +
> + mutex_unlock(&vcpu->mutex);
> + }
> +}
> +
> +int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress)
> +{
> + int r = -EBUSY;
> +
> + if (dst_kvm == src_kvm)
> + return -EINVAL;
> +
> + /*
> + * Bail if these VMs are already involved in a migration to avoid
> + * deadlock between two VMs trying to migrate to/from each other.
> + */
> + if (atomic_cmpxchg_acquire(dst_migration_in_progress, 0, 1))
> + return -EBUSY;
> +
> + if (atomic_cmpxchg_acquire(src_migration_in_progress, 0, 1))
> + goto release_dst;
> +
> + r = -EINTR;
> + if (mutex_lock_killable(&dst_kvm->lock))
> + goto release_src;
> + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
> + goto unlock_dst;
> + return 0;
> +
> +unlock_dst:
> + mutex_unlock(&dst_kvm->lock);
> +release_src:
> + atomic_set_release(src_migration_in_progress, 0);
> +release_dst:
> + atomic_set_release(dst_migration_in_progress, 0);
> + return r;
> +}
> +EXPORT_SYMBOL_GPL(lock_two_vms_for_migration);
> +
> +void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress)
> +{
> + mutex_unlock(&dst_kvm->lock);
> + mutex_unlock(&src_kvm->lock);
> + atomic_set_release(dst_migration_in_progress, 0);
> + atomic_set_release(src_migration_in_progress, 0);
> +}
> +EXPORT_SYMBOL_GPL(unlock_two_vms_for_migration);
> +
> +int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress)
> +{
> + struct kvm_vcpu *src_vcpu;
> + unsigned long i;
> + int ret = -EINVAL;
> +
> + ret = lock_two_vms_for_migration(dst_kvm, src_kvm,
> + dst_migration_in_progress,
> + src_migration_in_progress);
> + if (ret)
> + return ret;
> +
> + ret = lock_vcpus_for_migration(dst_kvm, MIGRATION_TARGET);
> + if (ret)
> + goto unlock_vms;
> +
> + ret = lock_vcpus_for_migration(src_kvm, MIGRATION_SOURCE);
> + if (ret)
> + goto unlock_dst_vcpu;
> +
> + if (atomic_read(&dst_kvm->online_vcpus) !=
> + atomic_read(&src_kvm->online_vcpus))
> + goto unlock_dst_vcpu;
> +
> + kvm_for_each_vcpu(i, src_vcpu, src_kvm) {
> + if (!src_vcpu->arch.guest_state_protected)
> + goto unlock_dst_vcpu;
> + }
> +
> + return 0;
> +
> +unlock_dst_vcpu:
> + unlock_vcpus_for_migration(dst_kvm);
> +unlock_vms:
> + unlock_two_vms_for_migration(dst_kvm, src_kvm,
> + dst_migration_in_progress,
> + src_migration_in_progress);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(pre_move_enc_context_from);
> +
> +void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress)
> +{
> + unlock_vcpus_for_migration(src_kvm);
> + unlock_vcpus_for_migration(dst_kvm);
> + unlock_two_vms_for_migration(dst_kvm, src_kvm,
> + dst_migration_in_progress,
> + src_migration_in_progress);
> +}
> +EXPORT_SYMBOL_GPL(post_move_enc_context_from);
> +

It would be nice to have kvm_ prefix for the functions exported.

> bool kvm_arch_dirty_log_supported(struct kvm *kvm)
> {
> return kvm->arch.vm_type != KVM_X86_PROTECTED_VM;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 33a1a5341e788..554c797184994 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -502,4 +502,20 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> unsigned int port, void *data, unsigned int count,
> int in);
>
> +int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress);
> +
> +void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress);
> +
> +int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress);
> +
> +void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> + atomic_t *dst_migration_in_progress,
> + atomic_t *src_migration_in_progress);
> +
> #endif

2023-04-18 06:44:13

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Fri, 7 Apr 2023 20:19:19 +0000
Sagi Shahar <[email protected]> wrote:

Is there any reaon that TDX doesn't need .vm_copy_enc_context_from? Or it is
going to be deprecated? The patch comments needs to be refined according to
Sean's KVM x86 maintainer book.

> This should mostly match the logic in sev_vm_move_enc_context_from.
>
> Signed-off-by: Sagi Shahar <[email protected]>
> ---
> arch/x86/kvm/vmx/main.c | 10 +++++++
> arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/vmx/tdx.h | 2 ++
> arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> 4 files changed, 73 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 5b64fe5404958..9d5d0ac465bf6 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> return tdx_vcpu_ioctl(vcpu, argp);
> }
>
> +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> +{
> + if (!is_td(kvm))
> + return -ENOTTY;
> +
> + return tdx_vm_move_enc_context_from(kvm, source_fd);
> +}
> +
> #define VMX_REQUIRED_APICV_INHIBITS \
> ( \
> BIT(APICV_INHIBIT_REASON_DISABLE)| \
> @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> .dev_mem_enc_ioctl = tdx_dev_ioctl,
> .mem_enc_ioctl = vt_mem_enc_ioctl,
> .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> +
> + .vm_move_enc_context_from = vt_move_enc_context_from,
> };
>
> struct kvm_x86_init_ops vt_init_ops __initdata = {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 8af7e4e81c860..0999a6d827c99 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> return 0;
> }
> +
> +static __always_inline bool tdx_guest(struct kvm *kvm)
> +{
> + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> +
> + return tdx_kvm->finalized;
> +}
> +
> +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> +{
> + return -EINVAL;
> +}
> +
> +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> +{
> + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> + struct file *src_kvm_file;
> + struct kvm_tdx *src_tdx;
> + struct kvm *src_kvm;
> + int ret;
> +
> + src_kvm_file = fget(source_fd);
> + if (!file_is_kvm(src_kvm_file)) {
> + ret = -EBADF;
> + goto out_fput;
> + }
> + src_kvm = src_kvm_file->private_data;
> + src_tdx = to_kvm_tdx(src_kvm);
> +
> + ret = pre_move_enc_context_from(kvm, src_kvm,
> + &dst_tdx->migration_in_progress,
> + &src_tdx->migration_in_progress);
> + if (ret)
> + goto out_fput;
> +
> + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> + ret = -EINVAL;
> + goto out_post;
> + }
> +
> + ret = tdx_migrate_from(kvm, src_kvm);
> + if (ret)
> + goto out_post;
> +
> + kvm_vm_dead(src_kvm);
> + ret = 0;
> +
> +out_post:
> + post_move_enc_context_from(kvm, src_kvm,
> + &dst_tdx->migration_in_progress,
> + &src_tdx->migration_in_progress);
> +out_fput:
> + if (src_kvm_file)
> + fput(src_kvm_file);
> + return ret;
> +}
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 71818c5001862..21b7e710be1fd 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -24,6 +24,8 @@ struct kvm_tdx {
> atomic_t tdh_mem_track;
>
> u64 tsc_offset;
> +
> + atomic_t migration_in_progress;
> };
>
> union tdx_exit_reason {
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index d049e0c72ed0c..275f5d75e9bf1 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> +
> +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> #else
> static inline int tdx_init(void) { return 0; };
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> +
> +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> + nsigned int source_fd) { return -EOPNOTSUPP; }
> #endif
>
> #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)

2023-04-18 12:14:38

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Fri, 7 Apr 2023 20:19:19 +0000
Sagi Shahar <[email protected]> wrote:

What was the status of the src VM when calling the vm_move_enc_context_from?
Is it still active like common live migration or it has been paused?

> This should mostly match the logic in sev_vm_move_enc_context_from.
>
> Signed-off-by: Sagi Shahar <[email protected]>
> ---
> arch/x86/kvm/vmx/main.c | 10 +++++++
> arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/vmx/tdx.h | 2 ++
> arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> 4 files changed, 73 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 5b64fe5404958..9d5d0ac465bf6 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> return tdx_vcpu_ioctl(vcpu, argp);
> }
>
> +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> +{
> + if (!is_td(kvm))
> + return -ENOTTY;
> +
> + return tdx_vm_move_enc_context_from(kvm, source_fd);
> +}
> +
> #define VMX_REQUIRED_APICV_INHIBITS \
> ( \
> BIT(APICV_INHIBIT_REASON_DISABLE)| \
> @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> .dev_mem_enc_ioctl = tdx_dev_ioctl,
> .mem_enc_ioctl = vt_mem_enc_ioctl,
> .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> +
> + .vm_move_enc_context_from = vt_move_enc_context_from,
> };
>
> struct kvm_x86_init_ops vt_init_ops __initdata = {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 8af7e4e81c860..0999a6d827c99 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> return 0;
> }
> +
> +static __always_inline bool tdx_guest(struct kvm *kvm)
> +{
> + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> +
> + return tdx_kvm->finalized;
> +}
return is_td_finalized()?
> +
> +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> +{
> + return -EINVAL;
> +}
> +
> +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> +{
> + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> + struct file *src_kvm_file;
> + struct kvm_tdx *src_tdx;
> + struct kvm *src_kvm;
> + int ret;
> +
> + src_kvm_file = fget(source_fd);
> + if (!file_is_kvm(src_kvm_file)) {
> + ret = -EBADF;
> + goto out_fput;
> + }
> + src_kvm = src_kvm_file->private_data;
> + src_tdx = to_kvm_tdx(src_kvm);
> +
> + ret = pre_move_enc_context_from(kvm, src_kvm,
> + &dst_tdx->migration_in_progress,
> + &src_tdx->migration_in_progress);
> + if (ret)
> + goto out_fput;
> +
> + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> + ret = -EINVAL;
> + goto out_post;
> + }
> +
> + ret = tdx_migrate_from(kvm, src_kvm);
> + if (ret)
> + goto out_post;
> +
> + kvm_vm_dead(src_kvm);
> + ret = 0;
> +
> +out_post:
> + post_move_enc_context_from(kvm, src_kvm,
> + &dst_tdx->migration_in_progress,
> + &src_tdx->migration_in_progress);
> +out_fput:
> + if (src_kvm_file)
> + fput(src_kvm_file);
> + return ret;
> +}
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 71818c5001862..21b7e710be1fd 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -24,6 +24,8 @@ struct kvm_tdx {
> atomic_t tdh_mem_track;
>
> u64 tsc_offset;
> +
> + atomic_t migration_in_progress;
> };
>
> union tdx_exit_reason {
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index d049e0c72ed0c..275f5d75e9bf1 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> +
> +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> #else
> static inline int tdx_init(void) { return 0; };
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> +
> +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> + nsigned int source_fd) { return -EOPNOTSUPP; }
> #endif
>
> #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)

2023-04-18 17:27:38

by Sagi Shahar

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from

On Mon, Apr 17, 2023 at 12:45 PM Zhi Wang <[email protected]> wrote:
>
> On Fri, 7 Apr 2023 20:19:18 +0000
> Sagi Shahar <[email protected]> wrote:
>
> > Both SEV and TDX are going to use similar flows for intra-host
> > migration. This change moves some of the code which will be used by both
> > architecture into shared code in x86.h
> >
> > Signed-off-by: Sagi Shahar <[email protected]>
> > ---
> > arch/x86/kvm/svm/sev.c | 175 +++++------------------------------------
> > arch/x86/kvm/x86.c | 166 ++++++++++++++++++++++++++++++++++++++
> > arch/x86/kvm/x86.h | 16 ++++
> > 3 files changed, 201 insertions(+), 156 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> > index c25aeb550cd97..18831a0b7734e 100644
> > --- a/arch/x86/kvm/svm/sev.c
> > +++ b/arch/x86/kvm/svm/sev.c
> > @@ -1553,116 +1553,6 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
> > return false;
> > }
> >
> > -static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
> > -{
> > - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
> > - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
> > - int r = -EBUSY;
> > -
> > - if (dst_kvm == src_kvm)
> > - return -EINVAL;
> > -
> > - /*
> > - * Bail if these VMs are already involved in a migration to avoid
> > - * deadlock between two VMs trying to migrate to/from each other.
> > - */
> > - if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
> > - return -EBUSY;
> > -
> > - if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
> > - goto release_dst;
> > -
> > - r = -EINTR;
> > - if (mutex_lock_killable(&dst_kvm->lock))
> > - goto release_src;
> > - if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
> > - goto unlock_dst;
> > - return 0;
> > -
> > -unlock_dst:
> > - mutex_unlock(&dst_kvm->lock);
> > -release_src:
> > - atomic_set_release(&src_sev->migration_in_progress, 0);
> > -release_dst:
> > - atomic_set_release(&dst_sev->migration_in_progress, 0);
> > - return r;
> > -}
> > -
> > -static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
> > -{
> > - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
> > - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
> > -
> > - mutex_unlock(&dst_kvm->lock);
> > - mutex_unlock(&src_kvm->lock);
> > - atomic_set_release(&dst_sev->migration_in_progress, 0);
> > - atomic_set_release(&src_sev->migration_in_progress, 0);
> > -}
> > -
> > -/* vCPU mutex subclasses. */
> > -enum sev_migration_role {
> > - SEV_MIGRATION_SOURCE = 0,
> > - SEV_MIGRATION_TARGET,
> > - SEV_NR_MIGRATION_ROLES,
> > -};
> > -
> > -static int sev_lock_vcpus_for_migration(struct kvm *kvm,
> > - enum sev_migration_role role)
> > -{
> > - struct kvm_vcpu *vcpu;
> > - unsigned long i, j;
> > -
> > - kvm_for_each_vcpu(i, vcpu, kvm) {
> > - if (mutex_lock_killable_nested(&vcpu->mutex, role))
> > - goto out_unlock;
> > -
> > -#ifdef CONFIG_PROVE_LOCKING
> > - if (!i)
> > - /*
> > - * Reset the role to one that avoids colliding with
> > - * the role used for the first vcpu mutex.
> > - */
> > - role = SEV_NR_MIGRATION_ROLES;
> > - else
> > - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
> > -#endif
> > - }
> > -
> > - return 0;
> > -
> > -out_unlock:
> > -
> > - kvm_for_each_vcpu(j, vcpu, kvm) {
> > - if (i == j)
> > - break;
> > -
> > -#ifdef CONFIG_PROVE_LOCKING
> > - if (j)
> > - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
> > -#endif
> > -
> > - mutex_unlock(&vcpu->mutex);
> > - }
> > - return -EINTR;
> > -}
> > -
> > -static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
> > -{
> > - struct kvm_vcpu *vcpu;
> > - unsigned long i;
> > - bool first = true;
> > -
> > - kvm_for_each_vcpu(i, vcpu, kvm) {
> > - if (first)
> > - first = false;
> > - else
> > - mutex_acquire(&vcpu->mutex.dep_map,
> > - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
> > -
> > - mutex_unlock(&vcpu->mutex);
> > - }
> > -}
> > -
> > static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
> > {
> > struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
> > @@ -1744,25 +1634,6 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
> > }
> > }
> >
> > -static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
> > -{
> > - struct kvm_vcpu *src_vcpu;
> > - unsigned long i;
> > -
> > - if (!sev_es_guest(src))
> > - return 0;
> > -
> > - if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
> > - return -EINVAL;
> > -
> > - kvm_for_each_vcpu(i, src_vcpu, src) {
> > - if (!src_vcpu->arch.guest_state_protected)
> > - return -EINVAL;
> > - }
> > -
> > - return 0;
> > -}
> > -
> > int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > {
> > struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
> > @@ -1777,19 +1648,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > ret = -EBADF;
> > goto out_fput;
> > }
> > -
> > source_kvm = source_kvm_file->private_data;
> > - ret = sev_lock_two_vms(kvm, source_kvm);
> > + src_sev = &to_kvm_svm(source_kvm)->sev_info;
> > +
> > + ret = pre_move_enc_context_from(kvm, source_kvm,
> > + &dst_sev->migration_in_progress,
> > + &src_sev->migration_in_progress);
> > if (ret)
> > goto out_fput;
> >
> > - if (sev_guest(kvm) || !sev_guest(source_kvm)) {
> > + if (sev_guest(kvm) || !sev_es_guest(source_kvm)) {
> > ret = -EINVAL;
> > - goto out_unlock;
> > + goto out_post;
> > }
> >
> > - src_sev = &to_kvm_svm(source_kvm)->sev_info;
> > -
> > dst_sev->misc_cg = get_current_misc_cg();
> > cg_cleanup_sev = dst_sev;
> > if (dst_sev->misc_cg != src_sev->misc_cg) {
> > @@ -1799,34 +1671,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > charged = true;
> > }
> >
> > - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
> > - if (ret)
> > - goto out_dst_cgroup;
> > - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
> > - if (ret)
> > - goto out_dst_vcpu;
> > -
> > - ret = sev_check_source_vcpus(kvm, source_kvm);
> > - if (ret)
> > - goto out_source_vcpu;
> > -
> > sev_migrate_from(kvm, source_kvm);
> > kvm_vm_dead(source_kvm);
> > cg_cleanup_sev = src_sev;
> > ret = 0;
> >
> > -out_source_vcpu:
> > - sev_unlock_vcpus_for_migration(source_kvm);
> > -out_dst_vcpu:
> > - sev_unlock_vcpus_for_migration(kvm);
> > out_dst_cgroup:
> > /* Operates on the source on success, on the destination on failure. */
> > if (charged)
> > sev_misc_cg_uncharge(cg_cleanup_sev);
> > put_misc_cg(cg_cleanup_sev->misc_cg);
> > cg_cleanup_sev->misc_cg = NULL;
> > -out_unlock:
> > - sev_unlock_two_vms(kvm, source_kvm);
> > +out_post:
> > + post_move_enc_context_from(kvm, source_kvm,
> > + &dst_sev->migration_in_progress,
> > + &src_sev->migration_in_progress);
> > out_fput:
> > if (source_kvm_file)
> > fput(source_kvm_file);
> > @@ -2058,7 +1917,11 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > }
> >
> > source_kvm = source_kvm_file->private_data;
> > - ret = sev_lock_two_vms(kvm, source_kvm);
> > + source_sev = &to_kvm_svm(source_kvm)->sev_info;
> > + mirror_sev = &to_kvm_svm(kvm)->sev_info;
> > + ret = lock_two_vms_for_migration(kvm, source_kvm,
> > + &mirror_sev->migration_in_progress,
> > + &source_sev->migration_in_progress);
> > if (ret)
> > goto e_source_fput;
> >
> > @@ -2078,9 +1941,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > * The mirror kvm holds an enc_context_owner ref so its asid can't
> > * disappear until we're done with it
> > */
> > - source_sev = &to_kvm_svm(source_kvm)->sev_info;
> > kvm_get_kvm(source_kvm);
> > - mirror_sev = &to_kvm_svm(kvm)->sev_info;
> > list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
> >
> > /* Set enc_context_owner and copy its encryption context over */
> > @@ -2101,7 +1962,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > */
> >
> > e_unlock:
> > - sev_unlock_two_vms(kvm, source_kvm);
> > + unlock_two_vms_for_migration(kvm, source_kvm,
> > + &mirror_sev->migration_in_progress,
> > + &source_sev->migration_in_progress);
> > e_source_fput:
> > if (source_kvm_file)
> > fput(source_kvm_file);
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 870041887ed91..865c434a94899 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -13596,6 +13596,172 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> > }
> > EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
> >
> > +/* vCPU mutex subclasses. */
> > +enum migration_role {
> > + MIGRATION_SOURCE = 0,
> > + MIGRATION_TARGET,
> > + NR_MIGRATION_ROLES,
> > +};
> > +
>
> > +static int lock_vcpus_for_migration(struct kvm *kvm, enum migration_role role)
> > +{
> > + struct kvm_vcpu *vcpu;
> > + unsigned long i, j;
> > +
> > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > + if (mutex_lock_killable_nested(&vcpu->mutex, role))
> > + goto out_unlock;
> > +
> > +#ifdef CONFIG_PROVE_LOCKING
> > + if (!i)
> > + /*
> > + * Reset the role to one that avoids colliding with
> > + * the role used for the first vcpu mutex.
> > + */
> > + role = NR_MIGRATION_ROLES;
> > + else
> > + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
> > +#endif
> > + }
> > +
> > + return 0;
> > +
> > +out_unlock:
> > +
> > + kvm_for_each_vcpu(j, vcpu, kvm) {
> > + if (i == j)
> > + break;
> > +
> > +#ifdef CONFIG_PROVE_LOCKING
> > + if (j)
> > + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
> > +#endif
> > +
> > + mutex_unlock(&vcpu->mutex);
> > + }
> > + return -EINTR;
> > +}
> > +
> > +static void unlock_vcpus_for_migration(struct kvm *kvm)
> > +{
> > + struct kvm_vcpu *vcpu;
> > + unsigned long i;
> > + bool first = true;
> > +
> > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > + if (first)
> > + first = false;
> > + else
> > + mutex_acquire(&vcpu->mutex.dep_map, NR_MIGRATION_ROLES,
> > + 0, _THIS_IP_);
> > +
> > + mutex_unlock(&vcpu->mutex);
> > + }
> > +}
> > +
> > +int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress)
> > +{
> > + int r = -EBUSY;
> > +
> > + if (dst_kvm == src_kvm)
> > + return -EINVAL;
> > +
> > + /*
> > + * Bail if these VMs are already involved in a migration to avoid
> > + * deadlock between two VMs trying to migrate to/from each other.
> > + */
> > + if (atomic_cmpxchg_acquire(dst_migration_in_progress, 0, 1))
> > + return -EBUSY;
> > +
> > + if (atomic_cmpxchg_acquire(src_migration_in_progress, 0, 1))
> > + goto release_dst;
> > +
> > + r = -EINTR;
> > + if (mutex_lock_killable(&dst_kvm->lock))
> > + goto release_src;
> > + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
> > + goto unlock_dst;
> > + return 0;
> > +
> > +unlock_dst:
> > + mutex_unlock(&dst_kvm->lock);
> > +release_src:
> > + atomic_set_release(src_migration_in_progress, 0);
> > +release_dst:
> > + atomic_set_release(dst_migration_in_progress, 0);
> > + return r;
> > +}
> > +EXPORT_SYMBOL_GPL(lock_two_vms_for_migration);
> > +
> > +void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress)
> > +{
> > + mutex_unlock(&dst_kvm->lock);
> > + mutex_unlock(&src_kvm->lock);
> > + atomic_set_release(dst_migration_in_progress, 0);
> > + atomic_set_release(src_migration_in_progress, 0);
> > +}
> > +EXPORT_SYMBOL_GPL(unlock_two_vms_for_migration);
> > +
> > +int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress)
> > +{
> > + struct kvm_vcpu *src_vcpu;
> > + unsigned long i;
> > + int ret = -EINVAL;
> > +
> > + ret = lock_two_vms_for_migration(dst_kvm, src_kvm,
> > + dst_migration_in_progress,
> > + src_migration_in_progress);
> > + if (ret)
> > + return ret;
> > +
> > + ret = lock_vcpus_for_migration(dst_kvm, MIGRATION_TARGET);
> > + if (ret)
> > + goto unlock_vms;
> > +
> > + ret = lock_vcpus_for_migration(src_kvm, MIGRATION_SOURCE);
> > + if (ret)
> > + goto unlock_dst_vcpu;
> > +
> > + if (atomic_read(&dst_kvm->online_vcpus) !=
> > + atomic_read(&src_kvm->online_vcpus))
> > + goto unlock_dst_vcpu;
> > +
> > + kvm_for_each_vcpu(i, src_vcpu, src_kvm) {
> > + if (!src_vcpu->arch.guest_state_protected)
> > + goto unlock_dst_vcpu;
> > + }
> > +
> > + return 0;
> > +
> > +unlock_dst_vcpu:
> > + unlock_vcpus_for_migration(dst_kvm);
> > +unlock_vms:
> > + unlock_two_vms_for_migration(dst_kvm, src_kvm,
> > + dst_migration_in_progress,
> > + src_migration_in_progress);
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(pre_move_enc_context_from);
> > +
> > +void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress)
> > +{
> > + unlock_vcpus_for_migration(src_kvm);
> > + unlock_vcpus_for_migration(dst_kvm);
> > + unlock_two_vms_for_migration(dst_kvm, src_kvm,
> > + dst_migration_in_progress,
> > + src_migration_in_progress);
> > +}
> > +EXPORT_SYMBOL_GPL(post_move_enc_context_from);
> > +
>
> It would be nice to have kvm_ prefix for the functions exported.

Sure, I'll update it in the next version.
>
> > bool kvm_arch_dirty_log_supported(struct kvm *kvm)
> > {
> > return kvm->arch.vm_type != KVM_X86_PROTECTED_VM;
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index 33a1a5341e788..554c797184994 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -502,4 +502,20 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> > unsigned int port, void *data, unsigned int count,
> > int in);
> >
> > +int lock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress);
> > +
> > +void unlock_two_vms_for_migration(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress);
> > +
> > +int pre_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress);
> > +
> > +void post_move_enc_context_from(struct kvm *dst_kvm, struct kvm *src_kvm,
> > + atomic_t *dst_migration_in_progress,
> > + atomic_t *src_migration_in_progress);
> > +
> > #endif

2023-04-18 17:49:49

by Sagi Shahar

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Mon, Apr 17, 2023 at 11:28 PM Zhi Wang <[email protected]> wrote:
>
> On Fri, 7 Apr 2023 20:19:19 +0000
> Sagi Shahar <[email protected]> wrote:
>
> Is there any reaon that TDX doesn't need .vm_copy_enc_context_from? Or it is
> going to be deprecated? The patch comments needs to be refined according to
> Sean's KVM x86 maintainer book.

To clarify, there are 2 types of migrations. live migration (between
different hosts) and intra-host (between kvm instances in the same
host) migration. This patchset deals with intra-host migration and
doesn't add support for live migration.

vm_copy_enc_context_from is currently used for setting up the
migration helper for SEV live migration and therefore it is currently
not needed in this patcheset.

>
> > This should mostly match the logic in sev_vm_move_enc_context_from.
> >
> > Signed-off-by: Sagi Shahar <[email protected]>
> > ---
> > arch/x86/kvm/vmx/main.c | 10 +++++++
> > arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/tdx.h | 2 ++
> > arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> > 4 files changed, 73 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 5b64fe5404958..9d5d0ac465bf6 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > return tdx_vcpu_ioctl(vcpu, argp);
> > }
> >
> > +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > +{
> > + if (!is_td(kvm))
> > + return -ENOTTY;
> > +
> > + return tdx_vm_move_enc_context_from(kvm, source_fd);
> > +}
> > +
> > #define VMX_REQUIRED_APICV_INHIBITS \
> > ( \
> > BIT(APICV_INHIBIT_REASON_DISABLE)| \
> > @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > +
> > + .vm_move_enc_context_from = vt_move_enc_context_from,
> > };
> >
> > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 8af7e4e81c860..0999a6d827c99 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> > INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> > return 0;
> > }
> > +
> > +static __always_inline bool tdx_guest(struct kvm *kvm)
> > +{
> > + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> > +
> > + return tdx_kvm->finalized;
> > +}
> > +
> > +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> > +{
> > + return -EINVAL;
> > +}
> > +
> > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > +{
> > + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> > + struct file *src_kvm_file;
> > + struct kvm_tdx *src_tdx;
> > + struct kvm *src_kvm;
> > + int ret;
> > +
> > + src_kvm_file = fget(source_fd);
> > + if (!file_is_kvm(src_kvm_file)) {
> > + ret = -EBADF;
> > + goto out_fput;
> > + }
> > + src_kvm = src_kvm_file->private_data;
> > + src_tdx = to_kvm_tdx(src_kvm);
> > +
> > + ret = pre_move_enc_context_from(kvm, src_kvm,
> > + &dst_tdx->migration_in_progress,
> > + &src_tdx->migration_in_progress);
> > + if (ret)
> > + goto out_fput;
> > +
> > + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> > + ret = -EINVAL;
> > + goto out_post;
> > + }
> > +
> > + ret = tdx_migrate_from(kvm, src_kvm);
> > + if (ret)
> > + goto out_post;
> > +
> > + kvm_vm_dead(src_kvm);
> > + ret = 0;
> > +
> > +out_post:
> > + post_move_enc_context_from(kvm, src_kvm,
> > + &dst_tdx->migration_in_progress,
> > + &src_tdx->migration_in_progress);
> > +out_fput:
> > + if (src_kvm_file)
> > + fput(src_kvm_file);
> > + return ret;
> > +}
> > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > index 71818c5001862..21b7e710be1fd 100644
> > --- a/arch/x86/kvm/vmx/tdx.h
> > +++ b/arch/x86/kvm/vmx/tdx.h
> > @@ -24,6 +24,8 @@ struct kvm_tdx {
> > atomic_t tdh_mem_track;
> >
> > u64 tsc_offset;
> > +
> > + atomic_t migration_in_progress;
> > };
> >
> > union tdx_exit_reason {
> > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> > index d049e0c72ed0c..275f5d75e9bf1 100644
> > --- a/arch/x86/kvm/vmx/x86_ops.h
> > +++ b/arch/x86/kvm/vmx/x86_ops.h
> > @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> > void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> > int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> > void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> > +
> > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> > #else
> > static inline int tdx_init(void) { return 0; };
> > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> > @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> > static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> > static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> > static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> > +
> > +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> > + nsigned int source_fd) { return -EOPNOTSUPP; }
> > #endif
> >
> > #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
>

2023-04-18 18:05:40

by Sagi Shahar

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Tue, Apr 18, 2023 at 5:12 AM Zhi Wang <[email protected]> wrote:
>
> On Fri, 7 Apr 2023 20:19:19 +0000
> Sagi Shahar <[email protected]> wrote:
>
> What was the status of the src VM when calling the vm_move_enc_context_from?
> Is it still active like common live migration or it has been paused?
>

Yes the source VM is still active like in the live migration case.
You can also see that we check that the source VM is finalized when we
call tdx_guest before migrating the state.

> > This should mostly match the logic in sev_vm_move_enc_context_from.
> >
> > Signed-off-by: Sagi Shahar <[email protected]>
> > ---
> > arch/x86/kvm/vmx/main.c | 10 +++++++
> > arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/tdx.h | 2 ++
> > arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> > 4 files changed, 73 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 5b64fe5404958..9d5d0ac465bf6 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > return tdx_vcpu_ioctl(vcpu, argp);
> > }
> >
> > +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > +{
> > + if (!is_td(kvm))
> > + return -ENOTTY;
> > +
> > + return tdx_vm_move_enc_context_from(kvm, source_fd);
> > +}
> > +
> > #define VMX_REQUIRED_APICV_INHIBITS \
> > ( \
> > BIT(APICV_INHIBIT_REASON_DISABLE)| \
> > @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > +
> > + .vm_move_enc_context_from = vt_move_enc_context_from,
> > };
> >
> > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 8af7e4e81c860..0999a6d827c99 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> > INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> > return 0;
> > }
> > +
> > +static __always_inline bool tdx_guest(struct kvm *kvm)
> > +{
> > + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> > +
> > + return tdx_kvm->finalized;
> > +}
> return is_td_finalized()?
> > +
> > +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> > +{
> > + return -EINVAL;
> > +}
> > +
> > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > +{
> > + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> > + struct file *src_kvm_file;
> > + struct kvm_tdx *src_tdx;
> > + struct kvm *src_kvm;
> > + int ret;
> > +
> > + src_kvm_file = fget(source_fd);
> > + if (!file_is_kvm(src_kvm_file)) {
> > + ret = -EBADF;
> > + goto out_fput;
> > + }
> > + src_kvm = src_kvm_file->private_data;
> > + src_tdx = to_kvm_tdx(src_kvm);
> > +
> > + ret = pre_move_enc_context_from(kvm, src_kvm,
> > + &dst_tdx->migration_in_progress,
> > + &src_tdx->migration_in_progress);
> > + if (ret)
> > + goto out_fput;
> > +
> > + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> > + ret = -EINVAL;
> > + goto out_post;
> > + }
> > +
> > + ret = tdx_migrate_from(kvm, src_kvm);
> > + if (ret)
> > + goto out_post;
> > +
> > + kvm_vm_dead(src_kvm);
> > + ret = 0;
> > +
> > +out_post:
> > + post_move_enc_context_from(kvm, src_kvm,
> > + &dst_tdx->migration_in_progress,
> > + &src_tdx->migration_in_progress);
> > +out_fput:
> > + if (src_kvm_file)
> > + fput(src_kvm_file);
> > + return ret;
> > +}
> > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > index 71818c5001862..21b7e710be1fd 100644
> > --- a/arch/x86/kvm/vmx/tdx.h
> > +++ b/arch/x86/kvm/vmx/tdx.h
> > @@ -24,6 +24,8 @@ struct kvm_tdx {
> > atomic_t tdh_mem_track;
> >
> > u64 tsc_offset;
> > +
> > + atomic_t migration_in_progress;
> > };
> >
> > union tdx_exit_reason {
> > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> > index d049e0c72ed0c..275f5d75e9bf1 100644
> > --- a/arch/x86/kvm/vmx/x86_ops.h
> > +++ b/arch/x86/kvm/vmx/x86_ops.h
> > @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> > void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> > int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> > void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> > +
> > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> > #else
> > static inline int tdx_init(void) { return 0; };
> > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> > @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> > static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> > static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> > static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> > +
> > +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> > + nsigned int source_fd) { return -EOPNOTSUPP; }
> > #endif
> >
> > #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
>

2023-04-19 06:41:07

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Tue, 18 Apr 2023 10:47:44 -0700
Sagi Shahar <[email protected]> wrote:

> On Mon, Apr 17, 2023 at 11:28 PM Zhi Wang <[email protected]> wrote:
> >
> > On Fri, 7 Apr 2023 20:19:19 +0000
> > Sagi Shahar <[email protected]> wrote:
> >
> > Is there any reaon that TDX doesn't need .vm_copy_enc_context_from? Or it is
> > going to be deprecated? The patch comments needs to be refined according to
> > Sean's KVM x86 maintainer book.
>
> To clarify, there are 2 types of migrations. live migration (between
> different hosts) and intra-host (between kvm instances in the same
> host) migration. This patchset deals with intra-host migration and
> doesn't add support for live migration.
>
> vm_copy_enc_context_from is currently used for setting up the
> migration helper for SEV live migration and therefore it is currently
> not needed in this patcheset.

Out of curiosity, Is this the migration helper you mentioned here also
a SEV VM?
>
> >
> > > This should mostly match the logic in sev_vm_move_enc_context_from.
> > >
> > > Signed-off-by: Sagi Shahar <[email protected]>
> > > ---
> > > arch/x86/kvm/vmx/main.c | 10 +++++++
> > > arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> > > arch/x86/kvm/vmx/tdx.h | 2 ++
> > > arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> > > 4 files changed, 73 insertions(+)
> > >
> > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > > index 5b64fe5404958..9d5d0ac465bf6 100644
> > > --- a/arch/x86/kvm/vmx/main.c
> > > +++ b/arch/x86/kvm/vmx/main.c
> > > @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > > return tdx_vcpu_ioctl(vcpu, argp);
> > > }
> > >
> > > +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > +{
> > > + if (!is_td(kvm))
> > > + return -ENOTTY;
> > > +
> > > + return tdx_vm_move_enc_context_from(kvm, source_fd);
> > > +}
> > > +
> > > #define VMX_REQUIRED_APICV_INHIBITS \
> > > ( \
> > > BIT(APICV_INHIBIT_REASON_DISABLE)| \
> > > @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > > .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > > +
> > > + .vm_move_enc_context_from = vt_move_enc_context_from,
> > > };
> > >
> > > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index 8af7e4e81c860..0999a6d827c99 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> > > INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> > > return 0;
> > > }
> > > +
> > > +static __always_inline bool tdx_guest(struct kvm *kvm)
> > > +{
> > > + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> > > +
> > > + return tdx_kvm->finalized;
> > > +}
> > > +
> > > +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> > > +{
> > > + return -EINVAL;
> > > +}
> > > +
> > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > +{
> > > + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> > > + struct file *src_kvm_file;
> > > + struct kvm_tdx *src_tdx;
> > > + struct kvm *src_kvm;
> > > + int ret;
> > > +
> > > + src_kvm_file = fget(source_fd);
> > > + if (!file_is_kvm(src_kvm_file)) {
> > > + ret = -EBADF;
> > > + goto out_fput;
> > > + }
> > > + src_kvm = src_kvm_file->private_data;
> > > + src_tdx = to_kvm_tdx(src_kvm);
> > > +
> > > + ret = pre_move_enc_context_from(kvm, src_kvm,
> > > + &dst_tdx->migration_in_progress,
> > > + &src_tdx->migration_in_progress);
> > > + if (ret)
> > > + goto out_fput;
> > > +
> > > + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> > > + ret = -EINVAL;
> > > + goto out_post;
> > > + }
> > > +
> > > + ret = tdx_migrate_from(kvm, src_kvm);
> > > + if (ret)
> > > + goto out_post;
> > > +
> > > + kvm_vm_dead(src_kvm);
> > > + ret = 0;
> > > +
> > > +out_post:
> > > + post_move_enc_context_from(kvm, src_kvm,
> > > + &dst_tdx->migration_in_progress,
> > > + &src_tdx->migration_in_progress);
> > > +out_fput:
> > > + if (src_kvm_file)
> > > + fput(src_kvm_file);
> > > + return ret;
> > > +}
> > > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > > index 71818c5001862..21b7e710be1fd 100644
> > > --- a/arch/x86/kvm/vmx/tdx.h
> > > +++ b/arch/x86/kvm/vmx/tdx.h
> > > @@ -24,6 +24,8 @@ struct kvm_tdx {
> > > atomic_t tdh_mem_track;
> > >
> > > u64 tsc_offset;
> > > +
> > > + atomic_t migration_in_progress;
> > > };
> > >
> > > union tdx_exit_reason {
> > > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> > > index d049e0c72ed0c..275f5d75e9bf1 100644
> > > --- a/arch/x86/kvm/vmx/x86_ops.h
> > > +++ b/arch/x86/kvm/vmx/x86_ops.h
> > > @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> > > void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> > > int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> > > void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> > > +
> > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> > > #else
> > > static inline int tdx_init(void) { return 0; };
> > > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> > > @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> > > static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> > > static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> > > static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> > > +
> > > +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> > > + nsigned int source_fd) { return -EOPNOTSUPP; }
> > > #endif
> > >
> > > #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
> >

2023-04-19 07:14:02

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 5/5] KVM: TDX: Add core logic for TDX intra-host migration

On Fri, 7 Apr 2023 20:19:21 +0000
Sagi Shahar <[email protected]> wrote:

> Adds the core logic for transferring state between source and
> destination TDs during intra-host migration.
>
> Signed-off-by: Sagi Shahar <[email protected]>
> ---
> arch/x86/kvm/vmx/tdx.c | 191 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 190 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 0999a6d827c99..05b164a91437b 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2834,9 +2834,198 @@ static __always_inline bool tdx_guest(struct kvm *kvm)
> return tdx_kvm->finalized;
> }
>
> +#define for_each_memslot_pair(memslots_1, memslots_2, memslot_iter_1, \
> + memslot_iter_2) \
> + for (memslot_iter_1 = rb_first(&memslots_1->gfn_tree), \
> + memslot_iter_2 = rb_first(&memslots_2->gfn_tree); \
> + memslot_iter_1 && memslot_iter_2; \
> + memslot_iter_1 = rb_next(memslot_iter_1), \
> + memslot_iter_2 = rb_next(memslot_iter_2))
> +

If it is a pair, using suffix *_a, *_b would be better.

> static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> {
> - return -EINVAL;
> + struct rb_node *src_memslot_iter, *dst_memslot_iter;
> + struct vcpu_tdx *dst_tdx_vcpu, *src_tdx_vcpu;
> + struct kvm_memslots *src_slots, *dst_slots;
> + struct kvm_vcpu *dst_vcpu, *src_vcpu;
> + struct kvm_tdx *src_tdx, *dst_tdx;
> + unsigned long i, j;
> + int ret;
> +
> + src_tdx = to_kvm_tdx(src);
> + dst_tdx = to_kvm_tdx(dst);
> +
> + src_slots = __kvm_memslots(src, 0);
> + dst_slots = __kvm_memslots(dst, 0);
> +
> + ret = -EINVAL;
> +
> + if (!src_tdx->finalized) {
> + pr_warn("Cannot migrate from a non finalized VM\n");
> + goto abort;
> + }
> +

Let's use the existing inline function is_td_finalized().

> + // Traverse both memslots in gfn order and compare them
> + for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter, dst_memslot_iter) {
> + struct kvm_memory_slot *src_slot, *dst_slot;
> +
> + src_slot =
> + container_of(src_memslot_iter, struct kvm_memory_slot,
> + gfn_node[src_slots->node_idx]);
> + dst_slot =
> + container_of(src_memslot_iter, struct kvm_memory_slot,
> + gfn_node[dst_slots->node_idx]);
> +
^dst_memslot_iter? So does the other one below.
> + if (src_slot->base_gfn != dst_slot->base_gfn ||
> + src_slot->npages != dst_slot->npages) {
> + pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> + goto abort;
> + }
> +
> + if (src_slot->flags != dst_slot->flags) {
> + pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> + goto abort;
> + }
> +
> + if (src_slot->flags & KVM_MEM_PRIVATE) {
> + if (src_slot->restrictedmem.file->f_inode->i_ino !=
> + dst_slot->restrictedmem.file->f_inode->i_ino) {
> + pr_warn("Private memslots points to different restricted files\n");
> + goto abort;
> + }
> +
> + if (src_slot->restrictedmem.index != dst_slot->restrictedmem.index) {
> + pr_warn("Private memslots points to the restricted file at different offsets\n");
> + goto abort;
> + }
> + }
> + }
> +
> + if (src_memslot_iter || dst_memslot_iter) {
> + pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> + goto abort;
> + }
> +
> + dst_tdx->hkid = src_tdx->hkid;
> + dst_tdx->tdr_pa = src_tdx->tdr_pa;
> +
> + dst_tdx->tdcs_pa = kcalloc(tdx_info.nr_tdcs_pages, sizeof(*dst_tdx->tdcs_pa),
> + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> + if (!dst_tdx->tdcs_pa) {
> + ret = -ENOMEM;
> + goto late_abort;
> + }
> + memcpy(dst_tdx->tdcs_pa, src_tdx->tdcs_pa,
> + tdx_info.nr_tdcs_pages * sizeof(*dst_tdx->tdcs_pa));
> +
> + dst_tdx->tsc_offset = src_tdx->tsc_offset;
> + dst_tdx->attributes = src_tdx->attributes;
> + dst_tdx->xfam = src_tdx->xfam;
> + dst_tdx->kvm.arch.gfn_shared_mask = src_tdx->kvm.arch.gfn_shared_mask;
> +
> + kvm_for_each_vcpu(i, src_vcpu, src)
> + tdx_flush_vp_on_cpu(src_vcpu);
> +
> + /* Copy per-vCPU state */
> + kvm_for_each_vcpu(i, src_vcpu, src) {
> + src_tdx_vcpu = to_tdx(src_vcpu);
> + dst_vcpu = kvm_get_vcpu(dst, i);
> + dst_tdx_vcpu = to_tdx(dst_vcpu);
> +
> + vcpu_load(dst_vcpu);
> +
> + memcpy(dst_vcpu->arch.regs, src_vcpu->arch.regs,
> + NR_VCPU_REGS * sizeof(src_vcpu->arch.regs[0]));
> + dst_vcpu->arch.regs_avail = src_vcpu->arch.regs_avail;
> + dst_vcpu->arch.regs_dirty = src_vcpu->arch.regs_dirty;
> +
> + dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset;
> +
> + dst_tdx_vcpu->interrupt_disabled_hlt = src_tdx_vcpu->interrupt_disabled_hlt;
> + dst_tdx_vcpu->buggy_hlt_workaround = src_tdx_vcpu->buggy_hlt_workaround;
> +
> + dst_tdx_vcpu->tdvpr_pa = src_tdx_vcpu->tdvpr_pa;
> + dst_tdx_vcpu->tdvpx_pa = kcalloc(tdx_info.nr_tdvpx_pages,
> + sizeof(*dst_tdx_vcpu->tdvpx_pa),
> + GFP_KERNEL_ACCOUNT);
> + if (!dst_tdx_vcpu->tdvpx_pa) {
> + ret = -ENOMEM;
> + vcpu_put(dst_vcpu);
> + goto late_abort;
> + }
> + memcpy(dst_tdx_vcpu->tdvpx_pa, src_tdx_vcpu->tdvpx_pa,
> + tdx_info.nr_tdvpx_pages * sizeof(*dst_tdx_vcpu->tdvpx_pa));
> +
> + td_vmcs_write64(dst_tdx_vcpu, POSTED_INTR_DESC_ADDR, __pa(&dst_tdx_vcpu->pi_desc));
> +
> + /* Copy private EPT tables */
> + if (kvm_mmu_move_private_pages_from(dst_vcpu, src_vcpu)) {
> + ret = -EINVAL;
> + vcpu_put(dst_vcpu);
> + goto late_abort;
> + }
> +
> + for (j = 0; j < tdx_info.nr_tdvpx_pages; j++)
> + src_tdx_vcpu->tdvpx_pa[j] = 0;
> +
> + src_tdx_vcpu->tdvpr_pa = 0;
> +
> + vcpu_put(dst_vcpu);
> + }
> +
> + for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter,
> + dst_memslot_iter) {
> + struct kvm_memory_slot *src_slot, *dst_slot;
> +
> + src_slot = container_of(src_memslot_iter,
> + struct kvm_memory_slot,
> + gfn_node[src_slots->node_idx]);
> + dst_slot = container_of(src_memslot_iter,
> + struct kvm_memory_slot,
> + gfn_node[dst_slots->node_idx]);
> +
> + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
> + unsigned long ugfn;
> + int level = i + 1;
> +
> + /*
> + * If the gfn and userspace address are not aligned wrt each other, then
> + * large page support should already be disabled at this level.
> + */
> + ugfn = dst_slot->userspace_addr >> PAGE_SHIFT;
> + if ((dst_slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1))
> + continue;
> +
> + dst_slot->arch.lpage_info[i - 1] =
> + src_slot->arch.lpage_info[i - 1];
> + src_slot->arch.lpage_info[i - 1] = NULL;
> + }
> + }
> +
> + dst->mem_attr_array.xa_head = src->mem_attr_array.xa_head;
> + src->mem_attr_array.xa_head = NULL;
> +
> + dst_tdx->finalized = true;
> +
> + /* Clear source VM to avoid freeing the hkid and pages on VM put */
> + src_tdx->hkid = -1;
> + src_tdx->tdr_pa = 0;
> + for (i = 0; i < tdx_info.nr_tdcs_pages; i++)
> + src_tdx->tdcs_pa[i] = 0;
> +
> + return 0;
> +
> +late_abort:
> + /* If we aborted after the state transfer already started, the src VM
> + * is no longer valid.
> + */
> + kvm_vm_dead(src);
> +
> +abort:
> + dst_tdx->hkid = -1;
> + dst_tdx->tdr_pa = 0;
> +
> + return ret;
> }
>
This function is quite long. It would be better to split some parts into
separate functions.
> int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)

2023-04-27 21:32:49

by Sagi Shahar

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Tue, Apr 18, 2023 at 11:34 PM Zhi Wang <[email protected]> wrote:
>
> On Tue, 18 Apr 2023 10:47:44 -0700
> Sagi Shahar <[email protected]> wrote:
>
> > On Mon, Apr 17, 2023 at 11:28 PM Zhi Wang <[email protected]> wrote:
> > >
> > > On Fri, 7 Apr 2023 20:19:19 +0000
> > > Sagi Shahar <[email protected]> wrote:
> > >
> > > Is there any reaon that TDX doesn't need .vm_copy_enc_context_from? Or it is
> > > going to be deprecated? The patch comments needs to be refined according to
> > > Sean's KVM x86 maintainer book.
> >
> > To clarify, there are 2 types of migrations. live migration (between
> > different hosts) and intra-host (between kvm instances in the same
> > host) migration. This patchset deals with intra-host migration and
> > doesn't add support for live migration.
> >
> > vm_copy_enc_context_from is currently used for setting up the
> > migration helper for SEV live migration and therefore it is currently
> > not needed in this patcheset.
>
> Out of curiosity, Is this the migration helper you mentioned here also
> a SEV VM?

I'm not that familiar with SEV migration but from what I understand
the answer is "not exactly".
It's a guest process that runs as part of the SEV VM firmware.

There's some public information about it here:
https://lpc.events/event/11/contributions/958/attachments/769/1448/Live%20migration%20of%20confidential%20guests_LPC2021.pdf
> >
> > >
> > > > This should mostly match the logic in sev_vm_move_enc_context_from.
> > > >
> > > > Signed-off-by: Sagi Shahar <[email protected]>
> > > > ---
> > > > arch/x86/kvm/vmx/main.c | 10 +++++++
> > > > arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> > > > arch/x86/kvm/vmx/tdx.h | 2 ++
> > > > arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> > > > 4 files changed, 73 insertions(+)
> > > >
> > > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > > > index 5b64fe5404958..9d5d0ac465bf6 100644
> > > > --- a/arch/x86/kvm/vmx/main.c
> > > > +++ b/arch/x86/kvm/vmx/main.c
> > > > @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > > > return tdx_vcpu_ioctl(vcpu, argp);
> > > > }
> > > >
> > > > +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > > +{
> > > > + if (!is_td(kvm))
> > > > + return -ENOTTY;
> > > > +
> > > > + return tdx_vm_move_enc_context_from(kvm, source_fd);
> > > > +}
> > > > +
> > > > #define VMX_REQUIRED_APICV_INHIBITS \
> > > > ( \
> > > > BIT(APICV_INHIBIT_REASON_DISABLE)| \
> > > > @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > > > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > > > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > > > .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > > > +
> > > > + .vm_move_enc_context_from = vt_move_enc_context_from,
> > > > };
> > > >
> > > > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > index 8af7e4e81c860..0999a6d827c99 100644
> > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> > > > INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> > > > return 0;
> > > > }
> > > > +
> > > > +static __always_inline bool tdx_guest(struct kvm *kvm)
> > > > +{
> > > > + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> > > > +
> > > > + return tdx_kvm->finalized;
> > > > +}
> > > > +
> > > > +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> > > > +{
> > > > + return -EINVAL;
> > > > +}
> > > > +
> > > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > > +{
> > > > + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> > > > + struct file *src_kvm_file;
> > > > + struct kvm_tdx *src_tdx;
> > > > + struct kvm *src_kvm;
> > > > + int ret;
> > > > +
> > > > + src_kvm_file = fget(source_fd);
> > > > + if (!file_is_kvm(src_kvm_file)) {
> > > > + ret = -EBADF;
> > > > + goto out_fput;
> > > > + }
> > > > + src_kvm = src_kvm_file->private_data;
> > > > + src_tdx = to_kvm_tdx(src_kvm);
> > > > +
> > > > + ret = pre_move_enc_context_from(kvm, src_kvm,
> > > > + &dst_tdx->migration_in_progress,
> > > > + &src_tdx->migration_in_progress);
> > > > + if (ret)
> > > > + goto out_fput;
> > > > +
> > > > + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> > > > + ret = -EINVAL;
> > > > + goto out_post;
> > > > + }
> > > > +
> > > > + ret = tdx_migrate_from(kvm, src_kvm);
> > > > + if (ret)
> > > > + goto out_post;
> > > > +
> > > > + kvm_vm_dead(src_kvm);
> > > > + ret = 0;
> > > > +
> > > > +out_post:
> > > > + post_move_enc_context_from(kvm, src_kvm,
> > > > + &dst_tdx->migration_in_progress,
> > > > + &src_tdx->migration_in_progress);
> > > > +out_fput:
> > > > + if (src_kvm_file)
> > > > + fput(src_kvm_file);
> > > > + return ret;
> > > > +}
> > > > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > > > index 71818c5001862..21b7e710be1fd 100644
> > > > --- a/arch/x86/kvm/vmx/tdx.h
> > > > +++ b/arch/x86/kvm/vmx/tdx.h
> > > > @@ -24,6 +24,8 @@ struct kvm_tdx {
> > > > atomic_t tdh_mem_track;
> > > >
> > > > u64 tsc_offset;
> > > > +
> > > > + atomic_t migration_in_progress;
> > > > };
> > > >
> > > > union tdx_exit_reason {
> > > > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> > > > index d049e0c72ed0c..275f5d75e9bf1 100644
> > > > --- a/arch/x86/kvm/vmx/x86_ops.h
> > > > +++ b/arch/x86/kvm/vmx/x86_ops.h
> > > > @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> > > > void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> > > > int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> > > > void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> > > > +
> > > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> > > > #else
> > > > static inline int tdx_init(void) { return 0; };
> > > > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> > > > @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> > > > static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> > > > static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> > > > static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> > > > +
> > > > +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> > > > + nsigned int source_fd) { return -EOPNOTSUPP; }
> > > > #endif
> > > >
> > > > #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
> > >
>

2023-04-28 16:23:59

by Zhi Wang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from

On Thu, 27 Apr 2023 14:25:41 -0700
Sagi Shahar <[email protected]> wrote:

> On Tue, Apr 18, 2023 at 11:34 PM Zhi Wang <[email protected]> wrote:
> >
> > On Tue, 18 Apr 2023 10:47:44 -0700
> > Sagi Shahar <[email protected]> wrote:
> >
> > > On Mon, Apr 17, 2023 at 11:28 PM Zhi Wang <[email protected]> wrote:
> > > >
> > > > On Fri, 7 Apr 2023 20:19:19 +0000
> > > > Sagi Shahar <[email protected]> wrote:
> > > >
> > > > Is there any reaon that TDX doesn't need .vm_copy_enc_context_from? Or it is
> > > > going to be deprecated? The patch comments needs to be refined according to
> > > > Sean's KVM x86 maintainer book.
> > >
> > > To clarify, there are 2 types of migrations. live migration (between
> > > different hosts) and intra-host (between kvm instances in the same
> > > host) migration. This patchset deals with intra-host migration and
> > > doesn't add support for live migration.
> > >
> > > vm_copy_enc_context_from is currently used for setting up the
> > > migration helper for SEV live migration and therefore it is currently
> > > not needed in this patcheset.
> >
> > Out of curiosity, Is this the migration helper you mentioned here also
> > a SEV VM?
>
> I'm not that familiar with SEV migration but from what I understand
> the answer is "not exactly".
> It's a guest process that runs as part of the SEV VM firmware.
>
> There's some public information about it here:
> https://lpc.events/event/11/contributions/958/attachments/769/1448/Live%20migration%20of%20confidential%20guests_LPC2021.pdf

Thanks so much for the information. I spent some time on reading and digging
around it didn't talk about how the callback will be used. It would be nice
to see the whole picture then I guess I will have more comments.

> > >
> > > >
> > > > > This should mostly match the logic in sev_vm_move_enc_context_from.
> > > > >
> > > > > Signed-off-by: Sagi Shahar <[email protected]>
> > > > > ---
> > > > > arch/x86/kvm/vmx/main.c | 10 +++++++
> > > > > arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++++++++++++++++++++++++++
> > > > > arch/x86/kvm/vmx/tdx.h | 2 ++
> > > > > arch/x86/kvm/vmx/x86_ops.h | 5 ++++
> > > > > 4 files changed, 73 insertions(+)
> > > > >
> > > > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > > > > index 5b64fe5404958..9d5d0ac465bf6 100644
> > > > > --- a/arch/x86/kvm/vmx/main.c
> > > > > +++ b/arch/x86/kvm/vmx/main.c
> > > > > @@ -979,6 +979,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > > > > return tdx_vcpu_ioctl(vcpu, argp);
> > > > > }
> > > > >
> > > > > +static int vt_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > > > +{
> > > > > + if (!is_td(kvm))
> > > > > + return -ENOTTY;
> > > > > +
> > > > > + return tdx_vm_move_enc_context_from(kvm, source_fd);
> > > > > +}
> > > > > +
> > > > > #define VMX_REQUIRED_APICV_INHIBITS \
> > > > > ( \
> > > > > BIT(APICV_INHIBIT_REASON_DISABLE)| \
> > > > > @@ -1141,6 +1149,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > > > > .dev_mem_enc_ioctl = tdx_dev_ioctl,
> > > > > .mem_enc_ioctl = vt_mem_enc_ioctl,
> > > > > .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
> > > > > +
> > > > > + .vm_move_enc_context_from = vt_move_enc_context_from,
> > > > > };
> > > > >
> > > > > struct kvm_x86_init_ops vt_init_ops __initdata = {
> > > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > > index 8af7e4e81c860..0999a6d827c99 100644
> > > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > > @@ -2826,3 +2826,59 @@ int __init tdx_init(void)
> > > > > INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, cpu));
> > > > > return 0;
> > > > > }
> > > > > +
> > > > > +static __always_inline bool tdx_guest(struct kvm *kvm)
> > > > > +{
> > > > > + struct kvm_tdx *tdx_kvm = to_kvm_tdx(kvm);
> > > > > +
> > > > > + return tdx_kvm->finalized;
> > > > > +}
> > > > > +
> > > > > +static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
> > > > > +{
> > > > > + return -EINVAL;
> > > > > +}
> > > > > +
> > > > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
> > > > > +{
> > > > > + struct kvm_tdx *dst_tdx = to_kvm_tdx(kvm);
> > > > > + struct file *src_kvm_file;
> > > > > + struct kvm_tdx *src_tdx;
> > > > > + struct kvm *src_kvm;
> > > > > + int ret;
> > > > > +
> > > > > + src_kvm_file = fget(source_fd);
> > > > > + if (!file_is_kvm(src_kvm_file)) {
> > > > > + ret = -EBADF;
> > > > > + goto out_fput;
> > > > > + }
> > > > > + src_kvm = src_kvm_file->private_data;
> > > > > + src_tdx = to_kvm_tdx(src_kvm);
> > > > > +
> > > > > + ret = pre_move_enc_context_from(kvm, src_kvm,
> > > > > + &dst_tdx->migration_in_progress,
> > > > > + &src_tdx->migration_in_progress);
> > > > > + if (ret)
> > > > > + goto out_fput;
> > > > > +
> > > > > + if (tdx_guest(kvm) || !tdx_guest(src_kvm)) {
> > > > > + ret = -EINVAL;
> > > > > + goto out_post;
> > > > > + }
> > > > > +
> > > > > + ret = tdx_migrate_from(kvm, src_kvm);
> > > > > + if (ret)
> > > > > + goto out_post;
> > > > > +
> > > > > + kvm_vm_dead(src_kvm);
> > > > > + ret = 0;
> > > > > +
> > > > > +out_post:
> > > > > + post_move_enc_context_from(kvm, src_kvm,
> > > > > + &dst_tdx->migration_in_progress,
> > > > > + &src_tdx->migration_in_progress);
> > > > > +out_fput:
> > > > > + if (src_kvm_file)
> > > > > + fput(src_kvm_file);
> > > > > + return ret;
> > > > > +}
> > > > > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > > > > index 71818c5001862..21b7e710be1fd 100644
> > > > > --- a/arch/x86/kvm/vmx/tdx.h
> > > > > +++ b/arch/x86/kvm/vmx/tdx.h
> > > > > @@ -24,6 +24,8 @@ struct kvm_tdx {
> > > > > atomic_t tdh_mem_track;
> > > > >
> > > > > u64 tsc_offset;
> > > > > +
> > > > > + atomic_t migration_in_progress;
> > > > > };
> > > > >
> > > > > union tdx_exit_reason {
> > > > > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> > > > > index d049e0c72ed0c..275f5d75e9bf1 100644
> > > > > --- a/arch/x86/kvm/vmx/x86_ops.h
> > > > > +++ b/arch/x86/kvm/vmx/x86_ops.h
> > > > > @@ -187,6 +187,8 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> > > > > void tdx_flush_tlb(struct kvm_vcpu *vcpu);
> > > > > int tdx_sept_tlb_remote_flush(struct kvm *kvm);
> > > > > void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> > > > > +
> > > > > +int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
> > > > > #else
> > > > > static inline int tdx_init(void) { return 0; };
> > > > > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> > > > > @@ -241,6 +243,9 @@ static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { ret
> > > > > static inline void tdx_flush_tlb(struct kvm_vcpu *vcpu) {}
> > > > > static inline int tdx_sept_tlb_remote_flush(struct kvm *kvm) { return 0; }
> > > > > static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
> > > > > +
> > > > > +static inline int tdx_vm_move_enc_context_from(struct kvm *kvm, u
> > > > > + nsigned int source_fd) { return -EOPNOTSUPP; }
> > > > > #endif
> > > > >
> > > > > #if defined(CONFIG_INTEL_TDX_HOST) && defined(CONFIG_KVM_SMM)
> > > >
> >

2023-06-02 07:02:56

by Isaku Yamahata

[permalink] [raw]
Subject: Re: [RFC PATCH 4/5] KVM: TDX: Implement moving private pages between 2 TDs

On Fri, Apr 07, 2023 at 08:19:20PM +0000,
Sagi Shahar <[email protected]> wrote:

> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 327dee4f6170e..685528fdc0ad6 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -296,6 +296,23 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
> trace_kvm_mmu_get_page(sp, true);
> }
>
> +static struct kvm_mmu_page *
> +kvm_tdp_mmu_get_vcpu_root_no_alloc(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm_mmu_page *root;
> +
> + lockdep_assert_held_read(&kvm->mmu_lock);

Because kvm_tdp_mmu_get_cpu_root() holds write lock,
this should be lockdep_assert_held(&kvm->mmu_lock)

Thanks,

> +
> + for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
> + if (root->role.word == role.word &&
> + kvm_tdp_mmu_get_root(root))
> + return root;
> + }
> +
> + return NULL;
> +}
> +
> static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
> bool private)
> {
> @@ -311,11 +328,9 @@ static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
> */
> if (private)
> kvm_mmu_page_role_set_private(&role);
> - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
> - if (root->role.word == role.word &&
> - kvm_tdp_mmu_get_root(root))
> - goto out;
> - }
> + root = kvm_tdp_mmu_get_vcpu_root_no_alloc(vcpu, role);
> + if (!!root)
> + goto out;
>
> root = tdp_mmu_alloc_sp(vcpu, role);
> tdp_mmu_init_sp(root, NULL, 0);
> @@ -330,6 +345,58 @@ static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,
> return root;
> }
>
> +hpa_t kvm_tdp_mmu_move_private_pages_from(struct kvm_vcpu *vcpu,
> + struct kvm_vcpu *src_vcpu)
> +{
> + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm *src_kvm = src_vcpu->kvm;
> + struct kvm_mmu_page *private_root = NULL;
> + struct kvm_mmu_page *root;
> + s64 num_private_pages, old;
> +
> + lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
> + lockdep_assert_held_write(&src_vcpu->kvm->mmu_lock);
> +
> + /* Find the private root of the source. */
> + kvm_mmu_page_role_set_private(&role);
> + for_each_tdp_mmu_root(src_kvm, root, kvm_mmu_role_as_id(role)) {
> + if (root->role.word == role.word) {
> + private_root = root;
> + break;
> + }
> + }
> + if (!private_root)
> + return INVALID_PAGE;
> +
> + /* Remove the private root from the src kvm and add it to dst kvm. */
> + list_del_rcu(&private_root->link);
> + list_add_rcu(&private_root->link, &kvm->arch.tdp_mmu_roots);
> +
> + num_private_pages = atomic64_read(&src_kvm->arch.tdp_private_mmu_pages);
> + old = atomic64_cmpxchg(&kvm->arch.tdp_private_mmu_pages, 0,
> + num_private_pages);
> + /* The destination VM should have no private pages at this point. */
> + WARN_ON(old);
> + atomic64_set(&src_kvm->arch.tdp_private_mmu_pages, 0);
> +
> + return __pa(private_root->spt);
> +}
> +
> +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa_no_alloc(struct kvm_vcpu *vcpu, bool private)
> +{
> + struct kvm_mmu_page *root;
> + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
> +
> + if (private)
> + kvm_mmu_page_role_set_private(&role);
> + root = kvm_tdp_mmu_get_vcpu_root_no_alloc(vcpu, role);
> + if (!root)
> + return INVALID_PAGE;
> +
> + return __pa(root->spt);
> +}
> +
> hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu, bool private)
> {
> return __pa(kvm_tdp_mmu_get_vcpu_root(vcpu, private)->spt);

--
Isaku Yamahata <[email protected]>