From: Sean Christopherson <[email protected]>
As the first step to create TDX guest, create/destroy VM struct. Assign
TDX private Host Key ID (HKID) to the TDX guest for memory encryption and
allocate extra pages for the TDX guest. On destruction, free allocated
pages, and HKID.
Before tearing down private page tables, TDX requires some resources of the
guest TD to be destroyed (i.e. HKID must have been reclaimed, etc). Add
flush_shadow_all_private callback before tearing down private page tables
for it.
Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
because some per-VM TDX resources, e.g. TDR, need to be freed after other
TDX resources, e.g. HKID, were freed.
Co-developed-by: Kai Huang <[email protected]>
Signed-off-by: Kai Huang <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v10 -> v11:
- Fix doule free in tdx_vm_free() by setting NULL.
- replace struct tdx_td_page tdr and tdcs from struct kvm_tdx with
unsigned long
Signed-off-by: Isaku Yamahata <[email protected]>
---
arch/x86/include/asm/kvm-x86-ops.h | 2 +
arch/x86/include/asm/kvm_host.h | 2 +
arch/x86/kvm/vmx/main.c | 34 ++-
arch/x86/kvm/vmx/tdx.c | 415 +++++++++++++++++++++++++++++
arch/x86/kvm/vmx/tdx.h | 6 +-
arch/x86/kvm/vmx/x86_ops.h | 9 +
arch/x86/kvm/x86.c | 8 +
7 files changed, 472 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index e6708bb3f4f6..552de893af75 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -22,7 +22,9 @@ KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(is_vm_type_supported)
KVM_X86_OP(vm_init)
+KVM_X86_OP_OPTIONAL(flush_shadow_all_private)
KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_free)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 234c28c8e6ee..e199ddf0bb00 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1540,7 +1540,9 @@ struct kvm_x86_ops {
bool (*is_vm_type_supported)(unsigned long vm_type);
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
+ void (*flush_shadow_all_private)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
+ void (*vm_free)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
int (*vcpu_precreate)(struct kvm *kvm);
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 781fbc896120..c5f2515026e9 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -29,14 +29,40 @@ static __init int vt_hardware_setup(void)
return 0;
}
+static void vt_hardware_unsetup(void)
+{
+ tdx_hardware_unsetup();
+ vmx_hardware_unsetup();
+}
+
static int vt_vm_init(struct kvm *kvm)
{
if (is_td(kvm))
- return -EOPNOTSUPP; /* Not ready to create guest TD yet. */
+ return tdx_vm_init(kvm);
return vmx_vm_init(kvm);
}
+static void vt_flush_shadow_all_private(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return;
+
+ vmx_vm_destroy(kvm);
+}
+
+static void vt_vm_free(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ tdx_vm_free(kvm);
+}
+
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
@@ -50,7 +76,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.check_processor_compatibility = vmx_check_processor_compat,
- .hardware_unsetup = vmx_hardware_unsetup,
+ .hardware_unsetup = vt_hardware_unsetup,
.hardware_enable = vmx_hardware_enable,
.hardware_disable = vmx_hardware_disable,
@@ -59,7 +85,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.is_vm_type_supported = vt_is_vm_type_supported,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vt_vm_init,
- .vm_destroy = vmx_vm_destroy,
+ .flush_shadow_all_private = vt_flush_shadow_all_private,
+ .vm_destroy = vt_vm_destroy,
+ .vm_free = vt_vm_free,
.vcpu_precreate = vmx_vcpu_precreate,
.vcpu_create = vmx_vcpu_create,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 2bd1cc37abab..d11950d18226 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -6,6 +6,7 @@
#include "capabilities.h"
#include "x86_ops.h"
#include "tdx.h"
+#include "tdx_ops.h"
#include "x86.h"
#undef pr_fmt
@@ -32,6 +33,250 @@ struct tdx_capabilities {
/* Capabilities of KVM + the TDX module. */
static struct tdx_capabilities tdx_caps;
+/*
+ * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
+ * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a global lock
+ * internally in TDX module. If failed, TDX_OPERAND_BUSY is returned without
+ * spinning or waiting due to a constraint on execution time. It's caller's
+ * responsibility to avoid race (or retry on TDX_OPERAND_BUSY). Use this mutex
+ * to avoid race in TDX module because the kernel knows better about scheduling.
+ */
+static DEFINE_MUTEX(tdx_lock);
+static struct mutex *tdx_mng_key_config_lock;
+
+static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
+{
+ return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
+}
+
+static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->tdr_pa;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+ tdx_keyid_free(kvm_tdx->hkid);
+ kvm_tdx->hkid = 0;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->hkid > 0;
+}
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ void *page = __va(page_pa);
+ unsigned long i;
+
+ if (!static_cpu_has(X86_FEATURE_MOVDIR64B)) {
+ clear_page(page);
+ return;
+ }
+
+ /*
+ * Zeroing the page is only necessary for systems with MKTME-i:
+ * when re-assign one page from old keyid to a new keyid, MOVDIR64B is
+ * required to clear/write the page with new keyid to prevent integrity
+ * error when read on the page with new keyid.
+ *
+ * clflush doesn't flush cache with HKID set.
+ * The cache line could be poisoned (even without MKTME-i), clear the
+ * poison bit.
+ */
+ for (i = 0; i < PAGE_SIZE; i += 64)
+ movdir64b(page + i, zero_page);
+ /*
+ * MOVDIR64B store uses WC buffer. Prevent following memory reads
+ * from seeing potentially poisoned cache.
+ */
+ __mb();
+}
+
+static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
+{
+ struct tdx_module_output out;
+ u64 err;
+
+ do {
+ err = tdh_phymem_page_reclaim(pa, &out);
+ /*
+ * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
+ * state. i.e. destructing TD.
+ * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
+ * Because we're destructing TD, it's rare to contend with TDR.
+ */
+ } while (err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX));
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
+ return -EIO;
+ }
+
+ if (do_wb) {
+ /*
+ * Only TDR page gets into this path. No contention is expected
+ * because of the last page of TD.
+ */
+ err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(pa, hkid));
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
+ return -EIO;
+ }
+ }
+
+ tdx_clear_page(pa);
+ return 0;
+}
+
+static void tdx_reclaim_td_page(unsigned long td_page_pa)
+{
+ if (!td_page_pa)
+ return;
+ /*
+ * TDCX are being reclaimed. TDX module maps TDCX with HKID
+ * assigned to the TD. Here the cache associated to the TD
+ * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
+ * cache doesn't need to be flushed again.
+ */
+ if (WARN_ON(tdx_reclaim_page(td_page_pa, false, 0)))
+ /* If reclaim failed, leak the page. */
+ return;
+ free_page((unsigned long)__va(td_page_pa));
+}
+
+static int tdx_do_tdh_phymem_cache_wb(void *param)
+{
+ u64 err = 0;
+
+ do {
+ err = tdh_phymem_cache_wb(!!err);
+ } while (err == TDX_INTERRUPTED_RESUMABLE);
+
+ /* Other thread may have done for us. */
+ if (err == TDX_NO_HKID_READY_TO_WBCACHE)
+ err = TDX_SUCCESS;
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ bool cpumask_allocated;
+ u64 err;
+ int ret;
+ int i;
+
+ if (!is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (!is_td_created(kvm_tdx))
+ goto free_hkid;
+
+ cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+ cpus_read_lock();
+ for_each_online_cpu(i) {
+ if (cpumask_allocated &&
+ cpumask_test_and_set_cpu(topology_physical_package_id(i),
+ packages))
+ continue;
+
+ /*
+ * We can destroy multiple the guest TDs simultaneously.
+ * Prevent tdh_phymem_cache_wb from returning TDX_BUSY by
+ * serialization.
+ */
+ mutex_lock(&tdx_lock);
+ ret = smp_call_on_cpu(i, tdx_do_tdh_phymem_cache_wb, NULL, 1);
+ mutex_unlock(&tdx_lock);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+
+ mutex_lock(&tdx_lock);
+ err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
+ mutex_unlock(&tdx_lock);
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
+ pr_err("tdh_mng_key_freeid failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ return;
+ }
+
+free_hkid:
+ tdx_hkid_free(kvm_tdx);
+}
+
+void tdx_vm_free(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ int i;
+
+ /* Can't reclaim or free TD pages if teardown failed. */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->tdcs_pa) {
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
+ tdx_reclaim_td_page(kvm_tdx->tdcs_pa[i]);
+ kfree(kvm_tdx->tdcs_pa);
+ kvm_tdx->tdcs_pa = NULL;
+ }
+
+ if (!kvm_tdx->tdr_pa)
+ return;
+ /*
+ * TDX module maps TDR with TDX global HKID. TDX module may access TDR
+ * while operating on TD (Especially reclaiming TDCS). Cache flush with
+ * TDX global HKID is needed.
+ */
+ if (tdx_reclaim_page(kvm_tdx->tdr_pa, true, tdx_global_keyid))
+ return;
+
+ free_page((unsigned long)__va(kvm_tdx->tdr_pa));
+ kvm_tdx->tdr_pa = 0;
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+ hpa_t *tdr_p = param;
+ u64 err;
+
+ do {
+ err = tdh_mng_key_config(*tdr_p);
+
+ /*
+ * If it failed to generate a random key, retry it because this
+ * is typically caused by an entropy error of the CPU's random
+ * number generator.
+ */
+ } while (err == TDX_KEY_GENERATION_FAILED);
+
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm);
+
+int tdx_vm_init(struct kvm *kvm)
+{
+ /* Place holder for now. */
+ return __tdx_td_init(kvm);
+}
+
int tdx_dev_ioctl(void __user *argp)
{
struct kvm_tdx_capabilities __user *user_caps;
@@ -78,6 +323,160 @@ int tdx_dev_ioctl(void __user *argp)
return 0;
}
+static int __tdx_td_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ unsigned long *tdcs_pa = NULL;
+ unsigned long tdr_pa = 0;
+ unsigned long va;
+ int ret, i;
+ u64 err;
+
+ ret = tdx_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_hkid;
+ tdr_pa = __pa(va);
+
+ tdcs_pa = kcalloc(tdx_caps.tdcs_nr_pages, sizeof(*kvm_tdx->tdcs_pa),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tdcs_pa)
+ goto free_tdr;
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_tdcs;
+ tdcs_pa[i] = __pa(va);
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto free_tdcs;
+ }
+ cpus_read_lock();
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+ pr_warn("All packages need to have online CPU to create TD. Online CPU and retry.\n");
+ goto free_packages;
+ }
+
+ /*
+ * Acquire global lock to avoid TDX_OPERAND_BUSY:
+ * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+ * Table (KOT) to track the assigned TDX private HKID. It doesn't spin
+ * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+ * caller to handle the contention. This is because of time limitation
+ * usable inside the TDX module and OS/VMM knows better about process
+ * scheduling.
+ *
+ * APIs to acquire the lock of KOT:
+ * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+ * TDH.PHYMEM.CACHE.WB.
+ */
+ mutex_lock(&tdx_lock);
+ err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+ ret = -EIO;
+ goto free_packages;
+ }
+ kvm_tdx->tdr_pa = tdr_pa;
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. Avoid this race by
+ * mutex.
+ */
+ mutex_lock(&tdx_mng_key_config_lock[pkg]);
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ &kvm_tdx->tdr_pa, true);
+ mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret)
+ goto teardown;
+
+ kvm_tdx->tdcs_pa = tdcs_pa;
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+ for (i++; i < tdx_caps.tdcs_nr_pages; i++) {
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ tdcs_pa[i] = 0;
+ }
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ /*
+ * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated
+ * ioctl() to define the configure CPUID values for the TD.
+ */
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ tdx_mmu_release_hkid(kvm);
+ tdx_vm_free(kvm);
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+free_tdcs:
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ if (tdcs_pa[i])
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ }
+ kfree(tdcs_pa);
+ kvm_tdx->tdcs_pa = NULL;
+
+free_tdr:
+ if (tdr_pa)
+ free_page((unsigned long)__va(tdr_pa));
+ kvm_tdx->tdr_pa = 0;
+free_hkid:
+ if (is_hkid_assigned(kvm_tdx))
+ tdx_hkid_free(kvm_tdx);
+ return ret;
+}
+
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_tdx_cmd tdx_cmd;
@@ -152,6 +551,8 @@ bool tdx_is_vm_type_supported(unsigned long type)
int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
{
+ int max_pkgs;
+ int i;
int r;
if (!enable_ept) {
@@ -159,6 +560,14 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
return -EINVAL;
}
+ max_pkgs = topology_max_packages();
+ tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
+ GFP_KERNEL);
+ if (!tdx_mng_key_config_lock)
+ return -ENOMEM;
+ for (i = 0; i < max_pkgs; i++)
+ mutex_init(&tdx_mng_key_config_lock[i]);
+
/* TDX requires VMX. */
r = vmxon_all();
if (!r)
@@ -167,3 +576,9 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
return r;
}
+
+void tdx_hardware_unsetup(void)
+{
+ /* kfree accepts NULL. */
+ kfree(tdx_mng_key_config_lock);
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 473013265bd8..e78d72cf4c3a 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -5,7 +5,11 @@
#ifdef CONFIG_INTEL_TDX_HOST
struct kvm_tdx {
struct kvm kvm;
- /* TDX specific members follow. */
+
+ unsigned long tdr_pa;
+ unsigned long *tdcs_pa;
+
+ int hkid;
};
struct vcpu_tdx {
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index bb4090dbae37..3d0f519727c6 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -139,15 +139,24 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
#ifdef CONFIG_INTEL_TDX_HOST
int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
+void tdx_hardware_unsetup(void);
bool tdx_is_vm_type_supported(unsigned long type);
int tdx_dev_ioctl(void __user *argp);
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_free(struct kvm *kvm);
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
#else
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; }
+static inline void tdx_hardware_unsetup(void) {}
static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
static inline int tdx_dev_ioctl(void __user *argp) { return -EOPNOTSUPP; };
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
+static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
+static inline void tdx_vm_free(struct kvm *kvm) {}
static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1152a9dc6d84..0fa91a9708aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12337,6 +12337,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_free)(kvm);
}
static void memslot_rmap_free(struct kvm_memory_slot *slot)
@@ -12647,6 +12648,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
+ /*
+ * kvm_mmu_zap_all() zaps both private and shared page tables. Before
+ * tearing down private page tables, TDX requires some TD resources to
+ * be destroyed (i.e. keyID must have been reclaimed, etc). Invoke
+ * kvm_x86_flush_shadow_all_private() for this.
+ */
+ static_call_cond(kvm_x86_flush_shadow_all_private)(kvm);
kvm_mmu_zap_all(kvm);
}
--
2.25.1
On Thu, 12 Jan 2023 08:31:26 -0800
[email protected] wrote:
> From: Sean Christopherson <[email protected]>
>
> As the first step to create TDX guest, create/destroy VM struct. Assign
> TDX private Host Key ID (HKID) to the TDX guest for memory encryption and
> allocate extra pages for the TDX guest. On destruction, free allocated
> pages, and HKID.
>
> Before tearing down private page tables, TDX requires some resources of
> the guest TD to be destroyed (i.e. HKID must have been reclaimed, etc).
> Add flush_shadow_all_private callback before tearing down private page
> tables for it.
>
> Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
> because some per-VM TDX resources, e.g. TDR, need to be freed after other
> TDX resources, e.g. HKID, were freed.
>
> Co-developed-by: Kai Huang <[email protected]>
> Signed-off-by: Kai Huang <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> Signed-off-by: Isaku Yamahata <[email protected]>
>
> ---
> Changes v10 -> v11:
> - Fix doule free in tdx_vm_free() by setting NULL.
> - replace struct tdx_td_page tdr and tdcs from struct kvm_tdx with
> unsigned long
>
> Signed-off-by: Isaku Yamahata <[email protected]>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 2 +
> arch/x86/include/asm/kvm_host.h | 2 +
> arch/x86/kvm/vmx/main.c | 34 ++-
> arch/x86/kvm/vmx/tdx.c | 415 +++++++++++++++++++++++++++++
> arch/x86/kvm/vmx/tdx.h | 6 +-
> arch/x86/kvm/vmx/x86_ops.h | 9 +
> arch/x86/kvm/x86.c | 8 +
> 7 files changed, 472 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h
> b/arch/x86/include/asm/kvm-x86-ops.h index e6708bb3f4f6..552de893af75
> 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -22,7 +22,9 @@ KVM_X86_OP(has_emulated_msr)
> KVM_X86_OP(vcpu_after_set_cpuid)
> KVM_X86_OP(is_vm_type_supported)
> KVM_X86_OP(vm_init)
> +KVM_X86_OP_OPTIONAL(flush_shadow_all_private)
> KVM_X86_OP_OPTIONAL(vm_destroy)
> +KVM_X86_OP_OPTIONAL(vm_free)
> KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
> KVM_X86_OP(vcpu_create)
> KVM_X86_OP(vcpu_free)
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h index 234c28c8e6ee..e199ddf0bb00 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1540,7 +1540,9 @@ struct kvm_x86_ops {
> bool (*is_vm_type_supported)(unsigned long vm_type);
> unsigned int vm_size;
> int (*vm_init)(struct kvm *kvm);
> + void (*flush_shadow_all_private)(struct kvm *kvm);
> void (*vm_destroy)(struct kvm *kvm);
> + void (*vm_free)(struct kvm *kvm);
>
> /* Create, but do not attach this VCPU */
> int (*vcpu_precreate)(struct kvm *kvm);
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 781fbc896120..c5f2515026e9 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -29,14 +29,40 @@ static __init int vt_hardware_setup(void)
> return 0;
> }
>
> +static void vt_hardware_unsetup(void)
> +{
> + tdx_hardware_unsetup();
> + vmx_hardware_unsetup();
> +}
> +
> static int vt_vm_init(struct kvm *kvm)
> {
> if (is_td(kvm))
> - return -EOPNOTSUPP; /* Not ready to create guest
> TD yet. */
> + return tdx_vm_init(kvm);
>
> return vmx_vm_init(kvm);
> }
>
> +static void vt_flush_shadow_all_private(struct kvm *kvm)
> +{
> + if (is_td(kvm))
> + tdx_mmu_release_hkid(kvm);
> +}
> +
> +static void vt_vm_destroy(struct kvm *kvm)
> +{
> + if (is_td(kvm))
> + return;
> +
> + vmx_vm_destroy(kvm);
> +}
> +
> +static void vt_vm_free(struct kvm *kvm)
> +{
> + if (is_td(kvm))
> + tdx_vm_free(kvm);
> +}
> +
> static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
> {
> if (!is_td(kvm))
> @@ -50,7 +76,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>
> .check_processor_compatibility = vmx_check_processor_compat,
>
> - .hardware_unsetup = vmx_hardware_unsetup,
> + .hardware_unsetup = vt_hardware_unsetup,
>
> .hardware_enable = vmx_hardware_enable,
> .hardware_disable = vmx_hardware_disable,
> @@ -59,7 +85,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> .is_vm_type_supported = vt_is_vm_type_supported,
> .vm_size = sizeof(struct kvm_vmx),
> .vm_init = vt_vm_init,
> - .vm_destroy = vmx_vm_destroy,
> + .flush_shadow_all_private = vt_flush_shadow_all_private,
> + .vm_destroy = vt_vm_destroy,
> + .vm_free = vt_vm_free,
>
> .vcpu_precreate = vmx_vcpu_precreate,
> .vcpu_create = vmx_vcpu_create,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 2bd1cc37abab..d11950d18226 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -6,6 +6,7 @@
> #include "capabilities.h"
> #include "x86_ops.h"
> #include "tdx.h"
> +#include "tdx_ops.h"
> #include "x86.h"
>
> #undef pr_fmt
> @@ -32,6 +33,250 @@ struct tdx_capabilities {
> /* Capabilities of KVM + the TDX module. */
> static struct tdx_capabilities tdx_caps;
>
> +/*
> + * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
> + * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a
> global lock
> + * internally in TDX module. If failed, TDX_OPERAND_BUSY is returned
> without
> + * spinning or waiting due to a constraint on execution time. It's
> caller's
> + * responsibility to avoid race (or retry on TDX_OPERAND_BUSY). Use
> this mutex
> + * to avoid race in TDX module because the kernel knows better about
> scheduling.
> + */
> +static DEFINE_MUTEX(tdx_lock);
> +static struct mutex *tdx_mng_key_config_lock;
> +
> +static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> +{
> + return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> +}
> +
> +static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> +{
> + return kvm_tdx->tdr_pa;
> +}
> +
> +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
> +{
> + tdx_keyid_free(kvm_tdx->hkid);
> + kvm_tdx->hkid = 0;
> +}
> +
> +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> +{
> + return kvm_tdx->hkid > 0;
> +}
> +
> +static void tdx_clear_page(unsigned long page_pa)
> +{
> + const void *zero_page = (const void *)
> __va(page_to_phys(ZERO_PAGE(0)));
> + void *page = __va(page_pa);
> + unsigned long i;
> +
> + if (!static_cpu_has(X86_FEATURE_MOVDIR64B)) {
> + clear_page(page);
> + return;
> + }
> +
> + /*
> + * Zeroing the page is only necessary for systems with MKTME-i:
> + * when re-assign one page from old keyid to a new keyid,
> MOVDIR64B is
> + * required to clear/write the page with new keyid to prevent
> integrity
> + * error when read on the page with new keyid.
> + *
> + * clflush doesn't flush cache with HKID set.
> + * The cache line could be poisoned (even without MKTME-i),
> clear the
> + * poison bit.
> + */
> + for (i = 0; i < PAGE_SIZE; i += 64)
> + movdir64b(page + i, zero_page);
> + /*
> + * MOVDIR64B store uses WC buffer. Prevent following memory
> reads
> + * from seeing potentially poisoned cache.
> + */
> + __mb();
> +}
> +
> +static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
> +{
> + struct tdx_module_output out;
> + u64 err;
> +
> + do {
> + err = tdh_phymem_page_reclaim(pa, &out);
> + /*
> + * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is
> shutdown.
> + * state. i.e. destructing TD.
> + * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target
> page.
> + * Because we're destructing TD, it's rare to contend
> with TDR.
> + */
> + } while (err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX));
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
> + return -EIO;
> + }
> +
> + if (do_wb) {
> + /*
> + * Only TDR page gets into this path. No contention is
> expected
> + * because of the last page of TD.
> + */
> + err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(pa, hkid));
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
> + return -EIO;
> + }
> + }
> +
> + tdx_clear_page(pa);
> + return 0;
> +}
> +
> +static void tdx_reclaim_td_page(unsigned long td_page_pa)
> +{
> + if (!td_page_pa)
> + return;
> + /*
> + * TDCX are being reclaimed. TDX module maps TDCX with HKID
> + * assigned to the TD. Here the cache associated to the TD
> + * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> + * cache doesn't need to be flushed again.
> + */
> + if (WARN_ON(tdx_reclaim_page(td_page_pa, false, 0)))
> + /* If reclaim failed, leak the page. */
Better add a FIXME: here as this has to be fixed later.
> + return;
> + free_page((unsigned long)__va(td_page_pa));
> +}
> +
> +static int tdx_do_tdh_phymem_cache_wb(void *param)
> +{
> + u64 err = 0;
> +
> + do {
> + err = tdh_phymem_cache_wb(!!err);
> + } while (err == TDX_INTERRUPTED_RESUMABLE);
> +
> + /* Other thread may have done for us. */
> + if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> + err = TDX_SUCCESS;
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + cpumask_var_t packages;
> + bool cpumask_allocated;
> + u64 err;
> + int ret;
> + int i;
> +
> + if (!is_hkid_assigned(kvm_tdx))
> + return;
> +
> + if (!is_td_created(kvm_tdx))
> + goto free_hkid;
> +
> + cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> + cpus_read_lock();
> + for_each_online_cpu(i) {
> + if (cpumask_allocated &&
> +
> cpumask_test_and_set_cpu(topology_physical_package_id(i),
> + packages))
> + continue;
> +
> + /*
> + * We can destroy multiple the guest TDs simultaneously.
> + * Prevent tdh_phymem_cache_wb from returning TDX_BUSY
> by
> + * serialization.
> + */
> + mutex_lock(&tdx_lock);
> + ret = smp_call_on_cpu(i, tdx_do_tdh_phymem_cache_wb,
> NULL, 1);
> + mutex_unlock(&tdx_lock);
> + if (ret)
> + break;
> + }
> + cpus_read_unlock();
> + free_cpumask_var(packages);
> +
> + mutex_lock(&tdx_lock);
> + err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
> + mutex_unlock(&tdx_lock);
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> + pr_err("tdh_mng_key_freeid failed. HKID %d is
> leaked.\n",
> + kvm_tdx->hkid);
> + return;
> + }
> +
> +free_hkid:
> + tdx_hkid_free(kvm_tdx);
> +}
> +
> +void tdx_vm_free(struct kvm *kvm)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + int i;
> +
> + /* Can't reclaim or free TD pages if teardown failed. */
> + if (is_hkid_assigned(kvm_tdx))
> + return;
> +
> + if (kvm_tdx->tdcs_pa) {
> + for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
> + tdx_reclaim_td_page(kvm_tdx->tdcs_pa[i]);
> + kfree(kvm_tdx->tdcs_pa);
> + kvm_tdx->tdcs_pa = NULL;
> + }
> +
> + if (!kvm_tdx->tdr_pa)
> + return;
> + /*
> + * TDX module maps TDR with TDX global HKID. TDX module may
> access TDR
> + * while operating on TD (Especially reclaiming TDCS). Cache
> flush with
> + * TDX global HKID is needed.
> + */
> + if (tdx_reclaim_page(kvm_tdx->tdr_pa, true, tdx_global_keyid))
> + return;
> +
> + free_page((unsigned long)__va(kvm_tdx->tdr_pa));
> + kvm_tdx->tdr_pa = 0;
> +}
> +
> +static int tdx_do_tdh_mng_key_config(void *param)
> +{
> + hpa_t *tdr_p = param;
> + u64 err;
> +
> + do {
> + err = tdh_mng_key_config(*tdr_p);
> +
> + /*
> + * If it failed to generate a random key, retry it
> because this
> + * is typically caused by an entropy error of the CPU's
> random
> + * number generator.
> + */
> + } while (err == TDX_KEY_GENERATION_FAILED);
> +
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +static int __tdx_td_init(struct kvm *kvm);
> +
> +int tdx_vm_init(struct kvm *kvm)
> +{
> + /* Place holder for now. */
> + return __tdx_td_init(kvm);
> +}
> +
> int tdx_dev_ioctl(void __user *argp)
> {
> struct kvm_tdx_capabilities __user *user_caps;
> @@ -78,6 +323,160 @@ int tdx_dev_ioctl(void __user *argp)
> return 0;
> }
>
> +static int __tdx_td_init(struct kvm *kvm)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + cpumask_var_t packages;
> + unsigned long *tdcs_pa = NULL;
> + unsigned long tdr_pa = 0;
> + unsigned long va;
> + int ret, i;
> + u64 err;
> +
> + ret = tdx_keyid_alloc();
> + if (ret < 0)
> + return ret;
> + kvm_tdx->hkid = ret;
> +
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + goto free_hkid;
> + tdr_pa = __pa(va);
> +
> + tdcs_pa = kcalloc(tdx_caps.tdcs_nr_pages,
> sizeof(*kvm_tdx->tdcs_pa),
> + GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> + if (!tdcs_pa)
> + goto free_tdr;
> + for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
> + va = __get_free_page(GFP_KERNEL_ACCOUNT);
> + if (!va)
> + goto free_tdcs;
> + tdcs_pa[i] = __pa(va);
> + }
> +
> + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> + ret = -ENOMEM;
> + goto free_tdcs;
> + }
> + cpus_read_lock();
> + /*
> + * Need at least one CPU of the package to be online in order to
> + * program all packages for host key id. Check it.
> + */
> + for_each_present_cpu(i)
> + cpumask_set_cpu(topology_physical_package_id(i),
> packages);
> + for_each_online_cpu(i)
> + cpumask_clear_cpu(topology_physical_package_id(i),
> packages);
> + if (!cpumask_empty(packages)) {
> + ret = -EIO;
> + /*
> + * Because it's hard for human operator to figure out
> the
> + * reason, warn it.
> + */
> + pr_warn("All packages need to have online CPU to create
> TD. Online CPU and retry.\n");
> + goto free_packages;
> + }
> +
> + /*
> + * Acquire global lock to avoid TDX_OPERAND_BUSY:
> + * TDH.MNG.CREATE and other APIs try to lock the global Key
> Owner
> + * Table (KOT) to track the assigned TDX private HKID. It
> doesn't spin
> + * to acquire the lock, returns TDX_OPERAND_BUSY instead, and
> let the
> + * caller to handle the contention. This is because of time
> limitation
> + * usable inside the TDX module and OS/VMM knows better about
> process
> + * scheduling.
> + *
> + * APIs to acquire the lock of KOT:
> + * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> + * TDH.PHYMEM.CACHE.WB.
> + */
> + mutex_lock(&tdx_lock);
> + err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> + mutex_unlock(&tdx_lock);
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> + ret = -EIO;
> + goto free_packages;
> + }
> + kvm_tdx->tdr_pa = tdr_pa;
> +
> + for_each_online_cpu(i) {
> + int pkg = topology_physical_package_id(i);
> +
> + if (cpumask_test_and_set_cpu(pkg, packages))
> + continue;
> +
> + /*
> + * Program the memory controller in the package with an
> + * encryption key associated to a TDX private host key
> id
> + * assigned to this TDR. Concurrent operations on same
> memory
> + * controller results in TDX_OPERAND_BUSY. Avoid this
> race by
> + * mutex.
> + */
> + mutex_lock(&tdx_mng_key_config_lock[pkg]);
> + ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> + &kvm_tdx->tdr_pa, true);
> + mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> + if (ret)
> + break;
> + }
> + cpus_read_unlock();
> + free_cpumask_var(packages);
> + if (ret)
> + goto teardown;
> +
> + kvm_tdx->tdcs_pa = tdcs_pa;
> + for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
> + err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> + for (i++; i < tdx_caps.tdcs_nr_pages; i++) {
> + free_page((unsigned
> long)__va(tdcs_pa[i]));
> + tdcs_pa[i] = 0;
> + }
> + ret = -EIO;
> + goto teardown;
> + }
> + }
> +
> + /*
> + * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT
> requires a dedicated
> + * ioctl() to define the configure CPUID values for the TD.
> + */
> + return 0;
> +
> + /*
> + * The sequence for freeing resources from a partially
> initialized TD
> + * varies based on where in the initialization flow failure
> occurred.
> + * Simply use the full teardown and destroy, which naturally
> play nice
> + * with partial initialization.
> + */
> +teardown:
> + tdx_mmu_release_hkid(kvm);
> + tdx_vm_free(kvm);
> + return ret;
> +
> +free_packages:
> + cpus_read_unlock();
> + free_cpumask_var(packages);
> +free_tdcs:
> + for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
> + if (tdcs_pa[i])
> + free_page((unsigned long)__va(tdcs_pa[i]));
> + }
> + kfree(tdcs_pa);
> + kvm_tdx->tdcs_pa = NULL;
> +
> +free_tdr:
> + if (tdr_pa)
> + free_page((unsigned long)__va(tdr_pa));
> + kvm_tdx->tdr_pa = 0;
> +free_hkid:
> + if (is_hkid_assigned(kvm_tdx))
> + tdx_hkid_free(kvm_tdx);
> + return ret;
> +}
> +
> int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> {
> struct kvm_tdx_cmd tdx_cmd;
> @@ -152,6 +551,8 @@ bool tdx_is_vm_type_supported(unsigned long type)
>
> int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> {
> + int max_pkgs;
> + int i;
> int r;
>
> if (!enable_ept) {
> @@ -159,6 +560,14 @@ int __init tdx_hardware_setup(struct kvm_x86_ops
> *x86_ops) return -EINVAL;
> }
>
> + max_pkgs = topology_max_packages();
> + tdx_mng_key_config_lock = kcalloc(max_pkgs,
> sizeof(*tdx_mng_key_config_lock),
> + GFP_KERNEL);
> + if (!tdx_mng_key_config_lock)
> + return -ENOMEM;
> + for (i = 0; i < max_pkgs; i++)
> + mutex_init(&tdx_mng_key_config_lock[i]);
> +
> /* TDX requires VMX. */
> r = vmxon_all();
> if (!r)
> @@ -167,3 +576,9 @@ int __init tdx_hardware_setup(struct kvm_x86_ops
> *x86_ops)
> return r;
> }
> +
> +void tdx_hardware_unsetup(void)
> +{
> + /* kfree accepts NULL. */
> + kfree(tdx_mng_key_config_lock);
> +}
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 473013265bd8..e78d72cf4c3a 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -5,7 +5,11 @@
> #ifdef CONFIG_INTEL_TDX_HOST
> struct kvm_tdx {
> struct kvm kvm;
> - /* TDX specific members follow. */
> +
> + unsigned long tdr_pa;
> + unsigned long *tdcs_pa;
> +
> + int hkid;
> };
>
> struct vcpu_tdx {
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index bb4090dbae37..3d0f519727c6 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -139,15 +139,24 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
>
> #ifdef CONFIG_INTEL_TDX_HOST
> int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
> +void tdx_hardware_unsetup(void);
> bool tdx_is_vm_type_supported(unsigned long type);
> int tdx_dev_ioctl(void __user *argp);
>
> +int tdx_vm_init(struct kvm *kvm);
> +void tdx_mmu_release_hkid(struct kvm *kvm);
> +void tdx_vm_free(struct kvm *kvm);
> int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
> #else
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) {
> return 0; } +static inline void tdx_hardware_unsetup(void) {}
> static inline bool tdx_is_vm_type_supported(unsigned long type) {
> return false; } static inline int tdx_dev_ioctl(void __user *argp) {
> return -EOPNOTSUPP; };
> +static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
> +static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
> +static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
> +static inline void tdx_vm_free(struct kvm *kvm) {}
> static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) {
> return -EOPNOTSUPP; } #endif
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1152a9dc6d84..0fa91a9708aa 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12337,6 +12337,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
> kvm_page_track_cleanup(kvm);
> kvm_xen_destroy_vm(kvm);
> kvm_hv_destroy_vm(kvm);
> + static_call_cond(kvm_x86_vm_free)(kvm);
> }
>
> static void memslot_rmap_free(struct kvm_memory_slot *slot)
> @@ -12647,6 +12648,13 @@ void kvm_arch_commit_memory_region(struct kvm
> *kvm,
> void kvm_arch_flush_shadow_all(struct kvm *kvm)
> {
> + /*
> + * kvm_mmu_zap_all() zaps both private and shared page tables.
> Before
> + * tearing down private page tables, TDX requires some TD
> resources to
> + * be destroyed (i.e. keyID must have been reclaimed, etc).
> Invoke
> + * kvm_x86_flush_shadow_all_private() for this.
> + */
> + static_call_cond(kvm_x86_flush_shadow_all_private)(kvm);
> kvm_mmu_zap_all(kvm);
> }
>
On Fri, Jan 13, 2023, Zhi Wang wrote:
> On Thu, 12 Jan 2023 08:31:26 -0800 [email protected] wrote:
> > +static void tdx_reclaim_td_page(unsigned long td_page_pa)
> > +{
> > + if (!td_page_pa)
> > + return;
> > + /*
> > + * TDCX are being reclaimed. TDX module maps TDCX with HKID
> > + * assigned to the TD. Here the cache associated to the TD
> > + * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> > + * cache doesn't need to be flushed again.
> > + */
> > + if (WARN_ON(tdx_reclaim_page(td_page_pa, false, 0)))
The WARN_ON() can go, tdx_reclaim_page() has WARN_ON_ONCE() + pr_tdx_error() in
all error paths.
> > + /* If reclaim failed, leak the page. */
>
> Better add a FIXME: here as this has to be fixed later.
No, leaking the page is all KVM can reasonably do here. An improved comment would
be helpful, but no code change is required. tdx_reclaim_page() returns an error
if and only if there's an unexpected, fatal error, e.g. a SEAMCALL with bad params,
incorrect concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
highly unlikely to be successful.
On Fri, 13 Jan 2023 15:16:08 +0000
Sean Christopherson <[email protected]> wrote:
> On Fri, Jan 13, 2023, Zhi Wang wrote:
> > On Thu, 12 Jan 2023 08:31:26 -0800 [email protected] wrote:
> > > +static void tdx_reclaim_td_page(unsigned long td_page_pa)
> > > +{
> > > + if (!td_page_pa)
> > > + return;
> > > + /*
> > > + * TDCX are being reclaimed. TDX module maps TDCX with HKID
> > > + * assigned to the TD. Here the cache associated to the TD
> > > + * was already flushed by TDH.PHYMEM.CACHE.WB before here,
> > > So
> > > + * cache doesn't need to be flushed again.
> > > + */
> > > + if (WARN_ON(tdx_reclaim_page(td_page_pa, false, 0)))
>
> The WARN_ON() can go, tdx_reclaim_page() has WARN_ON_ONCE() +
> pr_tdx_error() in all error paths.
>
> > > + /* If reclaim failed, leak the page. */
> >
> > Better add a FIXME: here as this has to be fixed later.
>
> No, leaking the page is all KVM can reasonably do here. An improved
> comment would be helpful, but no code change is required.
> tdx_reclaim_page() returns an error if and only if there's an
> unexpected, fatal error, e.g. a SEAMCALL with bad params, incorrect
> concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
> highly unlikely to be successful.
Hi:
The word "leaking" sounds like a situation left unhandled temporarily.
I checked the source code of the TDX module[1] for the possible reason to
fail when reviewing this patch:
tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_reclaim.c
tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_wbinvd.c
a. Invalid parameters. For example, page is not aligned, PA HKID is not zero...
For invalid parameters, a WARN_ON_ONCE() + return value is good enough as
that is how kernel handles similar situations. The caller takes the
responsibility.
b. Locks has been taken in TDX module. TDR page has been locked due to another
SEAMCALL, another SEAMCALL is doing PAMT walk and holding PAMT lock...
This needs to be improved later either by retry or taking tdx_lock to avoid
TDX module fails on this.
[1] https://www.intel.com/content/www/us/en/download/738875/738876/intel-trust-domain-extension-intel-tdx-module.html
On Sat, Jan 14, 2023, Zhi Wang wrote:
> On Fri, 13 Jan 2023 15:16:08 +0000 > Sean Christopherson <[email protected]> wrote:
>
> > On Fri, Jan 13, 2023, Zhi Wang wrote:
> > > Better add a FIXME: here as this has to be fixed later.
> >
> > No, leaking the page is all KVM can reasonably do here. An improved
> > comment would be helpful, but no code change is required.
> > tdx_reclaim_page() returns an error if and only if there's an
> > unexpected, fatal error, e.g. a SEAMCALL with bad params, incorrect
> > concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
> > highly unlikely to be successful.
>
> Hi:
>
> The word "leaking" sounds like a situation left unhandled temporarily.
>
> I checked the source code of the TDX module[1] for the possible reason to
> fail when reviewing this patch:
>
> tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_reclaim.c
> tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_wbinvd.c
>
> a. Invalid parameters. For example, page is not aligned, PA HKID is not zero...
>
> For invalid parameters, a WARN_ON_ONCE() + return value is good enough as
> that is how kernel handles similar situations. The caller takes the
> responsibility.
>
> b. Locks has been taken in TDX module. TDR page has been locked due to another
> SEAMCALL, another SEAMCALL is doing PAMT walk and holding PAMT lock...
>
> This needs to be improved later either by retry or taking tdx_lock to avoid
> TDX module fails on this.
No, tdx_reclaim_page() already retries TDH.PHYMEM.PAGE.RECLAIM if the target page
is contended (though I'd question the validity of even that), and TDH.PHYMEM.PAGE.WBINVD
is performed only when reclaiming the TDR. If there's contention when reclaiming
the TDR, then KVM effectively has a use-after-free bug, i.e. leaking the page is
the least of our worries.
On Thu, Jan 12, 2023 at 8:34 AM <[email protected]> wrote:
> +static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
> +{
> + ? ? ? struct tdx_module_output out;
> + ? ? ? u64 err;
> +
> + ? ? ? do {
> + ? ? ? ? ? ? ? err = tdh_phymem_page_reclaim(pa, &out);
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
> + ? ? ? ? ? ? ? ?* state. ?i.e. destructing TD.
> + ? ? ? ? ? ? ? ?* TDH.PHYMEM.PAGE.RECLAIM ?requires TDR and target page.
> + ? ? ? ? ? ? ? ?* Because we're destructing TD, it's rare to contend with TDR.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? } while (err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX));
> + ? ? ? if (WARN_ON_ONCE(err)) {
> + ? ? ? ? ? ? ? pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
> + ? ? ? ? ? ? ? return -EIO;
> + ? ? ? }
> +
> + ? ? ? if (do_wb) {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* Only TDR page gets into this path. ?No contention is expected
> + ? ? ? ? ? ? ? ?* because of the last page of TD.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(pa, hkid));
> + ? ? ? ? ? ? ? if (WARN_ON_ONCE(err)) {
> + ? ? ? ? ? ? ? ? ? ? ? pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
> + ? ? ? ? ? ? ? ? ? ? ? return -EIO;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +
> + ? ? ? tdx_clear_page(pa);
> + ? ? ? return 0;
> +}
On Tue, 17 Jan 2023 15:55:53 +0000
Sean Christopherson <[email protected]> wrote:
> On Sat, Jan 14, 2023, Zhi Wang wrote:
> > On Fri, 13 Jan 2023 15:16:08 +0000 > Sean Christopherson <[email protected]> wrote:
> >
> > > On Fri, Jan 13, 2023, Zhi Wang wrote:
> > > > Better add a FIXME: here as this has to be fixed later.
> > >
> > > No, leaking the page is all KVM can reasonably do here. An improved
> > > comment would be helpful, but no code change is required.
> > > tdx_reclaim_page() returns an error if and only if there's an
> > > unexpected, fatal error, e.g. a SEAMCALL with bad params, incorrect
> > > concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
> > > highly unlikely to be successful.
> >
> > Hi:
> >
> > The word "leaking" sounds like a situation left unhandled temporarily.
> >
> > I checked the source code of the TDX module[1] for the possible reason to
> > fail when reviewing this patch:
> >
> > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_reclaim.c
> > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_wbinvd.c
> >
> > a. Invalid parameters. For example, page is not aligned, PA HKID is not zero...
> >
> > For invalid parameters, a WARN_ON_ONCE() + return value is good enough as
> > that is how kernel handles similar situations. The caller takes the
> > responsibility.
> >
> > b. Locks has been taken in TDX module. TDR page has been locked due to another
> > SEAMCALL, another SEAMCALL is doing PAMT walk and holding PAMT lock...
> >
> > This needs to be improved later either by retry or taking tdx_lock to avoid
> > TDX module fails on this.
>
> No, tdx_reclaim_page() already retries TDH.PHYMEM.PAGE.RECLAIM if the target page
> is contended (though I'd question the validity of even that), and TDH.PHYMEM.PAGE.WBINVD
> is performed only when reclaiming the TDR. If there's contention when reclaiming
> the TDR, then KVM effectively has a use-after-free bug, i.e. leaking the page is
> the least of our worries.
>
Hi:
Thanks for the reply. "Leaking" is the consquence of even failing in retry. I
agree with this. But I was questioning if "retry" is really a correct and only
solution when encountering lock contention in the TDX module as I saw that there
are quite some magic numbers are going to be introduced because of "retry" and
there were discussions about times of retry should be 3 or 1000 in TDX guest
on hyper-V patches. It doesn't sound right.
Compare to an typical *kernel lock* case, an execution path can wait on a
waitqueue and later will be woken up. We usually do contention-wait-and-retry
and we rarely just do contention and retry X times. In TDX case, I understand
that it is hard for the TDX module to provide similar solutions as an execution
path can't stay long in the TDX module.
1) We can always take tdx_lock (linux kernel lock) when calling a SEAMCALL
that touch the TDX internal locks. But the downside is we might lose some
concurrency.
2) As TDX module doesn't provide contention-and-wait, I guess the following
approach might have been discussed when designing this "retry".
KERNEL TDX MODULE
SEAMCALL A -> PATH A: Taking locks
SEAMCALL B -> PATH B: Contention on a lock
<- Return "operand busy"
SEAMCALL B -|
| <- Wait on a kernel waitqueue
SEAMCALL B <-|
SEAMCALL A <- PATH A: Return
SEAMCALL A -|
| <- Wake up the waitqueue
SEMACALL A <-|
SEAMCALL B -> PATH B: Taking the locks
...
Why not this scheme wasn't chosen?
>
> On Thu, Jan 12, 2023 at 8:34 AM <[email protected]> wrote:
> > +static int tdx_reclaim_page(hpa_t pa, bool do_wb, u16 hkid)
> > +{
> > + ? ? ? struct tdx_module_output out;
> > + ? ? ? u64 err;
> > +
> > + ? ? ? do {
> > + ? ? ? ? ? ? ? err = tdh_phymem_page_reclaim(pa, &out);
> > + ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ?* TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
> > + ? ? ? ? ? ? ? ?* state. ?i.e. destructing TD.
> > + ? ? ? ? ? ? ? ?* TDH.PHYMEM.PAGE.RECLAIM ?requires TDR and target page.
> > + ? ? ? ? ? ? ? ?* Because we're destructing TD, it's rare to contend with TDR.
> > + ? ? ? ? ? ? ? ?*/
> > + ? ? ? } while (err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX));
> > + ? ? ? if (WARN_ON_ONCE(err)) {
> > + ? ? ? ? ? ? ? pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
> > + ? ? ? ? ? ? ? return -EIO;
> > + ? ? ? }
> > +
> > + ? ? ? if (do_wb) {
> > + ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ?* Only TDR page gets into this path. ?No contention is expected
> > + ? ? ? ? ? ? ? ?* because of the last page of TD.
> > + ? ? ? ? ? ? ? ?*/
> > + ? ? ? ? ? ? ? err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(pa, hkid));
> > + ? ? ? ? ? ? ? if (WARN_ON_ONCE(err)) {
> > + ? ? ? ? ? ? ? ? ? ? ? pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
> > + ? ? ? ? ? ? ? ? ? ? ? return -EIO;
> > + ? ? ? ? ? ? ? }
> > + ? ? ? }
> > +
> > + ? ? ? tdx_clear_page(pa);
> > + ? ? ? return 0;
> > +}
On Tue, Jan 17, 2023, Zhi Wang wrote:
> On Tue, 17 Jan 2023 15:55:53 +0000
> Sean Christopherson <[email protected]> wrote:
>
> > On Sat, Jan 14, 2023, Zhi Wang wrote:
> > > On Fri, 13 Jan 2023 15:16:08 +0000 > Sean Christopherson <[email protected]> wrote:
> > >
> > > > On Fri, Jan 13, 2023, Zhi Wang wrote:
> > > > > Better add a FIXME: here as this has to be fixed later.
> > > >
> > > > No, leaking the page is all KVM can reasonably do here. An improved
> > > > comment would be helpful, but no code change is required.
> > > > tdx_reclaim_page() returns an error if and only if there's an
> > > > unexpected, fatal error, e.g. a SEAMCALL with bad params, incorrect
> > > > concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
> > > > highly unlikely to be successful.
> > >
> > > Hi:
> > >
> > > The word "leaking" sounds like a situation left unhandled temporarily.
> > >
> > > I checked the source code of the TDX module[1] for the possible reason to
> > > fail when reviewing this patch:
> > >
> > > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_reclaim.c
> > > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_wbinvd.c
> > >
> > > a. Invalid parameters. For example, page is not aligned, PA HKID is not zero...
> > >
> > > For invalid parameters, a WARN_ON_ONCE() + return value is good enough as
> > > that is how kernel handles similar situations. The caller takes the
> > > responsibility.
> > >
> > > b. Locks has been taken in TDX module. TDR page has been locked due to another
> > > SEAMCALL, another SEAMCALL is doing PAMT walk and holding PAMT lock...
> > >
> > > This needs to be improved later either by retry or taking tdx_lock to avoid
> > > TDX module fails on this.
> >
> > No, tdx_reclaim_page() already retries TDH.PHYMEM.PAGE.RECLAIM if the target page
> > is contended (though I'd question the validity of even that), and TDH.PHYMEM.PAGE.WBINVD
> > is performed only when reclaiming the TDR. If there's contention when reclaiming
> > the TDR, then KVM effectively has a use-after-free bug, i.e. leaking the page is
> > the least of our worries.
> >
>
> Hi:
>
> Thanks for the reply. "Leaking" is the consquence of even failing in retry. I
> agree with this. But I was questioning if "retry" is really a correct and only
> solution when encountering lock contention in the TDX module as I saw that there
> are quite some magic numbers are going to be introduced because of "retry" and
> there were discussions about times of retry should be 3 or 1000 in TDX guest
> on hyper-V patches. It doesn't sound right.
Ah, yeah, I'm speaking only with respect to leaking pages on failure in this
specific scenario.
> Compare to an typical *kernel lock* case, an execution path can wait on a
> waitqueue and later will be woken up. We usually do contention-wait-and-retry
> and we rarely just do contention and retry X times. In TDX case, I understand
> that it is hard for the TDX module to provide similar solutions as an execution
> path can't stay long in the TDX module.
Let me preface the below comments by saying that this is the first time that I've
seen the "Single-Step and Zero-Step Attacks Mitigation Mechanisms" behavior, i.e.
the first time I've been made aware that the TDX Module can apparently decide
to take S-EPT locks in the VM-Enter path.
>
> 1) We can always take tdx_lock (linux kernel lock) when calling a SEAMCALL
> that touch the TDX internal locks. But the downside is we might lose some
> concurrency.
This isn't really feasible in practice. Even if tdx_lock were made a spinlock
(it's currently a mutex) so that it could it could be taken inside kvm->mmu_lock,
acquiring a per-VM lock, let alone a global lock, in KVM's page fault handling
path is not an option. KVM has a hard requirement these days of being able to
handle multiple page faults in parallel.
> 2) As TDX module doesn't provide contention-and-wait, I guess the following
> approach might have been discussed when designing this "retry".
>
> KERNEL TDX MODULE
>
> SEAMCALL A -> PATH A: Taking locks
>
> SEAMCALL B -> PATH B: Contention on a lock
>
> <- Return "operand busy"
>
> SEAMCALL B -|
> | <- Wait on a kernel waitqueue
> SEAMCALL B <-|
>
> SEAMCALL A <- PATH A: Return
>
> SEAMCALL A -|
> | <- Wake up the waitqueue
> SEMACALL A <-|
>
> SEAMCALL B -> PATH B: Taking the locks
> ...
>
> Why not this scheme wasn't chosen?
AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
may have considered the idea internally, but I don't recall anything being proposed
publically (though it's entirely possible I just missed the discussion).
Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
management, which AFAICT is the only scenario where KVM does the arbitrary "retry
X times and hope things work". If the contention occurs due to the TDX Module
taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
Module drops the S-EPT lock. In other words, immediately retrying and then punting
the problem further up the stack in KVM does seem to be the least awful "solution"
if there's contention.
That said, I 100% agree that the arbitrary retry stuff is awful. The zero-step
interaction in particular isn't acceptable.
Intel folks, encountering "TDX_OPERAND_BUSY | TDX_OPERAND_ID_SEPT" on VM-Enter
needs to be treated as a KVM bug, even if it means teaching KVM to kill the VM
if a vCPU is on the cusp of triggerring the "pre-determined number" of EPT faults
mentioned in this snippet:
After a pre-determined number of such EPT violations occur on the same instruction,
the TDX module starts tracking the GPAs that caused Secure EPT faults and fails
further host VMM attempts to enter the TD VCPU unless previously faulting private
GPAs are properly mapped in the Secure EPT.
If the "pre-determined number" is too low to avoid false positives, e.g. if it can
be tripped by a vCPU spinning while a different vCPU finishes handling a fault,
then either the behavior of the TDX Module needs to be revisited, or KVM needs to
stall vCPUs that are approaching the threshold until all outstanding S-EPT faults
have been serviced.
KVM shouldn't be spuriously zapping private S-EPT entries since the backing memory
is pinned, which means the only way for a vCPU to truly get stuck faulting on a
single instruction is if userspace is broken/malicious, in which case userspace
gets to keep the pieces.
On Tue, Jan 17, 2023, Sean Christopherson wrote:
> On Tue, Jan 17, 2023, Zhi Wang wrote:
> > 2) As TDX module doesn't provide contention-and-wait, I guess the following
> > approach might have been discussed when designing this "retry".
> >
> > KERNEL TDX MODULE
> >
> > SEAMCALL A -> PATH A: Taking locks
> >
> > SEAMCALL B -> PATH B: Contention on a lock
> >
> > <- Return "operand busy"
> >
> > SEAMCALL B -|
> > | <- Wait on a kernel waitqueue
> > SEAMCALL B <-|
> >
> > SEAMCALL A <- PATH A: Return
> >
> > SEAMCALL A -|
> > | <- Wake up the waitqueue
> > SEMACALL A <-|
> >
> > SEAMCALL B -> PATH B: Taking the locks
> > ...
> >
> > Why not this scheme wasn't chosen?
>
> AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
> may have considered the idea internally, but I don't recall anything being proposed
> publically (though it's entirely possible I just missed the discussion).
>
> Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
> management, which AFAICT is the only scenario where KVM does the arbitrary "retry
> X times and hope things work". If the contention occurs due to the TDX Module
> taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
> the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
> Module drops the S-EPT lock. In other words, immediately retrying and then punting
> the problem further up the stack in KVM does seem to be the least awful "solution"
> if there's contention.
Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
in KVM's MMU in order to wait isn't always an option. Most flows would play nice
with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
(conditionally) disallow sleeping.
On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > 2) As TDX module doesn't provide contention-and-wait, I guess the following
> > > approach might have been discussed when designing this "retry".
> > >
> > > KERNEL TDX MODULE
> > >
> > > SEAMCALL A -> PATH A: Taking locks
> > >
> > > SEAMCALL B -> PATH B: Contention on a lock
> > >
> > > <- Return "operand busy"
> > >
> > > SEAMCALL B -|
> > > | <- Wait on a kernel waitqueue
> > > SEAMCALL B <-|
> > >
> > > SEAMCALL A <- PATH A: Return
> > >
> > > SEAMCALL A -|
> > > | <- Wake up the waitqueue
> > > SEMACALL A <-|
> > >
> > > SEAMCALL B -> PATH B: Taking the locks
> > > ...
> > >
> > > Why not this scheme wasn't chosen?
> >
> > AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
> > may have considered the idea internally, but I don't recall anything being proposed
> > publically (though it's entirely possible I just missed the discussion).
> >
> > Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
> > management, which AFAICT is the only scenario where KVM does the arbitrary "retry
> > X times and hope things work". If the contention occurs due to the TDX Module
> > taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
> > the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
> > Module drops the S-EPT lock. In other words, immediately retrying and then punting
> > the problem further up the stack in KVM does seem to be the least awful "solution"
> > if there's contention.
>
> Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> (conditionally) disallow sleeping.
Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
retrying "X times", at least at those paths that can release mmu_lock()?
Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
rwlock_needbreak(). I haven't thought about details though.
On Thu, Jan 19, 2023, Huang, Kai wrote:
> On Thu, 2023-01-12 at 08:31 -0800, [email protected] wrote:
> > +static void tdx_clear_page(unsigned long page_pa)
> > +{
> > + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > + void *page = __va(page_pa);
> > + unsigned long i;
> > +
> > + if (!static_cpu_has(X86_FEATURE_MOVDIR64B)) {
> > + clear_page(page);
> > + return;
> > + }
>
> There might be below issues here:
>
> 1) The kernel says static_cpu_has() should only be used in fast patch where each
> cycle is counted, otherwise use boot_cpu_has(). I don't know whether here you
> should use static_cpu_has().
That documentation is stale[*], go ahead and use cpu_feature_enabled().
https://lore.kernel.org/all/[email protected]
> 2) IIUC a CPU feature bit can be cleared by 'clearcpuid=xxx' kernel command
As you note below, using clearcpuid taints the kernel, i.e. any breakage due to
clearcpuid is user error.
> line, so looks you should use CPUID directly otherwise the MOVDIR64B below can
> be unintentionally skipped. In practice w/o doing MOVDIR64B is fine since KeyID
> 0 doesn't have integrity enabled, but for the purpose you want to achieve
> checking real CPUID should be better.
>
> But maybe you don't want to do CPUID check here each time when reclaiming a
> page. In that case you can do CPUID during module initialization and cache
> whether MOVDIR64B is truly present. static_key is a good fit for this purpose
> too I think.
>
> But I am also seeing below in the kernel documentation:
>
> clearcpuid=X[,X...] [X86]
> ......
> Note that using this option will taint your kernel.
> Also note that user programs calling CPUID directly
> or using the feature without checking anything
> will still see it. This just prevents it from
> being used by the kernel or shown in /proc/cpuinfo.
> Also note the kernel might malfunction if you disable
> some critical bits.
>
> So the kernel is claiming using this will taint the kernel and it can even
> malfunction. So maybe it's OK to use static_cpu_has()/boot_cpu_has().
On Thu, Jan 19, 2023, Huang, Kai wrote:
> On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > 2) As TDX module doesn't provide contention-and-wait, I guess the following
> > > > approach might have been discussed when designing this "retry".
> > > >
> > > > KERNEL TDX MODULE
> > > >
> > > > SEAMCALL A -> PATH A: Taking locks
> > > >
> > > > SEAMCALL B -> PATH B: Contention on a lock
> > > >
> > > > <- Return "operand busy"
> > > >
> > > > SEAMCALL B -|
> > > > | <- Wait on a kernel waitqueue
> > > > SEAMCALL B <-|
> > > >
> > > > SEAMCALL A <- PATH A: Return
> > > >
> > > > SEAMCALL A -|
> > > > | <- Wake up the waitqueue
> > > > SEMACALL A <-|
> > > >
> > > > SEAMCALL B -> PATH B: Taking the locks
> > > > ...
> > > >
> > > > Why not this scheme wasn't chosen?
> > >
> > > AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
> > > may have considered the idea internally, but I don't recall anything being proposed
> > > publically (though it's entirely possible I just missed the discussion).
> > >
> > > Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
> > > management, which AFAICT is the only scenario where KVM does the arbitrary "retry
> > > X times and hope things work". If the contention occurs due to the TDX Module
> > > taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
> > > the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
> > > Module drops the S-EPT lock. In other words, immediately retrying and then punting
> > > the problem further up the stack in KVM does seem to be the least awful "solution"
> > > if there's contention.
> >
> > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > (conditionally) disallow sleeping.
>
> Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> retrying "X times", at least at those paths that can release mmu_lock()?
That's effectively what happens by unwinding up the stak with an error code.
Eventually the page fault handler will get the error and retry the guest.
> Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> rwlock_needbreak(). I haven't thought about details though.
I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
with a bunch of retry logic and error handling just because the TDX module is
ultra paranoid and hostile to hypervisors.
The problematic scenario of faulting indefinitely on a single instruction should
never happen under normal circumstances, and so KVM should treat such scenarios
as attacks/breakage and pass the buck to userspace.
On Thu, 2023-01-19 at 15:29 +0000, Sean Christopherson wrote:
> On Thu, Jan 19, 2023, Huang, Kai wrote:
> > On Thu, 2023-01-12 at 08:31 -0800, [email protected] wrote:
> > > +static void tdx_clear_page(unsigned long page_pa)
> > > +{
> > > + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > > + void *page = __va(page_pa);
> > > + unsigned long i;
> > > +
> > > + if (!static_cpu_has(X86_FEATURE_MOVDIR64B)) {
> > > + clear_page(page);
> > > + return;
> > > + }
> >
> > There might be below issues here:
> >
> > 1) The kernel says static_cpu_has() should only be used in fast patch where each
> > cycle is counted, otherwise use boot_cpu_has(). I don't know whether here you
> > should use static_cpu_has().
>
> That documentation is stale[*], go ahead and use cpu_feature_enabled().
>
> https://lore.kernel.org/all/[email protected]
Thanks for the info :)
>
> > 2) IIUC a CPU feature bit can be cleared by 'clearcpuid=xxx' kernel command
>
> As you note below, using clearcpuid taints the kernel, i.e. any breakage due to
> clearcpuid is user error.
Agreed.
On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> On Thu, Jan 19, 2023, Huang, Kai wrote:
> > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > > 2) As TDX module doesn't provide contention-and-wait, I guess the following
> > > > > approach might have been discussed when designing this "retry".
> > > > >
> > > > > KERNEL TDX MODULE
> > > > >
> > > > > SEAMCALL A -> PATH A: Taking locks
> > > > >
> > > > > SEAMCALL B -> PATH B: Contention on a lock
> > > > >
> > > > > <- Return "operand busy"
> > > > >
> > > > > SEAMCALL B -|
> > > > > | <- Wait on a kernel waitqueue
> > > > > SEAMCALL B <-|
> > > > >
> > > > > SEAMCALL A <- PATH A: Return
> > > > >
> > > > > SEAMCALL A -|
> > > > > | <- Wake up the waitqueue
> > > > > SEMACALL A <-|
> > > > >
> > > > > SEAMCALL B -> PATH B: Taking the locks
> > > > > ...
> > > > >
> > > > > Why not this scheme wasn't chosen?
> > > >
> > > > AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
> > > > may have considered the idea internally, but I don't recall anything being proposed
> > > > publically (though it's entirely possible I just missed the discussion).
> > > >
> > > > Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
> > > > management, which AFAICT is the only scenario where KVM does the arbitrary "retry
> > > > X times and hope things work". If the contention occurs due to the TDX Module
> > > > taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
> > > > the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
> > > > Module drops the S-EPT lock. In other words, immediately retrying and then punting
> > > > the problem further up the stack in KVM does seem to be the least awful "solution"
> > > > if there's contention.
> > >
> > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > (conditionally) disallow sleeping.
> >
> > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > retrying "X times", at least at those paths that can release mmu_lock()?
>
> That's effectively what happens by unwinding up the stak with an error code.
> Eventually the page fault handler will get the error and retry the guest.
>
> > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > rwlock_needbreak(). I haven't thought about details though.
>
> I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> with a bunch of retry logic and error handling just because the TDX module is
> ultra paranoid and hostile to hypervisors.
Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
For instance, parallel page faults on different vcpus on private pages. I
believe this is the main reason to retry. We previously used spinlock around
the SEAMCALLs to avoid, but looks that is not preferred.
>
> The problematic scenario of faulting indefinitely on a single instruction should
> never happen under normal circumstances, and so KVM should treat such scenarios
> as attacks/breakage and pass the buck to userspace.
Totally agree zero-step attack can be treated KVM bug.
On Thu, Jan 19, 2023, Huang, Kai wrote:
> On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > > (conditionally) disallow sleeping.
> > >
> > > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > > retrying "X times", at least at those paths that can release mmu_lock()?
> >
> > That's effectively what happens by unwinding up the stak with an error code.
> > Eventually the page fault handler will get the error and retry the guest.
> >
> > > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > > rwlock_needbreak(). I haven't thought about details though.
> >
> > I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> > with a bunch of retry logic and error handling just because the TDX module is
> > ultra paranoid and hostile to hypervisors.
>
> Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
> multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
> For instance, parallel page faults on different vcpus on private pages. I
> believe this is the main reason to retry.
Um, crud. I think there's a bigger issue. KVM always operates on its copy of the
S-EPT tables and assumes the the real S-EPT tables will always be synchronized with
KVM's mirror. That assumption doesn't hold true without serializing SEAMCALLs in
some way. E.g. if a SPTE is zapped and mapped at the same time, we can end up with:
vCPU0 vCPU1
===== =====
mirror[x] = xyz
old_spte = mirror[x]
mirror[x] = REMOVED_SPTE
sept[x] = REMOVED_SPTE
sept[x] = xyz
In other words, when mmu_lock is held for read, KVM relies on atomic SPTE updates.
With the mirror=>s-ept scheme, updates are no longer atomic and everything falls
apart.
Gracefully retrying only papers over the visible failures, the really problematic
scenarios are where multiple updates race and _don't_ trigger conflicts in the TDX
module.
> We previously used spinlock around the SEAMCALLs to avoid, but looks that is
> not preferred.
That doesn't address the race above either. And even if it did, serializing all
S-EPT SEAMCALLs for a VM is not an option, at least not in the long term.
The least invasive idea I have is expand the TDP MMU's concept of "frozen" SPTEs
and freeze (a.k.a. lock) the SPTE (KVM's mirror) until the corresponding S-EPT
update completes.
The other idea is to scrap the mirror concept entirely, though I gotta imagine
that would provide pretty awful performance.
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 0d8deefee66c..bcb398e71475 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -198,9 +198,9 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
-static inline bool is_removed_spte(u64 spte)
+static inline bool is_frozen_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == REMOVED_SPTE || spte & FROZEN_SPTE;
}
/* Get an SPTE's index into its parent's page table (and the spt array). */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bba33aea0fb0..7f34eccadf98 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -651,6 +651,9 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
+ if (<is TDX> && new_spte != REMOVED_SPTE)
+ new_spte |= FROZEN_SPTE;
+
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock.
@@ -662,6 +665,9 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
new_spte, iter->level, true);
handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+ if (<is TDX> && new_spte != REMOVED_SPTE)
+ __kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
+
return 0;
}
On Tue, 17 Jan 2023 20:56:17 +0000
Sean Christopherson <[email protected]> wrote:
> On Tue, Jan 17, 2023, Zhi Wang wrote:
> > On Tue, 17 Jan 2023 15:55:53 +0000
> > Sean Christopherson <[email protected]> wrote:
> >
> > > On Sat, Jan 14, 2023, Zhi Wang wrote:
> > > > On Fri, 13 Jan 2023 15:16:08 +0000 > Sean Christopherson <[email protected]> wrote:
> > > >
> > > > > On Fri, Jan 13, 2023, Zhi Wang wrote:
> > > > > > Better add a FIXME: here as this has to be fixed later.
> > > > >
> > > > > No, leaking the page is all KVM can reasonably do here. An improved
> > > > > comment would be helpful, but no code change is required.
> > > > > tdx_reclaim_page() returns an error if and only if there's an
> > > > > unexpected, fatal error, e.g. a SEAMCALL with bad params, incorrect
> > > > > concurrency in KVM, a TDX Module bug, etc. Retrying at a later point is
> > > > > highly unlikely to be successful.
> > > >
> > > > Hi:
> > > >
> > > > The word "leaking" sounds like a situation left unhandled temporarily.
> > > >
> > > > I checked the source code of the TDX module[1] for the possible reason to
> > > > fail when reviewing this patch:
> > > >
> > > > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_reclaim.c
> > > > tdx-module-v1.0.01.01.zip\src\vmm_dispatcher\api_calls\tdh_phymem_page_wbinvd.c
> > > >
> > > > a. Invalid parameters. For example, page is not aligned, PA HKID is not zero...
> > > >
> > > > For invalid parameters, a WARN_ON_ONCE() + return value is good enough as
> > > > that is how kernel handles similar situations. The caller takes the
> > > > responsibility.
> > > >
> > > > b. Locks has been taken in TDX module. TDR page has been locked due to another
> > > > SEAMCALL, another SEAMCALL is doing PAMT walk and holding PAMT lock...
> > > >
> > > > This needs to be improved later either by retry or taking tdx_lock to avoid
> > > > TDX module fails on this.
> > >
> > > No, tdx_reclaim_page() already retries TDH.PHYMEM.PAGE.RECLAIM if the target page
> > > is contended (though I'd question the validity of even that), and TDH.PHYMEM.PAGE.WBINVD
> > > is performed only when reclaiming the TDR. If there's contention when reclaiming
> > > the TDR, then KVM effectively has a use-after-free bug, i.e. leaking the page is
> > > the least of our worries.
> > >
> >
> > Hi:
> >
> > Thanks for the reply. "Leaking" is the consquence of even failing in retry. I
> > agree with this. But I was questioning if "retry" is really a correct and only
> > solution when encountering lock contention in the TDX module as I saw that there
> > are quite some magic numbers are going to be introduced because of "retry" and
> > there were discussions about times of retry should be 3 or 1000 in TDX guest
> > on hyper-V patches. It doesn't sound right.
>
> Ah, yeah, I'm speaking only with respect to leaking pages on failure in this
> specific scenario.
>
> > Compare to an typical *kernel lock* case, an execution path can wait on a
> > waitqueue and later will be woken up. We usually do contention-wait-and-retry
> > and we rarely just do contention and retry X times. In TDX case, I understand
> > that it is hard for the TDX module to provide similar solutions as an execution
> > path can't stay long in the TDX module.
>
> Let me preface the below comments by saying that this is the first time that I've
> seen the "Single-Step and Zero-Step Attacks Mitigation Mechanisms" behavior, i.e.
> the first time I've been made aware that the TDX Module can apparently decide
> to take S-EPT locks in the VM-Enter path.
>
> >
> > 1) We can always take tdx_lock (linux kernel lock) when calling a SEAMCALL
> > that touch the TDX internal locks. But the downside is we might lose some
> > concurrency.
>
> This isn't really feasible in practice. Even if tdx_lock were made a spinlock
> (it's currently a mutex) so that it could it could be taken inside kvm->mmu_lock,
> acquiring a per-VM lock, let alone a global lock, in KVM's page fault handling
> path is not an option. KVM has a hard requirement these days of being able to
> handle multiple page faults in parallel.
>
> > 2) As TDX module doesn't provide contention-and-wait, I guess the following
> > approach might have been discussed when designing this "retry".
> >
> > KERNEL TDX MODULE
> >
> > SEAMCALL A -> PATH A: Taking locks
> >
> > SEAMCALL B -> PATH B: Contention on a lock
> >
> > <- Return "operand busy"
> >
> > SEAMCALL B -|
> > | <- Wait on a kernel waitqueue
> > SEAMCALL B <-|
> >
> > SEAMCALL A <- PATH A: Return
> >
> > SEAMCALL A -|
> > | <- Wake up the waitqueue
> > SEMACALL A <-|
> >
> > SEAMCALL B -> PATH B: Taking the locks
> > ...
> >
> > Why not this scheme wasn't chosen?
>
> AFAIK, I don't think a waitqueue approach as ever been discussed publicly. Intel
> may have considered the idea internally, but I don't recall anything being proposed
> publically (though it's entirely possible I just missed the discussion).
>
> Anways, I don't think a waitqueue would be a good fit, at least not for S-EPT
> management, which AFAICT is the only scenario where KVM does the arbitrary "retry
> X times and hope things work". If the contention occurs due to the TDX Module
> taking an S-EPT lock in VM-Enter, then KVM won't get a chance to do the "Wake up
> the waitqueue" action until the next VM-Exit, which IIUC is well after the TDX
> Module drops the S-EPT lock. In other words, immediately retrying and then punting
> the problem further up the stack in KVM does seem to be the least awful "solution"
> if there's contention.
>
I should put both "waitqueue or spin" in the chart above to prevent misunderstanding.
I agree that S-EPT should be thought and treated differently than others as
any lock change on that path can affect the parallelization and performance. There are
many internal locks for protecting private data structures inside the security
firmware, like locks for TDR, TDCS, TDVPS. PAMT(mostly SEPT-related). Any contention
on these might result in a BUSY.
What I would like to clarify is: it would be better to have a mechanism to make sure a
SEAMCALL can succeed at a certain point (or at least trying to succeed) as the
magic number would work like a lottery in reality. E.g. the CPU doing retry is running
at a higher freq than the CPU running the execution path which took the TDX inner
locks due to different P states. The retry loop might end much earlier than expectation.
Not even say when the same linux kernel binary with the same pre-determined number
of retrying loop running on different CPU SKUs with different freqs.
We sometimes do retry in a driver to wait HW states ready is because the clock of a
device might be fixed or predictable. But I am in doubt if retrying loop works in the
case mentioned above.
It still likes a temporary workaround to me and I think we should be quite
careful of it.
> That said, I 100% agree that the arbitrary retry stuff is awful. The zero-step
> interaction in particular isn't acceptable.
>
> Intel folks, encountering "TDX_OPERAND_BUSY | TDX_OPERAND_ID_SEPT" on VM-Enter
> needs to be treated as a KVM bug, even if it means teaching KVM to kill the VM
> if a vCPU is on the cusp of triggerring the "pre-determined number" of EPT faults
> mentioned in this snippet:
>
> After a pre-determined number of such EPT violations occur on the same instruction,
> the TDX module starts tracking the GPAs that caused Secure EPT faults and fails
> further host VMM attempts to enter the TD VCPU unless previously faulting private
> GPAs are properly mapped in the Secure EPT.
>
> If the "pre-determined number" is too low to avoid false positives, e.g. if it can
> be tripped by a vCPU spinning while a different vCPU finishes handling a fault,
> then either the behavior of the TDX Module needs to be revisited, or KVM needs to
> stall vCPUs that are approaching the threshold until all outstanding S-EPT faults
> have been serviced.
>
That sharply hits the point. Let's keep this in mind and start from here.
I also saw the AMD SEV-SNP host patches had retry times like (2 * cpu cores).
But I haven't seen any explanation in the code and comment yet, like how it
supposes to be able to work in reality. Would ask this question in the patch
review. I start to feel this seems a common problem that needs to be sorted.
> KVM shouldn't be spuriously zapping private S-EPT entries since the backing memory
> is pinned, which means the only way for a vCPU to truly get stuck faulting on a
> single instruction is if userspace is broken/malicious, in which case userspace
> gets to keep the pieces.
On Fri, Jan 20, 2023, Zhi Wang wrote:
> On Tue, 17 Jan 2023 20:56:17 +0000 Sean Christopherson <[email protected]> wrote:
> What I would like to clarify is: it would be better to have a mechanism to make sure a
> SEAMCALL can succeed at a certain point (or at least trying to succeed) as the
> magic number would work like a lottery in reality.
Yep, you and I are in agreement on this point. Ideally, KVM would be able to
guarantee success and treat TDX_OPERAND_BUSY errors as KVM bugs.
On Thu, Jan 19, 2023, Huang, Kai wrote:
> On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> > > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > > > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > > > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > > > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > > > > (conditionally) disallow sleeping.
> > > > >
> > > > > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > > > > retrying "X times", at least at those paths that can release mmu_lock()?
> > > >
> > > > That's effectively what happens by unwinding up the stak with an error code.
> > > > Eventually the page fault handler will get the error and retry the guest.
> > > >
> > > > > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > > > > rwlock_needbreak(). I haven't thought about details though.
> > > >
> > > > I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> > > > with a bunch of retry logic and error handling just because the TDX module is
> > > > ultra paranoid and hostile to hypervisors.
> > >
> > > Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
> > > multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
> > > For instance, parallel page faults on different vcpus on private pages. I
> > > believe this is the main reason to retry.
> >
> > Um, crud. I think there's a bigger issue. KVM always operates on its copy of the
> > S-EPT tables and assumes the the real S-EPT tables will always be synchronized with
> > KVM's mirror. That assumption doesn't hold true without serializing SEAMCALLs in
> > some way. E.g. if a SPTE is zapped and mapped at the same time, we can end up with:
> >
> > vCPU0 vCPU1
> > ===== =====
> > mirror[x] = xyz
> > old_spte = mirror[x]
> > mirror[x] = REMOVED_SPTE
> > sept[x] = REMOVED_SPTE
> > sept[x] = xyz
>
> IIUC this case cannot happen, as the two steps in the vcpu0 are within read
> lock, which prevents from vcpu1, which holds the write lock during zapping SPTE.
Zapping SPTEs can happen while holding mmu_lock for read, e.g. see the bug fixed
by commit 21a36ac6b6c7 ("KVM: x86/mmu: Re-check under lock that TDP MMU SP hugepage
is disallowed").
On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> On Thu, Jan 19, 2023, Huang, Kai wrote:
> > On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > > > (conditionally) disallow sleeping.
> > > >
> > > > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > > > retrying "X times", at least at those paths that can release mmu_lock()?
> > >
> > > That's effectively what happens by unwinding up the stak with an error code.
> > > Eventually the page fault handler will get the error and retry the guest.
> > >
> > > > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > > > rwlock_needbreak(). I haven't thought about details though.
> > >
> > > I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> > > with a bunch of retry logic and error handling just because the TDX module is
> > > ultra paranoid and hostile to hypervisors.
> >
> > Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
> > multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
> > For instance, parallel page faults on different vcpus on private pages. I
> > believe this is the main reason to retry.
>
> Um, crud. I think there's a bigger issue. KVM always operates on its copy of the
> S-EPT tables and assumes the the real S-EPT tables will always be synchronized with
> KVM's mirror. That assumption doesn't hold true without serializing SEAMCALLs in
> some way. E.g. if a SPTE is zapped and mapped at the same time, we can end up with:
>
> vCPU0 vCPU1
> ===== =====
> mirror[x] = xyz
> old_spte = mirror[x]
> mirror[x] = REMOVED_SPTE
> sept[x] = REMOVED_SPTE
> sept[x] = xyz
IIUC this case cannot happen, as the two steps in the vcpu0 are within read
lock, which prevents from vcpu1, which holds the write lock during zapping SPTE.
On Thu, 2023-01-19 at 23:11 +0000, Sean Christopherson wrote:
> On Thu, Jan 19, 2023, Huang, Kai wrote:
> > On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> > > > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > > > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > > > > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > > > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > > > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > > > > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > > > > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > > > > > (conditionally) disallow sleeping.
> > > > > >
> > > > > > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > > > > > retrying "X times", at least at those paths that can release mmu_lock()?
> > > > >
> > > > > That's effectively what happens by unwinding up the stak with an error code.
> > > > > Eventually the page fault handler will get the error and retry the guest.
> > > > >
> > > > > > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > > > > > rwlock_needbreak(). I haven't thought about details though.
> > > > >
> > > > > I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> > > > > with a bunch of retry logic and error handling just because the TDX module is
> > > > > ultra paranoid and hostile to hypervisors.
> > > >
> > > > Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
> > > > multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
> > > > For instance, parallel page faults on different vcpus on private pages. I
> > > > believe this is the main reason to retry.
> > >
> > > Um, crud. I think there's a bigger issue. KVM always operates on its copy of the
> > > S-EPT tables and assumes the the real S-EPT tables will always be synchronized with
> > > KVM's mirror. That assumption doesn't hold true without serializing SEAMCALLs in
> > > some way. E.g. if a SPTE is zapped and mapped at the same time, we can end up with:
> > >
> > > vCPU0 vCPU1
> > > ===== =====
> > > mirror[x] = xyz
> > > old_spte = mirror[x]
> > > mirror[x] = REMOVED_SPTE
> > > sept[x] = REMOVED_SPTE
> > > sept[x] = xyz
> >
> > IIUC this case cannot happen, as the two steps in the vcpu0 are within read
> > lock, which prevents from vcpu1, which holds the write lock during zapping SPTE.
>
> Zapping SPTEs can happen while holding mmu_lock for read, e.g. see the bug fixed
> by commit 21a36ac6b6c7 ("KVM: x86/mmu: Re-check under lock that TDP MMU SP hugepage
> is disallowed").
Sorry, I now recall that I once had patch to change write path to hold
write_lock() for TDX guest to avoid such problem. I thought it was upstream
behaviour :)
On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> The least invasive idea I have is expand the TDP MMU's concept of "frozen" SPTEs
> and freeze (a.k.a. lock) the SPTE (KVM's mirror) until the corresponding S-EPT
> update completes.
This will introduce another "having-to-wait while SPTE is frozen" problem I
think, which IIUC means (one way is) you have to do some loop and retry, perhaps
similar to yield_safe.
>
> The other idea is to scrap the mirror concept entirely, though I gotta imagine
> that would provide pretty awful performance.
Right. I don't think this is a good option.
On Thu, Jan 19, 2023, Huang, Kai wrote:
> On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > The least invasive idea I have is expand the TDP MMU's concept of "frozen" SPTEs
> > and freeze (a.k.a. lock) the SPTE (KVM's mirror) until the corresponding S-EPT
> > update completes.
>
> This will introduce another "having-to-wait while SPTE is frozen" problem I
> think, which IIUC means (one way is) you have to do some loop and retry, perhaps
> similar to yield_safe.
Yes, but because the TDP MMU already freezes SPTEs (just for a shorter duration),
I'm 99% sure all of the affected flows already know how to yield/bail when necessary.
The problem with the zero-step mitigation is that it could (theoretically) cause
a "busy" error on literally any accesses, which makes it infeasible for KVM to have
sane behavior. E.g. freezing SPTEs to avoid the ordering issues isn't necessary
when holding mmu_lock for write, whereas the zero-step madness brings everything
into play.
On Thu, 2023-01-19 at 23:24 +0000, Huang, Kai wrote:
> On Thu, 2023-01-19 at 23:11 +0000, Sean Christopherson wrote:
> > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > > On Thu, 2023-01-19 at 15:37 +0000, Sean Christopherson wrote:
> > > > > > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > > > > > On Tue, 2023-01-17 at 21:01 +0000, Sean Christopherson wrote:
> > > > > > > > On Tue, Jan 17, 2023, Sean Christopherson wrote:
> > > > > > > > > On Tue, Jan 17, 2023, Zhi Wang wrote:
> > > > > > > > Oh, the other important piece I forgot to mention is that dropping mmu_lock deep
> > > > > > > > in KVM's MMU in order to wait isn't always an option. Most flows would play nice
> > > > > > > > with dropping mmu_lock and sleeping, but some paths, e.g. from the mmu_notifier,
> > > > > > > > (conditionally) disallow sleeping.
> > > > > > >
> > > > > > > Could we do something similar to tdp_mmu_iter_cond_resched() but not simple busy
> > > > > > > retrying "X times", at least at those paths that can release mmu_lock()?
> > > > > >
> > > > > > That's effectively what happens by unwinding up the stak with an error code.
> > > > > > Eventually the page fault handler will get the error and retry the guest.
> > > > > >
> > > > > > > Basically we treat TDX_OPERAND_BUSY as seamcall_needbreak(), similar to
> > > > > > > rwlock_needbreak(). I haven't thought about details though.
> > > > > >
> > > > > > I am strongly opposed to that approach. I do not want to pollute KVM's MMU code
> > > > > > with a bunch of retry logic and error handling just because the TDX module is
> > > > > > ultra paranoid and hostile to hypervisors.
> > > > >
> > > > > Right. But IIUC there's legal cases that SEPT SEAMCALL can return BUSY due to
> > > > > multiple threads trying to read/modify SEPT simultaneously in case of TDP MMU.
> > > > > For instance, parallel page faults on different vcpus on private pages. I
> > > > > believe this is the main reason to retry.
> > > >
> > > > Um, crud. I think there's a bigger issue. KVM always operates on its copy of the
> > > > S-EPT tables and assumes the the real S-EPT tables will always be synchronized with
> > > > KVM's mirror. That assumption doesn't hold true without serializing SEAMCALLs in
> > > > some way. E.g. if a SPTE is zapped and mapped at the same time, we can end up with:
> > > >
> > > > vCPU0 vCPU1
> > > > ===== =====
> > > > mirror[x] = xyz
> > > > old_spte = mirror[x]
> > > > mirror[x] = REMOVED_SPTE
> > > > sept[x] = REMOVED_SPTE
> > > > sept[x] = xyz
> > >
> > > IIUC this case cannot happen, as the two steps in the vcpu0 are within read
> > > lock, which prevents from vcpu1, which holds the write lock during zapping SPTE.
> >
> > Zapping SPTEs can happen while holding mmu_lock for read, e.g. see the bug fixed
> > by commit 21a36ac6b6c7 ("KVM: x86/mmu: Re-check under lock that TDP MMU SP hugepage
> > is disallowed").
>
> Sorry, I now recall that I once had patch to change write path to hold
> write_lock() for TDX guest to avoid such problem. I thought it was upstream
> behaviour :)
I mean change the zap PTE path to hold write lock. :)
On Thu, Jan 19, 2023 at 4:16 PM Sean Christopherson <[email protected]> wrote:
>
> On Thu, Jan 19, 2023, Huang, Kai wrote:
> > On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > > The least invasive idea I have is expand the TDP MMU's concept of "frozen" SPTEs
> > > and freeze (a.k.a. lock) the SPTE (KVM's mirror) until the corresponding S-EPT
> > > update completes.
> >
> > This will introduce another "having-to-wait while SPTE is frozen" problem I
> > think, which IIUC means (one way is) you have to do some loop and retry, perhaps
> > similar to yield_safe.
>
> Yes, but because the TDP MMU already freezes SPTEs (just for a shorter duration),
> I'm 99% sure all of the affected flows already know how to yield/bail when necessary.
>
> The problem with the zero-step mitigation is that it could (theoretically) cause
> a "busy" error on literally any accesses, which makes it infeasible for KVM to have
> sane behavior. E.g. freezing SPTEs to avoid the ordering issues isn't necessary
> when holding mmu_lock for write, whereas the zero-step madness brings everything
> into play.
(I'm still ramping up on TDX so apologies in advance if the following
is totally off base.)
The complexity, and to a lesser extent the memory overhead, of
mirroring Secure EPT tables with the TDP MMU makes me wonder if it is
really worth it. Especially since the initial TDX support has so many
constraints that would seem to allow a simpler implementation: all
private memory is pinned, no live migration support, no test/clear
young notifiers, etc.
For the initial version of KVM TDX support, what if we implemented the
Secure EPT management entirely off to the side? i.e. Not on top of the
TDP MMU. For example, write TDX-specific routines for:
- Fully populating the Secure EPT tree some time during VM creation.
- Tearing down the Secure EPT tree during VM destruction.
- Support for unmapping/mapping specific regions of the Secure EPT
tree for private<->shared conversions.
With that in place, KVM never would need to handle a fault on a Secure
EPT mapping. Any fault (e.g. due to an in-progress private<->shared
conversion) can just return back to the guest to retry the memory
access until the operation is complete.
If we start with only supporting 4K pages in the Secure EPT, the
Secure EPT routines described above would be almost trivial to
implement. Huge Pages would add some complexity, but I don't think it
would be terrible. Concurrency can be handled with a single lock since
we don't have to worry about concurrent faulting.
This would avoid having TDX add a bunch of complexity to the TDP MMU
(which would only be used for shared mappings). If and when we want to
have more complicated memory management for TDX private mappings, we
could revisit TDP MMU integration. But I think this design could even
get us to the point of supporting Dirty Logging (where the only fault
KVM would have to handle for TDX private mappings would be
write-protection faults). I'm not sure it would work for Demand-Paging
(at least the performance would not be great behind a single lock),
but we can cross that bridge when we get there.
On Fri, Jan 20, 2023, David Matlack wrote:
> On Thu, Jan 19, 2023 at 4:16 PM Sean Christopherson <[email protected]> wrote:
> >
> > On Thu, Jan 19, 2023, Huang, Kai wrote:
> > > On Thu, 2023-01-19 at 21:36 +0000, Sean Christopherson wrote:
> > > > The least invasive idea I have is expand the TDP MMU's concept of "frozen" SPTEs
> > > > and freeze (a.k.a. lock) the SPTE (KVM's mirror) until the corresponding S-EPT
> > > > update completes.a
Doh, the proposed patches already do a freeze+unfreeze. Sorry, I obviously didn't
read the most recent changelog and haven't looked at the code in a few versions.
> > > This will introduce another "having-to-wait while SPTE is frozen" problem I
> > > think, which IIUC means (one way is) you have to do some loop and retry, perhaps
> > > similar to yield_safe.
> >
> > Yes, but because the TDP MMU already freezes SPTEs (just for a shorter duration),
> > I'm 99% sure all of the affected flows already know how to yield/bail when necessary.
> >
> > The problem with the zero-step mitigation is that it could (theoretically) cause
> > a "busy" error on literally any accesses, which makes it infeasible for KVM to have
> > sane behavior. E.g. freezing SPTEs to avoid the ordering issues isn't necessary
> > when holding mmu_lock for write, whereas the zero-step madness brings everything
> > into play.
>
> (I'm still ramping up on TDX so apologies in advance if the following
> is totally off base.)
>
> The complexity, and to a lesser extent the memory overhead, of
> mirroring Secure EPT tables with the TDP MMU makes me wonder if it is
> really worth it. Especially since the initial TDX support has so many
> constraints that would seem to allow a simpler implementation: all
> private memory is pinned, no live migration support, no test/clear
> young notifiers, etc.
>
> For the initial version of KVM TDX support, what if we implemented the
> Secure EPT management entirely off to the side? i.e. Not on top of the
> TDP MMU. For example, write TDX-specific routines for:
>
> - Fully populating the Secure EPT tree some time during VM creation.
Fully populating S-EPT is likely not a viable option for performance reasons. The
number of SEAMCALLS needed for a large VM would be quite enormous, and to avoid
faults entirely the backing memory would also need to be pre-allocated.
> - Tearing down the Secure EPT tree during VM destruction.
> - Support for unmapping/mapping specific regions of the Secure EPT
> tree for private<->shared conversions.
>
> With that in place, KVM never would need to handle a fault on a Secure
> EPT mapping. Any fault (e.g. due to an in-progress private<->shared
> conversion) can just return back to the guest to retry the memory
> access until the operation is complete.
I don't think this will work. Or at least, it's not that simple. TDX and SNP
both require the host to support implicit conversions from shared => private,
i.e. KVM needs to be able to handle faults on private memory at any time. KVM
could rely on the attributes xarray to know if KVM should resume the guest or
exit to userspace, but at some point KVM needs to know whether or an entry has
been installed. We could work around that by adding a hooking to "set attributes"
to populate S-EPT entries, but I'm not sure that would yield a simpler implementation
without sacrificing performance.
Thinking more about this, I do believe there is a simpler approach than freezing
S-EPT entries. If we tweak the TDP MMU rules for TDX to require mmu_lock be held
for write when overwiting a present SPTE (with a live VM), then the problematic
cases go away. As you point out, all of the restrictions on TDX private memory
means we're most of the way there.
Explicit unmap flows already take mmu_lock for write, zapping/splitting for dirty
logging isn't a thing (yet), and the S-EPT root is fixed, i.e. kvm_tdp_mmu_put_root()
should never free S-EPT tables until the VM is dead.
In other words, the only problematic flow is kvm_tdp_mmu_map(). Specifically,
KVM will overwrite a present SPTE only to demote/promote an existing entry, i.e.
to split a hugepage or collapse into a hugepage. And I believe it's not too
difficult to ensure KVM never does in-place promotion/demotion:
- Private memory doesn't rely on host userspace page tables, i.e. KVM won't
try to promote to a hugepage because a hugepage was created in the host.
- Private memory doesn't (yet) support page migration, so again KVM won't try
to promote because a hugepage is suddenly possible.
- Shadow paging isn't compatible, i.e. disallow_lpage won't change dynamically
except for when there are mixed attributes, and toggling private/shared in
the attributes requires zapping with mmu_lock held for write.
- The idiotic non-coherent DMA + guest MTRR interaction can be disallowed (which
I assume is already the case for TDX).
- Disallow toggling the NX hugepage mitigation when TDX is allowed.
Races to install leaf SPTEs will yield a "loser" either on the spurious check in
tdp_mmu_map_handle_target_level():
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
or on the CMPXCHG in tdp_mmu_set_spte_atomic().
So I _think_ the race I described can't actually happen in practice, and if it
does happen, then we have a bug somewhere else. Though that raises the question
of why the proposed patches do the freeze+unfreeze. If it's just the VM destroy
case, then that should be easy enough to special case.
Intel folks,
Do you happen to know exactly what scenario prompted adding the freeze+unfreeze
code? Is there something I'm forgetting/missing, or is it possible we can go
with a simpler implementation?
>
>
> Intel folks,
>
> Do you happen to know exactly what scenario prompted adding the freeze+unfreeze
> code? Is there something I'm forgetting/missing, or is it possible we can go
> with a simpler implementation?
It's documented in the "TDX TDP MMU design doc" patch:
+TDX concurrent populating
+-------------------------
......
+
+Without freezing the entry, the following race can happen. Suppose two vcpus
+are faulting on the same GPA and the 2M and 4K level entries aren't populated
+yet.
+
+* vcpu 1: update 2M level EPT entry
+* vcpu 2: update 4K level EPT entry
+* vcpu 2: TDX SEAMCALL to update 4K secure EPT entry => error
+* vcpu 1: TDX SEAMCALL to update 2M secure EPT entry
+
( I guess such material will be more useful in the comment. And perhaps we can
get rid of the "TDX TDP MMU design doc" patch in this series at least for now as
probably nobody will look at it :-) )
On Mon, Jan 23, 2023, Huang, Kai wrote:
> >
> >
> > Intel folks,
> >
> > Do you happen to know exactly what scenario prompted adding the freeze+unfreeze
> > code? Is there something I'm forgetting/missing, or is it possible we can go
> > with a simpler implementation?
>
> It's documented in the "TDX TDP MMU design doc" patch:
>
> +TDX concurrent populating
> +-------------------------
> ......
> +
> +Without freezing the entry, the following race can happen. Suppose two vcpus
> +are faulting on the same GPA and the 2M and 4K level entries aren't populated
> +yet.
> +
> +* vcpu 1: update 2M level EPT entry
> +* vcpu 2: update 4K level EPT entry
> +* vcpu 2: TDX SEAMCALL to update 4K secure EPT entry => error
> +* vcpu 1: TDX SEAMCALL to update 2M secure EPT entry
Ooh, the problem isn't that two SEAMCALLs to the same entry get out of order, it's
that SEAMCALLs operating on child entries can race ahead of the parent. Hrm.
TDX also has the annoying-but-understandable restriction that leafs need to be
removed before parents. A not-yet-relevant complication on that front is that the
TDP MMU's behavior of recursively removing children means we also have to worry
about PRESENT => !PRESENT transitions, e.g. zapping a child because the parent is
being removed can race with a different vCPU try to populate the child (because
the vCPU handling a page fault could have seen the PRESENT parent).
I think there's an opportunity and motivation to improve the TDP MMU as a whole on
this front though. Rather than recursively zap children in handle_removed_pt(),
we can use the RCU callback to queue the page table for removal. Setting the parent
(target page table) !PRESENT and flushing the TLBs ensures that all children are
unreachable, i.e. KVM doesn't need to immediately set children !PRESENT. Unlike
the shadow MMU, which maintains a hash table of shadow pages, once a parent page
table is removed from the TDP MMU, its children are unreachabled.
The RCU callback must run in near-constant time, but that's easy to solve as we
already have a workqueue for zapping page tables, i.e. the RCU callback can simply
add the target page to the zap workqueue. That would also allow for a (very minor)
simplification of other TDP MMU code: tdp_mmu_zap_root() wouldn't needed to zap in
two passes since zapping children of the top-level SPTEs would be deferred to the
workqueue.
Back to TDX, to play nice with the restriction that parents are removed only after
children are removed, I believe KVM can use TDH.MEM.RANGE.BLOCK to make the parent
!PRESENT. That will effectively prune the S-EPT entry and all its children, and
the RCU callback will again ensure all in-flight SEAMCALLs for the children complete
before KVM actually tries to zap the children.
And if we rework zapping page tables, I suspect we can also address David's concern
(and my not-yet-voiced concern) about polluting the TDP MMU code with logic that is
necessary only for S-EPT (freezing SPTEs before populating them). Rather than update
S-EPT _after_ the TDP MMU SPTE, do the S-EPT update first, i.e. invoke the KVM TDX
hook before try_cmpxchg64() (or maybe instead of?). That way KVM TDX can freeze the
to-be-installed SPTE without common TDP MMU needing to be aware of the change.
> ( I guess such material will be more useful in the comment. And perhaps we can
> get rid of the "TDX TDP MMU design doc" patch in this series at least for now as
> probably nobody will look at it :-) )
Please keep the design doc, I'll definitely read it. I'm just chasing too many
things at the moment and haven't given the TDX series a proper review, i.e. haven't
even glanced through all of the patches or even the shortlog.
On Mon, 2023-01-23 at 17:41 +0000, Sean Christopherson wrote:
> On Mon, Jan 23, 2023, Huang, Kai wrote:
> > >
> > >
> > > Intel folks,
> > >
> > > Do you happen to know exactly what scenario prompted adding the freeze+unfreeze
> > > code? Is there something I'm forgetting/missing, or is it possible we can go
> > > with a simpler implementation?
> >
> > It's documented in the "TDX TDP MMU design doc" patch:
> >
> > +TDX concurrent populating
> > +-------------------------
> > ......
> > +
> > +Without freezing the entry, the following race can happen. Suppose two vcpus
> > +are faulting on the same GPA and the 2M and 4K level entries aren't populated
> > +yet.
> > +
> > +* vcpu 1: update 2M level EPT entry
> > +* vcpu 2: update 4K level EPT entry
> > +* vcpu 2: TDX SEAMCALL to update 4K secure EPT entry => error
> > +* vcpu 1: TDX SEAMCALL to update 2M secure EPT entry
>
> Ooh, the problem isn't that two SEAMCALLs to the same entry get out of order, it's
> that SEAMCALLs operating on child entries can race ahead of the parent. Hrm.
>
> TDX also has the annoying-but-understandable restriction that leafs need to be
> removed before parents. A not-yet-relevant complication on that front is that the
> TDP MMU's behavior of recursively removing children means we also have to worry
> about PRESENT => !PRESENT transitions, e.g. zapping a child because the parent is
> being removed can race with a different vCPU try to populate the child (because
> the vCPU handling a page fault could have seen the PRESENT parent).
Yes.
>
> I think there's an opportunity and motivation to improve the TDP MMU as a whole on
> this front though. Rather than recursively zap children in handle_removed_pt(),
> we can use the RCU callback to queue the page table for removal. Setting the parent
> (target page table) !PRESENT and flushing the TLBs ensures that all children are
> unreachable, i.e. KVM doesn't need to immediately set children !PRESENT. Unlike
> the shadow MMU, which maintains a hash table of shadow pages, once a parent page
> table is removed from the TDP MMU, its children are unreachabled.
Do you mean something like (pseudo):
rcu_callback(&removed_sp->rcu_head, handle_removed_pt);
>
> The RCU callback must run in near-constant time, but that's easy to solve as we
> already have a workqueue for zapping page tables, i.e. the RCU callback can simply
> add the target page to the zap workqueue. That would also allow for a (very minor)
> simplification of other TDP MMU code: tdp_mmu_zap_root() wouldn't needed to zap in
> two passes since zapping children of the top-level SPTEs would be deferred to the
> workqueue.
Do you mean zapping the entire page table (from root) doesn't need to be in RCU
read-critical section, but can/should be done after grace period? I think this
makes sense since zapping entire root must happen when root is already invalid,
which cannot be used anymore when the new faults come in?
>
> Back to TDX, to play nice with the restriction that parents are removed only after
> children are removed, I believe KVM can use TDH.MEM.RANGE.BLOCK to make the parent
> !PRESENT. That will effectively prune the S-EPT entry and all its children, and
> the RCU callback will again ensure all in-flight SEAMCALLs for the children complete
> before KVM actually tries to zap the children.
Reading the spec, it seems TDH.MEM.RANGE.BLOCK only sets the Secure EPT entry
which points to the entire range as "blocked", but won't go down until leaf to
mark all EPT entries as "blocked", which makes sense anyway.
But it seems TDH.MEM.PAGE.REMOVE and TDH.MEM.SEPT.REMOVE both only checks
whether that target EPT entry is "blocked", but doesn't check whether any parent
has been marked as "blocked". Not sure whether this will be a problem. But
anyway if this is a problem, we perhaps can get TDX module to fix.
>
> And if we rework zapping page tables, I suspect we can also address David's concern
> (and my not-yet-voiced concern) about polluting the TDP MMU code with logic that is
> necessary only for S-EPT (freezing SPTEs before populating them). Rather than update
> S-EPT _after_ the TDP MMU SPTE, do the S-EPT update first, i.e. invoke the KVM TDX
> hook before try_cmpxchg64() (or maybe instead of?). That way KVM TDX can freeze the
> to-be-installed SPTE without common TDP MMU needing to be aware of the change.
I don't quite understand how putting SEAMCALL before the try_cmpxchg64() can
work. Let's say one thread is populating a mapping and another is zapping it.
The populating thread makes SEAMCALL successfully but then try_cmpxchg64()
fails, in this case how to proceed?
>
> > ( I guess such material will be more useful in the comment. And perhaps we can
> > get rid of the "TDX TDP MMU design doc" patch in this series at least for now as
> > probably nobody will look at it :-) )
>
> Please keep the design doc, I'll definitely read it. I'm just chasing too many
> things at the moment and haven't given the TDX series a proper review, i.e. haven't
> even glanced through all of the patches or even the shortlog.
On Thu, Jan 26, 2023, Huang, Kai wrote:
> On Mon, 2023-01-23 at 17:41 +0000, Sean Christopherson wrote:
> > I think there's an opportunity and motivation to improve the TDP MMU as a whole on
> > this front though. Rather than recursively zap children in handle_removed_pt(),
> > we can use the RCU callback to queue the page table for removal. Setting the parent
> > (target page table) !PRESENT and flushing the TLBs ensures that all children are
> > unreachable, i.e. KVM doesn't need to immediately set children !PRESENT. Unlike
> > the shadow MMU, which maintains a hash table of shadow pages, once a parent page
> > table is removed from the TDP MMU, its children are unreachabled.
>
> Do you mean something like (pseudo):
>
> rcu_callback(&removed_sp->rcu_head, handle_removed_pt);
Yep.
> > The RCU callback must run in near-constant time, but that's easy to solve as we
> > already have a workqueue for zapping page tables, i.e. the RCU callback can simply
> > add the target page to the zap workqueue. That would also allow for a (very minor)
> > simplification of other TDP MMU code: tdp_mmu_zap_root() wouldn't needed to zap in
> > two passes since zapping children of the top-level SPTEs would be deferred to the
> > workqueue.
>
> Do you mean zapping the entire page table (from root) doesn't need to be in RCU
> read-critical section, but can/should be done after grace period? I think this
> makes sense since zapping entire root must happen when root is already invalid,
> which cannot be used anymore when the new faults come in?
Yes, minus the "from root" restriction. When a page table (call it "branch" to
continue the analogy) PTE has been zapped/blocked ("pruned"), KVM just needs to
wait for all potential readers to go away. That guarantee is provided by RCU;
software walkers, i.e. KVM itself, are required to hold RCU, and hardware walkers,
i.e. vCPUs running in the guest, are protected by proxy as the zapper ("arborist"?)
is required to hold RCU until all running vCPUs have been kicked.
In other words, once the PTE is zapped/blocked (branch is pruned), it's completely
removed from the paging tree and no other tasks can access the branch (page table
and its children). I.e. the only remaining reference to the branch is the pointer
handed to the RCU callback. That means the RCU callback has exclusive access to the
branch, i.e. can operate as if it were holding mmu_lock for write. Furthermore, the
RCU callback also doesn't need to flush TLBs because that was again done when
pruning the branch.
It's the same idea that KVM already uses for root SPs, the only difference is how
KVM determines that there is exactly one entity that holds a reference to the SP.
> > Back to TDX, to play nice with the restriction that parents are removed only after
> > children are removed, I believe KVM can use TDH.MEM.RANGE.BLOCK to make the parent
> > !PRESENT. That will effectively prune the S-EPT entry and all its children, and
> > the RCU callback will again ensure all in-flight SEAMCALLs for the children complete
> > before KVM actually tries to zap the children.
>
> Reading the spec, it seems TDH.MEM.RANGE.BLOCK only sets the Secure EPT entry
> which points to the entire range as "blocked", but won't go down until leaf to
> mark all EPT entries as "blocked", which makes sense anyway.
>
> But it seems TDH.MEM.PAGE.REMOVE and TDH.MEM.SEPT.REMOVE both only checks
> whether that target EPT entry is "blocked", but doesn't check whether any parent
> has been marked as "blocked". Not sure whether this will be a problem. But
> anyway if this is a problem, we perhaps can get TDX module to fix.
Oh, I didn't mean to suggest KVM skip TDH.MEM.RANGE.BLOCK for children, I simply
forgot that all S-EPT entries need to be blocked before they can be removed.
> > And if we rework zapping page tables, I suspect we can also address David's concern
> > (and my not-yet-voiced concern) about polluting the TDP MMU code with logic that is
> > necessary only for S-EPT (freezing SPTEs before populating them). Rather than update
> > S-EPT _after_ the TDP MMU SPTE, do the S-EPT update first, i.e. invoke the KVM TDX
> > hook before try_cmpxchg64() (or maybe instead of?). That way KVM TDX can freeze the
> > to-be-installed SPTE without common TDP MMU needing to be aware of the change.
>
> I don't quite understand how putting SEAMCALL before the try_cmpxchg64() can
> work. Let's say one thread is populating a mapping and another is zapping it.
> The populating thread makes SEAMCALL successfully but then try_cmpxchg64()
> fails, in this case how to proceed?
Ah, sorry, that was unclear. By "invoke the KVM TDX hook" I didn't mean "do the
SEAMCALL", I meant KVM TDX could do its own manipulation of the KVM-managed SPTEs
before the common/standard flow. E.g. something like:
if (kvm_x86_ops.set_private_spte && private)
r = static_call(kvm_x86_set_private_spte(...)
else
r = try_cmpxchg64(...) ? 0 : -EBUSY;
so that the common code doesn't need to do, or even be aware of, the freezing.
Then I think we just need another hook in handle_removed_pt(), or maybe in what
is currently __kvm_tdp_mmu_write_spte()?
I.e. fully replace the "write" operations in the TDP MMU instead of trying to
smush S-EPT's requirements into the common path.
On Thu, 2023-01-26 at 17:28 +0000, Sean Christopherson wrote:
> On Thu, Jan 26, 2023, Huang, Kai wrote:
> > On Mon, 2023-01-23 at 17:41 +0000, Sean Christopherson wrote:
> > > I think there's an opportunity and motivation to improve the TDP MMU as a whole on
> > > this front though. Rather than recursively zap children in handle_removed_pt(),
> > > we can use the RCU callback to queue the page table for removal. Setting the parent
> > > (target page table) !PRESENT and flushing the TLBs ensures that all children are
> > > unreachable, i.e. KVM doesn't need to immediately set children !PRESENT. Unlike
> > > the shadow MMU, which maintains a hash table of shadow pages, once a parent page
> > > table is removed from the TDP MMU, its children are unreachabled.
> >
> > Do you mean something like (pseudo):
> >
> > rcu_callback(&removed_sp->rcu_head, handle_removed_pt);
>
> Yep.
>
> > > The RCU callback must run in near-constant time, but that's easy to solve as we
> > > already have a workqueue for zapping page tables, i.e. the RCU callback can simply
> > > add the target page to the zap workqueue. That would also allow for a (very minor)
> > > simplification of other TDP MMU code: tdp_mmu_zap_root() wouldn't needed to zap in
> > > two passes since zapping children of the top-level SPTEs would be deferred to the
> > > workqueue.
> >
> > Do you mean zapping the entire page table (from root) doesn't need to be in RCU
> > read-critical section, but can/should be done after grace period? I think this
> > makes sense since zapping entire root must happen when root is already invalid,
> > which cannot be used anymore when the new faults come in?
>
> Yes, minus the "from root" restriction. When a page table (call it "branch" to
> continue the analogy) PTE has been zapped/blocked ("pruned"), KVM just needs to
> wait for all potential readers to go away. That guarantee is provided by RCU;
> software walkers, i.e. KVM itself, are required to hold RCU, and hardware walkers,
> i.e. vCPUs running in the guest, are protected by proxy as the zapper ("arborist"?)
> is required to hold RCU until all running vCPUs have been kicked.
Yes.
>
> In other words, once the PTE is zapped/blocked (branch is pruned), it's completely
> removed from the paging tree and no other tasks can access the branch (page table
> and its children). I.e. the only remaining reference to the branch is the pointer
> handed to the RCU callback. That means the RCU callback has exclusive access to the
> branch, i.e. can operate as if it were holding mmu_lock for write. Furthermore, the
> RCU callback also doesn't need to flush TLBs because that was again done when
> pruning the branch.
>
> It's the same idea that KVM already uses for root SPs, the only difference is how
> KVM determines that there is exactly one entity that holds a reference to the SP.
Right. This works fine for normal non-TDX case. However for TDX unfortunately
the access to the removed branch (or the removed sub-page-table) isn't that
"exclusive" as the SEAMCALL to truly zap that branch still needs to hold the
write lock of the entire Secure EPT tree, so it can still conflict with other
threads handling new faults.
This is a problem we need to deal with anyway.
>
> > > Back to TDX, to play nice with the restriction that parents are removed only after
> > > children are removed, I believe KVM can use TDH.MEM.RANGE.BLOCK to make the parent
> > > !PRESENT. That will effectively prune the S-EPT entry and all its children, and
> > > the RCU callback will again ensure all in-flight SEAMCALLs for the children complete
> > > before KVM actually tries to zap the children.
> >
> > Reading the spec, it seems TDH.MEM.RANGE.BLOCK only sets the Secure EPT entry
> > which points to the entire range as "blocked", but won't go down until leaf to
> > mark all EPT entries as "blocked", which makes sense anyway.
> >
> > But it seems TDH.MEM.PAGE.REMOVE and TDH.MEM.SEPT.REMOVE both only checks
> > whether that target EPT entry is "blocked", but doesn't check whether any parent
> > has been marked as "blocked". Not sure whether this will be a problem. But
> > anyway if this is a problem, we perhaps can get TDX module to fix.
>
> Oh, I didn't mean to suggest KVM skip TDH.MEM.RANGE.BLOCK for children, I simply
> forgot that all S-EPT entries need to be blocked before they can be removed.
>
> > > And if we rework zapping page tables, I suspect we can also address David's concern
> > > (and my not-yet-voiced concern) about polluting the TDP MMU code with logic that is
> > > necessary only for S-EPT (freezing SPTEs before populating them). Rather than update
> > > S-EPT _after_ the TDP MMU SPTE, do the S-EPT update first, i.e. invoke the KVM TDX
> > > hook before try_cmpxchg64() (or maybe instead of?). That way KVM TDX can freeze the
> > > to-be-installed SPTE without common TDP MMU needing to be aware of the change.
> >
> > I don't quite understand how putting SEAMCALL before the try_cmpxchg64() can
> > work. Let's say one thread is populating a mapping and another is zapping it.
> > The populating thread makes SEAMCALL successfully but then try_cmpxchg64()
> > fails, in this case how to proceed?
>
> Ah, sorry, that was unclear. By "invoke the KVM TDX hook" I didn't mean "do the
> SEAMCALL", I meant KVM TDX could do its own manipulation of the KVM-managed SPTEs
> before the common/standard flow. E.g. something like:
>
> if (kvm_x86_ops.set_private_spte && private)
> r = static_call(kvm_x86_set_private_spte(...)
> else
> r = try_cmpxchg64(...) ? 0 : -EBUSY;
>
> so that the common code doesn't need to do, or even be aware of, the freezing.
> Then I think we just need another hook in handle_removed_pt(), or maybe in what
> is currently __kvm_tdp_mmu_write_spte()?
>
> I.e. fully replace the "write" operations in the TDP MMU instead of trying to
> smush S-EPT's requirements into the common path.
I have no preference but looks better to me as the benefits you mentioned above.
:)
On Thu, Jan 26, 2023, Huang, Kai wrote:
> On Thu, 2023-01-26 at 17:28 +0000, Sean Christopherson wrote:
> > In other words, once the PTE is zapped/blocked (branch is pruned), it's completely
> > removed from the paging tree and no other tasks can access the branch (page table
> > and its children). I.e. the only remaining reference to the branch is the pointer
> > handed to the RCU callback. That means the RCU callback has exclusive access to the
> > branch, i.e. can operate as if it were holding mmu_lock for write. Furthermore, the
> > RCU callback also doesn't need to flush TLBs because that was again done when
> > pruning the branch.
> >
> > It's the same idea that KVM already uses for root SPs, the only difference is how
> > KVM determines that there is exactly one entity that holds a reference to the SP.
>
> Right. This works fine for normal non-TDX case. However for TDX unfortunately
> the access to the removed branch (or the removed sub-page-table) isn't that
> "exclusive" as the SEAMCALL to truly zap that branch still needs to hold the
> write lock of the entire Secure EPT tree, so it can still conflict with other
> threads handling new faults.
I thought TDX was smart enough to read-lock only the part of the tree that it's
actually consuming, and write-lock only the part of the tree that it's actually
modifying?
Hrm, but even if TDX takes a read-lock, there's still the problem of it needing
to walk the upper levels, i.e. KVM needs to keep mid-level page tables reachable
until they're fully removed. Blech. That should be a non-issue at this time
though, as I don't think KVM will ever REMOVE a page table of a live guest. I
need to look at the PROMOTE/DEMOTE flows...
On Thu, 2023-01-26 at 21:59 +0000, Sean Christopherson wrote:
> On Thu, Jan 26, 2023, Huang, Kai wrote:
> > On Thu, 2023-01-26 at 17:28 +0000, Sean Christopherson wrote:
> > > In other words, once the PTE is zapped/blocked (branch is pruned), it's completely
> > > removed from the paging tree and no other tasks can access the branch (page table
> > > and its children). I.e. the only remaining reference to the branch is the pointer
> > > handed to the RCU callback. That means the RCU callback has exclusive access to the
> > > branch, i.e. can operate as if it were holding mmu_lock for write. Furthermore, the
> > > RCU callback also doesn't need to flush TLBs because that was again done when
> > > pruning the branch.
> > >
> > > It's the same idea that KVM already uses for root SPs, the only difference is how
> > > KVM determines that there is exactly one entity that holds a reference to the SP.
> >
> > Right. This works fine for normal non-TDX case. However for TDX unfortunately
> > the access to the removed branch (or the removed sub-page-table) isn't that
> > "exclusive" as the SEAMCALL to truly zap that branch still needs to hold the
> > write lock of the entire Secure EPT tree, so it can still conflict with other
> > threads handling new faults.
>
> I thought TDX was smart enough to read-lock only the part of the tree that it's
> actually consuming, and write-lock only the part of the tree that it's actually
> modifying?
The spec says there's only exclusive/shared access to the "whole Secure EPT
tree":
8.6. Secure EPT Concurrency
Secure EPT concurrency rules are designed to support the expected usage and yet
be as simple as possible.
Host-Side (SEAMCALL) Functions:
• Functions that manage Secure EPT acquire exclusive access to the whole Secure
EPT tree of the target TD.
• In specific cases where a Secure EPT entry update may collide with a
concurrent update done by the guest TD, such host-side functions update the
Secure EPT entry as a transaction, using atomic compare and exchange operations.
• TDH.MEM.SEPT.RD acquire shared access to the whole Secure EPT tree of the
target TD to help prevent changes to the tree while they execute.
• Other functions that only read Secure EPT for GPA-to-HPA translation (e.g.,
TDH.MR.EXTEND) acquire shared access to the whole Secure EPT tree of the target
TD to help prevent changes to the tree while they execute.
>
> Hrm, but even if TDX takes a read-lock, there's still the problem of it needing
> to walk the upper levels, i.e. KVM needs to keep mid-level page tables reachable
> until they're fully removed. Blech. That should be a non-issue at this time
> though, as I don't think KVM will ever REMOVE a page table of a live guest. I
> need to look at the PROMOTE/DEMOTE flows...
In this series, if I read correctly, when slot is removed/moved, private
mappings are zapped too. It's kinda buried in:
[PATCH v11 043/113] KVM: x86/tdp_mmu: Don't zap private pages for unsupported
cases
(it's not easy to find -- I had to use 'git blame' in the actual repo to find
the commit, sigh.)
On Thu, Jan 26, 2023, Huang, Kai wrote:
> On Thu, 2023-01-26 at 21:59 +0000, Sean Christopherson wrote:
> > Hrm, but even if TDX takes a read-lock, there's still the problem of it needing
> > to walk the upper levels, i.e. KVM needs to keep mid-level page tables reachable
> > until they're fully removed. Blech. That should be a non-issue at this time
> > though, as I don't think KVM will ever REMOVE a page table of a live guest. I
> > need to look at the PROMOTE/DEMOTE flows...
>
> In this series, if I read correctly, when slot is removed/moved, private
> mappings are zapped too. It's kinda buried in:
>
> [PATCH v11 043/113] KVM: x86/tdp_mmu: Don't zap private pages for unsupported
> cases
That should be ok, only leaf SPTEs will be zapped in that case.