Received-SPF: pass (google.com: domain of linux-kernel+bounces-108440-linux.lists.archive=gmail.com@vger.kernel.org designates 147.75.80.249 as permitted sender) client-ip=147.75.80.249;
Date: Wed, 20 Mar 2024 13:12:01 +0800
From: Chao Gao <chao.gao@intel.com>
To: <isaku.yamahata@intel.com>
CC: <kvm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<isaku.yamahata@gmail.com>, Paolo Bonzini <pbonzini@redhat.com>,
	<erdemaktas@google.com>, Sean Christopherson <seanjc@google.com>, Sagi Shahar
	<sagis@google.com>, Kai Huang <kai.huang@intel.com>, <chen.bo@intel.com>,
	<hang.yuan@intel.com>, <tina.zhang@intel.com>, Sean Christopherson
	<sean.j.christopherson@intel.com>
Subject: Re: [PATCH v19 038/130] KVM: TDX: create/destroy VM structure
Message-ID: <ZfpwIespKy8qxWWE@chao-email>
References: <cover.1708933498.git.isaku.yamahata@intel.com>
 <7a508f88e8c8b5199da85b7a9959882ddf390796.1708933498.git.isaku.yamahata@intel.com>
Content-Type: text/plain; charset="us-ascii"
Content-Disposition: inline
In-Reply-To: <7a508f88e8c8b5199da85b7a9959882ddf390796.1708933498.git.isaku.yamahata@intel.com>
Precedence: bulk
MIME-Version: 1.0
X-MS-Exchange-AntiSpam-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-MessageData-0: =?us-ascii?Q?eWU+k/1MN1OVqSeGmzfhs5ucIRvnG9vqay1X21Dp57y0/m78AD/Koj7oOkbi?=
 =?us-ascii?Q?aRAJYNYFewOFpL47BCS0j/31huT4ziU1/OrJjLKUUs+me/YzURpxoYt1gubd?=
 =?us-ascii?Q?gGoCerHFmLnH1tFEA3pVnxkTEkzfr5GsfrOajeZSwTxbmGyPzpemwwEU2cG3?=
 =?us-ascii?Q?hvKF+b1DcaIt3htZWgvGfGQTlhy+geThaaJRxP6AqcP4aRltpwHnEz3zDEn7?=
 =?us-ascii?Q?rdF5fV9zMlhTSE93s47KNdOKWB2bZP6tSfQOz02HmAMc6bfgnmSWPkyV3IEF?=
 =?us-ascii?Q?z05pMaop7YLPdu5QR/FsvkZ+w4nArNdP63CTEZCbUiRuN0uwFf6BnF/aggv5?=
 =?us-ascii?Q?sPCARN9ihkJdFnEjyFaQ9QtWkDlfdjcmNurOTYbirg35Ywp3clc4grrSI23G?=
 =?us-ascii?Q?cmZmCXGu2lQj2DjHyJP7VryxFRSTYYwNZsshUMJtzxQ6/iVzkCFxr1EYOil0?=
 =?us-ascii?Q?wDI63CkZdLxOvgqtexhVay8UhMmARBOOMDuqMwhTl5ONtbyfqzm7H+Mvb6E4?=
 =?us-ascii?Q?66d8kJce+N/T2GEK9y86Sds0CivBwFmOy9likY3SaMzoCSO/hO5ug2YgAshd?=
 =?us-ascii?Q?qFKSxgaHi+tSO+WuiDfByO+PpYcWG6cMHXCEtoPP1EVSPj/+L3OviRtx/lzL?=
 =?us-ascii?Q?Gkusj8BYBEKYvgjuT4BJxEiQGNmGXimPtJr4om2bfbG19BCPBQITjnazpbdx?=
 =?us-ascii?Q?jT8/qfC0ahy1xa0vh/pB8ZaJRQsr6ZlNXu5LRWaH9bg3hdilar/+W2pS1zhK?=
 =?us-ascii?Q?i6CUyU/xVtR9pZ3k4ynaFmsvIfaywe49jic/QtfxCh4sFGLWEgJ2qlfjAAjW?=
 =?us-ascii?Q?S7Zu3XNPEoNKVzZ6samha+VDqFUs6eqVWS7PGqlxDacLmOl87NkT49qK9/WR?=
 =?us-ascii?Q?HFlfxxeww2fsFUkge5QU1lWYt+yeFVlaSyJhmoieLAutl2/WDAToi9xxmkdy?=
 =?us-ascii?Q?k2UmLKVGvu4dfB7sm5hmVRCljRJ4ILx05vMUduZga1tpQiLHPU6QzW04+8Wc?=
 =?us-ascii?Q?82XvQCCxxM/jnpRiuRLmMB2ANvmpODCt6nTFUyUFEKqsUdypuajg5bRDkggt?=
 =?us-ascii?Q?Hv3yZlsipETtOC/xD7alrQD5T5CaFL2R2vgHGYRZMO1+BRFZ7rpTEtjQaPBi?=
 =?us-ascii?Q?otY8+/yaeuQ5xZ1gqTnwofNj+bxEr9IdSrzwriMvL1bbpnWCZTLZvnhh/bsj?=
 =?us-ascii?Q?se+cZniw2D7+5BQwS0Hgr7pxtt7CYIGLBL3SPotRPsH+9+4bcgZx4UaBKSFY?=
 =?us-ascii?Q?6rOK8nnwdDxOMSpEaDCHv4fntnr/mytH8qpZh/VuTpSjXuSygxrJ81vcjJJ8?=
 =?us-ascii?Q?o1x2eNvqdl3ILsX+Fq0rwmr7nb8EIfzowm6OWUexfkFDfPx/69hI4nN6sAVc?=
 =?us-ascii?Q?d7C2GiecZPrPril1pSiPRGQKIyqaqkzZLAijgrwPE4XCPC+KFlVp8TSEWelP?=
 =?us-ascii?Q?3w3JfAk1eKqtLff1okYWJrq+ctesMESROanWTcDya3XxMFj9LqrgDIwnbL0k?=
 =?us-ascii?Q?BRZsvdL1OJKQ2knLqqiAEHuFaygUUnGBsDjQzQVdEEt/WyiNFWONr3tWMjZx?=
 =?us-ascii?Q?/693QRl9COp4l+UOOlMOAIbQQ4Mmwau29o7KDlcN?=
X-MS-Exchange-CrossTenant-Network-Message-Id: fa9ee92e-63d8-44e6-8516-08dc489c4c6b
X-MS-Exchange-CrossTenant-AuthSource: CH3PR11MB8660.namprd11.prod.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Internal
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 20 Mar 2024 05:12:11.8663
 (UTC)
X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted
X-MS-Exchange-CrossTenant-Id: 46c98d88-e344-4ed4-8496-4ed7712e255d
X-MS-Exchange-CrossTenant-MailboxType: HOSTED
X-MS-Exchange-CrossTenant-UserPrincipalName: gLmR255f87aCu5+AXrwDnq5TpUJzZTWKiziJreyTe01hZeRLxd1WRNhHQ7/5mR3x4uwrXvVAb69aj3KFVQiDTQ==
X-MS-Exchange-Transport-CrossTenantHeadersStamped: IA1PR11MB6172
X-OriginatorOrg: intel.com

> config KVM_SW_PROTECTED_VM
> 	bool "Enable support for KVM software-protected VMs"
>-	depends on EXPERT
> 	depends on KVM && X86_64
> 	select KVM_GENERIC_PRIVATE_MEM
> 	help
>@@ -89,6 +88,8 @@ config KVM_SW_PROTECTED_VM
> config KVM_INTEL
> 	tristate "KVM for Intel (and compatible) processors support"
> 	depends on KVM && IA32_FEAT_CTL
>+	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST

why does INTEL_TDX_HOST select KVM_SW_PROTECTED_VM?

>+	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
> 	help
> 	.vcpu_precreate = vmx_vcpu_precreate,
> 	.vcpu_create = vmx_vcpu_create,

>--- a/arch/x86/kvm/vmx/tdx.c
>+++ b/arch/x86/kvm/vmx/tdx.c
>@@ -5,10 +5,11 @@
> 
> #include "capabilities.h"
> #include "x86_ops.h"
>-#include "x86.h"
> #include "mmu.h"
> #include "tdx_arch.h"
> #include "tdx.h"
>+#include "tdx_ops.h"
>+#include "x86.h"

any reason to reorder x86.h?

>+static void tdx_do_tdh_phymem_cache_wb(void *unused)
>+{
>+	u64 err = 0;
>+
>+	do {
>+		err = tdh_phymem_cache_wb(!!err);
>+	} while (err == TDX_INTERRUPTED_RESUMABLE);
>+
>+	/* Other thread may have done for us. */
>+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
>+		err = TDX_SUCCESS;
>+	if (WARN_ON_ONCE(err))
>+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
>+}
>+
>+void tdx_mmu_release_hkid(struct kvm *kvm)
>+{
>+	bool packages_allocated, targets_allocated;
>+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>+	cpumask_var_t packages, targets;
>+	u64 err;
>+	int i;
>+
>+	if (!is_hkid_assigned(kvm_tdx))
>+		return;
>+
>+	if (!is_td_created(kvm_tdx)) {
>+		tdx_hkid_free(kvm_tdx);
>+		return;
>+	}
>+
>+	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>+	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>+	cpus_read_lock();
>+
>+	/*
>+	 * We can destroy multiple guest TDs simultaneously.  Prevent
>+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
>+	 */
>+	mutex_lock(&tdx_lock);
>+
>+	/*
>+	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
>+	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.  Make the transition atomic
>+	 * to other functions to operate private pages and Secure-EPT pages.
>+	 *
>+	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
>+	 * This function is called via mmu notifier, mmu_release().
>+	 * kvm_gmem_release() is called via fput() on process exit.
>+	 */
>+	write_lock(&kvm->mmu_lock);
>+
>+	for_each_online_cpu(i) {
>+		if (packages_allocated &&
>+		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
>+					     packages))
>+			continue;
>+		if (targets_allocated)
>+			cpumask_set_cpu(i, targets);
>+	}
>+	if (targets_allocated)
>+		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
>+	else
>+		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);

This tries flush cache on all CPUs when we run out of memory. I am not sure if
it is the best solution. A simple solution is just use two global bitmaps.

And current logic isn't optimal. e.g., if packages_allocated is true while
targets_allocated is false, then we will fill in the packages bitmap but don't
use it at all.

That said, I prefer to optimize the rare case in a separate patch. We can just use
two global bitmaps or let the flush fail here just as you are doing below on
seamcall failure.

>+	/*
>+	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
>+	 * tdh_mng_key_freeid() will fail.
>+	 */
>+	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
>+	if (WARN_ON_ONCE(err)) {
>+		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>+		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>+		       kvm_tdx->hkid);
>+	} else
>+		tdx_hkid_free(kvm_tdx);

curly brackets are missing.

>+
>+	write_unlock(&kvm->mmu_lock);
>+	mutex_unlock(&tdx_lock);
>+	cpus_read_unlock();
>+	free_cpumask_var(targets);
>+	free_cpumask_var(packages);
>+}
>+

>+static int __tdx_td_init(struct kvm *kvm)
>+{
>+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>+	cpumask_var_t packages;
>+	unsigned long *tdcs_pa = NULL;
>+	unsigned long tdr_pa = 0;
>+	unsigned long va;
>+	int ret, i;
>+	u64 err;
>+
>+	ret = tdx_guest_keyid_alloc();
>+	if (ret < 0)
>+		return ret;
>+	kvm_tdx->hkid = ret;
>+
>+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+	if (!va)
>+		goto free_hkid;
>+	tdr_pa = __pa(va);
>+
>+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
>+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>+	if (!tdcs_pa)
>+		goto free_tdr;
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+		if (!va)
>+			goto free_tdcs;
>+		tdcs_pa[i] = __pa(va);
>+	}
>+
>+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
>+		ret = -ENOMEM;
>+		goto free_tdcs;
>+	}
>+	cpus_read_lock();
>+	/*
>+	 * Need at least one CPU of the package to be online in order to
>+	 * program all packages for host key id.  Check it.
>+	 */
>+	for_each_present_cpu(i)
>+		cpumask_set_cpu(topology_physical_package_id(i), packages);
>+	for_each_online_cpu(i)
>+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
>+	if (!cpumask_empty(packages)) {
>+		ret = -EIO;
>+		/*
>+		 * Because it's hard for human operator to figure out the
>+		 * reason, warn it.
>+		 */
>+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
>+		pr_warn_ratelimited(MSG_ALLPKG);
>+		goto free_packages;
>+	}
>+
>+	/*
>+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
>+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
>+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
>+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
>+	 * caller to handle the contention.  This is because of time limitation
>+	 * usable inside the TDX module and OS/VMM knows better about process
>+	 * scheduling.
>+	 *
>+	 * APIs to acquire the lock of KOT:
>+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
>+	 * TDH.PHYMEM.CACHE.WB.
>+	 */
>+	mutex_lock(&tdx_lock);
>+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
>+	mutex_unlock(&tdx_lock);
>+	if (err == TDX_RND_NO_ENTROPY) {
>+		ret = -EAGAIN;
>+		goto free_packages;
>+	}
>+	if (WARN_ON_ONCE(err)) {
>+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
>+		ret = -EIO;
>+		goto free_packages;
>+	}
>+	kvm_tdx->tdr_pa = tdr_pa;
>+
>+	for_each_online_cpu(i) {
>+		int pkg = topology_physical_package_id(i);
>+
>+		if (cpumask_test_and_set_cpu(pkg, packages))
>+			continue;
>+
>+		/*
>+		 * Program the memory controller in the package with an
>+		 * encryption key associated to a TDX private host key id
>+		 * assigned to this TDR.  Concurrent operations on same memory
>+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>+		 * mutex.
>+		 */
>+		mutex_lock(&tdx_mng_key_config_lock[pkg]);

the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
create TDs, the same set of CPUs (the first online CPU of each package) will be
selected to configure the key because of the cpumask_test_and_set_cpu() above.
it means, we never have two CPUs in the same socket trying to program the key,
i.e., no concurrent calls.

>+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
>+				      &kvm_tdx->tdr_pa, true);
>+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
>+		if (ret)
>+			break;
>+	}
>+	cpus_read_unlock();
>+	free_cpumask_var(packages);
>+	if (ret) {
>+		i = 0;
>+		goto teardown;
>+	}
>+
>+	kvm_tdx->tdcs_pa = tdcs_pa;
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
>+		if (err == TDX_RND_NO_ENTROPY) {
>+			/* Here it's hard to allow userspace to retry. */
>+			ret = -EBUSY;
>+			goto teardown;
>+		}
>+		if (WARN_ON_ONCE(err)) {
>+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
>+			ret = -EIO;
>+			goto teardown;
>+		}
>+	}
>+
>+	/*
>+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
>+	 * ioctl() to define the configure CPUID values for the TD.
>+	 */
>+	return 0;
>+
>+	/*
>+	 * The sequence for freeing resources from a partially initialized TD
>+	 * varies based on where in the initialization flow failure occurred.
>+	 * Simply use the full teardown and destroy, which naturally play nice
>+	 * with partial initialization.
>+	 */
>+teardown:
>+	for (; i < tdx_info->nr_tdcs_pages; i++) {
>+		if (tdcs_pa[i]) {
>+			free_page((unsigned long)__va(tdcs_pa[i]));
>+			tdcs_pa[i] = 0;
>+		}
>+	}
>+	if (!kvm_tdx->tdcs_pa)
>+		kfree(tdcs_pa);
>+	tdx_mmu_release_hkid(kvm);
>+	tdx_vm_free(kvm);
>+	return ret;
>+
>+free_packages:
>+	cpus_read_unlock();
>+	free_cpumask_var(packages);
>+free_tdcs:
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		if (tdcs_pa[i])
>+			free_page((unsigned long)__va(tdcs_pa[i]));
>+	}
>+	kfree(tdcs_pa);
>+	kvm_tdx->tdcs_pa = NULL;
>+
>+free_tdr:
>+	if (tdr_pa)
>+		free_page((unsigned long)__va(tdr_pa));
>+	kvm_tdx->tdr_pa = 0;
>+free_hkid:
>+	if (is_hkid_assigned(kvm_tdx))

IIUC, this is always true because you just return if keyid
allocation fails.

	>+	ret = tdx_guest_keyid_alloc();
	>+	if (ret < 0)
	>+		return ret;
	>+	kvm_tdx->hkid = ret;
	>+
	>+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
	>+	if (!va)
	>+		goto free_hkid;

>+		tdx_hkid_free(kvm_tdx);
>+	return ret;
>+}