Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
From:   KarimAllah Ahmed <karahmed@amazon.de>
To:     rkrcmar@redhat.com, pbonzini@redhat.com,
        linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
        jmattson@google.com
Cc:     KarimAllah Ahmed <karahmed@amazon.de>
Subject: [PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel
Date:   Mon,  3 Dec 2018 14:59:11 +0100
Message-Id: <1543845551-4403-1-git-send-email-karahmed@amazon.de>
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk

The "APIC-access address" is simply a token that the hypervisor puts into
the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
virtualization whenever a page walk terminates with that PFN. This address
has to be a legal address (i.e.  within the physical address supported by
the CPU), but it need not have WB memory behind it. In fact, it need not
have anything at all behind it. When bit 31 ("activate secondary controls")
of the primary processor-based VM-execution controls is set and bit 0
("virtualize APIC accesses") of the secondary processor-based VM-execution
controls is set, the PFN recorded in the VMCS "APIC-access address" field
will never be touched. (Instead, the access triggers APIC virtualization,
which may access the PFN recorded in the "Virtual-APIC address" field of
the VMCS.)

So stop mapping the "APIC-access address" page into the kernel and even
drop the requirements to have a valid page backing it. Instead, just use
some token that:

1) Not one of the valid guest pages.
2) Within the physical address supported by the CPU.

Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
---

Thanks Jim for the commit message :)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 10 ++++++
 arch/x86/kvm/vmx.c              | 71 ++++++++++++++++++-----------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbda5a9..7e50196 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+	bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t *hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7c03c0f..ae46a8d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
+	hpa_t hpa;
 	struct kvm_memory_slot *slot;
 	bool async;
 
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops->nested_apic_access_addr &&
+	    kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
+		*pfn = hpa >> PAGE_SHIFT;
+		if (writable)
+			*writable = true;
+		return false;
+	}
+
 	/*
 	 * Don't expose private memslots to L2.
 	 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 83a614f..340cf56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -864,7 +864,6 @@ struct nested_vmx {
 	 * Guest pages referred to in the vmcs02 with host-physical
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
-	struct page *apic_access_page;
 	struct kvm_host_map virtual_apic_map;
 	struct kvm_host_map pi_desc_map;
 	struct kvm_host_map msr_bitmap_map;
@@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
 	kfree(vmx->nested.cached_vmcs12);
 	kfree(vmx->nested.cached_shadow_vmcs12);
 	/* Unpin physical memory we referred to in the vmcs02 */
-	if (vmx->nested.apic_access_page) {
-		kvm_release_page_dirty(vmx->nested.apic_access_page);
-		vmx->nested.apic_access_page = NULL;
-	}
 	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
 	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
 	vmx->nested.pi_desc = NULL;
@@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 						 struct vmcs12 *vmcs12);
 
+static hpa_t vmx_apic_access_addr(void)
+{
+	/*
+	 * The physical address choosen here has to:
+	 * 1) Never be an address that could be assigned to a guest.
+	 * 2) Within the maximum physical limits of the CPU.
+	 *
+	 * So our choice below is completely random, but at least it follows
+	 * these two rules.
+	 */
+	return __pa_symbol(_text) & PAGE_MASK;
+}
+
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_host_map *map;
-	struct page *page;
-	u64 hpa;
 
-	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		/*
-		 * Translate L1 physical address to host physical
-		 * address for vmcs02. Keep the page pinned, so this
-		 * physical address remains valid. We keep a reference
-		 * to it so we can release it later.
-		 */
-		if (vmx->nested.apic_access_page) { /* shouldn't happen */
-			kvm_release_page_dirty(vmx->nested.apic_access_page);
-			vmx->nested.apic_access_page = NULL;
-		}
-		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
-		/*
-		 * If translation failed, no matter: This feature asks
-		 * to exit when accessing the given address, and if it
-		 * can never be accessed, this feature won't do
-		 * anything anyway.
-		 */
-		if (!is_error_page(page)) {
-			vmx->nested.apic_access_page = page;
-			hpa = page_to_phys(vmx->nested.apic_access_page);
-			vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		} else {
-			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
-		}
-	}
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		vmcs_write64(APIC_ACCESS_ADDR, vmx_apic_access_addr());
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
 		map = &vmx->nested.virtual_apic_map;
@@ -14196,12 +14177,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
 
-	/* Unpin physical memory we referred to in vmcs02 */
-	if (vmx->nested.apic_access_page) {
-		kvm_release_page_dirty(vmx->nested.apic_access_page);
-		vmx->nested.apic_access_page = NULL;
-	}
-
 	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
 	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
 	vmx->nested.pi_desc = NULL;
@@ -14949,6 +14924,21 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static bool vmx_nested_apic_access_addr(struct kvm_vcpu *vcpu,
+					gpa_t gpa, hpa_t *hpa)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+	    page_address_valid(vcpu, vmcs12->apic_access_addr) &&
+	    (gpa_to_gfn(gpa) == gpa_to_gfn(vmcs12->apic_access_addr))) {
+		*hpa = vmx_apic_access_addr();
+		return true;
+	}
+
+	return false;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -15022,6 +15012,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+	.nested_apic_access_addr = vmx_nested_apic_access_addr,
 	.get_enable_apicv = vmx_get_enable_apicv,
 	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
-- 
2.7.4