LinuxLists.cc - [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs

2022-04-27 16:25:52

Subject: [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs

From: Paolo Bonzini <[email protected]>

[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]

Red Hat's QE team reported test failure on access_tracking_perf_test:

Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages
guest physical test memory offset: 0x3fffbffff000

Populating memory : 0.684014577s
Writing to populated memory : 0.006230175s
Reading from populated memory : 0.004557805s
==== Test Assertion Failure ====
lib/kvm_util.c:1411: false
pid=125806 tid=125809 errno=4 - Interrupted system call
1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
2 (inlined by) addr_gpa2hva at kvm_util.c:1405
3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
6 0x00007fefe9ff81ce: ?? ??:0
7 0x00007fefe9c64d82: ?? ??:0
No vm physical memory at 0xffbffff000

I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
PA.

It turns out that the address translation for clearing idle page tracking
returned a wrong result; addr_gva2gpa()'s last step, which is based on
"pte[index[0]].pfn", did the calculation with 40 bits length and the
high 12 bits got truncated. In above case the GPA address to be returned
should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
0xffbffff000 and the subsequent gpa2hva lookup failed.

The width of operations on bit fields greater than 32-bit is
implementation defined, and differs between GCC (which uses the bitfield
precision) and clang (which uses 64-bit arithmetic), so this is a
potential minefield. Remove the bit fields and using manual masking
instead.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
Reported-by: Nana Liu <[email protected]>
Reviewed-by: Peter Xu <[email protected]>
Tested-by: Peter Xu <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
.../selftests/kvm/include/x86_64/processor.h | 15 ++
.../selftests/kvm/lib/x86_64/processor.c | 192 +++++++-----------
2 files changed, 92 insertions(+), 115 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 8a470da7b71a..15a2875698b5 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -60,6 +60,21 @@
/* CPUID.0x8000_0001.EDX */
#define CPUID_GBPAGES (1ul << 26)

+/* Page table bitfield declarations */
+#define PTE_PRESENT_MASK BIT_ULL(0)
+#define PTE_WRITABLE_MASK BIT_ULL(1)
+#define PTE_USER_MASK BIT_ULL(2)
+#define PTE_ACCESSED_MASK BIT_ULL(5)
+#define PTE_DIRTY_MASK BIT_ULL(6)
+#define PTE_LARGE_MASK BIT_ULL(7)
+#define PTE_GLOBAL_MASK BIT_ULL(8)
+#define PTE_NX_MASK BIT_ULL(63)
+
+#define PAGE_SHIFT 12
+
+#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12)
+#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+
/* General Registers in 64-Bit Mode */
struct gpr64_regs {
u64 rax;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 9f000dfb5594..0dd442c26015 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -19,38 +19,6 @@

vm_vaddr_t exception_handlers;

-/* Virtual translation table structure declarations */
-struct pageUpperEntry {
- uint64_t present:1;
- uint64_t writable:1;
- uint64_t user:1;
- uint64_t write_through:1;
- uint64_t cache_disable:1;
- uint64_t accessed:1;
- uint64_t ignored_06:1;
- uint64_t page_size:1;
- uint64_t ignored_11_08:4;
- uint64_t pfn:40;
- uint64_t ignored_62_52:11;
- uint64_t execute_disable:1;
-};
-
-struct pageTableEntry {
- uint64_t present:1;
- uint64_t writable:1;
- uint64_t user:1;
- uint64_t write_through:1;
- uint64_t cache_disable:1;
- uint64_t accessed:1;
- uint64_t dirty:1;
- uint64_t reserved_07:1;
- uint64_t global:1;
- uint64_t ignored_11_09:3;
- uint64_t pfn:40;
- uint64_t ignored_62_52:11;
- uint64_t execute_disable:1;
-};
-
void regs_dump(FILE *stream, struct kvm_regs *regs,
uint8_t indent)
{
@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
return &page_table[index];
}

-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
- uint64_t pt_pfn,
- uint64_t vaddr,
- uint64_t paddr,
- int level,
- enum x86_page_size page_size)
+static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
+ uint64_t pt_pfn,
+ uint64_t vaddr,
+ uint64_t paddr,
+ int level,
+ enum x86_page_size page_size)
{
- struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
-
- if (!pte->present) {
- pte->writable = true;
- pte->present = true;
- pte->page_size = (level == page_size);
- if (pte->page_size)
- pte->pfn = paddr >> vm->page_shift;
+ uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+
+ if (!(*pte & PTE_PRESENT_MASK)) {
+ *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
+ if (level == page_size)
+ *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
else
- pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
+ *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
} else {
/*
* Entry already present. Assert that the caller doesn't want
@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
TEST_ASSERT(level != page_size,
"Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
page_size, vaddr);
- TEST_ASSERT(!pte->page_size,
+ TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
"Cannot create page table at level: %u, vaddr: 0x%lx\n",
level, vaddr);
}
@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
enum x86_page_size page_size)
{
const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
+ uint64_t *pml4e, *pdpe, *pde;
+ uint64_t *pte;

TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
"Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
*/
pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
vaddr, paddr, 3, page_size);
- if (pml4e->page_size)
+ if (*pml4e & PTE_LARGE_MASK)
return;

- pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
- if (pdpe->page_size)
+ pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
+ if (*pdpe & PTE_LARGE_MASK)
return;

- pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
- if (pde->page_size)
+ pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
+ if (*pde & PTE_LARGE_MASK)
return;

/* Fill in page table entry. */
- pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
- TEST_ASSERT(!pte->present,
+ pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
+ TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
"PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
- pte->pfn = paddr >> vm->page_shift;
- pte->writable = true;
- pte->present = 1;
+ *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
}

void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
}

-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
+static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
uint64_t vaddr)
{
uint16_t index[4];
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
+ uint64_t *pml4e, *pdpe, *pde;
+ uint64_t *pte;
struct kvm_cpuid_entry2 *entry;
struct kvm_sregs sregs;
int max_phy_addr;
@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
index[3] = (vaddr >> 39) & 0x1ffu;

pml4e = addr_gpa2hva(vm, vm->pgd);
- TEST_ASSERT(pml4e[index[3]].present,
+ TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
"Expected pml4e to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
- (rsvd_mask | (1ull << 7))) == 0,
+ TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
"Unexpected reserved bits set.");

- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
- TEST_ASSERT(pdpe[index[2]].present,
+ pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+ TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
"Expected pdpe to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(pdpe[index[2]].page_size == 0,
+ TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
"Expected pdpe to map a pde not a 1-GByte page.");
- TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
+ TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
"Unexpected reserved bits set.");

- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
- TEST_ASSERT(pde[index[1]].present,
+ pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+ TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
"Expected pde to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(pde[index[1]].page_size == 0,
+ TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
"Expected pde to map a pte not a 2-MByte page.");
- TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
+ TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
"Unexpected reserved bits set.");

- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
- TEST_ASSERT(pte[index[0]].present,
+ pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+ TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
"Expected pte to be present for gva: 0x%08lx", vaddr);

return &pte[index[0]];
@@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc

uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
{
- struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
+ uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);

return *(uint64_t *)pte;
}
@@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
uint64_t pte)
{
- struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
- vaddr);
+ uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);

*(uint64_t *)new_pte = pte;
}

void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
{
- struct pageUpperEntry *pml4e, *pml4e_start;
- struct pageUpperEntry *pdpe, *pdpe_start;
- struct pageUpperEntry *pde, *pde_start;
- struct pageTableEntry *pte, *pte_start;
+ uint64_t *pml4e, *pml4e_start;
+ uint64_t *pdpe, *pdpe_start;
+ uint64_t *pde, *pde_start;
+ uint64_t *pte, *pte_start;

if (!vm->pgd_created)
return;
@@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
fprintf(stream, "%*s index hvaddr gpaddr "
"addr w exec dirty\n",
indent, "");
- pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
+ pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
pml4e = &pml4e_start[n1];
- if (!pml4e->present)
+ if (!(*pml4e & PTE_PRESENT_MASK))
continue;
- fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+ fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
" %u\n",
indent, "",
pml4e - pml4e_start, pml4e,
- addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
- pml4e->writable, pml4e->execute_disable);
+ addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
+ !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));

- pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
+ pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
pdpe = &pdpe_start[n2];
- if (!pdpe->present)
+ if (!(*pdpe & PTE_PRESENT_MASK))
continue;
- fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
+ fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx "
"%u %u\n",
indent, "",
pdpe - pdpe_start, pdpe,
addr_hva2gpa(vm, pdpe),
- (uint64_t) pdpe->pfn, pdpe->writable,
- pdpe->execute_disable);
+ PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
+ !!(*pdpe & PTE_NX_MASK));

- pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
+ pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
pde = &pde_start[n3];
- if (!pde->present)
+ if (!(*pde & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spde 0x%-3zx %p "
- "0x%-12lx 0x%-10lx %u %u\n",
+ "0x%-12lx 0x%-10llx %u %u\n",
indent, "", pde - pde_start, pde,
addr_hva2gpa(vm, pde),
- (uint64_t) pde->pfn, pde->writable,
- pde->execute_disable);
+ PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
+ !!(*pde & PTE_NX_MASK));

- pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
+ pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
pte = &pte_start[n4];
- if (!pte->present)
+ if (!(*pte & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spte 0x%-3zx %p "
- "0x%-12lx 0x%-10lx %u %u "
+ "0x%-12lx 0x%-10llx %u %u "
" %u 0x%-10lx\n",
indent, "",
pte - pte_start, pte,
addr_hva2gpa(vm, pte),
- (uint64_t) pte->pfn,
- pte->writable,
- pte->execute_disable,
- pte->dirty,
+ PTE_GET_PFN(*pte),
+ !!(*pte & PTE_WRITABLE_MASK),
+ !!(*pte & PTE_NX_MASK),
+ !!(*pte & PTE_DIRTY_MASK),
((uint64_t) n1 << 27)
| ((uint64_t) n2 << 18)
| ((uint64_t) n3 << 9)
@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
{
uint16_t index[4];
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
+ uint64_t *pml4e, *pdpe, *pde;
+ uint64_t *pte;

TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
if (!vm->pgd_created)
goto unmapped_gva;
pml4e = addr_gpa2hva(vm, vm->pgd);
- if (!pml4e[index[3]].present)
+ if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
goto unmapped_gva;

- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
- if (!pdpe[index[2]].present)
+ pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+ if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
goto unmapped_gva;

- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
- if (!pde[index[1]].present)
+ pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+ if (!(pde[index[1]] & PTE_PRESENT_MASK))
goto unmapped_gva;

- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
- if (!pte[index[0]].present)
+ pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+ if (!(pte[index[0]] & PTE_PRESENT_MASK))
goto unmapped_gva;

- return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
+ return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);

unmapped_gva:
TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
--
2.35.1

2022-04-27 16:26:35

by Sasha Levin

[permalink] [raw]

Subject: [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised

From: Wanpeng Li <[email protected]>

[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]

As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
interrupt") mentioned that the host admin should well tune the guest
setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
surplus for *busy* housekeeping. In this setup, it is preferrable to
disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.

However, if only some guests isolated and others not, they would not
have any benefit from posted timer interrupts, and at the same time lose
VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
returns true and therefore forces kvm_can_use_hv_timer() to false.

By guaranteeing that posted-interrupt timer is only used if MWAIT or
HLT are done without vmexit, KVM can make a better choice and use the
VMX preemption timer and the corresponding fast paths.

Reported-by: Aili Yao <[email protected]>
Reviewed-by: Sean Christopherson <[email protected]>
Cc: Aili Yao <[email protected]>
Cc: Sean Christopherson <[email protected]>
Signed-off-by: Wanpeng Li <[email protected]>
Message-Id: <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
arch/x86/kvm/lapic.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6b6f9359d29e..970d5c740b00 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)

static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}

bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
--
2.35.1

2022-04-27 16:26:37

by Sasha Levin

[permalink] [raw]

Subject: [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors

From: Peter Gonda <[email protected]>

[ Upstream commit b2125513dfc0dd0ec5a9605138a3c356592cfb73 ]

For SEV-ES VMs with mirrors to be intra-host migrated they need to be
able to migrate with the mirror. This is due to that fact that all VMSAs
need to be added into the VM with LAUNCH_UPDATE_VMSA before
lAUNCH_FINISH. Allowing migration with mirrors allows users of SEV-ES to
keep the mirror VMs VMSAs during migration.

Adds a list of mirror VMs for the original VM iterate through during its
migration. During the iteration the owner pointers can be updated from
the source to the destination. This fixes the ASID leaking issue which
caused the blocking of migration of VMs with mirrors.

Signed-off-by: Peter Gonda <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: Marc Orr <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: [email protected]
Cc: [email protected]
Message-Id: <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
arch/x86/kvm/svm/sev.c | 57 ++++++++++++-------
arch/x86/kvm/svm/svm.h | 3 +-
.../selftests/kvm/x86_64/sev_migrate_tests.c | 47 ++++++++++-----
3 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index fef975852582..553d6dbf19a2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;

INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);

return 0;

@@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
}
}

-static void sev_migrate_from(struct kvm_sev_info *dst,
- struct kvm_sev_info *src)
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *mirror;
+
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
@@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
src->enc_context_owner = NULL;

list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
}

static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)

src_sev = &to_kvm_svm(source_kvm)->sev_info;

- /*
- * VMs mirroring src's encryption context rely on it to keep the
- * ASID allocated, but below we are clearing src_sev->asid.
- */
- if (src_sev->num_mirrored_vms) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
}
- sev_migrate_from(dst_sev, src_sev);
+
+ sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
@@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
*/
source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
- source_sev->num_mirrored_vms++;
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);

/* Set enc_context_owner and copy its encryption context over */
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
@@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;

/*
@@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;

- WARN_ON(sev->num_mirrored_vms);
-
if (!sev_guest(kvm))
return;

+ WARN_ON(!list_empty(&sev->mirror_vms));
+
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
- struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;

mutex_lock(&owner_kvm->lock);
- if (!WARN_ON(!owner_sev->num_mirrored_vms))
- owner_sev->num_mirrored_vms--;
+ list_del(&sev->mirror_entry);
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
return;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 86bcfed6599e..831bca1fdc29 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -81,7 +81,8 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
- unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 80056bbbb003..2e5a42cb470b 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -341,35 +341,54 @@ static void test_sev_mirror_parameters(void)

static void test_sev_move_copy(void)
{
- struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
- int ret;
+ struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
+ *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;

sev_vm = sev_vm_create(/* es= */ false);
dst_vm = aux_vm_create(true);
+ dst2_vm = aux_vm_create(true);
+ dst3_vm = aux_vm_create(true);
mirror_vm = aux_vm_create(false);
dst_mirror_vm = aux_vm_create(false);
+ dst2_mirror_vm = aux_vm_create(false);
+ dst3_mirror_vm = aux_vm_create(false);

sev_mirror_create(mirror_vm->fd, sev_vm->fd);
- ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
- TEST_ASSERT(ret == -1 && errno == EBUSY,
- "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
- errno);

- /* The mirror itself can be migrated. */
sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
- ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
- TEST_ASSERT(ret == -1 && errno == EBUSY,
- "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
- errno);
+ sev_migrate_from(dst_vm->fd, sev_vm->fd);
+
+ sev_migrate_from(dst2_vm->fd, dst_vm->fd);
+ sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd);
+
+ sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd);
+ sev_migrate_from(dst3_vm->fd, dst2_vm->fd);
+
+ kvm_vm_free(dst_vm);
+ kvm_vm_free(sev_vm);
+ kvm_vm_free(dst2_vm);
+ kvm_vm_free(dst3_vm);
+ kvm_vm_free(mirror_vm);
+ kvm_vm_free(dst_mirror_vm);
+ kvm_vm_free(dst2_mirror_vm);
+ kvm_vm_free(dst3_mirror_vm);

/*
- * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus,
- * the owner can be copied as soon as dst_mirror_vm is gone.
+ * Run similar test be destroy mirrors before mirrored VMs to ensure
+ * destruction is done safely.
*/
- kvm_vm_free(dst_mirror_vm);
+ sev_vm = sev_vm_create(/* es= */ false);
+ dst_vm = aux_vm_create(true);
+ mirror_vm = aux_vm_create(false);
+ dst_mirror_vm = aux_vm_create(false);
+
+ sev_mirror_create(mirror_vm->fd, sev_vm->fd);
+
+ sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
sev_migrate_from(dst_vm->fd, sev_vm->fd);

kvm_vm_free(mirror_vm);
+ kvm_vm_free(dst_mirror_vm);
kvm_vm_free(dst_vm);
kvm_vm_free(sev_vm);
}
--
2.35.1

2022-04-27 16:26:40

by Sasha Levin

[permalink] [raw]

Subject: [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs

From: Paolo Bonzini <[email protected]>

[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]

WARN and bail if KVM attempts to free a root that isn't backed by a shadow
page. KVM allocates a bare page for "special" roots, e.g. when using PAE
paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
will be valid but won't be backed by a shadow page. It's all too easy to
blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
crashing KVM and possibly the kernel.

Reviewed-by: Sean Christopherson <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7f009ebb319a..e7cd16e1e0a0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3239,6 +3239,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;

sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ if (WARN_ON(!sp))
+ return;

if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_put_root(kvm, sp, false);
--
2.35.1

2022-04-27 16:27:27

by Sasha Levin

[permalink] [raw]

Subject: [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test

From: Thomas Huth <[email protected]>

[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]

When compiling kvm_page_table_test.c, I get this compiler warning
with gcc 11.2:

kvm_page_table_test.c: In function 'pre_init_before_test':
../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
distinct pointer types lacks a cast
44 | (void) (&_max1 == &_max2); \
| ^~
kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
281 | alignment = max(0x100000, alignment);
| ^~~

Fix it by adjusting the type of the absolute value.

Signed-off-by: Thomas Huth <[email protected]>
Reviewed-by: Claudio Imbrenda <[email protected]>
Message-Id: <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index ba1fdc3dcf4a..2c4a7563a4f8 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
else
guest_test_phys_mem = p->phys_offset;
#ifdef __s390x__
- alignment = max(0x100000, alignment);
+ alignment = max(0x100000UL, alignment);
#endif
guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);

--
2.35.1

2022-04-27 16:40:23

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors

On 4/27/22 17:54, Sasha Levin wrote:
> From: Peter Gonda <[email protected]>
>
> [ Upstream commit b2125513dfc0dd0ec5a9605138a3c356592cfb73 ]
>
> For SEV-ES VMs with mirrors to be intra-host migrated they need to be
> able to migrate with the mirror. This is due to that fact that all VMSAs
> need to be added into the VM with LAUNCH_UPDATE_VMSA before
> lAUNCH_FINISH. Allowing migration with mirrors allows users of SEV-ES to
> keep the mirror VMs VMSAs during migration.
>
> Adds a list of mirror VMs for the original VM iterate through during its
> migration. During the iteration the owner pointers can be updated from
> the source to the destination. This fixes the ASID leaking issue which
> caused the blocking of migration of VMs with mirrors.
>
> Signed-off-by: Peter Gonda <[email protected]>
> Cc: Sean Christopherson <[email protected]>
> Cc: Marc Orr <[email protected]>
> Cc: Paolo Bonzini <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Message-Id: <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Sasha Levin <[email protected]>

Too scary,

NACK

Paolo

> ---
> arch/x86/kvm/svm/sev.c | 57 ++++++++++++-------
> arch/x86/kvm/svm/svm.h | 3 +-
> .../selftests/kvm/x86_64/sev_migrate_tests.c | 47 ++++++++++-----
> 3 files changed, 73 insertions(+), 34 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index fef975852582..553d6dbf19a2 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
> goto e_free;
>
> INIT_LIST_HEAD(&sev->regions_list);
> + INIT_LIST_HEAD(&sev->mirror_vms);
>
> return 0;
>
> @@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
> }
> }
>
> -static void sev_migrate_from(struct kvm_sev_info *dst,
> - struct kvm_sev_info *src)
> +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
> {
> + struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
> + struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
> + struct kvm_sev_info *mirror;
> +
> dst->active = true;
> dst->asid = src->asid;
> dst->handle = src->handle;
> @@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
> src->enc_context_owner = NULL;
>
> list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> +
> + /*
> + * If this VM has mirrors, "transfer" each mirror's refcount of the
> + * source to the destination (this KVM). The caller holds a reference
> + * to the source, so there's no danger of use-after-free.
> + */
> + list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
> + list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
> + kvm_get_kvm(dst_kvm);
> + kvm_put_kvm(src_kvm);
> + mirror->enc_context_owner = dst_kvm;
> + }
> +
> + /*
> + * If this VM is a mirror, remove the old mirror from the owners list
> + * and add the new mirror to the list.
> + */
> + if (is_mirroring_enc_context(dst_kvm)) {
> + struct kvm_sev_info *owner_sev_info =
> + &to_kvm_svm(dst->enc_context_owner)->sev_info;
> +
> + list_del(&src->mirror_entry);
> + list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
> + }
> }
>
> static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
>
> src_sev = &to_kvm_svm(source_kvm)->sev_info;
>
> - /*
> - * VMs mirroring src's encryption context rely on it to keep the
> - * ASID allocated, but below we are clearing src_sev->asid.
> - */
> - if (src_sev->num_mirrored_vms) {
> - ret = -EBUSY;
> - goto out_unlock;
> - }
> -
> dst_sev->misc_cg = get_current_misc_cg();
> cg_cleanup_sev = dst_sev;
> if (dst_sev->misc_cg != src_sev->misc_cg) {
> @@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
> if (ret)
> goto out_source_vcpu;
> }
> - sev_migrate_from(dst_sev, src_sev);
> +
> + sev_migrate_from(kvm, source_kvm);
> kvm_vm_dead(source_kvm);
> cg_cleanup_sev = src_sev;
> ret = 0;
> @@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> */
> source_sev = &to_kvm_svm(source_kvm)->sev_info;
> kvm_get_kvm(source_kvm);
> - source_sev->num_mirrored_vms++;
> + mirror_sev = &to_kvm_svm(kvm)->sev_info;
> + list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
>
> /* Set enc_context_owner and copy its encryption context over */
> - mirror_sev = &to_kvm_svm(kvm)->sev_info;
> mirror_sev->enc_context_owner = source_kvm;
> mirror_sev->active = true;
> mirror_sev->asid = source_sev->asid;
> @@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> mirror_sev->es_active = source_sev->es_active;
> mirror_sev->handle = source_sev->handle;
> INIT_LIST_HEAD(&mirror_sev->regions_list);
> + INIT_LIST_HEAD(&mirror_sev->mirror_vms);
> ret = 0;
>
> /*
> @@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
> struct list_head *head = &sev->regions_list;
> struct list_head *pos, *q;
>
> - WARN_ON(sev->num_mirrored_vms);
> -
> if (!sev_guest(kvm))
> return;
>
> + WARN_ON(!list_empty(&sev->mirror_vms));
> +
> /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
> if (is_mirroring_enc_context(kvm)) {
> struct kvm *owner_kvm = sev->enc_context_owner;
> - struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
>
> mutex_lock(&owner_kvm->lock);
> - if (!WARN_ON(!owner_sev->num_mirrored_vms))
> - owner_sev->num_mirrored_vms--;
> + list_del(&sev->mirror_entry);
> mutex_unlock(&owner_kvm->lock);
> kvm_put_kvm(owner_kvm);
> return;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 86bcfed6599e..831bca1fdc29 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -81,7 +81,8 @@ struct kvm_sev_info {
> struct list_head regions_list; /* List of registered regions */
> u64 ap_jump_table; /* SEV-ES AP Jump Table address */
> struct kvm *enc_context_owner; /* Owner of copied encryption context */
> - unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> + struct list_head mirror_vms; /* List of VMs mirroring */
> + struct list_head mirror_entry; /* Use as a list entry of mirrors */
> struct misc_cg *misc_cg; /* For misc cgroup accounting */
> atomic_t migration_in_progress;
> };
> diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> index 80056bbbb003..2e5a42cb470b 100644
> --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> @@ -341,35 +341,54 @@ static void test_sev_mirror_parameters(void)
>
> static void test_sev_move_copy(void)
> {
> - struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
> - int ret;
> + struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
> + *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
>
> sev_vm = sev_vm_create(/* es= */ false);
> dst_vm = aux_vm_create(true);
> + dst2_vm = aux_vm_create(true);
> + dst3_vm = aux_vm_create(true);
> mirror_vm = aux_vm_create(false);
> dst_mirror_vm = aux_vm_create(false);
> + dst2_mirror_vm = aux_vm_create(false);
> + dst3_mirror_vm = aux_vm_create(false);
>
> sev_mirror_create(mirror_vm->fd, sev_vm->fd);
> - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
> - TEST_ASSERT(ret == -1 && errno == EBUSY,
> - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
> - errno);
>
> - /* The mirror itself can be migrated. */
> sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
> - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
> - TEST_ASSERT(ret == -1 && errno == EBUSY,
> - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
> - errno);
> + sev_migrate_from(dst_vm->fd, sev_vm->fd);
> +
> + sev_migrate_from(dst2_vm->fd, dst_vm->fd);
> + sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd);
> +
> + sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd);
> + sev_migrate_from(dst3_vm->fd, dst2_vm->fd);
> +
> + kvm_vm_free(dst_vm);
> + kvm_vm_free(sev_vm);
> + kvm_vm_free(dst2_vm);
> + kvm_vm_free(dst3_vm);
> + kvm_vm_free(mirror_vm);
> + kvm_vm_free(dst_mirror_vm);
> + kvm_vm_free(dst2_mirror_vm);
> + kvm_vm_free(dst3_mirror_vm);
>
> /*
> - * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus,
> - * the owner can be copied as soon as dst_mirror_vm is gone.
> + * Run similar test be destroy mirrors before mirrored VMs to ensure
> + * destruction is done safely.
> */
> - kvm_vm_free(dst_mirror_vm);
> + sev_vm = sev_vm_create(/* es= */ false);
> + dst_vm = aux_vm_create(true);
> + mirror_vm = aux_vm_create(false);
> + dst_mirror_vm = aux_vm_create(false);
> +
> + sev_mirror_create(mirror_vm->fd, sev_vm->fd);
> +
> + sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
> sev_migrate_from(dst_vm->fd, sev_vm->fd);
>
> kvm_vm_free(mirror_vm);
> + kvm_vm_free(dst_mirror_vm);
> kvm_vm_free(dst_vm);
> kvm_vm_free(sev_vm);
> }

2022-04-27 16:49:57

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs

On 4/27/22 17:53, Sasha Levin wrote:
> From: Paolo Bonzini <[email protected]>
>
> [ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
>
> Red Hat's QE team reported test failure on access_tracking_perf_test:
>
> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages
> guest physical test memory offset: 0x3fffbffff000
>
> Populating memory : 0.684014577s
> Writing to populated memory : 0.006230175s
> Reading from populated memory : 0.004557805s
> ==== Test Assertion Failure ====
> lib/kvm_util.c:1411: false
> pid=125806 tid=125809 errno=4 - Interrupted system call
> 1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
> 2 (inlined by) addr_gpa2hva at kvm_util.c:1405
> 3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
> 4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
> 5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
> 6 0x00007fefe9ff81ce: ?? ??:0
> 7 0x00007fefe9c64d82: ?? ??:0
> No vm physical memory at 0xffbffff000
>
> I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
> PA.
>
> It turns out that the address translation for clearing idle page tracking
> returned a wrong result; addr_gva2gpa()'s last step, which is based on
> "pte[index[0]].pfn", did the calculation with 40 bits length and the
> high 12 bits got truncated. In above case the GPA address to be returned
> should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
> 0xffbffff000 and the subsequent gpa2hva lookup failed.
>
> The width of operations on bit fields greater than 32-bit is
> implementation defined, and differs between GCC (which uses the bitfield
> precision) and clang (which uses 64-bit arithmetic), so this is a
> potential minefield. Remove the bit fields and using manual masking
> instead.
>
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
> Reported-by: Nana Liu <[email protected]>
> Reviewed-by: Peter Xu <[email protected]>
> Tested-by: Peter Xu <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Sasha Levin <[email protected]>
> ---
> .../selftests/kvm/include/x86_64/processor.h | 15 ++
> .../selftests/kvm/lib/x86_64/processor.c | 192 +++++++-----------
> 2 files changed, 92 insertions(+), 115 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
> index 8a470da7b71a..15a2875698b5 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
> @@ -60,6 +60,21 @@
> /* CPUID.0x8000_0001.EDX */
> #define CPUID_GBPAGES (1ul << 26)
>
> +/* Page table bitfield declarations */
> +#define PTE_PRESENT_MASK BIT_ULL(0)
> +#define PTE_WRITABLE_MASK BIT_ULL(1)
> +#define PTE_USER_MASK BIT_ULL(2)
> +#define PTE_ACCESSED_MASK BIT_ULL(5)
> +#define PTE_DIRTY_MASK BIT_ULL(6)
> +#define PTE_LARGE_MASK BIT_ULL(7)
> +#define PTE_GLOBAL_MASK BIT_ULL(8)
> +#define PTE_NX_MASK BIT_ULL(63)
> +
> +#define PAGE_SHIFT 12
> +
> +#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12)
> +#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
> +
> /* General Registers in 64-Bit Mode */
> struct gpr64_regs {
> u64 rax;
> diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> index 9f000dfb5594..0dd442c26015 100644
> --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> @@ -19,38 +19,6 @@
>
> vm_vaddr_t exception_handlers;
>
> -/* Virtual translation table structure declarations */
> -struct pageUpperEntry {
> - uint64_t present:1;
> - uint64_t writable:1;
> - uint64_t user:1;
> - uint64_t write_through:1;
> - uint64_t cache_disable:1;
> - uint64_t accessed:1;
> - uint64_t ignored_06:1;
> - uint64_t page_size:1;
> - uint64_t ignored_11_08:4;
> - uint64_t pfn:40;
> - uint64_t ignored_62_52:11;
> - uint64_t execute_disable:1;
> -};
> -
> -struct pageTableEntry {
> - uint64_t present:1;
> - uint64_t writable:1;
> - uint64_t user:1;
> - uint64_t write_through:1;
> - uint64_t cache_disable:1;
> - uint64_t accessed:1;
> - uint64_t dirty:1;
> - uint64_t reserved_07:1;
> - uint64_t global:1;
> - uint64_t ignored_11_09:3;
> - uint64_t pfn:40;
> - uint64_t ignored_62_52:11;
> - uint64_t execute_disable:1;
> -};
> -
> void regs_dump(FILE *stream, struct kvm_regs *regs,
> uint8_t indent)
> {
> @@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
> return &page_table[index];
> }
>
> -static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
> - uint64_t pt_pfn,
> - uint64_t vaddr,
> - uint64_t paddr,
> - int level,
> - enum x86_page_size page_size)
> +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
> + uint64_t pt_pfn,
> + uint64_t vaddr,
> + uint64_t paddr,
> + int level,
> + enum x86_page_size page_size)
> {
> - struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> -
> - if (!pte->present) {
> - pte->writable = true;
> - pte->present = true;
> - pte->page_size = (level == page_size);
> - if (pte->page_size)
> - pte->pfn = paddr >> vm->page_shift;
> + uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> +
> + if (!(*pte & PTE_PRESENT_MASK)) {
> + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
> + if (level == page_size)
> + *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
> else
> - pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
> + *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
> } else {
> /*
> * Entry already present. Assert that the caller doesn't want
> @@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
> TEST_ASSERT(level != page_size,
> "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
> page_size, vaddr);
> - TEST_ASSERT(!pte->page_size,
> + TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
> "Cannot create page table at level: %u, vaddr: 0x%lx\n",
> level, vaddr);
> }
> @@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
> enum x86_page_size page_size)
> {
> const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
> - struct pageUpperEntry *pml4e, *pdpe, *pde;
> - struct pageTableEntry *pte;
> + uint64_t *pml4e, *pdpe, *pde;
> + uint64_t *pte;
>
> TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
> "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
> */
> pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
> vaddr, paddr, 3, page_size);
> - if (pml4e->page_size)
> + if (*pml4e & PTE_LARGE_MASK)
> return;
>
> - pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
> - if (pdpe->page_size)
> + pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
> + if (*pdpe & PTE_LARGE_MASK)
> return;
>
> - pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
> - if (pde->page_size)
> + pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
> + if (*pde & PTE_LARGE_MASK)
> return;
>
> /* Fill in page table entry. */
> - pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
> - TEST_ASSERT(!pte->present,
> + pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
> + TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
> "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
> - pte->pfn = paddr >> vm->page_shift;
> - pte->writable = true;
> - pte->present = 1;
> + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
> }
>
> void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
> @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
> __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
> }
>
> -static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
> +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
> uint64_t vaddr)
> {
> uint16_t index[4];
> - struct pageUpperEntry *pml4e, *pdpe, *pde;
> - struct pageTableEntry *pte;
> + uint64_t *pml4e, *pdpe, *pde;
> + uint64_t *pte;
> struct kvm_cpuid_entry2 *entry;
> struct kvm_sregs sregs;
> int max_phy_addr;
> @@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
> index[3] = (vaddr >> 39) & 0x1ffu;
>
> pml4e = addr_gpa2hva(vm, vm->pgd);
> - TEST_ASSERT(pml4e[index[3]].present,
> + TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
> "Expected pml4e to be present for gva: 0x%08lx", vaddr);
> - TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
> - (rsvd_mask | (1ull << 7))) == 0,
> + TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
> "Unexpected reserved bits set.");
>
> - pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> - TEST_ASSERT(pdpe[index[2]].present,
> + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> + TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
> "Expected pdpe to be present for gva: 0x%08lx", vaddr);
> - TEST_ASSERT(pdpe[index[2]].page_size == 0,
> + TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
> "Expected pdpe to map a pde not a 1-GByte page.");
> - TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
> + TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
> "Unexpected reserved bits set.");
>
> - pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> - TEST_ASSERT(pde[index[1]].present,
> + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> + TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
> "Expected pde to be present for gva: 0x%08lx", vaddr);
> - TEST_ASSERT(pde[index[1]].page_size == 0,
> + TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
> "Expected pde to map a pte not a 2-MByte page.");
> - TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
> + TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
> "Unexpected reserved bits set.");
>
> - pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> - TEST_ASSERT(pte[index[0]].present,
> + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> + TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
> "Expected pte to be present for gva: 0x%08lx", vaddr);
>
> return &pte[index[0]];
> @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
>
> uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
> {
> - struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
> + uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>
> return *(uint64_t *)pte;
> }
> @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
> void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
> uint64_t pte)
> {
> - struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
> - vaddr);
> + uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>
> *(uint64_t *)new_pte = pte;
> }
>
> void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
> {
> - struct pageUpperEntry *pml4e, *pml4e_start;
> - struct pageUpperEntry *pdpe, *pdpe_start;
> - struct pageUpperEntry *pde, *pde_start;
> - struct pageTableEntry *pte, *pte_start;
> + uint64_t *pml4e, *pml4e_start;
> + uint64_t *pdpe, *pdpe_start;
> + uint64_t *pde, *pde_start;
> + uint64_t *pte, *pte_start;
>
> if (!vm->pgd_created)
> return;
> @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
> fprintf(stream, "%*s index hvaddr gpaddr "
> "addr w exec dirty\n",
> indent, "");
> - pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
> + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
> for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
> pml4e = &pml4e_start[n1];
> - if (!pml4e->present)
> + if (!(*pml4e & PTE_PRESENT_MASK))
> continue;
> - fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
> + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
> " %u\n",
> indent, "",
> pml4e - pml4e_start, pml4e,
> - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
> - pml4e->writable, pml4e->execute_disable);
> + addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
> + !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
>
> - pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
> + pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
> for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
> pdpe = &pdpe_start[n2];
> - if (!pdpe->present)
> + if (!(*pdpe & PTE_PRESENT_MASK))
> continue;
> - fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
> + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx "
> "%u %u\n",
> indent, "",
> pdpe - pdpe_start, pdpe,
> addr_hva2gpa(vm, pdpe),
> - (uint64_t) pdpe->pfn, pdpe->writable,
> - pdpe->execute_disable);
> + PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
> + !!(*pdpe & PTE_NX_MASK));
>
> - pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
> + pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
> for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
> pde = &pde_start[n3];
> - if (!pde->present)
> + if (!(*pde & PTE_PRESENT_MASK))
> continue;
> fprintf(stream, "%*spde 0x%-3zx %p "
> - "0x%-12lx 0x%-10lx %u %u\n",
> + "0x%-12lx 0x%-10llx %u %u\n",
> indent, "", pde - pde_start, pde,
> addr_hva2gpa(vm, pde),
> - (uint64_t) pde->pfn, pde->writable,
> - pde->execute_disable);
> + PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
> + !!(*pde & PTE_NX_MASK));
>
> - pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
> + pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
> for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
> pte = &pte_start[n4];
> - if (!pte->present)
> + if (!(*pte & PTE_PRESENT_MASK))
> continue;
> fprintf(stream, "%*spte 0x%-3zx %p "
> - "0x%-12lx 0x%-10lx %u %u "
> + "0x%-12lx 0x%-10llx %u %u "
> " %u 0x%-10lx\n",
> indent, "",
> pte - pte_start, pte,
> addr_hva2gpa(vm, pte),
> - (uint64_t) pte->pfn,
> - pte->writable,
> - pte->execute_disable,
> - pte->dirty,
> + PTE_GET_PFN(*pte),
> + !!(*pte & PTE_WRITABLE_MASK),
> + !!(*pte & PTE_NX_MASK),
> + !!(*pte & PTE_DIRTY_MASK),
> ((uint64_t) n1 << 27)
> | ((uint64_t) n2 << 18)
> | ((uint64_t) n3 << 9)
> @@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
> vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
> {
> uint16_t index[4];
> - struct pageUpperEntry *pml4e, *pdpe, *pde;
> - struct pageTableEntry *pte;
> + uint64_t *pml4e, *pdpe, *pde;
> + uint64_t *pte;
>
> TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
> "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
> if (!vm->pgd_created)
> goto unmapped_gva;
> pml4e = addr_gpa2hva(vm, vm->pgd);
> - if (!pml4e[index[3]].present)
> + if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
> goto unmapped_gva;
>
> - pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> - if (!pdpe[index[2]].present)
> + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> + if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
> goto unmapped_gva;
>
> - pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> - if (!pde[index[1]].present)
> + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> + if (!(pde[index[1]] & PTE_PRESENT_MASK))
> goto unmapped_gva;
>
> - pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> - if (!pte[index[0]].present)
> + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> + if (!(pte[index[0]] & PTE_PRESENT_MASK))
> goto unmapped_gva;
>
> - return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
> + return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
>
> unmapped_gva:
> TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);

Acked-by: Paolo Bonzini <[email protected]>

2022-04-27 16:49:58

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test

On 4/27/22 17:53, Sasha Levin wrote:
> From: Thomas Huth <[email protected]>
>
> [ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
>
> When compiling kvm_page_table_test.c, I get this compiler warning
> with gcc 11.2:
>
> kvm_page_table_test.c: In function 'pre_init_before_test':
> ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
> distinct pointer types lacks a cast
> 44 | (void) (&_max1 == &_max2); \
> | ^~
> kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
> 281 | alignment = max(0x100000, alignment);
> | ^~~
>
> Fix it by adjusting the type of the absolute value.
>
> Signed-off-by: Thomas Huth <[email protected]>
> Reviewed-by: Claudio Imbrenda <[email protected]>
> Message-Id: <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Sasha Levin <[email protected]>
> ---
> tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
> index ba1fdc3dcf4a..2c4a7563a4f8 100644
> --- a/tools/testing/selftests/kvm/kvm_page_table_test.c
> +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
> @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
> else
> guest_test_phys_mem = p->phys_offset;
> #ifdef __s390x__
> - alignment = max(0x100000, alignment);
> + alignment = max(0x100000UL, alignment);
> #endif
> guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
>

Acked-by: Paolo Bonzini <[email protected]>

2022-04-27 16:50:23

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <[email protected]>
>
> [ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
>
> WARN and bail if KVM attempts to free a root that isn't backed by a shadow
> page. KVM allocates a bare page for "special" roots, e.g. when using PAE
> paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
> will be valid but won't be backed by a shadow page. It's all too easy to
> blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
> crashing KVM and possibly the kernel.
>
> Reviewed-by: Sean Christopherson <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Sasha Levin <[email protected]>
> ---
> arch/x86/kvm/mmu/mmu.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7f009ebb319a..e7cd16e1e0a0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3239,6 +3239,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> return;
>
> sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> + if (WARN_ON(!sp))
> + return;
>
> if (is_tdp_mmu_page(sp))
> kvm_tdp_mmu_put_root(kvm, sp, false);

Acked-by: Paolo Bonzini <[email protected]>

2022-04-27 16:51:23

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised

On 4/27/22 17:54, Sasha Levin wrote:
> From: Wanpeng Li <[email protected]>
>
> [ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
>
> As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
> interrupt") mentioned that the host admin should well tune the guest
> setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
> surplus for *busy* housekeeping. In this setup, it is preferrable to
> disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
>
> However, if only some guests isolated and others not, they would not
> have any benefit from posted timer interrupts, and at the same time lose
> VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
> returns true and therefore forces kvm_can_use_hv_timer() to false.
>
> By guaranteeing that posted-interrupt timer is only used if MWAIT or
> HLT are done without vmexit, KVM can make a better choice and use the
> VMX preemption timer and the corresponding fast paths.
>
> Reported-by: Aili Yao <[email protected]>
> Reviewed-by: Sean Christopherson <[email protected]>
> Cc: Aili Yao <[email protected]>
> Cc: Sean Christopherson <[email protected]>
> Signed-off-by: Wanpeng Li <[email protected]>
> Message-Id: <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Sasha Levin <[email protected]>
> ---
> arch/x86/kvm/lapic.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 6b6f9359d29e..970d5c740b00 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
>
> static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
> {
> - return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
> + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> + (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
> }
>
> bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)

Acked-by: Paolo Bonzini <[email protected]>