LinuxLists.cc - [PATCHV2 0/3] kvm: x86: make PTE_PREFETCH

2021-10-19 15:33:48

Subject: [PATCHV2 0/3] kvm: x86: make PTE_PREFETCH_NUM tunable

This series adds new IOCTL which make it possible to tune
PTE_PREFETCH_NUM value on per-VM basis.

v2:
- added ioctl (previously was sysfs param) [David]
- preallocate prefetch buffers [David]
- converted arch/x86/kvm/mmu/paging_tmpl.h [David]

Sergey Senozhatsky (3):
KVM: x86: introduce kvm_mmu_pte_prefetch structure
KVM: x86: use mmu_pte_prefetch for guest_walker
KVM: x86: add KVM_SET_MMU_PREFETCH ioctl

Documentation/virt/kvm/api.rst | 21 ++++++++++++
arch/x86/include/asm/kvm_host.h | 12 +++++++
arch/x86/kvm/mmu.h | 4 +++
arch/x86/kvm/mmu/mmu.c | 57 ++++++++++++++++++++++++++++++---
arch/x86/kvm/mmu/paging_tmpl.h | 39 +++++++++++++++-------
arch/x86/kvm/x86.c | 38 +++++++++++++++++++++-
include/uapi/linux/kvm.h | 4 +++
7 files changed, 158 insertions(+), 17 deletions(-)

--
2.33.0.1079.g6e70778dc9-goog

2021-10-19 15:33:54

by Sergey Senozhatsky

[permalink] [raw]

Subject: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
prefetch pages array, lock and the number of PTE to prefetch.

This is needed to turn PTE_PREFETCH_NUM into a tunable VM
parameter.

Signed-off-by: Sergey Senozhatsky <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 12 +++++++
arch/x86/kvm/mmu.h | 4 +++
arch/x86/kvm/mmu/mmu.c | 57 ++++++++++++++++++++++++++++++---
arch/x86/kvm/x86.c | 9 +++++-
4 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5271fce6cd65..11400bc3c70d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
u64 runstate_times[4];
};

+struct kvm_mmu_pte_prefetch {
+ /*
+ * This will be cast either to array of pointers to struct page,
+ * or array of u64, or array of u32
+ */
+ void *ents;
+ unsigned int num_ents;
+ spinlock_t lock;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -682,6 +692,8 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_gfn_array_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;

+ struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
+
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 75367af1a6d3..b953a3a4083a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -68,6 +68,10 @@ static __always_inline u64 rsvd_bits(int s, int e)
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);

+int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents);
+int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu);
+void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu);
+
void kvm_init_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
unsigned long cr4, u64 efer, gpa_t nested_cr3);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 24a9f4c3f5e7..fed3a498a729 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -115,6 +115,7 @@ module_param(dbg, bool, 0644);
#endif

#define PTE_PREFETCH_NUM 8
+#define MAX_PTE_PREFETCH_NUM 128

#define PT32_LEVEL_BITS 10

@@ -732,7 +733,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)

/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
+ 1 + PT64_ROOT_MAX_LEVEL + MAX_PTE_PREFETCH_NUM);
if (r)
return r;
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
@@ -2753,12 +2754,13 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
{
- struct page *pages[PTE_PREFETCH_NUM];
+ struct page **pages;
struct kvm_memory_slot *slot;
unsigned int access = sp->role.access;
int i, ret;
gfn_t gfn;

+ pages = (struct page **)vcpu->arch.mmu_pte_prefetch.ents;
gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
if (!slot)
@@ -2781,14 +2783,17 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *sptep)
{
u64 *spte, *start = NULL;
+ unsigned int pte_prefetch_num;
int i;

WARN_ON(!sp->role.direct);

- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+ pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
+ i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
spte = sp->spt + i;

- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ for (i = 0; i < pte_prefetch_num; i++, spte++) {
if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
@@ -2800,6 +2805,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
}
if (start)
direct_pte_prefetch_many(vcpu, sp, start, spte);
+ spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
}

static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -4914,6 +4920,49 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_init_mmu);

+int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents)
+{
+ u64 *ents;
+
+ if (!num_ents)
+ return -EINVAL;
+
+ if (!is_power_of_2(num_ents))
+ return -EINVAL;
+
+ if (num_ents > MAX_PTE_PREFETCH_NUM)
+ return -EINVAL;
+
+ ents = kmalloc_array(num_ents, sizeof(u64), GFP_KERNEL);
+ if (!ents)
+ return -ENOMEM;
+
+ spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+ kfree(vcpu->arch.mmu_pte_prefetch.ents);
+ vcpu->arch.mmu_pte_prefetch.ents = ents;
+ vcpu->arch.mmu_pte_prefetch.num_ents = num_ents;
+ spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_pte_prefetch);
+
+int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu)
+{
+ spin_lock_init(&vcpu->arch.mmu_pte_prefetch.lock);
+
+ return kvm_set_pte_prefetch(vcpu, PTE_PREFETCH_NUM);
+}
+EXPORT_SYMBOL_GPL(kvm_init_pte_prefetch);
+
+void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu_pte_prefetch.num_ents = 0;
+ kfree(vcpu->arch.mmu_pte_prefetch.ents);
+ vcpu->arch.mmu_pte_prefetch.ents = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_pte_prefetch_destroy);
+
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0f99132d7d1..4805960a89e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10707,10 +10707,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.hv_root_tdp = INVALID_PAGE;
#endif

- r = static_call(kvm_x86_vcpu_create)(vcpu);
+ r = kvm_init_pte_prefetch(vcpu);
if (r)
goto free_guest_fpu;

+ r = static_call(kvm_x86_vcpu_create)(vcpu);
+ if (r)
+ goto free_pte_prefetch;
+
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
@@ -10721,6 +10725,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
return 0;

+free_pte_prefetch:
+ kvm_pte_prefetch_destroy(vcpu);
free_guest_fpu:
kvm_free_guest_fpu(vcpu);
free_user_fpu:
@@ -10782,6 +10788,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_free_lapic(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
+ kvm_pte_prefetch_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
--
2.33.0.1079.g6e70778dc9-goog

2021-10-19 15:35:11

by Sergey Senozhatsky

[permalink] [raw]

Subject: [PATCHV2 2/3] KVM: x86: use mmu_pte_prefetch for guest_walker

Do not use fixed size PTE array for prefetch, but switch to
mmu_pte_prefetch cached entries array for PTEs prefetch.

Signed-off-by: Sergey Senozhatsky <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 4 ++--
arch/x86/kvm/mmu/paging_tmpl.h | 39 +++++++++++++++++++++++-----------
2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fed3a498a729..3eb034ffbe58 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2788,7 +2788,6 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,

WARN_ON(!sp->role.direct);

- spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
spte = sp->spt + i;
@@ -2805,7 +2804,6 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
}
if (start)
direct_pte_prefetch_many(vcpu, sp, start, spte);
- spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
}

static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -2832,7 +2830,9 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
if (unlikely(vcpu->kvm->mmu_notifier_count))
return;

+ spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
__direct_pte_prefetch(vcpu, sp, sptep);
+ spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
}

static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index d8889e02c4b7..6a0924261d81 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -86,7 +86,6 @@ struct guest_walker {
unsigned max_level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
- pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
@@ -592,23 +591,30 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
struct guest_walker *gw, int level)
{
+ pt_element_t *prefetch_ptes;
pt_element_t curr_pte;
gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
- u64 mask;
+ u32 pte_prefetch_num;
+ u64 len;
int r, index;

+ spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+ prefetch_ptes = (pt_element_t *)vcpu->arch.mmu_pte_prefetch.ents;
+ pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
+
if (level == PG_LEVEL_4K) {
- mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
- base_gpa = pte_gpa & ~mask;
+ len = pte_prefetch_num * sizeof(pt_element_t);
+ base_gpa = pte_gpa & ~(len - 1);
index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

- r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
- gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
- curr_pte = gw->prefetch_ptes[index];
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, prefetch_ptes,
+ len);
+ curr_pte = prefetch_ptes[index];
} else
r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
&curr_pte, sizeof(curr_pte));

+ spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
return r || curr_pte != gw->ptes[level - 1];
}

@@ -616,7 +622,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- pt_element_t *gptep = gw->prefetch_ptes;
+ u32 pte_prefetch_num;
+ pt_element_t *gptep;
u64 *spte;
int i;

@@ -632,13 +639,19 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (unlikely(vcpu->kvm->mmu_notifier_count))
return;

- if (sp->role.direct)
- return __direct_pte_prefetch(vcpu, sp, sptep);
+ spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+ gptep = (pt_element_t *)vcpu->arch.mmu_pte_prefetch.ents;
+ pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
+
+ if (sp->role.direct) {
+ __direct_pte_prefetch(vcpu, sp, sptep);
+ goto out;
+ }

- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
spte = sp->spt + i;

- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ for (i = 0; i < pte_prefetch_num; i++, spte++) {
if (spte == sptep)
continue;

@@ -648,6 +661,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
break;
}
+out:
+ spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
}

/*
--
2.33.0.1079.g6e70778dc9-goog

2021-10-19 15:35:23

by Sergey Senozhatsky

[permalink] [raw]

Subject: [PATCHV2 3/3] KVM: x86: add KVM_SET_MMU_PREFETCH ioctl

This ioctl lets user-space set the number of PTEs KVM will
prefetch:

- pte_prefetch 8

VM-EXIT Samples Samples% Time% Min Time Max Time Avg time

EPT_VIOLATION 760998 54.85% 7.23% 0.92us 31765.89us 7.78us ( +- 1.46% )
MSR_WRITE 170599 12.30% 0.53% 0.60us 3334.13us 2.52us ( +- 0.86% )
EXTERNAL_INTERRUPT 159510 11.50% 1.65% 0.49us 43705.81us 8.45us ( +- 7.54% )
[..]

Total Samples:1387305, Total events handled time:81900258.99us.

- pte_prefetch 16

VM-EXIT Samples Samples% Time% Min Time Max Time Avg time

EPT_VIOLATION 658064 52.58% 7.04% 0.91us 17022.84us 8.34us ( +- 1.52% )
MSR_WRITE 163776 13.09% 0.54% 0.56us 5192.10us 2.57us ( +- 1.25% )
EXTERNAL_INTERRUPT 144588 11.55% 1.62% 0.48us 97410.16us 8.75us ( +- 11.44% )
[..]

Total Samples:1251546, Total events handled time:77956187.56us.

Signed-off-by: Sergey Senozhatsky <[email protected]>
---
Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++++
arch/x86/kvm/x86.c | 29 +++++++++++++++++++++++++++++
include/uapi/linux/kvm.h | 4 ++++
3 files changed, 54 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 0c0bf26426b3..b06b7c11a430 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5473,6 +5473,17 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block.

+4.134 KVM SET MMU PREFETCH
+----------------------
+
+:Capability: KVM_CAP_MMU PREFETCH
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: int value (in)
+:Returns: 0 on success, error code otherwise
+
+Sets the maximum number of PTEs KVM will try to prefetch.
+
5. The kvm_run structure
========================

@@ -7440,3 +7451,13 @@ The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset
of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace
the hypercalls whose corresponding bit is in the argument, and return
ENOSYS for the others.
+
+8.35 KVM_CAP_MMU_PTE_PREFETCH
+---------------------------
+
+:Capability: KVM_CAP_MMU_PTE_PREFETCH
+:Architectures: x86
+:Parameters: args[0] - the number of PTEs to prefetch
+
+Sets the maximum number of PTEs KVM will prefetch. The value must be power
+of two and within (0, 128] range.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4805960a89e6..e1b2224c4176 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4030,6 +4030,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_MMU_PTE_PREFETCH:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -5831,6 +5832,25 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */

+static int kvm_arch_mmu_pte_prefetch(struct kvm *kvm, unsigned int num_pages)
+{
+ struct kvm_vcpu *vcpu;
+ int i, ret;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = kvm_set_pte_prefetch(vcpu, num_pages);
+ if (ret) {
+ kvm_err("Failed to set PTE prefetch on VCPU%d: %d\n",
+ vcpu->vcpu_id, ret);
+ break;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+}
+
static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
{
struct kvm_clock_data data;
@@ -6169,6 +6189,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_X86_SET_MSR_FILTER:
r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
break;
+ case KVM_SET_MMU_PREFETCH: {
+ u64 val;
+
+ r = -EFAULT;
+ if (copy_from_user(&val, argp, sizeof(val)))
+ goto out;
+ r = kvm_arch_mmu_pte_prefetch(kvm, val);
+ break;
+ }
default:
r = -ENOTTY;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 322b4b588d75..0782eb4c424d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1120,6 +1120,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_BINARY_STATS_FD 203
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
#define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_MMU_PTE_PREFETCH 206

#ifdef KVM_CAP_IRQ_ROUTING

@@ -2015,4 +2016,7 @@ struct kvm_stats_desc {

#define KVM_GET_STATS_FD _IO(KVMIO, 0xce)

+/* Set number of PTEs to prefetch */
+#define KVM_SET_MMU_PREFETCH _IOW(KVMIO, 0xcf, __u64)
+
#endif /* __LINUX_KVM_H */
--
2.33.0.1079.g6e70778dc9-goog

2021-10-19 15:40:19

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCHV2 3/3] KVM: x86: add KVM_SET_MMU_PREFETCH ioctl

On (21/10/20 00:32), Sergey Senozhatsky wrote:
> static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
> {
> struct kvm_clock_data data;
> @@ -6169,6 +6189,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
> case KVM_X86_SET_MSR_FILTER:
> r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
> break;
> + case KVM_SET_MMU_PREFETCH: {
> + u64 val;
> +
> + r = -EFAULT;
> + if (copy_from_user(&val, argp, sizeof(val)))
> + goto out;
> + r = kvm_arch_mmu_pte_prefetch(kvm, val);
> + break;
> + }

A side question: is there any value in turning this into a per-VCPU ioctl?
So that, say, on heterogeneous systems big cores can prefetch more than
little cores, for instance.

2021-10-19 22:47:34

by David Matlack

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
<[email protected]> wrote:
>
> kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> prefetch pages array, lock and the number of PTE to prefetch.
>
> This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> parameter.
>
> Signed-off-by: Sergey Senozhatsky <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 12 +++++++
> arch/x86/kvm/mmu.h | 4 +++
> arch/x86/kvm/mmu/mmu.c | 57 ++++++++++++++++++++++++++++++---
> arch/x86/kvm/x86.c | 9 +++++-
> 4 files changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5271fce6cd65..11400bc3c70d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
> u64 runstate_times[4];
> };
>
> +struct kvm_mmu_pte_prefetch {
> + /*
> + * This will be cast either to array of pointers to struct page,
> + * or array of u64, or array of u32
> + */
> + void *ents;
> + unsigned int num_ents;
> + spinlock_t lock;

The spinlock is overkill. I'd suggest something like this:
- When VM-ioctl is invoked to update prefetch count, store it in
kvm_arch. No synchronization with vCPUs needed.
- When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
different than count at last fault, re-allocate vCPU prefetch array.
(So you'll need to add prefetch array and count to kvm_vcpu_arch as
well.)

No extra locks are needed. vCPUs that fault after the VM-ioctl will
get the new prefetch count. We don't really care if a prefetch count
update races with a vCPU fault as long as vCPUs are careful to only
read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
use that. Assuming prefetch count ioctls are rare, the re-allocation
on the fault path will be rare as well.

Note: You could apply this same approach to a module param, except
vCPUs would be reading the module param rather than vcpu->kvm during
each fault.

And the other alternative, like you suggested in the other patch, is
to use a vCPU ioctl. That would side-step the synchronization issue
because vCPU ioctls require the vCPU mutex. So the reallocation could
be done in the ioctl and not at fault time.

Taking a step back, can you say a bit more about your usecase? The
module param approach would be simplest because you would not have to
add userspace support, but in v1 you did mention you wanted per-VM
control.

> +};
> +
> struct kvm_vcpu_arch {
> /*
> * rip and regs accesses must go through
> @@ -682,6 +692,8 @@ struct kvm_vcpu_arch {
> struct kvm_mmu_memory_cache mmu_gfn_array_cache;
> struct kvm_mmu_memory_cache mmu_page_header_cache;
>
> + struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
> +
> /*
> * QEMU userspace and the guest each have their own FPU state.
> * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 75367af1a6d3..b953a3a4083a 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -68,6 +68,10 @@ static __always_inline u64 rsvd_bits(int s, int e)
> void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
> void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
>
> +int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents);
> +int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu);
> +void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu);
> +
> void kvm_init_mmu(struct kvm_vcpu *vcpu);
> void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
> unsigned long cr4, u64 efer, gpa_t nested_cr3);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 24a9f4c3f5e7..fed3a498a729 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -115,6 +115,7 @@ module_param(dbg, bool, 0644);
> #endif
>
> #define PTE_PREFETCH_NUM 8
> +#define MAX_PTE_PREFETCH_NUM 128
>
> #define PT32_LEVEL_BITS 10
>
> @@ -732,7 +733,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>
> /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
> r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
> - 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
> + 1 + PT64_ROOT_MAX_LEVEL + MAX_PTE_PREFETCH_NUM);
> if (r)
> return r;
> r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> @@ -2753,12 +2754,13 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp,
> u64 *start, u64 *end)
> {
> - struct page *pages[PTE_PREFETCH_NUM];
> + struct page **pages;
> struct kvm_memory_slot *slot;
> unsigned int access = sp->role.access;
> int i, ret;
> gfn_t gfn;
>
> + pages = (struct page **)vcpu->arch.mmu_pte_prefetch.ents;
> gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
> slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
> if (!slot)
> @@ -2781,14 +2783,17 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp, u64 *sptep)
> {
> u64 *spte, *start = NULL;
> + unsigned int pte_prefetch_num;
> int i;
>
> WARN_ON(!sp->role.direct);
>
> - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
> + spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
> + pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
> + i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
> spte = sp->spt + i;
>
> - for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
> + for (i = 0; i < pte_prefetch_num; i++, spte++) {
> if (is_shadow_present_pte(*spte) || spte == sptep) {
> if (!start)
> continue;
> @@ -2800,6 +2805,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
> }
> if (start)
> direct_pte_prefetch_many(vcpu, sp, start, spte);
> + spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
> }
>
> static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> @@ -4914,6 +4920,49 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
> }
> EXPORT_SYMBOL_GPL(kvm_init_mmu);
>
> +int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents)
> +{
> + u64 *ents;
> +
> + if (!num_ents)
> + return -EINVAL;
> +
> + if (!is_power_of_2(num_ents))
> + return -EINVAL;
> +
> + if (num_ents > MAX_PTE_PREFETCH_NUM)
> + return -EINVAL;
> +
> + ents = kmalloc_array(num_ents, sizeof(u64), GFP_KERNEL);
> + if (!ents)
> + return -ENOMEM;
> +
> + spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
> + kfree(vcpu->arch.mmu_pte_prefetch.ents);
> + vcpu->arch.mmu_pte_prefetch.ents = ents;
> + vcpu->arch.mmu_pte_prefetch.num_ents = num_ents;
> + spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_set_pte_prefetch);
> +
> +int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu)
> +{
> + spin_lock_init(&vcpu->arch.mmu_pte_prefetch.lock);
> +
> + return kvm_set_pte_prefetch(vcpu, PTE_PREFETCH_NUM);
> +}
> +EXPORT_SYMBOL_GPL(kvm_init_pte_prefetch);
> +
> +void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu)
> +{
> + vcpu->arch.mmu_pte_prefetch.num_ents = 0;
> + kfree(vcpu->arch.mmu_pte_prefetch.ents);
> + vcpu->arch.mmu_pte_prefetch.ents = NULL;
> +}
> +EXPORT_SYMBOL_GPL(kvm_pte_prefetch_destroy);
> +
> static union kvm_mmu_page_role
> kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
> {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b0f99132d7d1..4805960a89e6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10707,10 +10707,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> vcpu->arch.hv_root_tdp = INVALID_PAGE;
> #endif
>
> - r = static_call(kvm_x86_vcpu_create)(vcpu);
> + r = kvm_init_pte_prefetch(vcpu);
> if (r)
> goto free_guest_fpu;
>
> + r = static_call(kvm_x86_vcpu_create)(vcpu);
> + if (r)
> + goto free_pte_prefetch;
> +
> vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
> vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
> kvm_vcpu_mtrr_init(vcpu);
> @@ -10721,6 +10725,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> vcpu_put(vcpu);
> return 0;
>
> +free_pte_prefetch:
> + kvm_pte_prefetch_destroy(vcpu);
> free_guest_fpu:
> kvm_free_guest_fpu(vcpu);
> free_user_fpu:
> @@ -10782,6 +10788,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
> kvm_free_lapic(vcpu);
> idx = srcu_read_lock(&vcpu->kvm->srcu);
> kvm_mmu_destroy(vcpu);
> + kvm_pte_prefetch_destroy(vcpu);
> srcu_read_unlock(&vcpu->kvm->srcu, idx);
> free_page((unsigned long)vcpu->arch.pio_data);
> kvfree(vcpu->arch.cpuid_entries);
> --
> 2.33.0.1079.g6e70778dc9-goog
>

2021-10-20 01:26:07

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On (21/10/19 15:44), David Matlack wrote:
> On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
> <[email protected]> wrote:
> >
> > kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> > prefetch pages array, lock and the number of PTE to prefetch.
> >
> > This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> > parameter.
> >
> > Signed-off-by: Sergey Senozhatsky <[email protected]>
> > ---
> > arch/x86/include/asm/kvm_host.h | 12 +++++++
> > arch/x86/kvm/mmu.h | 4 +++
> > arch/x86/kvm/mmu/mmu.c | 57 ++++++++++++++++++++++++++++++---
> > arch/x86/kvm/x86.c | 9 +++++-
> > 4 files changed, 77 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 5271fce6cd65..11400bc3c70d 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
> > u64 runstate_times[4];
> > };
> >
> > +struct kvm_mmu_pte_prefetch {
> > + /*
> > + * This will be cast either to array of pointers to struct page,
> > + * or array of u64, or array of u32
> > + */
> > + void *ents;
> > + unsigned int num_ents;
> > + spinlock_t lock;
>
> The spinlock is overkill. I'd suggest something like this:
> - When VM-ioctl is invoked to update prefetch count, store it in
> kvm_arch. No synchronization with vCPUs needed.
> - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> different than count at last fault, re-allocate vCPU prefetch array.
> (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> well.)
>
> No extra locks are needed. vCPUs that fault after the VM-ioctl will
> get the new prefetch count. We don't really care if a prefetch count
> update races with a vCPU fault as long as vCPUs are careful to only
> read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> use that. Assuming prefetch count ioctls are rare, the re-allocation
> on the fault path will be rare as well.

So reallocation from the faul-path should happen before vCPU takes the
mmu_lock? And READ_ONCE(prefetch_count) should also happen before vCPU
takes mmu_lock, I assume, so we need to pass it as a parameter to all
the functions that will access prefetch array.

> Note: You could apply this same approach to a module param, except
> vCPUs would be reading the module param rather than vcpu->kvm during
> each fault.
>
> And the other alternative, like you suggested in the other patch, is
> to use a vCPU ioctl. That would side-step the synchronization issue
> because vCPU ioctls require the vCPU mutex. So the reallocation could
> be done in the ioctl and not at fault time.

One more idea, wonder what do you think:

There is an upper limit on the number of PTEs we prefault, which is 128 as of
now, but I think 64 will be good enough, or maybe even 32. So we can always
allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
way we never have to reallocate anything, we just adjust the "maximum index"
value.

> Taking a step back, can you say a bit more about your usecase?

We are looking at various ways of reducing the number of vm-exits. There
is only one VM running on the device (a pretty low-end laptop).

2021-10-20 15:58:59

by David Matlack

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On Tue, Oct 19, 2021 at 6:24 PM Sergey Senozhatsky
<[email protected]> wrote:
>
> On (21/10/19 15:44), David Matlack wrote:
> > On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
> > <[email protected]> wrote:
> > >
> > > kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> > > prefetch pages array, lock and the number of PTE to prefetch.
> > >
> > > This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> > > parameter.
> > >
> > > Signed-off-by: Sergey Senozhatsky <[email protected]>
> > > ---
> > > arch/x86/include/asm/kvm_host.h | 12 +++++++
> > > arch/x86/kvm/mmu.h | 4 +++
> > > arch/x86/kvm/mmu/mmu.c | 57 ++++++++++++++++++++++++++++++---
> > > arch/x86/kvm/x86.c | 9 +++++-
> > > 4 files changed, 77 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 5271fce6cd65..11400bc3c70d 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
> > > u64 runstate_times[4];
> > > };
> > >
> > > +struct kvm_mmu_pte_prefetch {
> > > + /*
> > > + * This will be cast either to array of pointers to struct page,
> > > + * or array of u64, or array of u32
> > > + */
> > > + void *ents;
> > > + unsigned int num_ents;
> > > + spinlock_t lock;
> >
> > The spinlock is overkill. I'd suggest something like this:
> > - When VM-ioctl is invoked to update prefetch count, store it in
> > kvm_arch. No synchronization with vCPUs needed.
> > - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> > different than count at last fault, re-allocate vCPU prefetch array.
> > (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> > well.)
> >
> > No extra locks are needed. vCPUs that fault after the VM-ioctl will
> > get the new prefetch count. We don't really care if a prefetch count
> > update races with a vCPU fault as long as vCPUs are careful to only
> > read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> > use that. Assuming prefetch count ioctls are rare, the re-allocation
> > on the fault path will be rare as well.
>
> So reallocation from the faul-path should happen before vCPU takes the
> mmu_lock?

Yes. Take a look at mmu_topup_memory_caches for an example of
allocating in the fault path prior to taking the mmu lock.

> And READ_ONCE(prefetch_count) should also happen before vCPU
> takes mmu_lock, I assume, so we need to pass it as a parameter to all
> the functions that will access prefetch array.

Store the value of READ_ONCE(prefetch_count) in struct kvm_vcpu_arch
because you also need to know if it changes on the next fault. Then
you also don't have to add a parameter to a bunch of functions in the
fault path.

>
> > Note: You could apply this same approach to a module param, except
> > vCPUs would be reading the module param rather than vcpu->kvm during
> > each fault.
> >
> > And the other alternative, like you suggested in the other patch, is
> > to use a vCPU ioctl. That would side-step the synchronization issue
> > because vCPU ioctls require the vCPU mutex. So the reallocation could
> > be done in the ioctl and not at fault time.
>
> One more idea, wonder what do you think:
>
> There is an upper limit on the number of PTEs we prefault, which is 128 as of
> now, but I think 64 will be good enough, or maybe even 32. So we can always
> allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
> ->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
> way we never have to reallocate anything, we just adjust the "maximum index"
> value.

128 * 8 would be 1KB per vCPU. That is probably reasonable, but I
don't think the re-allocation would be that complex.

>
> > Taking a step back, can you say a bit more about your usecase?
>
> We are looking at various ways of reducing the number of vm-exits. There
> is only one VM running on the device (a pretty low-end laptop).

When you say reduce the number of vm-exits, can you be more specific?
Are you trying to reduce the time it takes for vCPUs to fault in
memory during VM startup? I just mention because there are likely
other techniques you can apply that would not require modifying KVM
code (e.g. prefaulting the host memory before running the VM, using
the TDP MMU instead of the legacy MMU to allow parallel faults, using
hugepages to map in more memory per fault, etc.)

2021-10-21 02:51:51

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On (21/10/20 08:56), David Matlack wrote:
> > > The spinlock is overkill. I'd suggest something like this:
> > > - When VM-ioctl is invoked to update prefetch count, store it in
> > > kvm_arch. No synchronization with vCPUs needed.
> > > - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> > > different than count at last fault, re-allocate vCPU prefetch array.
> > > (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> > > well.)
> > >
> > > No extra locks are needed. vCPUs that fault after the VM-ioctl will
> > > get the new prefetch count. We don't really care if a prefetch count
> > > update races with a vCPU fault as long as vCPUs are careful to only
> > > read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> > > use that. Assuming prefetch count ioctls are rare, the re-allocation
> > > on the fault path will be rare as well.
> >
> > So reallocation from the faul-path should happen before vCPU takes the
> > mmu_lock?
>
> Yes. Take a look at mmu_topup_memory_caches for an example of
> allocating in the fault path prior to taking the mmu lock.
>
> > And READ_ONCE(prefetch_count) should also happen before vCPU
> > takes mmu_lock, I assume, so we need to pass it as a parameter to all
> > the functions that will access prefetch array.
>
> Store the value of READ_ONCE(prefetch_count) in struct kvm_vcpu_arch
> because you also need to know if it changes on the next fault. Then
> you also don't have to add a parameter to a bunch of functions in the
> fault path.
>
> >
> > > Note: You could apply this same approach to a module param, except
> > > vCPUs would be reading the module param rather than vcpu->kvm during
> > > each fault.
> > >
> > > And the other alternative, like you suggested in the other patch, is
> > > to use a vCPU ioctl. That would side-step the synchronization issue
> > > because vCPU ioctls require the vCPU mutex. So the reallocation could
> > > be done in the ioctl and not at fault time.
> >
> > One more idea, wonder what do you think:
> >
> > There is an upper limit on the number of PTEs we prefault, which is 128 as of
> > now, but I think 64 will be good enough, or maybe even 32. So we can always
> > allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
> > ->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
> > way we never have to reallocate anything, we just adjust the "maximum index"
> > value.
>
> 128 * 8 would be 1KB per vCPU. That is probably reasonable, but I
> don't think the re-allocation would be that complex.

128 is probably too large. What I'm thinking is "32 * 8" per-VCPU.
Then it can even be something like:

---

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5271fce6cd65..b3a436f8fdf5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,6 +607,11 @@ struct kvm_vcpu_xen {
u64 runstate_times[4];
};

+struct kvm_mmu_pte_prefetch {
+ u64 ents[32];
+ unsigned int num_ents; /* max prefetch value for this vCPU */
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -682,6 +687,8 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_gfn_array_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;

+ struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
+
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.

---

> >
> > > Taking a step back, can you say a bit more about your usecase?
> >
> > We are looking at various ways of reducing the number of vm-exits. There
> > is only one VM running on the device (a pretty low-end laptop).
>
> When you say reduce the number of vm-exits, can you be more specific?
> Are you trying to reduce the time it takes for vCPUs to fault in
> memory during VM startup?

VM Boot is the test I'm running, yes, and I see some improvements with
pte-prefault == 16.

> I just mention because there are likely other techniques you can apply
> that would not require modifying KVM code (e.g. prefaulting the host
> memory before running the VM, using the TDP MMU instead of the legacy
> MMU to allow parallel faults, using hugepages to map in more memory
> per fault, etc.)

THP would be awesome. We have THP enabled on devices that have 8-plus
gigabytes of RAM; but we don't have THP enabled on low end devices, that
only have 4 gigabytes of RAM. On low end devices KVM with THP causes host
memory regression, that we cannot accept, hence for 4 gig devices we try
various "other solutions".

We are using TDP. And somehow I never see (literally never) async PFs.
It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
__direct_map() from tdp_page_fault().

2021-10-21 03:35:33

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On (21/10/21 11:48), Sergey Senozhatsky wrote:
>
> We are using TDP. And somehow I never see (literally never) async PFs.
> It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
> __direct_map() from tdp_page_fault().

Hmm, and tdp_page_fault()->fast_page_fault() always fails on
!is_access_allowed(error_code, new_spte), it never handles the faults.
And I see some ->mmu_lock contention:

spin_lock(&vcpu->kvm->mmu_lock);
__direct_map();
spin_unlock(&vcpu->kvm->mmu_lock);

So it might be that we setup guest memory wrongly and never get
advantages of TPD and fast page faults?

2021-10-21 08:11:27

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCHV2 1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

On (21/10/21 12:28), Sergey Senozhatsky wrote:
> >
> > We are using TDP. And somehow I never see (literally never) async PFs.
> > It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
> > __direct_map() from tdp_page_fault().
>
> Hmm, and tdp_page_fault()->fast_page_fault() always fails on
> !is_access_allowed(error_code, new_spte), it never handles the faults.
> And I see some ->mmu_lock contention:
>
> spin_lock(&vcpu->kvm->mmu_lock);
> __direct_map();
> spin_unlock(&vcpu->kvm->mmu_lock);
>
> So it might be that we setup guest memory wrongly and never get
> advantages of TPD and fast page faults?

No, never mind, that's probably expected and ->mmu_lock contention is not
severe.

2021-11-09 05:27:40

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCHV2 3/3] KVM: x86: add KVM_SET_MMU_PREFETCH ioctl

On Wed, Oct 20, 2021, Sergey Senozhatsky wrote:
> On (21/10/20 00:32), Sergey Senozhatsky wrote:
> > static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
> > {
> > struct kvm_clock_data data;
> > @@ -6169,6 +6189,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
> > case KVM_X86_SET_MSR_FILTER:
> > r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
> > break;
> > + case KVM_SET_MMU_PREFETCH: {
> > + u64 val;
> > +
> > + r = -EFAULT;
> > + if (copy_from_user(&val, argp, sizeof(val)))
> > + goto out;
> > + r = kvm_arch_mmu_pte_prefetch(kvm, val);
> > + break;
> > + }
>
> A side question: is there any value in turning this into a per-VCPU ioctl?
> So that, say, on heterogeneous systems big cores can prefetch more than
> little cores, for instance.

I don't think so? If anything, such behavior should probably be tied to the pCPU,
not vCPU. Though I'm guessing the difference in optimal prefetch size between big
and little cores is in the noise.

I suspect the optimal prefetch size is more dependent on the guest workload than
the core its running on. There's likely a correlation between the core size and
the workload, but for that to have any meaning the vCPU would have be affined to
a core (or set of cores), i.e having the behavior tied to the pCPU as opposed to
the vCPU would work just as well.

If the optimal setting is based on the speed of the core, not the workload, then
per-pCPU is again preferable as it "works" regardless of vCPU affinity.