From: Isaku Yamahata <[email protected]>
Wire KVM_PRE_FAULT_MEMORY ioctl to __kvm_mmu_do_page_fault() to populate guest
memory. It can be called right after KVM_CREATE_VCPU creates a vCPU,
since at that point kvm_mmu_create() and kvm_init_mmu() are called and
the vCPU is ready to invoke the KVM page fault handler.
The helper function kvm_mmu_map_tdp_page take care of the logic to
process RET_PF_* return values and convert them to success or errno.
Signed-off-by: Isaku Yamahata <[email protected]>
Message-ID: <9b866a0ae7147f96571c439e75429a03dcb659b6.1712785629.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/mmu/mmu.c | 72 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 3 ++
3 files changed, 76 insertions(+)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7632fe6e4db9..54c155432793 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -44,6 +44,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 10e90788b263..a045b23964c0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4647,6 +4647,78 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level)
+{
+ int r;
+
+ /* Restrict to TDP page fault. */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+retry:
+ r = __kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_RETRY:
+ if (signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ goto retry;
+
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ break;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ON_ONCE(r);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 end;
+ int r;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ kvm_mmu_reload(vcpu);
+
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83b8260443a3..619ad713254e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4715,6 +4715,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MEMORY_FAULT_INFO:
r = 1;
break;
+ case KVM_CAP_PRE_FAULT_MEMORY:
+ r = tdp_enabled;
+ break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;
--
2.43.0
On 4/19/2024 4:59 PM, Paolo Bonzini wrote:
> From: Isaku Yamahata <[email protected]>
>
> Wire KVM_PRE_FAULT_MEMORY ioctl to __kvm_mmu_do_page_fault() to populate guest
> memory. It can be called right after KVM_CREATE_VCPU creates a vCPU,
> since at that point kvm_mmu_create() and kvm_init_mmu() are called and
> the vCPU is ready to invoke the KVM page fault handler.
>
> The helper function kvm_mmu_map_tdp_page take care of the logic to
> process RET_PF_* return values and convert them to success or errno.
>
> Signed-off-by: Isaku Yamahata <[email protected]>
> Message-ID: <9b866a0ae7147f96571c439e75429a03dcb659b6.1712785629.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/Kconfig | 1 +
> arch/x86/kvm/mmu/mmu.c | 72 ++++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/x86.c | 3 ++
> 3 files changed, 76 insertions(+)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 7632fe6e4db9..54c155432793 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -44,6 +44,7 @@ config KVM
> select KVM_VFIO
> select HAVE_KVM_PM_NOTIFIER if PM
> select KVM_GENERIC_HARDWARE_ENABLING
> + select KVM_GENERIC_PRE_FAULT_MEMORY
> help
> Support hosting fully virtualized guest machines using hardware
> virtualization extensions. You will need a fairly recent
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 10e90788b263..a045b23964c0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4647,6 +4647,78 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> return direct_page_fault(vcpu, fault);
> }
>
> +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> + u8 *level)
> +{
> + int r;
> +
> + /* Restrict to TDP page fault. */
> + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
> + return -EOPNOTSUPP;
> +
> +retry:
> + r = __kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
> + if (r < 0)
> + return r;
> +
> + switch (r) {
> + case RET_PF_RETRY:
> + if (signal_pending(current))
> + return -EINTR;
> + cond_resched();
> + goto retry;
> +
> + case RET_PF_FIXED:
> + case RET_PF_SPURIOUS:
> + break;
> +
> + case RET_PF_EMULATE:
> + return -ENOENT;
> +
> + case RET_PF_CONTINUE:
> + case RET_PF_INVALID:
> + default:
> + WARN_ON_ONCE(r);
> + return -EIO;
Need to update patch 1 for -EIO
> + }
> +
> + return 0;
> +}
> +
> +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> + struct kvm_pre_fault_memory *range)
> +{
> + u64 error_code = PFERR_GUEST_FINAL_MASK;
> + u8 level = PG_LEVEL_4K;
> + u64 end;
> + int r;
> +
> + /*
> + * reload is efficient when called repeatedly, so we can do it on
> + * every iteration.
> + */
> + kvm_mmu_reload(vcpu);
> +
> + if (kvm_arch_has_private_mem(vcpu->kvm) &&
> + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
> + error_code |= PFERR_PRIVATE_ACCESS;
> +
> + /*
> + * Shadow paging uses GVA for kvm page fault, so restrict to
> + * two-dimensional paging.
> + */
> + r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
> + if (r < 0)
> + return r;
> +
> + /*
> + * If the mapping that covers range->gpa can use a huge page, it
> + * may start below it or end after range->gpa + range->size.
> + */
> + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
> + return min(range->size, end - range->gpa);
> +}
> +
> static void nonpaging_init_context(struct kvm_mmu *context)
> {
> context->page_fault = nonpaging_page_fault;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 83b8260443a3..619ad713254e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4715,6 +4715,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_MEMORY_FAULT_INFO:
> r = 1;
> break;
> + case KVM_CAP_PRE_FAULT_MEMORY:
> + r = tdp_enabled;
> + break;
> case KVM_CAP_EXIT_HYPERCALL:
> r = KVM_EXIT_HYPERCALL_VALID_MASK;
> break;
On Mon, Apr 22, 2024, Xiaoyao Li wrote:
> On 4/19/2024 4:59 PM, Paolo Bonzini wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 10e90788b263..a045b23964c0 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -4647,6 +4647,78 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > return direct_page_fault(vcpu, fault);
> > }
> > +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> > + u8 *level)
Align parameters:
static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
u8 *level)
> > +{
> > + int r;
> > +
> > + /* Restrict to TDP page fault. */
This is fairly obvious from the code, what might not be obvious is _why_. I'm
also ok dropping the comment entirely, but it's easy enough to provide a hint to
the reader.
> > + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
> > + return -EOPNOTSUPP;
> > +
> > +retry:
> > + r = __kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
> > + if (r < 0)
> > + return r;
> > +
> > + switch (r) {
> > + case RET_PF_RETRY:
> > + if (signal_pending(current))
> > + return -EINTR;
> > + cond_resched();
> > + goto retry;
Rather than a goto+retry from inside a switch statement, what about:
int r;
/*
* Pre-faulting a GPA is supported only non-nested TDP, as indirect
* MMUs map either GVAs or L2 GPAs, not L1 GPAs.
*/
if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
return -EOPNOTSUPP;
do {
if (signal_pending(current))
return -EINTR;
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
switch (r) {
case RET_PF_FIXED:
case RET_PF_SPURIOUS:
break;
case RET_PF_EMULATE:
return -ENOENT;
case RET_PF_CONTINUE:
case RET_PF_INVALID:
case RET_PF_RETRY:
default:
WARN_ON_ONCE(r >= 0);
return -EIO;
}
return 0;