2021-01-25 06:49:36

by David Stevens

[permalink] [raw]
Subject: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

From: David Stevens <[email protected]>

Use the range passed to mmu_notifer's invalidate_range_start to prevent
spurious page fault retries due to changes in unrelated host virtual
addresses. This has the secondary effect of greatly reducing the
likelihood of extreme latency when handing a page fault due to another
thread having been preempted while modifying host virtual addresses.

Signed-off-by: David Stevens <[email protected]>
---
arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +-
arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
arch/x86/kvm/mmu/mmu.c | 16 ++++++++++------
arch/x86/kvm/mmu/paging_tmpl.h | 7 ++++---
include/linux/kvm_host.h | 22 +++++++++++++++++++++-
virt/kvm/kvm_main.c | 22 ++++++++++++++++++----
6 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 38ea396a23d6..8e06cd3f759c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
} else {
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
- writing, &write_ok);
+ writing, &write_ok, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index bb35490400e9..e603de7ade52 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,

/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
- writing, upgrade_p);
+ writing, upgrade_p, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..79166288ed03 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}

static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
- bool *writable)
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+ bool write, bool *writable)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}

async = false;
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+ write, writable, hva);
if (!async)
return false; /* *pfn has correct page already */

@@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}

- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+ write, writable, hva);
return false;
}

@@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
kvm_pfn_t pfn;
+ hva_t hva;
int r;

if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();

- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+ write, &map_writable))
return RET_PF_RETRY;

if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
@@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,

r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..3171784139a4 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
+ hva_t hva;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
int max_level;
@@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();

- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
- &map_writable))
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+ write_fault, &map_writable))
return RET_PF_RETRY;

if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,

r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;

kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..b70097685249 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -502,6 +502,8 @@ struct kvm {
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
+ unsigned long mmu_notifier_range_start;
+ unsigned long mmu_notifier_range_end;
#endif
long tlbs_dirty;
struct list_head devices;
@@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
- bool *writable);
+ bool *writable, hva_t *hva);

void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
return 1;
return 0;
}
+
+static inline int mmu_notifier_retry_hva(struct kvm *kvm,
+ unsigned long mmu_seq,
+ unsigned long hva)
+{
+ /*
+ * Unlike mmu_notifier_retry, this function relies on
+ * kvm->mmu_lock for consistency.
+ */
+ if (unlikely(kvm->mmu_notifier_count)) {
+ if (kvm->mmu_notifier_range_start <= hva &&
+ hva < kvm->mmu_notifier_range_end)
+ return 1;
+ }
+ if (kvm->mmu_notifier_seq != mmu_seq)
+ return 1;
+ return 0;
+}
#endif

#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa9e3614d30e..d6e1ef5cb184 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
+ if (likely(kvm->mmu_notifier_count == 1)) {
+ kvm->mmu_notifier_range_start = range->start;
+ kvm->mmu_notifier_range_end = range->end;
+ } else {
+ /**
+ * Tracking multiple concurrent ranges has diminishing returns,
+ * so just use the maximum range. This persists until after all
+ * outstanding invalidation operations complete.
+ */
+ kvm->mmu_notifier_range_start = 0;
+ kvm->mmu_notifier_range_end = ULONG_MAX;
+ }
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
range->flags);
/* we've to flush the tlb before the pages can be freed */
@@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,

kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
- bool *writable)
+ bool *writable, hva_t *hva)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+ if (hva)
+ *hva = addr;

if (addr == KVM_HVA_ERR_RO_BAD) {
if (writable)
@@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
- write_fault, writable);
+ write_fault, writable, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);

kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);

kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);

--
2.30.0.280.ga3ce27912f-goog


2021-01-25 18:37:56

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

+Cc the other architectures, I'm guessing this would be a helpful optimization
for all archs.

Quite a few comments, but they're all little more than nits. Nice!

On Mon, Jan 25, 2021, David Stevens wrote:
> From: David Stevens <[email protected]>
>
> Use the range passed to mmu_notifer's invalidate_range_start to prevent

s/mmu_notifer/mmu_notifier.

And maybe avoid calling out invalidate_range_start() by name? It took me a few
reads to understand it's referring to the function, i.e. the start of the
invalidation, not the start of the range.

> spurious page fault retries due to changes in unrelated host virtual
> addresses.

This needs to elaborate on the exact scenario this is handling, as is it sounds
like KVM is tracking the history of invalidations or something. Understanding
this patch requires a priori knowledge of mmu_notifier_count. Something like:

Track the range being invalidated by mmu_notifier and skip page fault
retries if the fault address is not affected by the in-progress
invalidation. Disable the optimization if multiple invalidations are
in-progress to keep things simple, as tracking multiple ranges has
diminishing returns.

> This has the secondary effect of greatly reducing the likelihood of extreme

Out of curiosity, is this really the _secondary_ effect? I would expect this
change to primarily benefit scenarios where the invalidation has gotten
waylaid for whatever reason.

> latency when handing a page fault due to another thread having been preempted
> while modifying host virtual addresses.
>
> Signed-off-by: David Stevens <[email protected]>
> ---

...

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..79166288ed03 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> }
>
> static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
> - bool *writable)
> + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> + bool write, bool *writable)

Side topic, I'm all for creating a 'struct kvm_page_fault' or whatever to hold
all these variables. The helper functions stacks are getting unwieldy.
Definitely doesn't need to be addressed here, this just reminded of how ugly
these stacks are.

> {
> struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> bool async;
> @@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> }
>
> async = false;
> - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
> + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
> + write, writable, hva);
> if (!async)
> return false; /* *pfn has correct page already */
>
> @@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> return true;
> }
>
> - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
> + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
> + write, writable, hva);
> return false;
> }
>
> @@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> gfn_t gfn = gpa >> PAGE_SHIFT;
> unsigned long mmu_seq;
> kvm_pfn_t pfn;
> + hva_t hva;
> int r;
>
> if (page_fault_handle_page_track(vcpu, error_code, gfn))
> @@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
>
> - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
> + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
> + write, &map_writable))
> return RET_PF_RETRY;
>
> if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
> @@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>
> r = RET_PF_RETRY;
> spin_lock(&vcpu->kvm->mmu_lock);
> - if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
> goto out_unlock;
> r = make_mmu_pages_available(vcpu);
> if (r)
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50e268eb8e1a..3171784139a4 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
> struct guest_walker walker;
> int r;
> kvm_pfn_t pfn;
> + hva_t hva;
> unsigned long mmu_seq;
> bool map_writable, is_self_change_mapping;
> int max_level;
> @@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
>
> - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
> - &map_writable))
> + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> + write_fault, &map_writable))
> return RET_PF_RETRY;
>
> if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
> @@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>
> r = RET_PF_RETRY;
> spin_lock(&vcpu->kvm->mmu_lock);
> - if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
> goto out_unlock;
>
> kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f3b1013fb22c..b70097685249 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -502,6 +502,8 @@ struct kvm {
> struct mmu_notifier mmu_notifier;
> unsigned long mmu_notifier_seq;
> long mmu_notifier_count;
> + unsigned long mmu_notifier_range_start;
> + unsigned long mmu_notifier_range_end;
> #endif
> long tlbs_dirty;
> struct list_head devices;
> @@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
> kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
> kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
> bool atomic, bool *async, bool write_fault,
> - bool *writable);
> + bool *writable, hva_t *hva);
>
> void kvm_release_pfn_clean(kvm_pfn_t pfn);
> void kvm_release_pfn_dirty(kvm_pfn_t pfn);
> @@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> return 1;
> return 0;
> }
> +
> +static inline int mmu_notifier_retry_hva(struct kvm *kvm,
> + unsigned long mmu_seq,
> + unsigned long hva)
> +{
> + /*
> + * Unlike mmu_notifier_retry, this function relies on
> + * kvm->mmu_lock for consistency.

mmu_notifier_retry is the outlier due to PPC behavior. Maybe just add a lockdep
annonation and call it good?

> + */

This needs a comment to explicitly state that 'count > 1' cannot be done at
this time. My initial thought is that it would be more intuitive to check for
'count > 1' here, but that would potentially check the wrong wrange when count
goes from 2->1. The comment about persistence in invalidate_range_start() is a
good hint, but I think it's worth being explicit to avoid bad "cleanup" in the
future.

> + if (unlikely(kvm->mmu_notifier_count)) {
> + if (kvm->mmu_notifier_range_start <= hva &&
> + hva < kvm->mmu_notifier_range_end)

Combine these into a single statement? I think the result is easier to read?

if (unlikely(kvm->mmu_notifier_count) &&
kvm->mmu_notifier_range_start <= hva &&
hva < kvm->mmu_notifier_range_end)

> + return 1;
> + }
> + if (kvm->mmu_notifier_seq != mmu_seq)
> + return 1;
> + return 0;
> +}
> #endif
>
> #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fa9e3614d30e..d6e1ef5cb184 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> * count is also read inside the mmu_lock critical section.
> */
> kvm->mmu_notifier_count++;
> + if (likely(kvm->mmu_notifier_count == 1)) {
> + kvm->mmu_notifier_range_start = range->start;
> + kvm->mmu_notifier_range_end = range->end;
> + } else {
> + /**
> + * Tracking multiple concurrent ranges has diminishing returns,
> + * so just use the maximum range. This persists until after all
> + * outstanding invalidation operations complete.
> + */
> + kvm->mmu_notifier_range_start = 0;
> + kvm->mmu_notifier_range_end = ULONG_MAX;

Hrm, I don't think there's a corner case in practice, but ULONG_MAX is a legal
virtual address and range_end is exclusive. E.g. passing hva=-1ul would get a
false negative in mmu_notifier_retry_hva(). It's not an issue as written
because hva is generated from the gfn, and hva can't be a kernel address. I'm
guessing mmu_notifier also doesn't fire on kernel addresses. I assume that all
holds true for other architectures, and adding checks in mmu_notifier_retry_hva()
feels like a waste of cycles, but it still bugs me. :-)

Maybe zero out range_end and explicitly check for that, just to be paranoid?

if (unlikely(kvm->mmu_notifier_count) &&
(!kvm->mmu_notifier_range_end ||
(kvm->mmu_notifier_range_start <= hva &&
hva < kvm->mmu_notifier_range_end))

> + }
> need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
> range->flags);
> /* we've to flush the tlb before the pages can be freed */
> @@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
>
> kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
> bool atomic, bool *async, bool write_fault,
> - bool *writable)
> + bool *writable, hva_t *hva)

Hrm, it feels like we should really split gfn->hva and hva->pfn into separate
operations, but pretty much every arch needs the hva error handling. Splitting
it would probably do more harm than good, at least not without a lot of
additional refactoring. Bummer.

> {
> unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);

Newline here.

> + if (hva)
> + *hva = addr;
>
> if (addr == KVM_HVA_ERR_RO_BAD) {
> if (writable)
> @@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
> bool *writable)
> {
> return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
> - write_fault, writable);
> + write_fault, writable, NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
>
> kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
> {
> - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
> + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
>
> kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
> {
> - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
> + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>
> --
> 2.30.0.280.ga3ce27912f-goog
>

2021-01-26 20:07:06

by David Stevens

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

> > This has the secondary effect of greatly reducing the likelihood of extreme
>
> Out of curiosity, is this really the _secondary_ effect? I would expect this
> change to primarily benefit scenarios where the invalidation has gotten
> waylaid for whatever reason.

Yeah, this is the primary benefit. I was thinking about it as the
reduction in page fault retries is the direct effect, and that in turn
leads to a secondary effect of a reduction in the chance of extreme
latency. But I guess that's not a particularly important distinction
to make. I'll reword this.

>
> This needs a comment to explicitly state that 'count > 1' cannot be done at
> this time. My initial thought is that it would be more intuitive to check for
> 'count > 1' here, but that would potentially check the wrong wrange when count
> goes from 2->1. The comment about persistence in invalidate_range_start() is a
> good hint, but I think it's worth being explicit to avoid bad "cleanup" in the
> future.
>
> > + if (unlikely(kvm->mmu_notifier_count)) {
> > + if (kvm->mmu_notifier_range_start <= hva &&
> > + hva < kvm->mmu_notifier_range_end)

I'm not sure I understand what you're suggesting here. How exactly
would 'count > 1' be used incorrectly here? I'm fine with adding a
comment, but I'm not sure what the comment needs to clarify.

-David

2021-01-27 20:03:39

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

On Tue, Jan 26, 2021, David Stevens wrote:
> > This needs a comment to explicitly state that 'count > 1' cannot be done at
> > this time. My initial thought is that it would be more intuitive to check for
> > 'count > 1' here, but that would potentially check the wrong wrange when count
> > goes from 2->1. The comment about persistence in invalidate_range_start() is a
> > good hint, but I think it's worth being explicit to avoid bad "cleanup" in the
> > future.
> >
> > > + if (unlikely(kvm->mmu_notifier_count)) {
> > > + if (kvm->mmu_notifier_range_start <= hva &&
> > > + hva < kvm->mmu_notifier_range_end)
>
> I'm not sure I understand what you're suggesting here. How exactly
> would 'count > 1' be used incorrectly here? I'm fine with adding a
> comment, but I'm not sure what the comment needs to clarify.

There's no guarantee that the remaining in-progress invalidation when the count
goes from 2->1 is the same invalidation call that set range_start/range_end.

E.g. given two invalidations, A and B, the order of calls could be:

kvm_mmu_notifier_invalidate_range_start(A)
kvm_mmu_notifier_invalidate_range_start(B)
kvm_mmu_notifier_invalidate_range_end(A)
kvm_mmu_notifier_invalidate_range_end(B) <-- ???

or

kvm_mmu_notifier_invalidate_range_start(A)
kvm_mmu_notifier_invalidate_range_start(B)
kvm_mmu_notifier_invalidate_range_end(B)
kvm_mmu_notifier_invalidate_range_end(A) <-- ???

In the first case, "A" is in-progress when the count goes 2->1, in the second
case "B" is still in-progress. Checking for "count > 1" in the consumer instead
of handling it in the producer (as you did) would lead to the consumer checking
against the wrong range. I don't see a way to solve that without adding some
amount of history, which I agree is unnecessary.