2022-10-13 21:15:29

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 00/16] KVM: x86: gfn_to_pfn_cache fixes and cleanups

The highlights are two fixes for bugs where "destroying" and "initializing"
a gfn=>pfn cache while it is being accessed results in various forms of
badness, e.g. re-initialization of an in-use lock, consuming a NULL pointer,
potential memory corruption, etc...

Everything else is cleanup to make the gpc APIs easier to use and harder
to use incorrectly.

David, patch 3 (KVM: x86: Always use non-compat vcpu_runstate_info size...)
in particular needs your eyeballs. I'm pretty sure it's ok, but
confirmation from someone that actually uses KVM XEN would be nice.

v2:
- Fix active vs. valid race by rejecting refresh() if the cache is
currently invalid.
- Tweak shortlogs to be "KVM" only. Even though x86 is the only user
of the caches, I think it makes sense to tag the changes as "full"
KVM for future readers.
- Add back the selftest (from the RFC) that triggers the race conditions.
- Always use non-compat size for checking+refreshing runstate_info and
make @len truly immutable.
- Drop unmap() for the moment. I started adding code to prevent "bad"
usage, but without any user I couldn't figure out exactly what
restrictions need to be in place.
- Do more cleanup.

v1: https://lore.kernel.org/all/[email protected]

Michal Luczaj (9):
KVM: Initialize gfn_to_pfn_cache locks in dedicated helper
KVM: Shorten gfn_to_pfn_cache function names
KVM: x86: Remove unused argument in gpc_unmap_khva()
KVM: Store immutable gfn_to_pfn_cache properties
KVM: Store gfn_to_pfn_cache length as an immutable property
KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_check()
KVM: Clean up hva_to_pfn_retry()
KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_refresh()
KVM: selftests: Add tests in xen_shinfo_test to detect lock races

Sean Christopherson (7):
KVM: Reject attempts to consume or refresh inactive gfn_to_pfn_cache
KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn
cache
KVM: Drop KVM's API to allow temprorarily unmapping gfn=>pfn cache
KVM: Do not partially reinitialize gfn=>pfn cache during activation
KVM: Drop @gpa from exported gfn=>pfn cache check() and refresh()
helpers
KVM: Skip unnecessary "unmap" if gpc is already valid during refresh
KVM: selftests: Mark "guest_saw_irq" as volatile in xen_shinfo_test

arch/x86/kvm/x86.c | 24 +--
arch/x86/kvm/xen.c | 84 ++++-----
include/linux/kvm_host.h | 73 ++++----
include/linux/kvm_types.h | 2 +
.../selftests/kvm/x86_64/xen_shinfo_test.c | 142 ++++++++++++++-
virt/kvm/pfncache.c | 170 ++++++++++--------
6 files changed, 320 insertions(+), 175 deletions(-)


base-commit: e18d6152ff0f41b7f01f9817372022df04e0d354
--
2.38.0.413.g74048e4d9e-goog


2022-10-13 21:30:44

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 05/16] KVM: x86: Remove unused argument in gpc_unmap_khva()

From: Michal Luczaj <[email protected]>

Remove the unused @kvm argument from gpc_unmap_khva().

Signed-off-by: Michal Luczaj <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
virt/kvm/pfncache.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 23180f1d9c1c..32ccf168361b 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -98,7 +98,7 @@ bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
}
EXPORT_SYMBOL_GPL(kvm_gpc_check);

-static void gpc_unmap_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
+static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva)
{
/* Unmap the old pfn/page if it was mapped before. */
if (!is_error_noslot_pfn(pfn) && khva) {
@@ -177,7 +177,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
* the existing mapping and didn't create a new one.
*/
if (new_khva != old_khva)
- gpc_unmap_khva(kvm, new_pfn, new_khva);
+ gpc_unmap_khva(new_pfn, new_khva);

kvm_release_pfn_clean(new_pfn);

@@ -324,7 +324,7 @@ int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
mutex_unlock(&gpc->refresh_lock);

if (unmap_old)
- gpc_unmap_khva(kvm, old_pfn, old_khva);
+ gpc_unmap_khva(old_pfn, old_khva);

return ret;
}
@@ -353,7 +353,7 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
write_unlock_irq(&gpc->lock);
mutex_unlock(&gpc->refresh_lock);

- gpc_unmap_khva(kvm, old_pfn, old_khva);
+ gpc_unmap_khva(old_pfn, old_khva);
}
EXPORT_SYMBOL_GPL(kvm_gpc_unmap);

--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:31:31

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 12/16] KVM: Do not partially reinitialize gfn=>pfn cache during activation

Don't partially reinitialize a gfn=>pfn cache when activating the cache,
and instead assert that the cache is not valid during activation. Bug
the VM if the assertion fails, as use-after-free and/or data corruption
is all but guaranteed if KVM ends up with a valid-but-inactive cache.

Signed-off-by: Sean Christopherson <[email protected]>
---
virt/kvm/pfncache.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 62b47feed36c..2d5b417e50ac 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -342,6 +342,9 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
gpc->vcpu = vcpu;
gpc->usage = usage;
gpc->len = len;
+ gpc->pfn = KVM_PFN_ERR_FAULT;
+ gpc->uhva = KVM_HVA_ERR_BAD;
+
}
EXPORT_SYMBOL_GPL(kvm_gpc_init);

@@ -350,10 +353,8 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
struct kvm *kvm = gpc->kvm;

if (!gpc->active) {
- gpc->khva = NULL;
- gpc->pfn = KVM_PFN_ERR_FAULT;
- gpc->uhva = KVM_HVA_ERR_BAD;
- gpc->valid = false;
+ if (KVM_BUG_ON(gpc->valid, kvm))
+ return -EIO;

spin_lock(&kvm->gpc_lock);
list_add(&gpc->list, &kvm->gpc_list);
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:31:39

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

Always use the size of Xen's non-compat vcpu_runstate_info struct when
checking that the GPA+size doesn't cross a page boundary. Conceptually,
using the current mode is more correct, but KVM isn't consistent with
itself as kvm_xen_vcpu_set_attr() unconditionally uses the "full" size
when activating the cache. More importantly, prior to the introduction
of the gfn_to_pfn_cache, KVM _always_ used the full size, i.e. allowing
the guest (userspace?) to use a poorly aligned GPA in 32-bit mode but not
64-bit mode is more of a bug than a feature, and fixing the bug doesn't
break KVM's historical ABI.

Always using the non-compat size will allow for future gfn_to_pfn_cache
clenups as this is (was) the only case where KVM uses a different size at
check()+refresh() than at activate(). E.g. the length/size of the cache
can be made immutable and dropped from check()+refresh(), which yields a
cleaner set of APIs and avoids potential bugs that could occur if check()
where invoked with a different size than refresh().

Fixes: a795cd43c5b5 ("KVM: x86/xen: Use gfn_to_pfn_cache for runstate area")
Cc: David Woodhouse <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/xen.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index b2be60c6efa4..9e79ef2cca99 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -212,10 +212,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
if (!vx->runstate_cache.active)
return;

- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_len = sizeof(struct vcpu_runstate_info);
- else
- user_len = sizeof(struct compat_vcpu_runstate_info);
+ user_len = sizeof(struct vcpu_runstate_info);

read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:32:51

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 16/16] KVM: selftests: Mark "guest_saw_irq" as volatile in xen_shinfo_test

Tag "guest_saw_irq" as "volatile" to ensure that the compiler will never
optimize away lookups. Relying on the compiler thinking that the flag
is global and thus might change also works, but it's subtle, less robust,
and looks like a bug at first glance, e.g. risks being "fixed" and
breaking the test.

Make the flag "static" as well since convincing the compiler it's global
is no longer necessary.

Alternatively, the flag could be accessed with {READ,WRITE}_ONCE(), but
literally every access would need the wrappers, and eking out performance
isn't exactly top priority for selftests.

Signed-off-by: Sean Christopherson <[email protected]>
---
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index caa3f5ab9e10..2a5727188c8d 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -132,7 +132,7 @@ struct {
struct kvm_irq_routing_entry entries[2];
} irq_routes;

-bool guest_saw_irq;
+static volatile bool guest_saw_irq;

static void evtchn_handler(struct ex_regs *regs)
{
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:33:19

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 06/16] KVM: Store immutable gfn_to_pfn_cache properties

From: Michal Luczaj <[email protected]>

Move the assignment of immutable properties @kvm, @vcpu, and @usage to
the initializer. Make _activate() and _deactivate() use stored values.

Note, @len is also effectively immutable, but less obviously so. Leave
@len as is for now, it will be addressed in a future patch.

Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
[sean: handle @len in a separate patch]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 14 +++++-------
arch/x86/kvm/xen.c | 47 ++++++++++++++++++---------------------
include/linux/kvm_host.h | 37 +++++++++++++++---------------
include/linux/kvm_types.h | 1 +
virt/kvm/pfncache.c | 22 +++++++++++-------
5 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd00e6a33203..9c68050672de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2314,13 +2314,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);

/* we verify if the enable bit is set... */
- if (system_time & 1) {
- kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
- KVM_HOST_USES_PFN, system_time & ~1ULL,
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
sizeof(struct pvclock_vcpu_time_info));
- } else {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
- }
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);

return;
}
@@ -3388,7 +3386,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)

static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
vcpu->arch.time = 0;
}

@@ -11757,7 +11755,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;

- kvm_gpc_init(&vcpu->arch.pv_time);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);

if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 74d9f4985f93..55b7195d69d6 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,13 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int idx = srcu_read_lock(&kvm->srcu);

if (gfn == GPA_INVALID) {
- kvm_gpc_deactivate(kvm, gpc);
+ kvm_gpc_deactivate(gpc);
goto out;
}

do {
- ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
- PAGE_SIZE);
+ ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
if (ret)
goto out;

@@ -550,15 +549,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));

if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}

- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache, NULL,
- KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_info));
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

@@ -566,15 +563,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)

case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}

- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -586,14 +581,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
r = 0;
break;
}

- r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
+ data->u.gpa,
sizeof(struct vcpu_runstate_info));
break;

@@ -1814,9 +1808,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)

timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);

- kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
}

void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1824,9 +1821,9 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);

- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);

del_timer_sync(&vcpu->arch.xen.poll_timer);
}
@@ -1834,7 +1831,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}

void kvm_xen_destroy_vm(struct kvm *kvm)
@@ -1842,7 +1839,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
struct evtchnfd *evtchnfd;
int i;

- kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);

idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bb020ee3b2fe..e5e70607a5ef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1243,18 +1243,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
* kvm_gpc_init - initialize gfn_to_pfn_cache.
*
* @gpc: struct gfn_to_pfn_cache object.
- *
- * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must
- * be zero-allocated (or zeroed by the caller before init).
- */
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
-
-/**
- * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
- * physical address.
- *
* @kvm: pointer to kvm instance.
- * @gpc: struct gfn_to_pfn_cache object.
* @vcpu: vCPU to be used for marking pages dirty and to be woken on
* invalidation.
* @usage: indicates if the resulting host physical PFN is used while
@@ -1263,20 +1252,31 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
* changes!---will also force @vcpu to exit the guest and
* refresh the cache); and/or if the PFN used directly
* by KVM (and thus needs a kernel virtual mapping).
+ *
+ * This sets up a gfn_to_pfn_cache by initializing locks and assigning the
+ * immutable attributes. Note, the cache must be zero-allocated (or zeroed by
+ * the caller before init).
+ */
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
+
+/**
+ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
+ * physical address.
+ *
+ * @gpc: struct gfn_to_pfn_cache object.
* @gpa: guest physical address to map.
* @len: sanity check; the range being access must fit a single page.
*
* @return: 0 for success.
* -EINVAL for a mapping which would cross a page boundary.
- * -EFAULT for an untranslatable guest physical address.
+ * -EFAULT for an untranslatable guest physical address.
*
- * This primes a gfn_to_pfn_cache and links it into the @kvm's list for
+ * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for
* invalidations to be processed. Callers are required to use kvm_gpc_check()
* to ensure that the cache is valid before accessing the target page.
*/
-int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len);
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len);

/**
* kvm_gpc_check - check validity of a gfn_to_pfn_cache.
@@ -1335,13 +1335,12 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
/**
* kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
*
- * @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
*
- * This removes a cache from the @kvm's list to be processed on MMU notifier
+ * This removes a cache from the VM's list to be processed on MMU notifier
* invocation.
*/
-void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);

void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 3ca3db020e0e..76de36e56cdf 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -67,6 +67,7 @@ struct gfn_to_pfn_cache {
gpa_t gpa;
unsigned long uhva;
struct kvm_memory_slot *memslot;
+ struct kvm *kvm;
struct kvm_vcpu *vcpu;
struct list_head list;
rwlock_t lock;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 32ccf168361b..6756dfa60d5a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -357,25 +357,29 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
}
EXPORT_SYMBOL_GPL(kvm_gpc_unmap);

-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
{
+ WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
+ WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
+
rwlock_init(&gpc->lock);
mutex_init(&gpc->refresh_lock);
+
+ gpc->kvm = kvm;
+ gpc->vcpu = vcpu;
+ gpc->usage = usage;
}
EXPORT_SYMBOL_GPL(kvm_gpc_init);

-int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len)
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
{
- WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
+ struct kvm *kvm = gpc->kvm;

if (!gpc->active) {
gpc->khva = NULL;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->uhva = KVM_HVA_ERR_BAD;
- gpc->vcpu = vcpu;
- gpc->usage = usage;
gpc->valid = false;

spin_lock(&kvm->gpc_lock);
@@ -395,8 +399,10 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
}
EXPORT_SYMBOL_GPL(kvm_gpc_activate);

-void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
{
+ struct kvm *kvm = gpc->kvm;
+
if (gpc->active) {
/*
* Deactivate the cache before removing it from the list, KVM
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:33:21

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 01/16] KVM: Initialize gfn_to_pfn_cache locks in dedicated helper

From: Michal Luczaj <[email protected]>

Move the gfn_to_pfn_cache lock initialization to another helper and
call the new helper during VM/vCPU creation. There are race
conditions possible due to kvm_gfn_to_pfn_cache_init()'s
ability to re-initialize the cache's locks.

For example: a race between ioctl(KVM_XEN_HVM_EVTCHN_SEND) and
kvm_gfn_to_pfn_cache_init() leads to a corrupted shinfo gpc lock.

(thread 1) | (thread 2)
|
kvm_xen_set_evtchn_fast |
read_lock_irqsave(&gpc->lock, ...) |
| kvm_gfn_to_pfn_cache_init
| rwlock_init(&gpc->lock)
read_unlock_irqrestore(&gpc->lock, ...) |

Rename "cache_init" and "cache_destroy" to activate+deactivate to
avoid implying that the cache really is destroyed/freed.

Note, there more races in the newly named kvm_gpc_activate() that will
be addressed separately.

Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
Cc: [email protected]
Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
[sean: call out that this is a bug fix]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 12 +++++----
arch/x86/kvm/xen.c | 57 +++++++++++++++++++++-------------------
include/linux/kvm_host.h | 24 ++++++++++++-----
virt/kvm/pfncache.c | 21 ++++++++-------
4 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4bd5f8a751de..943f039564e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,

/* we verify if the enable bit is set... */
if (system_time & 1) {
- kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
- KVM_HOST_USES_PFN, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info));
+ kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+ KVM_HOST_USES_PFN, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
} else {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
}

return;
@@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)

static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
vcpu->arch.time = 0;
}

@@ -11757,6 +11757,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;

+ kvm_gpc_init(&vcpu->arch.pv_time);
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 93c628d3e3a9..b2be60c6efa4 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int idx = srcu_read_lock(&kvm->srcu);

if (gfn == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
+ kvm_gpc_deactivate(kvm, gpc);
goto out;
}

do {
- ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
- gpa, PAGE_SIZE);
+ ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
+ PAGE_SIZE);
if (ret)
goto out;

@@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));

if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}

- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_info));
+ r = kvm_gpc_activate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache, NULL,
+ KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

@@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)

case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}

- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct pvclock_vcpu_time_info));
+ r = kvm_gpc_activate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
@@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
r = 0;
break;
}

- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_runstate_info));
+ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct vcpu_runstate_info));
break;

case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -1816,7 +1815,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
{
vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
vcpu->arch.xen.poll_evtchn = 0;
+
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
}

void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1824,18 +1828,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);

- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache);
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
+
del_timer_sync(&vcpu->arch.xen.poll_timer);
}

void kvm_xen_init_vm(struct kvm *kvm)
{
idr_init(&kvm->arch.xen.evtchn_ports);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
}

void kvm_xen_destroy_vm(struct kvm *kvm)
@@ -1843,7 +1846,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
struct evtchnfd *evtchnfd;
int i;

- kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+ kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);

idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32f259fa5801..694c4cb6caf4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1240,8 +1240,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);

/**
- * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
- * given guest physical address.
+ * kvm_gpc_init - initialize gfn_to_pfn_cache.
+ *
+ * @gpc: struct gfn_to_pfn_cache object.
+ *
+ * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must
+ * be zero-allocated (or zeroed by the caller before init).
+ */
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
+
+/**
+ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
+ * physical address.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1265,9 +1275,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
* kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
* accessing the target page.
*/
-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len);
+int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ gpa_t gpa, unsigned long len);

/**
* kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
@@ -1324,7 +1334,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);

/**
- * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
+ * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1332,7 +1342,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
* This removes a cache from the @kvm's list to be processed on MMU notifier
* invocation.
*/
-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);

void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 68ff41d39545..08f97cf97264 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -346,17 +346,20 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
}
EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);

+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
+{
+ rwlock_init(&gpc->lock);
+ mutex_init(&gpc->refresh_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_gpc_init);

-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len)
+int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ gpa_t gpa, unsigned long len)
{
WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);

if (!gpc->active) {
- rwlock_init(&gpc->lock);
- mutex_init(&gpc->refresh_lock);
-
gpc->khva = NULL;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->uhva = KVM_HVA_ERR_BAD;
@@ -371,9 +374,9 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
}
return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
+EXPORT_SYMBOL_GPL(kvm_gpc_activate);

-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
{
if (gpc->active) {
spin_lock(&kvm->gpc_lock);
@@ -384,4 +387,4 @@ void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
gpc->active = false;
}
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy);
+EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:33:43

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 07/16] KVM: Store gfn_to_pfn_cache length as an immutable property

From: Michal Luczaj <[email protected]>

Make the length of a gfn=>pfn cache an immutable property of the cache
to cleanup the APIs and avoid potential bugs, e.g calling check() with a
larger size than refresh() could put KVM into an infinite loop.

All current (and anticipated future) users access the cache with a
predetermined size, which isn't a coincidence as using a dedicated cache
really only make sense when the access pattern is "fixed".

Add a WARN in kvm_setup_guest_pvclock() to assert that the offset+size
matches the length of the cache, both to make it more obvious that the
length really is immutable in that case, and to detect future bugs.

No functional change intended.

Signed-off-by: Michal Luczaj <[email protected]>
Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 14 ++++++------
arch/x86/kvm/xen.c | 46 ++++++++++++++++-----------------------
include/linux/kvm_host.h | 14 +++++-------
include/linux/kvm_types.h | 1 +
virt/kvm/pfncache.c | 18 +++++++--------
5 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c68050672de..0b4fa3455f3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2315,8 +2315,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,

/* we verify if the enable bit is set... */
if (system_time & 1)
- kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info));
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL);
else
kvm_gpc_deactivate(&vcpu->arch.pv_time);

@@ -3031,13 +3030,13 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
struct pvclock_vcpu_time_info *guest_hv_clock;
unsigned long flags;

+ WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));
+
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock)))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -11755,7 +11754,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;

- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN,
+ sizeof(struct pvclock_vcpu_time_info));

if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 55b7195d69d6..6f5a5507392e 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -47,7 +47,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
}

do {
- ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ ret = kvm_gpc_activate(gpc, gpa);
if (ret)
goto out;

@@ -203,7 +203,6 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
uint64_t *user_times;
unsigned long flags;
- size_t user_len;
int *user_state;

kvm_xen_update_runstate(v, state);
@@ -211,17 +210,15 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
if (!vx->runstate_cache.active)
return;

- user_len = sizeof(struct vcpu_runstate_info);
-
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, user_len)) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

/* When invoked from kvm_sched_out() we cannot sleep */
if (state == RUNSTATE_runnable)
return;

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -347,12 +344,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info)))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -412,8 +407,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

/*
@@ -427,8 +421,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa)) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -555,7 +548,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}

r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa, sizeof(struct vcpu_info));
+ data->u.gpa);
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

@@ -569,8 +562,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}

r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
- data->u.gpa,
- sizeof(struct pvclock_vcpu_time_info));
+ data->u.gpa);
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
@@ -587,8 +579,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}

r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
- data->u.gpa,
- sizeof(struct vcpu_runstate_info));
+ data->u.gpa);
break;

case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -956,7 +947,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,

read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa))
goto out_rcu;

ret = false;
@@ -1347,7 +1338,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa))
goto out_rcu;

if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1381,7 +1372,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa)) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
@@ -1479,7 +1470,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;

idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
+ rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);

@@ -1809,11 +1800,11 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);

kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ KVM_HOST_USES_PFN, sizeof(struct vcpu_runstate_info));
kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ KVM_HOST_USES_PFN, sizeof(struct vcpu_info));
kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ KVM_HOST_USES_PFN, sizeof(struct pvclock_vcpu_time_info));
}

void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1831,7 +1822,8 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN,
+ PAGE_SIZE);
}

void kvm_xen_destroy_vm(struct kvm *kvm)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e5e70607a5ef..c79f2e122ac8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1252,13 +1252,15 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
* changes!---will also force @vcpu to exit the guest and
* refresh the cache); and/or if the PFN used directly
* by KVM (and thus needs a kernel virtual mapping).
+ * @len: sanity check; the range being access must fit a single page.
*
* This sets up a gfn_to_pfn_cache by initializing locks and assigning the
* immutable attributes. Note, the cache must be zero-allocated (or zeroed by
* the caller before init).
*/
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ unsigned long len);

/**
* kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
@@ -1266,7 +1268,6 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
*
* @gpc: struct gfn_to_pfn_cache object.
* @gpa: guest physical address to map.
- * @len: sanity check; the range being access must fit a single page.
*
* @return: 0 for success.
* -EINVAL for a mapping which would cross a page boundary.
@@ -1276,7 +1277,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
* invalidations to be processed. Callers are required to use kvm_gpc_check()
* to ensure that the cache is valid before accessing the target page.
*/
-int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len);
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa);

/**
* kvm_gpc_check - check validity of a gfn_to_pfn_cache.
@@ -1284,7 +1285,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
* @gpa: current guest physical address to map.
- * @len: sanity check; the range being access must fit a single page.
*
* @return: %true if the cache is still valid and the address matches.
* %false if the cache is not valid.
@@ -1296,8 +1296,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
* Callers in IN_GUEST_MODE may do so without locking, although they should
* still hold a read lock on kvm->scru for the memslot checks.
*/
-bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
- unsigned long len);
+bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa);

/**
* kvm_gpc_refresh - update a previously initialized cache.
@@ -1317,8 +1316,7 @@ bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
* still lock and check the cache status, as this function does not return
* with the lock still held to permit access.
*/
-int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
- unsigned long len);
+int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa);

/**
* kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache.
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 76de36e56cdf..d66b276d29e0 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -74,6 +74,7 @@ struct gfn_to_pfn_cache {
struct mutex refresh_lock;
void *khva;
kvm_pfn_t pfn;
+ unsigned long len;
enum pfn_cache_usage usage;
bool active;
bool valid;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 6756dfa60d5a..34883ad12536 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -76,15 +76,14 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
}
}

-bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
- unsigned long len)
+bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
struct kvm_memslots *slots = kvm_memslots(kvm);

if (!gpc->active)
return false;

- if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+ if ((gpa & ~PAGE_MASK) + gpc->len > PAGE_SIZE)
return false;

if (gpc->gpa != gpa || gpc->generation != slots->generation ||
@@ -238,8 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
return -EFAULT;
}

-int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
- unsigned long len)
+int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
unsigned long page_offset = gpa & ~PAGE_MASK;
@@ -253,7 +251,7 @@ int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
* If must fit within a single page. The 'len' argument is
* only to enforce that.
*/
- if (page_offset + len > PAGE_SIZE)
+ if (page_offset + gpc->len > PAGE_SIZE)
return -EINVAL;

/*
@@ -358,7 +356,8 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
EXPORT_SYMBOL_GPL(kvm_gpc_unmap);

void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ unsigned long len)
{
WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
@@ -369,10 +368,11 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
gpc->kvm = kvm;
gpc->vcpu = vcpu;
gpc->usage = usage;
+ gpc->len = len;
}
EXPORT_SYMBOL_GPL(kvm_gpc_init);

-int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
struct kvm *kvm = gpc->kvm;

@@ -395,7 +395,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
gpc->active = true;
write_unlock_irq(&gpc->lock);
}
- return kvm_gpc_refresh(kvm, gpc, gpa, len);
+ return kvm_gpc_refresh(kvm, gpc, gpa);
}
EXPORT_SYMBOL_GPL(kvm_gpc_activate);

--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:34:28

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 09/16] KVM: Clean up hva_to_pfn_retry()

From: Michal Luczaj <[email protected]>

Make hva_to_pfn_retry() use kvm instance cached in gfn_to_pfn_cache.

Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
virt/kvm/pfncache.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 6fe76fb4d228..ef7ac1666847 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -138,7 +138,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
return kvm->mmu_invalidate_seq != mmu_seq;
}

-static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
{
/* Note, the new page offset may be different than the old! */
void *old_khva = gpc->khva - offset_in_page(gpc->khva);
@@ -158,7 +158,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
gpc->valid = false;

do {
- mmu_seq = kvm->mmu_invalidate_seq;
+ mmu_seq = gpc->kvm->mmu_invalidate_seq;
smp_rmb();

write_unlock_irq(&gpc->lock);
@@ -216,7 +216,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
* attempting to refresh.
*/
WARN_ON_ONCE(gpc->valid);
- } while (mmu_notifier_retry_cache(kvm, mmu_seq));
+ } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));

gpc->valid = true;
gpc->pfn = new_pfn;
@@ -293,7 +293,7 @@ int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa)
* drop the lock and do the HVA to PFN lookup again.
*/
if (!gpc->valid || old_uhva != gpc->uhva) {
- ret = hva_to_pfn_retry(kvm, gpc);
+ ret = hva_to_pfn_retry(gpc);
} else {
/* If the HVA→PFN mapping was already valid, don't unmap it. */
old_pfn = KVM_PFN_ERR_FAULT;
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:35:28

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 11/16] KVM: Drop KVM's API to allow temprorarily unmapping gfn=>pfn cache

Drop kvm_gpc_unmap() as it has no users and unclear requirements. The
API was added as part of the original gfn_to_pfn_cache support, but its
sole usage[*] was never merged. Fold the guts of kvm_gpc_unmap() into
the deactivate path and drop the API. Omit acquiring refresh_lock as
as concurrent calls to kvm_gpc_deactivate() are not allowed (this is
not enforced, e.g. via lockdep. due to it being called during vCPU
destruction).

If/when temporary unmapping makes a comeback, the desirable behavior is
likely to restrict temporary unmapping to vCPU-exclusive mappings and
require the vcpu->mutex be held to serialize unmap. Use of the
refresh_lock to protect unmapping was somewhat specuatively added by
commit 93984f19e7bc ("KVM: Fully serialize gfn=>pfn cache refresh via
mutex") to guard against concurrent unmaps, but the primary use case of
the temporary unmap, nested virtualization[*], doesn't actually need or
want concurrent unmaps.

[*] https://lore.kernel.org/all/[email protected]

Signed-off-by: Sean Christopherson <[email protected]>
---
include/linux/kvm_host.h | 12 -----------
virt/kvm/pfncache.c | 44 +++++++++++++++-------------------------
2 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b63d2abbef56..22cf43389954 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1315,18 +1315,6 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
*/
int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa);

-/**
- * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache.
- *
- * @kvm: pointer to kvm instance.
- * @gpc: struct gfn_to_pfn_cache object.
- *
- * This unmaps the referenced page. The cache is left in the invalid state
- * but at least the mapping from GPA to userspace HVA will remain cached
- * and can be reused on a subsequent refresh.
- */
-void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
-
/**
* kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
*
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 432b150bd9f1..62b47feed36c 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -328,33 +328,6 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

-void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
-{
- void *old_khva;
- kvm_pfn_t old_pfn;
-
- mutex_lock(&gpc->refresh_lock);
- write_lock_irq(&gpc->lock);
-
- gpc->valid = false;
-
- old_khva = gpc->khva - offset_in_page(gpc->khva);
- old_pfn = gpc->pfn;
-
- /*
- * We can leave the GPA → uHVA map cache intact but the PFN
- * lookup will need to be redone even for the same page.
- */
- gpc->khva = NULL;
- gpc->pfn = KVM_PFN_ERR_FAULT;
-
- write_unlock_irq(&gpc->lock);
- mutex_unlock(&gpc->refresh_lock);
-
- gpc_unmap_khva(old_pfn, old_khva);
-}
-EXPORT_SYMBOL_GPL(kvm_gpc_unmap);
-
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
unsigned long len)
@@ -402,6 +375,8 @@ EXPORT_SYMBOL_GPL(kvm_gpc_activate);
void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
{
struct kvm *kvm = gpc->kvm;
+ kvm_pfn_t old_pfn;
+ void *old_khva;

if (gpc->active) {
/*
@@ -411,13 +386,26 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
*/
write_lock_irq(&gpc->lock);
gpc->active = false;
+ gpc->valid = false;
+
+ /*
+ * Leave the GPA => uHVA cache intact, it's protected by the
+ * memslot generation. The PFN lookup needs to be redone every
+ * time as mmu_notifier protection is lost when the cache is
+ * removed from the VM's gpc_list.
+ */
+ old_khva = gpc->khva - offset_in_page(gpc->khva);
+ gpc->khva = NULL;
+
+ old_pfn = gpc->pfn;
+ gpc->pfn = KVM_PFN_ERR_FAULT;
write_unlock_irq(&gpc->lock);

spin_lock(&kvm->gpc_lock);
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);

- kvm_gpc_unmap(kvm, gpc);
+ gpc_unmap_khva(old_pfn, old_khva);
}
}
EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:36:58

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 10/16] KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_refresh()

From: Michal Luczaj <[email protected]>

Make kvm_gpc_refresh() use kvm instance cached in gfn_to_pfn_cache.

No functional change intended.

Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
[sean: leave kvm_gpc_unmap() as-is]
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 2 +-
arch/x86/kvm/xen.c | 8 ++++----
include/linux/kvm_host.h | 8 +++-----
virt/kvm/pfncache.c | 6 +++---
4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b357a84f8c49..d370d06bb07a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3036,7 +3036,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index c7304f37c438..920ba5ca3016 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -218,7 +218,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
if (state == RUNSTATE_runnable)
return;

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -347,7 +347,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc, gpc->gpa))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -421,7 +421,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;

- if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa)) {
+ if (kvm_gpc_refresh(gpc, gpc->gpa)) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -1470,7 +1470,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;

idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa);
+ rc = kvm_gpc_refresh(gpc, gpc->gpa);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad8ef7f2d705..b63d2abbef56 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1300,22 +1300,20 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
/**
* kvm_gpc_refresh - update a previously initialized cache.
*
- * @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
* @gpa: updated guest physical address to map.
- * @len: sanity check; the range being access must fit a single page.
*
* @return: 0 for success.
* -EINVAL for a mapping which would cross a page boundary.
- * -EFAULT for an untranslatable guest physical address.
+ * -EFAULT for an untranslatable guest physical address.
*
* This will attempt to refresh a gfn_to_pfn_cache. Note that a successful
- * returm from this function does not mean the page can be immediately
+ * return from this function does not mean the page can be immediately
* accessed because it may have raced with an invalidation. Callers must
* still lock and check the cache status, as this function does not return
* with the lock still held to permit access.
*/
-int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa);
+int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa);

/**
* kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache.
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index ef7ac1666847..432b150bd9f1 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -237,9 +237,9 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
return -EFAULT;
}

-int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
unsigned long page_offset = gpa & ~PAGE_MASK;
bool unmap_old = false;
unsigned long old_uhva;
@@ -395,7 +395,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
gpc->active = true;
write_unlock_irq(&gpc->lock);
}
- return kvm_gpc_refresh(kvm, gpc, gpa);
+ return kvm_gpc_refresh(gpc, gpa);
}
EXPORT_SYMBOL_GPL(kvm_gpc_activate);

--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:39:15

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 04/16] KVM: Shorten gfn_to_pfn_cache function names

From: Michal Luczaj <[email protected]>

Formalize "gpc" as the acronym and use it in function names.

No functional change intended.

Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 8 ++++----
arch/x86/kvm/xen.c | 29 ++++++++++++++---------------
include/linux/kvm_host.h | 21 ++++++++++-----------
virt/kvm/pfncache.c | 20 ++++++++++----------
4 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 943f039564e7..fd00e6a33203 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3034,12 +3034,12 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
unsigned long flags;

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock)))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock)))
return;

read_lock_irqsave(&gpc->lock, flags);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 9e79ef2cca99..74d9f4985f93 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -215,15 +215,14 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
user_len = sizeof(struct vcpu_runstate_info);

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- user_len)) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, user_len)) {
read_unlock_irqrestore(&gpc->lock, flags);

/* When invoked from kvm_sched_out() we cannot sleep */
if (state == RUNSTATE_runnable)
return;

- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, user_len))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -349,12 +348,12 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info)))
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info)))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -414,8 +413,8 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);

/*
@@ -429,8 +428,8 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;

- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -963,7 +962,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,

read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
goto out_rcu;

ret = false;
@@ -1354,7 +1353,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
goto out_rcu;

if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1388,7 +1387,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ if (!kvm_gpc_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
@@ -1486,7 +1485,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;

idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
+ rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 694c4cb6caf4..bb020ee3b2fe 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1271,16 +1271,15 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
* -EFAULT for an untranslatable guest physical address.
*
* This primes a gfn_to_pfn_cache and links it into the @kvm's list for
- * invalidations to be processed. Callers are required to use
- * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
- * accessing the target page.
+ * invalidations to be processed. Callers are required to use kvm_gpc_check()
+ * to ensure that the cache is valid before accessing the target page.
*/
int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
gpa_t gpa, unsigned long len);

/**
- * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
+ * kvm_gpc_check - check validity of a gfn_to_pfn_cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1297,11 +1296,11 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
* Callers in IN_GUEST_MODE may do so without locking, although they should
* still hold a read lock on kvm->scru for the memslot checks.
*/
-bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- gpa_t gpa, unsigned long len);
+bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+ unsigned long len);

/**
- * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache.
+ * kvm_gpc_refresh - update a previously initialized cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1318,11 +1317,11 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
* still lock and check the cache status, as this function does not return
* with the lock still held to permit access.
*/
-int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- gpa_t gpa, unsigned long len);
+int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+ unsigned long len);

/**
- * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache.
+ * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1331,7 +1330,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
* but at least the mapping from GPA to userspace HVA will remain cached
* and can be reused on a subsequent refresh.
*/
-void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);

/**
* kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 346e47f15572..23180f1d9c1c 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -76,8 +76,8 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
}
}

-bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- gpa_t gpa, unsigned long len)
+bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+ unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm);

@@ -96,7 +96,7 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,

return true;
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
+EXPORT_SYMBOL_GPL(kvm_gpc_check);

static void gpc_unmap_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
{
@@ -238,8 +238,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
return -EFAULT;
}

-int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- gpa_t gpa, unsigned long len)
+int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+ unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
unsigned long page_offset = gpa & ~PAGE_MASK;
@@ -328,9 +328,9 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,

return ret;
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_refresh);
+EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

-void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
{
void *old_khva;
kvm_pfn_t old_pfn;
@@ -355,7 +355,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)

gpc_unmap_khva(kvm, old_pfn, old_khva);
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
+EXPORT_SYMBOL_GPL(kvm_gpc_unmap);

void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
{
@@ -391,7 +391,7 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
gpc->active = true;
write_unlock_irq(&gpc->lock);
}
- return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
+ return kvm_gpc_refresh(kvm, gpc, gpa, len);
}
EXPORT_SYMBOL_GPL(kvm_gpc_activate);

@@ -411,7 +411,7 @@ void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);

- kvm_gfn_to_pfn_cache_unmap(kvm, gpc);
+ kvm_gpc_unmap(kvm, gpc);
}
}
EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:40:14

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 14/16] KVM: Skip unnecessary "unmap" if gpc is already valid during refresh

When refreshing a gfn=>pfn cache, skip straight to unlocking if the cache
already valid instead of stuffing the "old" variables to turn the
unmapping outro into a nop.

Signed-off-by: Sean Christopherson <[email protected]>
---
virt/kvm/pfncache.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f211c878788b..57d47f06637d 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -293,9 +293,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
ret = hva_to_pfn_retry(gpc);
} else {
/* If the HVA→PFN mapping was already valid, don't unmap it. */
- old_pfn = KVM_PFN_ERR_FAULT;
- old_khva = NULL;
ret = 0;
+ goto out_unlock;
}

out:
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:40:42

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 13/16] KVM: Drop @gpa from exported gfn=>pfn cache check() and refresh() helpers

Drop the @gpa param from the exported check()+refresh() helpers and limit
changing the cache's GPA to the activate path. All external users just
feed in gpc->gpa, i.e. this is a fancy nop.

Allowing users to change the GPA at check()+refresh() is dangerous as
those helpers explicitly allow concurrent calls, e.g. KVM could get into
a livelock scenario. It's also unclear as to what the expected behavior
should be if multiple tasks attempt to refresh with different GPAs.

Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 4 ++--
arch/x86/kvm/xen.c | 20 ++++++++++----------
include/linux/kvm_host.h | 6 ++----
virt/kvm/pfncache.c | 16 +++++++++-------
4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d370d06bb07a..2db8515d38dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3033,10 +3033,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc))
return;

read_lock_irqsave(&gpc->lock, flags);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 920ba5ca3016..529d3f4c1b9d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -211,14 +211,14 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
return;

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc)) {
read_unlock_irqrestore(&gpc->lock, flags);

/* When invoked from kvm_sched_out() we cannot sleep */
if (state == RUNSTATE_runnable)
return;

- if (kvm_gpc_refresh(gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -344,10 +344,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc)) {
read_unlock_irqrestore(&gpc->lock, flags);

- if (kvm_gpc_refresh(gpc, gpc->gpa))
+ if (kvm_gpc_refresh(gpc))
return;

read_lock_irqsave(&gpc->lock, flags);
@@ -407,7 +407,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc)) {
read_unlock_irqrestore(&gpc->lock, flags);

/*
@@ -421,7 +421,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;

- if (kvm_gpc_refresh(gpc, gpc->gpa)) {
+ if (kvm_gpc_refresh(gpc)) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -947,7 +947,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,

read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gpc_check(gpc, gpc->gpa))
+ if (!kvm_gpc_check(gpc))
goto out_rcu;

ret = false;
@@ -1338,7 +1338,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(gpc, gpc->gpa))
+ if (!kvm_gpc_check(gpc))
goto out_rcu;

if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1372,7 +1372,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(gpc, gpc->gpa)) {
+ if (!kvm_gpc_check(gpc)) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
@@ -1470,7 +1470,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;

idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gpc_refresh(gpc, gpc->gpa);
+ rc = kvm_gpc_refresh(gpc);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 22cf43389954..92cf0be21974 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1283,7 +1283,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
* kvm_gpc_check - check validity of a gfn_to_pfn_cache.
*
* @gpc: struct gfn_to_pfn_cache object.
- * @gpa: current guest physical address to map.
*
* @return: %true if the cache is still valid and the address matches.
* %false if the cache is not valid.
@@ -1295,13 +1294,12 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
* Callers in IN_GUEST_MODE may do so without locking, although they should
* still hold a read lock on kvm->scru for the memslot checks.
*/
-bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
+bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc);

/**
* kvm_gpc_refresh - update a previously initialized cache.
*
* @gpc: struct gfn_to_pfn_cache object.
- * @gpa: updated guest physical address to map.
*
* @return: 0 for success.
* -EINVAL for a mapping which would cross a page boundary.
@@ -1313,7 +1311,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
* still lock and check the cache status, as this function does not return
* with the lock still held to permit access.
*/
-int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
+int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc);

/**
* kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d5b417e50ac..f211c878788b 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -76,17 +76,14 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
}
}

-bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc)
{
struct kvm_memslots *slots = kvm_memslots(gpc->kvm);

if (!gpc->active)
return false;

- if ((gpa & ~PAGE_MASK) + gpc->len > PAGE_SIZE)
- return false;
-
- if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+ if (gpc->generation != slots->generation ||
kvm_is_error_hva(gpc->uhva))
return false;

@@ -237,7 +234,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
return -EFAULT;
}

-int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
unsigned long page_offset = gpa & ~PAGE_MASK;
@@ -326,6 +323,11 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa)

return ret;
}
+
+int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc)
+{
+ return __kvm_gpc_refresh(gpc, gpc->gpa);
+}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
@@ -369,7 +371,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
gpc->active = true;
write_unlock_irq(&gpc->lock);
}
- return kvm_gpc_refresh(gpc, gpa);
+ return __kvm_gpc_refresh(gpc, gpa);
}
EXPORT_SYMBOL_GPL(kvm_gpc_activate);

--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:41:24

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 08/16] KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_check()

From: Michal Luczaj <[email protected]>

Make kvm_gpc_check() use kvm instance cached in gfn_to_pfn_cache.

Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Michal Luczaj <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/x86.c | 2 +-
arch/x86/kvm/xen.c | 12 ++++++------
include/linux/kvm_host.h | 3 +--
virt/kvm/pfncache.c | 4 ++--
4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b4fa3455f3a..b357a84f8c49 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3033,7 +3033,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 6f5a5507392e..c7304f37c438 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -211,7 +211,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
return;

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

/* When invoked from kvm_sched_out() we cannot sleep */
@@ -344,7 +344,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa))
@@ -407,7 +407,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));

read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa)) {
+ while (!kvm_gpc_check(gpc, gpc->gpa)) {
read_unlock_irqrestore(&gpc->lock, flags);

/*
@@ -947,7 +947,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,

read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa))
+ if (!kvm_gpc_check(gpc, gpc->gpa))
goto out_rcu;

ret = false;
@@ -1338,7 +1338,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa))
+ if (!kvm_gpc_check(gpc, gpc->gpa))
goto out_rcu;

if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1372,7 +1372,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;

read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(kvm, gpc, gpc->gpa)) {
+ if (!kvm_gpc_check(gpc, gpc->gpa)) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c79f2e122ac8..ad8ef7f2d705 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1282,7 +1282,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
/**
* kvm_gpc_check - check validity of a gfn_to_pfn_cache.
*
- * @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
* @gpa: current guest physical address to map.
*
@@ -1296,7 +1295,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa);
* Callers in IN_GUEST_MODE may do so without locking, although they should
* still hold a read lock on kvm->scru for the memslot checks.
*/
-bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa);
+bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa);

/**
* kvm_gpc_refresh - update a previously initialized cache.
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 34883ad12536..6fe76fb4d228 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -76,9 +76,9 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
}
}

-bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa)
+bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa)
{
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = kvm_memslots(gpc->kvm);

if (!gpc->active)
return false;
--
2.38.0.413.g74048e4d9e-goog

2022-10-13 21:43:06

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v2 15/16] KVM: selftests: Add tests in xen_shinfo_test to detect lock races

From: Michal Luczaj <[email protected]>

Tests for races between shinfo_cache (de)activation and hypercall+ioctl()
processing. KVM has had bugs where activating the shared info cache
multiple times and/or with concurrent users results in lock corruption,
NULL pointer dereferences, and other fun.

For the timer injection testcase (#22), re-arm the timer until the IRQ
is successfully injected. If the timer expires while the shared info
is deactivated (invalid), KVM will drop the event.

Signed-off-by: Michal Luczaj <[email protected]>
Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
.../selftests/kvm/x86_64/xen_shinfo_test.c | 140 ++++++++++++++++++
1 file changed, 140 insertions(+)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 8a5cb800f50e..caa3f5ab9e10 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -15,9 +15,13 @@
#include <time.h>
#include <sched.h>
#include <signal.h>
+#include <pthread.h>

#include <sys/eventfd.h>

+/* Defined in include/linux/kvm_types.h */
+#define GPA_INVALID (~(ulong)0)
+
#define SHINFO_REGION_GVA 0xc0000000ULL
#define SHINFO_REGION_GPA 0xc0000000ULL
#define SHINFO_REGION_SLOT 10
@@ -44,6 +48,8 @@

#define MIN_STEAL_TIME 50000

+#define SHINFO_RACE_TIMEOUT 2 /* seconds */
+
#define __HYPERVISOR_set_timer_op 15
#define __HYPERVISOR_sched_op 29
#define __HYPERVISOR_event_channel_op 32
@@ -148,6 +154,7 @@ static void guest_wait_for_irq(void)
static void guest_code(void)
{
struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+ int i;

__asm__ __volatile__(
"sti\n"
@@ -325,6 +332,49 @@ static void guest_code(void)
guest_wait_for_irq();

GUEST_SYNC(21);
+ /* Racing host ioctls */
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(22);
+ /* Racing vmcall against host ioctl */
+
+ ports[0] = 0;
+
+ p = (struct sched_poll) {
+ .ports = ports,
+ .nr_ports = 1,
+ .timeout = 0
+ };
+
+wait_for_timer:
+ /*
+ * Poll for a timer wake event while the worker thread is mucking with
+ * the shared info. KVM XEN drops timer IRQs if the shared info is
+ * invalid when the timer expires. Arbitrarily poll 100 times before
+ * giving up and asking the VMM to re-arm the timer. 100 polls should
+ * consume enough time to beat on KVM without taking too long if the
+ * timer IRQ is dropped due to an invalid event channel.
+ */
+ for (i = 0; i < 100 && !guest_saw_irq; i++)
+ asm volatile("vmcall"
+ : "=a" (rax)
+ : "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p)
+ : "memory");
+
+ /*
+ * Re-send the timer IRQ if it was (likely) dropped due to the timer
+ * expiring while the event channel was invalid.
+ */
+ if (!guest_saw_irq) {
+ GUEST_SYNC(23);
+ goto wait_for_timer;
+ }
+ guest_saw_irq = false;
+
+ GUEST_SYNC(24);
}

static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -352,11 +402,36 @@ static void handle_alrm(int sig)
TEST_FAIL("IRQ delivery timed out");
}

+static void *juggle_shinfo_state(void *arg)
+{
+ struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+ struct kvm_xen_hvm_attr cache_init = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
+ };
+
+ struct kvm_xen_hvm_attr cache_destroy = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = GPA_INVALID
+ };
+
+ for (;;) {
+ __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
+ __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
+ pthread_testcancel();
+ };
+
+ return NULL;
+}
+
int main(int argc, char *argv[])
{
struct timespec min_ts, max_ts, vm_ts;
struct kvm_vm *vm;
+ pthread_t thread;
bool verbose;
+ int ret;

verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
!strncmp(argv[1], "--verbose", 10));
@@ -785,6 +860,71 @@ int main(int argc, char *argv[])
case 21:
TEST_ASSERT(!evtchn_irq_expected,
"Expected event channel IRQ but it didn't happen");
+ alarm(0);
+
+ if (verbose)
+ printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
+
+ ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
+ TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
+
+ struct kvm_irq_routing_xen_evtchn uxe = {
+ .port = 1,
+ .vcpu = vcpu->id,
+ .priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
+ };
+
+ evtchn_irq_expected = true;
+ for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
+ __vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
+ break;
+
+ case 22:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+
+ if (verbose)
+ printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
+
+ shinfo->evtchn_pending[0] = 1;
+
+ evtchn_irq_expected = true;
+ tmr.u.timer.expires_ns = rs->state_entry_time +
+ SHINFO_RACE_TIMEOUT * 1000000000ULL;
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ break;
+
+ case 23:
+ /*
+ * Optional and possibly repeated sync point.
+ * Injecting the timer IRQ may fail if the
+ * shinfo is invalid when the timer expires.
+ * If the timer has expired but the IRQ hasn't
+ * been delivered, rearm the timer and retry.
+ */
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+
+ /* Resume the guest if the timer is still pending. */
+ if (tmr.u.timer.expires_ns)
+ break;
+
+ /* All done if the IRQ was delivered. */
+ if (!evtchn_irq_expected)
+ break;
+
+ tmr.u.timer.expires_ns = rs->state_entry_time +
+ SHINFO_RACE_TIMEOUT * 1000000000ULL;
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ break;
+ case 24:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+
+ ret = pthread_cancel(thread);
+ TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
+
+ ret = pthread_join(thread, 0);
+ TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
goto done;

case 0x20:
--
2.38.0.413.g74048e4d9e-goog

2022-10-27 12:11:15

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

On 10/13/22 23:12, Sean Christopherson wrote:
> Always use the size of Xen's non-compat vcpu_runstate_info struct when
> checking that the GPA+size doesn't cross a page boundary. Conceptually,
> using the current mode is more correct, but KVM isn't consistent with
> itself as kvm_xen_vcpu_set_attr() unconditionally uses the "full" size
> when activating the cache. More importantly, prior to the introduction
> of the gfn_to_pfn_cache, KVM _always_ used the full size, i.e. allowing
> the guest (userspace?) to use a poorly aligned GPA in 32-bit mode but not
> 64-bit mode is more of a bug than a feature, and fixing the bug doesn't
> break KVM's historical ABI.

I'd rather not introduce additional restrictions in KVM, mostly because
it's actually easy to avoid this patch by instead enforcing that
attributes are set in a sensible order:

- long mode cannot be changed after the shared info page is enabled
(which makes sense because the shared info page also has a compat version)

- the caches must be activated after the shared info page (which
enforces that the vCPU attributes are set after the VM attributes)

This is technically a userspace API break, but nobody is really using
this API outside Amazon so... Patches coming after I finish testing.

Paolo


> Always using the non-compat size will allow for future gfn_to_pfn_cache
> clenups as this is (was) the only case where KVM uses a different size at
> check()+refresh() than at activate(). E.g. the length/size of the cache
> can be made immutable and dropped from check()+refresh(), which yields a
> cleaner set of APIs and avoids potential bugs that could occur if check()
> where invoked with a different size than refresh().
>
> Fixes: a795cd43c5b5 ("KVM: x86/xen: Use gfn_to_pfn_cache for runstate area")
> Cc: David Woodhouse <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/xen.c | 5 +----
> 1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index b2be60c6efa4..9e79ef2cca99 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -212,10 +212,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
> if (!vx->runstate_cache.active)
> return;
>
> - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
> - user_len = sizeof(struct vcpu_runstate_info);
> - else
> - user_len = sizeof(struct compat_vcpu_runstate_info);
> + user_len = sizeof(struct vcpu_runstate_info);
>
> read_lock_irqsave(&gpc->lock, flags);
> while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,


2022-10-27 14:59:27

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

On Thu, Oct 27, 2022, Paolo Bonzini wrote:
> On 10/13/22 23:12, Sean Christopherson wrote:
> > Always use the size of Xen's non-compat vcpu_runstate_info struct when
> > checking that the GPA+size doesn't cross a page boundary. Conceptually,
> > using the current mode is more correct, but KVM isn't consistent with
> > itself as kvm_xen_vcpu_set_attr() unconditionally uses the "full" size
> > when activating the cache. More importantly, prior to the introduction
> > of the gfn_to_pfn_cache, KVM _always_ used the full size, i.e. allowing
> > the guest (userspace?) to use a poorly aligned GPA in 32-bit mode but not
> > 64-bit mode is more of a bug than a feature, and fixing the bug doesn't
> > break KVM's historical ABI.
>
> I'd rather not introduce additional restrictions in KVM,

But KVM already has this restriction. "struct vcpu_info" is always checked for
the non-compat size, and as above, "struct vcpu_runstate_info" is checked for the
non-compat size during its initialization.

> actually easy to avoid this patch by instead enforcing that attributes are
> set in a sensible order:

I don't care about fixing XEN support, I care about forcing "length" to be immutable
in order to simplify the gfn_to_pfn_cache implementation. Avoiding this patch
prevents doing that later in this series.

> - long mode cannot be changed after the shared info page is enabled (which
> makes sense because the shared info page also has a compat version)

How is this not introducing an additional restriction? This seems way more
onerous than what is effectively a revert.

> - the caches must be activated after the shared info page (which enforces
> that the vCPU attributes are set after the VM attributes)
>
> This is technically a userspace API break, but nobody is really using this
> API outside Amazon so... Patches coming after I finish testing.

It's not just userspace break, it affects the guest ABI as well. arch.xen.long_mode
isn't set just by userspace, it's also snapshot when the guest changes the hypercall
page. Maybe there's something in the XEN ABI that says the hypercall page can never
be changed, but barring that I don't see how to prevent ending up with a misaligned
cache due to the guest enabling long mode.

int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
bool lm = is_long_mode(vcpu);

/* Latch long_mode for shared_info pages etc. */
vcpu->kvm->arch.xen.long_mode = lm;

2022-10-27 15:09:12

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

On 10/27/22 16:44, Sean Christopherson wrote:
>> - long mode cannot be changed after the shared info page is enabled (which
>> makes sense because the shared info page also has a compat version)
>
> How is this not introducing an additional restriction? This seems way more
> onerous than what is effectively a revert.
>
>> - the caches must be activated after the shared info page (which enforces
>> that the vCPU attributes are set after the VM attributes)
>>
>> This is technically a userspace API break, but nobody is really using this
>> API outside Amazon so... Patches coming after I finish testing.
>
> It's not just userspace break, it affects the guest ABI as well.

Yes, I was talking of the VMM here; additional restrictions are fine there.

The guests however should be compatible with Xen, so you also need to
re-activate the cache after the hypercall page is written, but that's
two lines of code.

Paolo


2022-10-27 15:50:16

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

On Thu, Oct 27, 2022, Paolo Bonzini wrote:
> On 10/27/22 16:44, Sean Christopherson wrote:
> > > - long mode cannot be changed after the shared info page is enabled (which
> > > makes sense because the shared info page also has a compat version)
> >
> > How is this not introducing an additional restriction? This seems way more
> > onerous than what is effectively a revert.
> >
> > > - the caches must be activated after the shared info page (which enforces
> > > that the vCPU attributes are set after the VM attributes)
> > >
> > > This is technically a userspace API break, but nobody is really using this
> > > API outside Amazon so... Patches coming after I finish testing.
> >
> > It's not just userspace break, it affects the guest ABI as well.
>
> Yes, I was talking of the VMM here; additional restrictions are fine there.

Additional restrictions are fine where?

> The guests however should be compatible with Xen, so you also need to
> re-activate the cache after the hypercall page is written, but that's two
> lines of code.

And do what if the guest transitions from 32-bit => 64-bit and the cache isn't
aligned for 64-bit? E.g. kvm_xen_set_evtchn() will silently drop events no matter
what KVM does. In other words, I don't see how KVM can provide a same ABI without
forcing the cached pages to be aligned for the largets possible size.

2022-10-27 17:48:09

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache

On Thu, Oct 27, 2022, Sean Christopherson wrote:
> On Thu, Oct 27, 2022, Paolo Bonzini wrote:
> > On 10/13/22 23:12, Sean Christopherson wrote:
> > > Always use the size of Xen's non-compat vcpu_runstate_info struct when
> > > checking that the GPA+size doesn't cross a page boundary. Conceptually,
> > > using the current mode is more correct, but KVM isn't consistent with
> > > itself as kvm_xen_vcpu_set_attr() unconditionally uses the "full" size
> > > when activating the cache. More importantly, prior to the introduction
> > > of the gfn_to_pfn_cache, KVM _always_ used the full size, i.e. allowing
> > > the guest (userspace?) to use a poorly aligned GPA in 32-bit mode but not
> > > 64-bit mode is more of a bug than a feature, and fixing the bug doesn't
> > > break KVM's historical ABI.
> >
> > I'd rather not introduce additional restrictions in KVM,
>
> But KVM already has this restriction. "struct vcpu_info" is always checked for
> the non-compat size, and as above, "struct vcpu_runstate_info" is checked for the
> non-compat size during its initialization.

Ah, I forgot those are the same size:

BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));

2022-11-21 14:57:00

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH v2 07/16] KVM: Store gfn_to_pfn_cache length as an immutable property

On Thu, 2022-10-13 at 21:12 +0000, Sean Christopherson wrote:
> From: Michal Luczaj <[email protected]>
>
> Make the length of a gfn=>pfn cache an immutable property of the cache
> to cleanup the APIs and avoid potential bugs, e.g calling check() with a
> larger size than refresh() could put KVM into an infinite loop.

Hm, that's a strange hypothetical bug to be worried about, given the
pattern is usually to have the check() and refresh() within a few lines
of each other with just atomicity/locking stuff in between them.

I won't fight for it, but I quite liked the idea that each user of a
GPC would know how much space *it* is going to access, and provide that
length as a required parameter. I do note you've added a WARN_ON to one
such user, and that's great — but overall, this patch makes that
checking *optional* instead of mandatory.

> All current (and anticipated future) users access the cache with a
> predetermined size, which isn't a coincidence as using a dedicated cache
> really only make sense when the access pattern is "fixed".

In fixing up the runstate area, I've made that not true. Not only does
the runstate area change size at runtime if the guest changes between
32-bit and 64-bit mode, but it also now uses *two* GPCs to cope with a
region that crosses a page boundary, and the size of the first
therefore changes according to how much fits on the tail of the page.

> Add a WARN in kvm_setup_guest_pvclock() to assert that the offset+size
> matches the length of the cache, both to make it more obvious that the
> length really is immutable in that case, and to detect future bugs.
...
> @@ -3031,13 +3030,13 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
> struct pvclock_vcpu_time_info *guest_hv_clock;
> unsigned long flags;
>
> + WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));
> +

That ought to be 'gpc->len < offset + sizeof(*guest_hv_clock)' I think?

In the case where we are writing a clock *within* a mapped Xen
vcpu_info structure, it doesn't have to be at the *end* of that
structure. I think the xen_shinfo_test should have caught that?



Attachments:
smime.p7s (5.83 kB)

2022-11-21 19:20:18

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 07/16] KVM: Store gfn_to_pfn_cache length as an immutable property

On Mon, Nov 21, 2022, David Woodhouse wrote:
> On Thu, 2022-10-13 at 21:12 +0000, Sean Christopherson wrote:
> > From: Michal Luczaj <[email protected]>
> >
> > Make the length of a gfn=>pfn cache an immutable property of the cache
> > to cleanup the APIs and avoid potential bugs, e.g calling check() with a
> > larger size than refresh() could put KVM into an infinite loop.
>
> Hm, that's a strange hypothetical bug to be worried about, given the
> pattern is usually to have the check() and refresh() within a few lines
> of each other with just atomicity/locking stuff in between them.

Why do you say it's strange to be worried about? The GPC and Xen code is all quite
complex and has had multiple bugs, several of which are not exactly edge cases.
I don't think it's at all strange to want to make it difficult to introduce a bug
that would in many ways be worse than panicking the kernel.

But as Paolo said, the APIs themselves are to blame[*], check() and refresh()
shouldn't be split for the common case, i.e. this particular concern should largely
be a non-issue in the long run.

[*] https://lore.kernel.org/all/[email protected]

> I won't fight for it, but I quite liked the idea that each user of a
> GPC would know how much space *it* is going to access, and provide that
> length as a required parameter. I do note you've added a WARN_ON to one
> such user, and that's great — but overall, this patch makes that
> checking *optional* instead of mandatory.

I honestly don't see a meaningful difference in this case. The only practical
difference is that shoving @len into the cache makes the check a one-time thing.
The "mandatory" check at use time still relies on a human to not make a mistake.
If the check were derived from the actual access, a la get_user(), then I would
feel differently.

Case in point, the mandatory check didn't prevent KVM from screwing up bounds
checking in kvm_xen_schedop_poll(). The PAGE_SIZE passed in for @len is in no
way tied to actual access that's being performed, the code is simply regurgitating
the size of the cache.

> > All current (and anticipated future) users access the cache with a
> > predetermined size, which isn't a coincidence as using a dedicated cache
> > really only make sense when the access pattern is "fixed".
>
> In fixing up the runstate area, I've made that not true. Not only does
> the runstate area change size at runtime if the guest changes between
> 32-bit and 64-bit mode, but it also now uses *two* GPCs to cope with a
> region that crosses a page boundary, and the size of the first
> therefore changes according to how much fits on the tail of the page.
>
> > Add a WARN in kvm_setup_guest_pvclock() to assert that the offset+size
> > matches the length of the cache, both to make it more obvious that the
> > length really is immutable in that case, and to detect future bugs.
> ...
> > @@ -3031,13 +3030,13 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
> > struct pvclock_vcpu_time_info *guest_hv_clock;
> > unsigned long flags;
> >
> > + WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));
> > +
>
> That ought to be 'gpc->len < offset + sizeof(*guest_hv_clock)' I think?
>
> In the case where we are writing a clock *within* a mapped Xen
> vcpu_info structure, it doesn't have to be at the *end* of that
> structure. I think the xen_shinfo_test should have caught that?

The WARN doesn't get false positive because "struct pvclock_vcpu_time_info" is
placed at the end of "struct vcpu_info" and "struct compat_vcpu_info".

I don't have a strong opinion on whether it's "!=" of "<", my goal in adding the
WARN was primarily to show that @len really is immutable in this case. Guarding
against future overrun bugs was a bonus.

2022-11-21 20:41:08

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH v2 07/16] KVM: Store gfn_to_pfn_cache length as an immutable property

On Mon, 2022-11-21 at 19:11 +0000, Sean Christopherson wrote:
> On Mon, Nov 21, 2022, David Woodhouse wrote:
> > On Thu, 2022-10-13 at 21:12 +0000, Sean Christopherson wrote:
> > > From: Michal Luczaj <[email protected]>
> > >
> > > Make the length of a gfn=>pfn cache an immutable property of the cache
> > > to cleanup the APIs and avoid potential bugs, e.g calling check() with a
> > > larger size than refresh() could put KVM into an infinite loop.
> >
> > Hm, that's a strange hypothetical bug to be worried about, given the
> > pattern is usually to have the check() and refresh() within a few lines
> > of each other with just atomicity/locking stuff in between them.
>
> Why do you say it's strange to be worried about? The GPC and Xen code is all quite
> complex and has had multiple bugs, several of which are not exactly edge cases.
> I don't think it's at all strange to want to make it difficult to introduce a bug
> that would in many ways be worse than panicking the kernel.

The check() and refresh() calls are within a few lines of each other,
and it'd be really strange for them to have a *different* idea about
what the length is, surely?

> But as Paolo said, the APIs themselves are to blame[*], check() and refresh()
> shouldn't be split for the common case, i.e. this particular concern should largely
> be a non-issue in the long run.
>
> [*] https://lore.kernel.org/all/[email protected]

Yeah. As I said to Paul, I've been tempted by that. I've so far not
done it because although they look broadly similar, a bunch of the
sites do end up with *different* code between the check() and the
refresh(), for various locking and atomicity reasons.

> > I won't fight for it, but I quite liked the idea that each user of a
> > GPC would know how much space *it* is going to access, and provide that
> > length as a required parameter. I do note you've added a WARN_ON to one
> > such user, and that's great — but overall, this patch makes that
> > checking *optional* instead of mandatory.
>
> I honestly don't see a meaningful difference in this case. The only practical
> difference is that shoving @len into the cache makes the check a one-time thing.
> The "mandatory" check at use time still relies on a human to not make a mistake.
> If the check were derived from the actual access, a la get_user(), then I would
> feel differently.
>
> Case in point, the mandatory check didn't prevent KVM from screwing up bounds
> checking in kvm_xen_schedop_poll(). The PAGE_SIZE passed in for @len is in no
> way tied to actual access that's being performed, the code is simply regurgitating
> the size of the cache.

True, but that's a different class of bug, and the human needs to make
a more *egregious* mistake.

If the function itself writes outside the size that *it* thinks *it's*
going to write, right there and then in that function, that's utterly
hosed (and the SCHEDOP_poll thing was indeed so hosed).

The mandatory check *did* save us from configuring a 32-bit runstate
area at the end of a page, then *writing* to it in 64-bit mode (where
it's larger) and running off the end of the page.

It saved us from "knowing", a few seconds ago under different
circumstances, what the size of the runstate area was... and then it
actually being different when it's written.

But that's not the common case, so again, I won't fight for it.

I've reworked the unapplied parts of this series on top of the poll and
runstate fixes in my tree, *except* for this one making the length
immutable, and I'm running some tests.

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/gpc-fixes

I'm happy to reinstate the immutable length thing in some form on top.

Given that the runstate code already calculates for itself how many
bytes it can fit onto the first page, it really doesn't care about the
length field in the GPC. As a nasty hack, the runstate code could
probably even get away with setting 'len' to zero. That's kind of
awful, but maybe we could introduce a __kvm_gpc_activate() which does
take a new length, leaving kvm_gpc_activate() without it?

> > > All current (and anticipated future) users access the cache with a
> > > predetermined size, which isn't a coincidence as using a dedicated cache
> > > really only make sense when the access pattern is "fixed".
> >
> > In fixing up the runstate area, I've made that not true. Not only does
> > the runstate area change size at runtime if the guest changes between
> > 32-bit and 64-bit mode, but it also now uses *two* GPCs to cope with a
> > region that crosses a page boundary, and the size of the first
> > therefore changes according to how much fits on the tail of the page.
> >
> > > Add a WARN in kvm_setup_guest_pvclock() to assert that the offset+size
> > > matches the length of the cache, both to make it more obvious that the
> > > length really is immutable in that case, and to detect future bugs.
> >
> > ...
> > > @@ -3031,13 +3030,13 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
> > > struct pvclock_vcpu_time_info *guest_hv_clock;
> > > unsigned long flags;
> > >
> > > + WARN_ON_ONCE(gpc->len != offset + sizeof(*guest_hv_clock));
> > > +
> >
> > That ought to be 'gpc->len < offset + sizeof(*guest_hv_clock)' I think?
> >
> > In the case where we are writing a clock *within* a mapped Xen
> > vcpu_info structure, it doesn't have to be at the *end* of that
> > structure. I think the xen_shinfo_test should have caught that?
>
> The WARN doesn't get false positive because "struct pvclock_vcpu_time_info" is
> placed at the end of "struct vcpu_info" and "struct compat_vcpu_info".
>
> I don't have a strong opinion on whether it's "!=" of "<", my goal in adding the
> WARN was primarily to show that @len really is immutable in this case. Guarding
> against future overrun bugs was a bonus.

Ah right, I think I was looking at the pvclock_wall_clock field in the
shared_info, not the time field in the vcpu_info.


Attachments:
smime.p7s (5.83 kB)

2022-11-22 19:32:14

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 07/16] KVM: Store gfn_to_pfn_cache length as an immutable property

On Mon, Nov 21, 2022, David Woodhouse wrote:
> On Mon, 2022-11-21 at 19:11 +0000, Sean Christopherson wrote:
> > On Mon, Nov 21, 2022, David Woodhouse wrote:
> > > I won't fight for it, but I quite liked the idea that each user of a
> > > GPC would know how much space *it* is going to access, and provide that
> > > length as a required parameter. I do note you've added a WARN_ON to one
> > > such user, and that's great — but overall, this patch makes that
> > > checking *optional* instead of mandatory.
> >
> > I honestly don't see a meaningful difference in this case. The only practical
> > difference is that shoving @len into the cache makes the check a one-time thing.
> > The "mandatory" check at use time still relies on a human to not make a mistake.
> > If the check were derived from the actual access, a la get_user(), then I would
> > feel differently.
> >
> > Case in point, the mandatory check didn't prevent KVM from screwing up bounds
> > checking in kvm_xen_schedop_poll(). The PAGE_SIZE passed in for @len is in no
> > way tied to actual access that's being performed, the code is simply regurgitating
> > the size of the cache.
>
> True, but that's a different class of bug, and the human needs to make
> a more *egregious* mistake.
>
> If the function itself writes outside the size that *it* thinks *it's*
> going to write, right there and then in that function, that's utterly
> hosed (and the SCHEDOP_poll thing was indeed so hosed).

Yes, such mistakes are more egregious in the sense they are harder to find and
have more severe consequences, but I don't think the mistakes are necessarily
harder to make. Bugs in simple usage patterns are easy to spot, but at the same
time they're also less likely to be buggy because they're simpler.

> The mandatory check *did* save us from configuring a 32-bit runstate
> area at the end of a page, then *writing* to it in 64-bit mode (where
> it's larger) and running off the end of the page.

Only because the length/capacity wasn't immutable, i.e. that particilar bug couldn't
have been introduced in the first place if kvm_gpc_activate() was the only "public"
API that allowed "changing" the length.

That's really what I dislike. I have no objection to adding a sanity check, what
I think is broken and dangerous is allowing a gpc->gpa to effectively become valid
by refreshing with a smaller length.

The gfn_to_hva_cache APIs have the same problem, but they get away with it because
they don't support concurrent usage and don't have to deal with invalidation events.

Lastly, if we keep "length" then we also need to keep "gpa", otherwise the resulting
API is all kinds of funky.

E.g. I'd be totally ok with something like this that would allow users to opt-in
to sanity checking their usage.

int __kvm_gpc_lock(struct gfn_to_pfn_cache *gpc)
{
int r;

read_lock_irqsave(&gpc->lock, gpc->flags);

while (kvm_gpc_check(gpc)) {
read_unlock_irqrestore(&gpc->lock, gpc->flags);

r = kvm_gpc_refresh(gpc);
if (r)
return r;

read_lock_irqsave(&gpc->lock, gpc->flags);
}

return 0;
}

int kvm_gpc_lock(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
{
if (WARN_ON_ONCE(gpa < gpc->gpa || (gpa + len > PAGE_SIZE) ||
((gpa & PAGE_MASK) != (gpc->gpa & PAGE_MASK)))
return -EINVAL;

return __kvm_gpc_lock(gpc);
}

2022-12-02 10:38:10

by Like Xu

[permalink] [raw]
Subject: Re: [PATCH v2 05/16] KVM: x86: Remove unused argument in gpc_unmap_khva()

On 14/10/2022 5:12 am, Sean Christopherson wrote:
> Remove the unused @kvm argument from gpc_unmap_khva().

Nit: the caller kvm_gpc_unmap() can also get rid of the unused @kvm argument.

2022-12-02 13:10:40

by Michal Luczaj

[permalink] [raw]
Subject: Re: [PATCH v2 05/16] KVM: x86: Remove unused argument in gpc_unmap_khva()

On 12/2/22 10:28, Like Xu wrote:
> On 14/10/2022 5:12 am, Sean Christopherson wrote:
>> Remove the unused @kvm argument from gpc_unmap_khva().
>
> Nit: the caller kvm_gpc_unmap() can also get rid of the unused @kvm argument.

Right, the initial series cleaned up kvm_gpc_unmap() in a separate patch.
Current iteration removes kvm_gpc_unmap() later in the series:
https://lore.kernel.org/kvm/[email protected]/

Michal

2022-12-02 14:30:32

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH v2 05/16] KVM: x86: Remove unused argument in gpc_unmap_khva()

On Fri, 2022-12-02 at 11:57 +0100, Michal Luczaj wrote:
> On 12/2/22 10:28, Like Xu wrote:
> > On 14/10/2022 5:12 am, Sean Christopherson wrote:
> > > Remove the unused @kvm argument from gpc_unmap_khva().
> >
> > Nit: the caller kvm_gpc_unmap() can also get rid of the unused @kvm argument.
>
> Right, the initial series cleaned up kvm_gpc_unmap() in a separate patch.
> Current iteration removes kvm_gpc_unmap() later in the series:
> https://lore.kernel.org/kvm/[email protected]/

I have been keeping that series up to date in
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/gpc-fixes

Now that the dust has settled on the Xen runstate area, I may post it
as v3 of the series.

Or I may attempt to resolve the gpc->len immutability thing first. I'm
still not really convinced Sean has won me round on that; I'm still
quite attached to the TOCTOU benefit of checking the length right there
at the moment you're going to use the pointer — especially given that
it *doesn't* have bounds checks like get_user() does, as Sean points
out.


Attachments:
smime.p7s (5.83 kB)

2022-12-02 17:21:30

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 05/16] KVM: x86: Remove unused argument in gpc_unmap_khva()

On Fri, Dec 02, 2022, David Woodhouse wrote:
> On Fri, 2022-12-02 at 11:57 +0100, Michal Luczaj wrote:
> > On 12/2/22 10:28, Like Xu wrote:
> > > On 14/10/2022 5:12 am, Sean Christopherson wrote:
> > > > Remove the unused @kvm argument from gpc_unmap_khva().
> > >
> > > Nit: the caller kvm_gpc_unmap() can also get rid of the unused @kvm argument.
> >
> > Right, the initial series cleaned up kvm_gpc_unmap() in a separate patch.
> > Current iteration removes kvm_gpc_unmap() later in the series:
> > https://lore.kernel.org/kvm/[email protected]/
>
> I have been keeping that series up to date in
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/gpc-fixes
>
> Now that the dust has settled on the Xen runstate area, I may post it
> as v3 of the series.
>
> Or I may attempt to resolve the gpc->len immutability thing first. I'm
> still not really convinced Sean has won me round on that;

Ya, I agree that storing "len" is undesirable, but so is storing "gpa" instead of
"gfn".

> I'm still quite attached to the TOCTOU benefit of checking the length right
> there at the moment you're going to use the pointer — especially given that
> it *doesn't* have bounds checks like get_user() does, as Sean points out.

I'm in favor of keeping the length checks if we modify the cache to store the
gfn, not the gpa, and require the gpa (or maybe just offset?) in the "get a kernel
pointer" API.

So, how about this for a set of APIs? Obviously not tested whatsoever, but I
think they address the Xen use cases, and will fit the nested virt cases too
(which want to stuff a pfn into a VMCS/VMCB).

void *kvm_gpc_get_kmap(struct gfn_to_pfn_cache *gpc, gpa_t offset,
unsigned long len, bool atomic)
{
<lock + refresh>

return gpc->khva + offset;
}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

kvm_pfn_t kvm_gpc_get_pfn(struct gfn_to_pfn_cache *gpc, bool atomic)
{
<lock + refresh of full page>

return gpc->pfn;
}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

void kvm_gpc_put(struct gfn_to_pfn_cache *gpc)
{
<unlock>
}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);

int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc)
{
return __kvm_gpc_refresh(gpc, gfn_to_gpa(gpc->gfn), PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(kvm_gpc_refresh);


And then __kvm_gpc_refresh() would do something like:

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..b2dd2eda4b56 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -236,22 +236,19 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
return -EFAULT;
}

-static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpt_t gpa,
unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
- unsigned long page_offset = gpa & ~PAGE_MASK;
bool unmap_old = false;
unsigned long old_uhva;
kvm_pfn_t old_pfn;
void *old_khva;
+ gfn_t gfn;
int ret;

- /*
- * If must fit within a single page. The 'len' argument is
- * only to enforce that.
- */
- if (page_offset + len > PAGE_SIZE)
+ /* An individual cache doesn't support page splits. */
+ if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
return -EINVAL;

/*
@@ -268,16 +265,16 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
goto out_unlock;
}

+ gfn = gpa_to_gfn(gpa);
+
old_pfn = gpc->pfn;
- old_khva = gpc->khva - offset_in_page(gpc->khva);
+ old_khva = gpc->khva;
old_uhva = gpc->uhva;

/* If the userspace HVA is invalid, refresh that first */
- if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+ if (gpc->gfn != gfn || gpc->generation != slots->generation ||
kvm_is_error_hva(gpc->uhva)) {
- gfn_t gfn = gpa_to_gfn(gpa);
-
- gpc->gpa = gpa;
+ gpc->gfn = gfn;
gpc->generation = slots->generation;
gpc->memslot = __gfn_to_memslot(slots, gfn);
gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
@@ -295,12 +292,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
if (!gpc->valid || old_uhva != gpc->uhva) {
ret = hva_to_pfn_retry(gpc);
} else {
- /*
- * If the HVA→PFN mapping was already valid, don't unmap it.
- * But do update gpc->khva because the offset within the page
- * may have changed.
- */
- gpc->khva = old_khva + page_offset;
+ /* If the HVA→PFN mapping was already valid, don't unmap it. */
ret = 0;
goto out_unlock;
}