From: David Stevens <[email protected]>
This series replaces the usage of kvm_vcpu_map in vmx with
gfn_to_pfn_cache. See [1] for details on why kvm_vcpu_map is broken.
The presence of kvm_vcpu_map blocks another series I would like to
try to merge [2]. Although I'm not familiar with the internals of vmx,
I've gone ahead and taken a stab at this cleanup. I've done some manual
testing with nested VMs, and KVM selftests pass, but thorough feedback
would be appreciated. Once this cleanup is done, I'll take a look at
removing kvm_vcpu_map from svm.
[1] https://lore.kernel.org/all/[email protected]/
[2] https://lore.kernel.org/all/[email protected]/
David Stevens (3):
KVM: Support sharing gpc locks
KVM: use gfn=>pfn cache in nested_get_vmcs12_pages
KVM: use gfn=>pfn cache for evmcs
arch/x86/kvm/vmx/hyperv.c | 41 ++++-
arch/x86/kvm/vmx/hyperv.h | 2 +
arch/x86/kvm/vmx/nested.c | 329 +++++++++++++++++++++++++++++---------
arch/x86/kvm/vmx/vmx.c | 48 +++++-
arch/x86/kvm/vmx/vmx.h | 14 +-
arch/x86/kvm/x86.c | 8 +-
arch/x86/kvm/xen.c | 58 +++----
include/linux/kvm_host.h | 12 ++
include/linux/kvm_types.h | 3 +-
virt/kvm/pfncache.c | 37 +++--
10 files changed, 418 insertions(+), 134 deletions(-)
--
2.39.1.456.gfc5497dd1b-goog
From: David Stevens <[email protected]>
Support initializing a gfn_to_pfn_cache with an external lock instead of
its embedded lock. This allows groups of gpcs that are accessed together
to share a lock, which can greatly simplify locking.
Signed-off-by: David Stevens <[email protected]>
---
arch/x86/kvm/x86.c | 8 +++---
arch/x86/kvm/xen.c | 58 +++++++++++++++++++--------------------
include/linux/kvm_host.h | 12 ++++++++
include/linux/kvm_types.h | 3 +-
virt/kvm/pfncache.c | 37 +++++++++++++++----------
5 files changed, 70 insertions(+), 48 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 508074e47bc0..ec0de9bc2eae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3047,14 +3047,14 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
struct pvclock_vcpu_time_info *guest_hv_clock;
unsigned long flags;
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
return;
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
}
guest_hv_clock = (void *)(gpc->khva + offset);
@@ -3083,7 +3083,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2681e2007e39..fa8ab23271d3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -59,12 +59,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
/* It could be invalid again already, so we need to check */
- read_lock_irq(&gpc->lock);
+ read_lock_irq(gpc->lock);
if (gpc->valid)
break;
- read_unlock_irq(&gpc->lock);
+ read_unlock_irq(gpc->lock);
} while (1);
/* Paranoia checks on the 32-bit struct layout */
@@ -101,7 +101,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
smp_wmb();
wc->version = wc_version + 1;
- read_unlock_irq(&gpc->lock);
+ read_unlock_irq(gpc->lock);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
@@ -274,15 +274,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
*/
if (atomic) {
local_irq_save(flags);
- if (!read_trylock(&gpc1->lock)) {
+ if (!read_trylock(gpc1->lock)) {
local_irq_restore(flags);
return;
}
} else {
- read_lock_irqsave(&gpc1->lock, flags);
+ read_lock_irqsave(gpc1->lock, flags);
}
while (!kvm_gpc_check(gpc1, user_len1)) {
- read_unlock_irqrestore(&gpc1->lock, flags);
+ read_unlock_irqrestore(gpc1->lock, flags);
/* When invoked from kvm_sched_out() we cannot sleep */
if (atomic)
@@ -291,7 +291,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
if (kvm_gpc_refresh(gpc1, user_len1))
return;
- read_lock_irqsave(&gpc1->lock, flags);
+ read_lock_irqsave(gpc1->lock, flags);
}
if (likely(!user_len2)) {
@@ -316,19 +316,19 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* takes them more than one at a time. Set a subclass on the
* gpc1 lock to make lockdep shut up about it.
*/
- lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
if (atomic) {
- if (!read_trylock(&gpc2->lock)) {
- read_unlock_irqrestore(&gpc1->lock, flags);
+ if (!read_trylock(gpc2->lock)) {
+ read_unlock_irqrestore(gpc1->lock, flags);
return;
}
} else {
- read_lock(&gpc2->lock);
+ read_lock(gpc2->lock);
}
if (!kvm_gpc_check(gpc2, user_len2)) {
- read_unlock(&gpc2->lock);
- read_unlock_irqrestore(&gpc1->lock, flags);
+ read_unlock(gpc2->lock);
+ read_unlock_irqrestore(gpc1->lock, flags);
/* When invoked from kvm_sched_out() we cannot sleep */
if (atomic)
@@ -428,9 +428,9 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
}
if (user_len2)
- read_unlock(&gpc2->lock);
+ read_unlock(gpc2->lock);
- read_unlock_irqrestore(&gpc1->lock, flags);
+ read_unlock_irqrestore(gpc1->lock, flags);
mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
if (user_len2)
@@ -505,14 +505,14 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* does anyway. Page it in and retry the instruction. We're just a
* little more honest about it.
*/
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
return;
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
}
/* Now gpc->khva is a valid kernel address for the vcpu_info */
@@ -540,7 +540,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
@@ -568,9 +568,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
/*
* This function gets called from kvm_vcpu_block() after setting the
@@ -590,11 +590,11 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
*/
return 0;
}
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
}
rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
return rc;
}
@@ -1172,7 +1172,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
int idx, i;
idx = srcu_read_lock(&kvm->srcu);
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
@@ -1193,7 +1193,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
}
out_rcu:
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
srcu_read_unlock(&kvm->srcu, idx);
return ret;
@@ -1576,7 +1576,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
@@ -1607,10 +1607,10 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
} else {
rc = 1; /* Delivered to the bitmap in shared_info. */
/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
gpc = &vcpu->arch.xen.vcpu_info_cache;
- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(gpc->lock, flags);
if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
@@ -1644,7 +1644,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
}
out_rcu:
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock_irqrestore(gpc->lock, flags);
srcu_read_unlock(&kvm->srcu, idx);
if (kick_vcpu) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 109b18e2789c..7d1f9c6561e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1279,6 +1279,18 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
+/**
+ * kvm_gpc_init_with_lock - initialize gfn_to_pfn_cache with an external lock.
+ *
+ * @lock: an initialized rwlock
+ *
+ * See kvm_gpc_init. Allows multiple gfn_to_pfn_cache structs to share the
+ * same lock.
+ */
+void kvm_gpc_init_with_lock(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ rwlock_t *lock);
+
/**
* kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
* physical address.
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 76de36e56cdf..b6432c8cc19c 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -70,7 +70,8 @@ struct gfn_to_pfn_cache {
struct kvm *kvm;
struct kvm_vcpu *vcpu;
struct list_head list;
- rwlock_t lock;
+ rwlock_t *lock;
+ rwlock_t _lock;
struct mutex refresh_lock;
void *khva;
kvm_pfn_t pfn;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..2c6a2edaca9f 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -31,7 +31,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
spin_lock(&kvm->gpc_lock);
list_for_each_entry(gpc, &kvm->gpc_list, list) {
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
/* Only a single page so no need to care about length */
if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
@@ -50,7 +50,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
}
}
- write_unlock_irq(&gpc->lock);
+ write_unlock_irq(gpc->lock);
}
spin_unlock(&kvm->gpc_lock);
@@ -147,7 +147,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
lockdep_assert_held(&gpc->refresh_lock);
- lockdep_assert_held_write(&gpc->lock);
+ lockdep_assert_held_write(gpc->lock);
/*
* Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
@@ -160,7 +160,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
mmu_seq = gpc->kvm->mmu_invalidate_seq;
smp_rmb();
- write_unlock_irq(&gpc->lock);
+ write_unlock_irq(gpc->lock);
/*
* If the previous iteration "failed" due to an mmu_notifier
@@ -208,7 +208,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
}
}
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
/*
* Other tasks must wait for _this_ refresh to complete before
@@ -231,7 +231,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
return 0;
out_error:
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
return -EFAULT;
}
@@ -261,7 +261,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
*/
mutex_lock(&gpc->refresh_lock);
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
if (!gpc->active) {
ret = -EINVAL;
@@ -321,7 +321,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
unmap_old = (old_pfn != gpc->pfn);
out_unlock:
- write_unlock_irq(&gpc->lock);
+ write_unlock_irq(gpc->lock);
mutex_unlock(&gpc->refresh_lock);
@@ -339,20 +339,29 @@ EXPORT_SYMBOL_GPL(kvm_gpc_refresh);
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
+{
+ rwlock_init(&gpc->_lock);
+ kvm_gpc_init_with_lock(gpc, kvm, vcpu, usage, &gpc->_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_gpc_init);
+
+void kvm_gpc_init_with_lock(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ rwlock_t *lock)
{
WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
- rwlock_init(&gpc->lock);
mutex_init(&gpc->refresh_lock);
gpc->kvm = kvm;
gpc->vcpu = vcpu;
+ gpc->lock = lock;
gpc->usage = usage;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->uhva = KVM_HVA_ERR_BAD;
}
-EXPORT_SYMBOL_GPL(kvm_gpc_init);
+EXPORT_SYMBOL_GPL(kvm_gpc_init_with_lock);
int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
{
@@ -371,9 +380,9 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
* refresh must not establish a mapping until the cache is
* reachable by mmu_notifier events.
*/
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
gpc->active = true;
- write_unlock_irq(&gpc->lock);
+ write_unlock_irq(gpc->lock);
}
return __kvm_gpc_refresh(gpc, gpa, len);
}
@@ -391,7 +400,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
* must stall mmu_notifier events until all users go away, i.e.
* until gpc->lock is dropped and refresh is guaranteed to fail.
*/
- write_lock_irq(&gpc->lock);
+ write_lock_irq(gpc->lock);
gpc->active = false;
gpc->valid = false;
@@ -406,7 +415,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
old_pfn = gpc->pfn;
gpc->pfn = KVM_PFN_ERR_FAULT;
- write_unlock_irq(&gpc->lock);
+ write_unlock_irq(gpc->lock);
spin_lock(&kvm->gpc_lock);
list_del(&gpc->list);
--
2.39.1.456.gfc5497dd1b-goog
From: David Stevens <[email protected]>
Use gfn_to_pfn_cache to access guest pages needed by
nested_get_vmcs12_pages. This replaces kvm_vcpu_map, which doesn't
properly handle updates to the HVA->GFN mapping.
The MSR bitmap is only accessed in nested_vmx_prepare_msr_bitmap, so it
could potentially be accessed directly through the HVA. However, using a
persistent gpc should be more efficient, and maintenance of the gpc can
be easily done alongside the other gpcs.
Signed-off-by: David Stevens <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 206 ++++++++++++++++++++++++++++++--------
arch/x86/kvm/vmx/vmx.c | 38 ++++++-
arch/x86/kvm/vmx/vmx.h | 11 +-
3 files changed, 204 insertions(+), 51 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 557b9c468734..cb41113caa8a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -324,9 +324,10 @@ static void free_nested(struct kvm_vcpu *vcpu)
* page's backing page (yeah, confusing) shouldn't actually be accessed,
* and if it is written, the contents are irrelevant.
*/
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ kvm_gpc_deactivate(&vmx->nested.apic_access_gpc);
+ kvm_gpc_deactivate(&vmx->nested.virtual_apic_gpc);
+ kvm_gpc_deactivate(&vmx->nested.pi_desc_gpc);
+ kvm_gpc_deactivate(&vmx->nested.msr_bitmap_gpc);
vmx->nested.pi_desc = NULL;
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -558,19 +559,22 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
msr_bitmap_l0, msr);
}
+static bool nested_vmcs12_gpc_check(struct gfn_to_pfn_cache *gpc,
+ gpa_t gpa, unsigned long len, bool *try_refresh);
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
*/
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+ struct vmcs12 *vmcs12,
+ bool *try_refresh)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
- struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -590,10 +594,11 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
return true;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
+ if (!nested_vmcs12_gpc_check(&vmx->nested.msr_bitmap_gpc,
+ vmcs12->msr_bitmap, PAGE_SIZE, try_refresh))
return false;
- msr_bitmap_l1 = (unsigned long *)map->hva;
+ msr_bitmap_l1 = vmx->nested.msr_bitmap_gpc.khva;
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -654,8 +659,6 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD, MSR_TYPE_W);
- kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
-
vmx->nested.force_msr_bitmap_recalc = false;
return true;
@@ -3184,11 +3187,59 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
return true;
}
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_vmcs12_gpc_check(struct gfn_to_pfn_cache *gpc,
+ gpa_t gpa, unsigned long len, bool *try_refresh)
+{
+ bool check;
+
+ if (gpc->gpa != gpa || !gpc->active)
+ return false;
+ check = kvm_gpc_check(gpc, len);
+ if (!check)
+ *try_refresh = true;
+ return check;
+}
+
+static void nested_vmcs12_gpc_refresh(struct gfn_to_pfn_cache *gpc,
+ gpa_t gpa, unsigned long len)
+{
+ if (gpc->gpa != gpa || !gpc->active) {
+ kvm_gpc_deactivate(gpc);
+
+ if (kvm_gpc_activate(gpc, gpa, len))
+ kvm_gpc_deactivate(gpc);
+ } else {
+ if (kvm_gpc_refresh(gpc, len))
+ kvm_gpc_deactivate(gpc);
+ }
+}
+
+static void nested_get_vmcs12_pages_refresh(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ nested_vmcs12_gpc_refresh(&vmx->nested.apic_access_gpc,
+ vmcs12->apic_access_addr, PAGE_SIZE);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ nested_vmcs12_gpc_refresh(&vmx->nested.virtual_apic_gpc,
+ vmcs12->virtual_apic_page_addr, PAGE_SIZE);
+
+ if (nested_cpu_has_posted_intr(vmcs12))
+ nested_vmcs12_gpc_refresh(&vmx->nested.pi_desc_gpc,
+ vmcs12->posted_intr_desc_addr, sizeof(struct pi_desc));
+
+ if (cpu_has_vmx_msr_bitmap() && nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ nested_vmcs12_gpc_refresh(&vmx->nested.msr_bitmap_gpc,
+ vmcs12->msr_bitmap, PAGE_SIZE);
+}
+
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, bool *try_refresh)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_host_map *map;
if (!vcpu->arch.pdptrs_from_userspace &&
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
@@ -3197,16 +3248,19 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) {
+ *try_refresh = false;
return false;
+ }
}
-
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- map = &vmx->nested.apic_access_page_map;
-
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
- vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+ if (nested_vmcs12_gpc_check(&vmx->nested.apic_access_gpc,
+ vmcs12->apic_access_addr, PAGE_SIZE, try_refresh)) {
+ vmcs_write64(APIC_ACCESS_ADDR,
+ pfn_to_hpa(vmx->nested.apic_access_gpc.pfn));
+ } else if (*try_refresh) {
+ return false;
} else {
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
__func__);
@@ -3219,10 +3273,13 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- map = &vmx->nested.virtual_apic_map;
-
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+ if (nested_vmcs12_gpc_check(&vmx->nested.virtual_apic_gpc,
+ vmcs12->virtual_apic_page_addr, PAGE_SIZE,
+ try_refresh)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ pfn_to_hpa(vmx->nested.virtual_apic_gpc.pfn));
+ } else if (*try_refresh) {
+ return false;
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3245,14 +3302,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- map = &vmx->nested.pi_desc_map;
-
- if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+ if (nested_vmcs12_gpc_check(&vmx->nested.pi_desc_gpc,
+ vmcs12->posted_intr_desc_addr,
+ sizeof(struct pi_desc), try_refresh)) {
vmx->nested.pi_desc =
- (struct pi_desc *)(((void *)map->hva) +
- offset_in_page(vmcs12->posted_intr_desc_addr));
+ (struct pi_desc *)vmx->nested.pi_desc_gpc.khva;
vmcs_write64(POSTED_INTR_DESC_ADDR,
- pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ pfn_to_hpa(vmx->nested.pi_desc_gpc.pfn) +
+ offset_in_page(vmx->nested.pi_desc_gpc.gpa));
+ } else if (*try_refresh) {
+ return false;
} else {
/*
* Defer the KVM_INTERNAL_EXIT until KVM tries to
@@ -3264,16 +3323,22 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
}
}
- if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12, try_refresh)) {
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
- else
+ } else {
+ if (*try_refresh)
+ return false;
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ }
return true;
}
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+ bool success, try_refresh;
+ int idx;
+
/*
* Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
* in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
@@ -3291,8 +3356,24 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
return false;
}
- if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
- return false;
+ if (!is_guest_mode(vcpu))
+ return true;
+
+ try_refresh = true;
+retry:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ success = nested_get_vmcs12_pages(vcpu, &try_refresh);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!success) {
+ if (try_refresh) {
+ nested_get_vmcs12_pages_refresh(vcpu);
+ try_refresh = false;
+ goto retry;
+ } else {
+ return false;
+ }
+ }
return true;
}
@@ -3389,6 +3470,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
.failed_vmentry = 1,
};
u32 failed_index;
+ bool success, try_refresh;
+ unsigned long flags;
trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
vmx->nested.current_vmptr,
@@ -3441,13 +3524,26 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) {
- if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ try_refresh = true;
+retry:
+ read_lock_irqsave(vmx->nested.apic_access_gpc.lock, flags);
+ success = nested_get_vmcs12_pages(vcpu, &try_refresh);
+
+ if (unlikely(!success)) {
+ read_unlock_irqrestore(vmx->nested.apic_access_gpc.lock, flags);
+ if (try_refresh) {
+ nested_get_vmcs12_pages_refresh(vcpu);
+ try_refresh = false;
+ goto retry;
+ } else {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
}
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ read_unlock_irqrestore(vmx->nested.apic_access_gpc.lock, flags);
return NVMX_VMENTRY_VMFAIL;
}
@@ -3455,12 +3551,16 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
&entry_failure_code)) {
exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
+ read_unlock_irqrestore(vmx->nested.apic_access_gpc.lock, flags);
goto vmentry_fail_vmexit;
}
}
enter_guest_mode(vcpu);
+ if (from_vmentry)
+ read_unlock_irqrestore(vmx->nested.apic_access_gpc.lock, flags);
+
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
@@ -3810,9 +3910,10 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int max_irr;
+ int max_irr, idx;
void *vapic_page;
u16 status;
+ bool success;
if (!vmx->nested.pi_pending)
return 0;
@@ -3827,7 +3928,17 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
- vapic_page = vmx->nested.virtual_apic_map.hva;
+retry:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ success = kvm_gpc_check(&vmx->nested.virtual_apic_gpc, PAGE_SIZE);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!success) {
+ if (kvm_gpc_refresh(&vmx->nested.virtual_apic_gpc, PAGE_SIZE))
+ goto mmio_needed;
+ goto retry;
+ }
+ vapic_page = vmx->nested.virtual_apic_gpc.khva;
if (!vapic_page)
goto mmio_needed;
@@ -4827,12 +4938,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_update_cpu_dirty_logging(vcpu);
}
- /* Unpin physical memory we referred to in vmcs02 */
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
-
if (vmx->nested.reload_vmcs01_apic_access_page) {
vmx->nested.reload_vmcs01_apic_access_page = false;
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
@@ -5246,6 +5351,12 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
vmx->nested.current_vmptr = INVALID_GPA;
+
+ kvm_gpc_deactivate(&vmx->nested.apic_access_gpc);
+ kvm_gpc_deactivate(&vmx->nested.virtual_apic_gpc);
+ kvm_gpc_deactivate(&vmx->nested.pi_desc_gpc);
+ kvm_gpc_deactivate(&vmx->nested.msr_bitmap_gpc);
+ vmx->nested.pi_desc = NULL;
}
/* Emulate the VMXOFF instruction */
@@ -5620,6 +5731,17 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
+ kvm_gpc_activate(&vmx->nested.apic_access_gpc,
+ vmx->nested.cached_vmcs12->apic_access_addr, PAGE_SIZE);
+ kvm_gpc_activate(&vmx->nested.virtual_apic_gpc,
+ vmx->nested.cached_vmcs12->virtual_apic_page_addr,
+ PAGE_SIZE);
+ kvm_gpc_activate(&vmx->nested.pi_desc_gpc,
+ vmx->nested.cached_vmcs12->posted_intr_desc_addr,
+ sizeof(struct pi_desc));
+ kvm_gpc_activate(&vmx->nested.msr_bitmap_gpc,
+ vmx->nested.cached_vmcs12->msr_bitmap, PAGE_SIZE);
+
set_current_vmptr(vmx, vmptr);
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c788aa382611..1bb8252d40aa 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4097,16 +4097,27 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
void *vapic_page;
u32 vppr;
- int rvi;
+ int rvi, idx;
+ bool success;
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_gpc.gpa))
return false;
rvi = vmx_get_rvi();
+retry:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ success = kvm_gpc_check(&vmx->nested.virtual_apic_gpc, PAGE_SIZE);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
- vapic_page = vmx->nested.virtual_apic_map.hva;
+ if (!success) {
+ if (kvm_gpc_refresh(&vmx->nested.virtual_apic_gpc, PAGE_SIZE))
+ return false;
+ goto retry;
+ }
+
+ vapic_page = vmx->nested.virtual_apic_gpc.khva;
vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
return ((rvi & 0xf0) > (vppr & 0xf0));
@@ -4804,6 +4815,27 @@ static void init_vmcs(struct vcpu_vmx *vmx)
}
vmx_setup_uret_msrs(vmx);
+
+ if (nested) {
+ memset(&vmx->nested.apic_access_gpc, 0, sizeof(vmx->nested.apic_access_gpc));
+ kvm_gpc_init(&vmx->nested.apic_access_gpc, kvm, &vmx->vcpu,
+ KVM_GUEST_USES_PFN);
+
+ memset(&vmx->nested.virtual_apic_gpc, 0, sizeof(vmx->nested.virtual_apic_gpc));
+ kvm_gpc_init_with_lock(&vmx->nested.virtual_apic_gpc, kvm, &vmx->vcpu,
+ KVM_GUEST_AND_HOST_USE_PFN,
+ vmx->nested.apic_access_gpc.lock);
+
+ memset(&vmx->nested.pi_desc_gpc, 0, sizeof(vmx->nested.pi_desc_gpc));
+ kvm_gpc_init_with_lock(&vmx->nested.pi_desc_gpc, kvm, &vmx->vcpu,
+ KVM_GUEST_AND_HOST_USE_PFN,
+ vmx->nested.apic_access_gpc.lock);
+
+ memset(&vmx->nested.msr_bitmap_gpc, 0, sizeof(vmx->nested.msr_bitmap_gpc));
+ kvm_gpc_init_with_lock(&vmx->nested.msr_bitmap_gpc, kvm, &vmx->vcpu,
+ KVM_HOST_USES_PFN,
+ vmx->nested.apic_access_gpc.lock);
+ }
}
static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..e067730a0222 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -207,13 +207,12 @@ struct nested_vmx {
/*
* Guest pages referred to in the vmcs02 with host-physical
- * pointers, so we must keep them pinned while L2 runs.
+ * pointers.
*/
- struct kvm_host_map apic_access_page_map;
- struct kvm_host_map virtual_apic_map;
- struct kvm_host_map pi_desc_map;
-
- struct kvm_host_map msr_bitmap_map;
+ struct gfn_to_pfn_cache apic_access_gpc;
+ struct gfn_to_pfn_cache virtual_apic_gpc;
+ struct gfn_to_pfn_cache pi_desc_gpc;
+ struct gfn_to_pfn_cache msr_bitmap_gpc;
struct pi_desc *pi_desc;
bool pi_pending;
--
2.39.1.456.gfc5497dd1b-goog
From: David Stevens <[email protected]>
Use gfn_to_pfn_cache to access evmcs. This replaces kvm_vcpu_map, which
doesn't properly handle updates to the HVA->GFN mapping.
This change introduces a number of new failure cases, since refreshing a
gpc can fail. Since the evmcs is sometimes accessed alongside vmcs12
pages, the evmcs gpc is initialized to share the vmcs12 pages' gpc lock
for simplicity. This is coarser locking than necessary, but taking the
lock outside of the vcpu thread should be rare, so the impact should be
minimal.
Signed-off-by: David Stevens <[email protected]>
---
arch/x86/kvm/vmx/hyperv.c | 41 ++++++++++-
arch/x86/kvm/vmx/hyperv.h | 2 +
arch/x86/kvm/vmx/nested.c | 151 +++++++++++++++++++++++++++-----------
arch/x86/kvm/vmx/vmx.c | 10 +++
arch/x86/kvm/vmx/vmx.h | 3 +-
5 files changed, 158 insertions(+), 49 deletions(-)
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index 22daca752797..1b140ef1d4db 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -554,12 +554,21 @@ bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs;
+ unsigned long flags;
+ bool nested_flush_hypercall;
- if (!hv_vcpu || !evmcs)
+ if (!hv_vcpu || !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return false;
- if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ evmcs = nested_evmcs_lock_and_acquire(vcpu, &flags);
+ if (!evmcs)
+ return false;
+
+ nested_flush_hypercall = evmcs->hv_enlightenments_control.nested_flush_hypercall;
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
+
+ if (!nested_flush_hypercall)
return false;
return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
@@ -569,3 +578,29 @@ void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
{
nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
}
+
+struct hv_enlightened_vmcs *nested_evmcs_lock_and_acquire(struct kvm_vcpu *vcpu,
+ unsigned long *flags_out)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+retry:
+ read_lock_irqsave(vmx->nested.hv_evmcs_gpc.lock, flags);
+ if (!kvm_gpc_check(&vmx->nested.hv_evmcs_gpc, sizeof(struct hv_enlightened_vmcs))) {
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
+ if (!vmx->nested.hv_evmcs_gpc.active)
+ return NULL;
+
+ if (kvm_gpc_refresh(&vmx->nested.hv_evmcs_gpc,
+ sizeof(struct hv_enlightened_vmcs))) {
+ kvm_gpc_deactivate(&vmx->nested.hv_evmcs_gpc);
+ return NULL;
+ }
+
+ goto retry;
+ }
+
+ *flags_out = flags;
+ return vmx->nested.hv_evmcs_gpc.khva;
+}
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index ab08a9b9ab7d..43a9488f9a38 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -306,5 +306,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+struct hv_enlightened_vmcs *nested_evmcs_lock_and_acquire(struct kvm_vcpu *vcpu,
+ unsigned long *flags_out);
#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index cb41113caa8a..b8fff71583c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -229,10 +229,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
- kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs = NULL;
- }
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ kvm_gpc_deactivate(&vmx->nested.hv_evmcs_gpc);
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
@@ -574,7 +572,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -589,10 +587,18 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
* - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
* and tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
- evmcs->hv_enlightenments_control.msr_bitmap &&
- evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
- return true;
+ if (!vmx->nested.force_msr_bitmap_recalc && vmx->nested.hv_evmcs_gpc.active) {
+ if (!kvm_gpc_check(&vmx->nested.hv_evmcs_gpc,
+ sizeof(struct hv_enlightened_vmcs))) {
+ *try_refresh = true;
+ return false;
+ }
+
+ evmcs = vmx->nested.hv_evmcs_gpc.khva;
+ if (evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+ }
if (!nested_vmcs12_gpc_check(&vmx->nested.msr_bitmap_gpc,
vmcs12->msr_bitmap, PAGE_SIZE, try_refresh))
@@ -1573,11 +1579,18 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
vmcs_load(vmx->loaded_vmcs->vmcs);
}
-static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
+static bool copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, bool full_copy)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs;
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
+ unsigned long flags;
+ u32 hv_clean_fields;
+
+ evmcs = nested_evmcs_lock_and_acquire(&vmx->vcpu, &flags);
+ if (!evmcs)
+ return false;
+ hv_clean_fields = full_copy ? 0 : evmcs->hv_clean_fields;
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
@@ -1814,13 +1827,25 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
*/
- return;
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
+ return true;
}
static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs;
+ unsigned long flags;
+
+ evmcs = nested_evmcs_lock_and_acquire(&vmx->vcpu, &flags);
+ if (WARN_ON_ONCE(!evmcs)) {
+ /*
+ * We can't sync, so the state is now invalid. This isn't an immediate
+ * problem, but further accesses will be errors. Failing to acquire the
+ * evmcs gpc deactivates it, so any subsequent attempts will also fail.
+ */
+ return;
+ }
/*
* Should not be changed by KVM:
@@ -1988,6 +2013,8 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
+
return;
}
@@ -2001,6 +2028,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
+ struct hv_enlightened_vmcs *hv_evmcs;
+ unsigned long flags;
if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
@@ -2016,11 +2045,14 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
nested_release_evmcs(vcpu);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
- &vmx->nested.hv_evmcs_map))
+ if (kvm_gpc_activate(&vmx->nested.hv_evmcs_gpc, evmcs_gpa, PAGE_SIZE)) {
+ kvm_gpc_deactivate(&vmx->nested.hv_evmcs_gpc);
return EVMPTRLD_ERROR;
+ }
- vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
+ hv_evmcs = nested_evmcs_lock_and_acquire(&vmx->vcpu, &flags);
+ if (!hv_evmcs)
+ return EVMPTRLD_ERROR;
/*
* Currently, KVM only supports eVMCS version 1
@@ -2044,9 +2076,10 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* eVMCS version or VMCS12 revision_id as valid values for first
* u32 field of eVMCS.
*/
- if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
- (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
+ if (hv_evmcs->revision_id != KVM_EVMCS_VERSION &&
+ hv_evmcs->revision_id != VMCS12_REVISION) {
nested_release_evmcs(vcpu);
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
return EVMPTRLD_VMFAIL;
}
@@ -2072,8 +2105,15 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
if (from_launch || evmcs_gpa_changed) {
- vmx->nested.hv_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ if (!evmcs_gpa_changed) {
+ hv_evmcs = nested_evmcs_lock_and_acquire(&vmx->vcpu, &flags);
+ if (!hv_evmcs)
+ return EVMPTRLD_ERROR;
+ }
+
+ hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
vmx->nested.force_msr_bitmap_recalc = true;
}
@@ -2399,9 +2439,10 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
}
}
-static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_rare(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ struct hv_enlightened_vmcs *hv_evmcs)
{
- struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
@@ -2534,13 +2575,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool load_guest_pdptrs_vmcs12 = false;
+ struct hv_enlightened_vmcs *hv_evmcs = NULL;
+
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ hv_evmcs = vmx->nested.hv_evmcs_gpc.khva;
if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
- prepare_vmcs02_rare(vmx, vmcs12);
+ prepare_vmcs02_rare(vcpu, vmcs12, hv_evmcs);
vmx->nested.dirty_vmcs12 = false;
load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
- !(vmx->nested.hv_evmcs->hv_clean_fields &
+ !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
@@ -2663,8 +2708,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* here.
*/
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
- vmx->nested.hv_evmcs->hv_clean_fields |=
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ hv_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
return 0;
}
@@ -3214,7 +3258,7 @@ static void nested_vmcs12_gpc_refresh(struct gfn_to_pfn_cache *gpc,
}
}
-static void nested_get_vmcs12_pages_refresh(struct kvm_vcpu *vcpu)
+static bool nested_get_vmcs12_pages_refresh(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3231,9 +3275,24 @@ static void nested_get_vmcs12_pages_refresh(struct kvm_vcpu *vcpu)
nested_vmcs12_gpc_refresh(&vmx->nested.pi_desc_gpc,
vmcs12->posted_intr_desc_addr, sizeof(struct pi_desc));
- if (cpu_has_vmx_msr_bitmap() && nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ if (cpu_has_vmx_msr_bitmap() && nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) {
+ if (vmx->nested.hv_evmcs_gpc.active) {
+ if (kvm_gpc_refresh(&vmx->nested.hv_evmcs_gpc, PAGE_SIZE)) {
+ kvm_gpc_deactivate(&vmx->nested.hv_evmcs_gpc);
+ pr_debug_ratelimited("%s: no backing for evmcs\n", __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+ }
+
nested_vmcs12_gpc_refresh(&vmx->nested.msr_bitmap_gpc,
vmcs12->msr_bitmap, PAGE_SIZE);
+ }
+
+ return true;
}
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, bool *try_refresh)
@@ -3366,13 +3425,11 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (!success) {
- if (try_refresh) {
- nested_get_vmcs12_pages_refresh(vcpu);
+ if (try_refresh && nested_get_vmcs12_pages_refresh(vcpu)) {
try_refresh = false;
goto retry;
- } else {
- return false;
}
+ return false;
}
return true;
@@ -3531,14 +3588,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (unlikely(!success)) {
read_unlock_irqrestore(vmx->nested.apic_access_gpc.lock, flags);
- if (try_refresh) {
- nested_get_vmcs12_pages_refresh(vcpu);
+ if (try_refresh && nested_get_vmcs12_pages_refresh(vcpu)) {
try_refresh = false;
goto retry;
- } else {
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
if (nested_vmx_check_vmentry_hw(vcpu)) {
@@ -3680,7 +3735,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return nested_vmx_failInvalid(vcpu);
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
- copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
+ if (!copy_enlightened_to_vmcs12(vmx, false))
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
/* Enlightened VMCS doesn't have launch state */
vmcs12->launch_state = !launch;
} else if (enable_shadow_vmcs) {
@@ -5421,7 +5477,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12,
launch_state),
&zero, sizeof(zero));
- } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
+ } else if (vmx->nested.hv_evmcs_gpc.active && vmptr == vmx->nested.hv_evmcs_vmptr) {
nested_release_evmcs(vcpu);
}
@@ -5448,8 +5504,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs;
struct x86_exception e;
- unsigned long field;
+ unsigned long field, flags;
u64 value;
gva_t gva = 0;
short offset;
@@ -5498,8 +5555,13 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ evmcs = nested_evmcs_lock_and_acquire(&vmx->vcpu, &flags);
+ if (!evmcs)
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+
/* Read the field, zero-extended to a u64 value */
- value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ value = evmcs_read_any(evmcs, field, offset);
+ read_unlock_irqrestore(vmx->nested.hv_evmcs_gpc.lock, flags);
}
/*
@@ -6604,7 +6666,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
} else {
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
/*
* L1 hypervisor is not obliged to keep eVMCS
* clean fields data always up-to-date while
@@ -6612,8 +6674,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
* supposed to be actual upon vmentry so we need
* to ignore it here and do full copy.
*/
- copy_enlightened_to_vmcs12(vmx, 0);
- else if (enable_shadow_vmcs)
+ if (!copy_enlightened_to_vmcs12(vmx, true))
+ return -EFAULT;
+ } else if (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
}
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1bb8252d40aa..1c13fc1b7b5e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4835,6 +4835,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
kvm_gpc_init_with_lock(&vmx->nested.msr_bitmap_gpc, kvm, &vmx->vcpu,
KVM_HOST_USES_PFN,
vmx->nested.apic_access_gpc.lock);
+
+ memset(&vmx->nested.hv_evmcs_gpc, 0, sizeof(vmx->nested.hv_evmcs_gpc));
+ /*
+ * Share the same lock for simpler locking. Taking the lock
+ * outside of the vcpu thread should be rare, so the cost of
+ * the coarser locking should be minimal
+ */
+ kvm_gpc_init_with_lock(&vmx->nested.hv_evmcs_gpc, kvm, &vmx->vcpu,
+ KVM_GUEST_AND_HOST_USE_PFN,
+ vmx->nested.apic_access_gpc.lock);
}
}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e067730a0222..71e52daf60af 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -252,9 +252,8 @@ struct nested_vmx {
bool guest_mode;
} smm;
+ struct gfn_to_pfn_cache hv_evmcs_gpc;
gpa_t hv_evmcs_vmptr;
- struct kvm_host_map hv_evmcs_map;
- struct hv_enlightened_vmcs *hv_evmcs;
};
struct vcpu_vmx {
--
2.39.1.456.gfc5497dd1b-goog
On Fri, Jan 27, 2023, David Stevens wrote:
> From: David Stevens <[email protected]>
>
> This series replaces the usage of kvm_vcpu_map in vmx with
> gfn_to_pfn_cache. See [1] for details on why kvm_vcpu_map is broken.
>
> The presence of kvm_vcpu_map blocks another series I would like to
> try to merge [2]. Although I'm not familiar with the internals of vmx,
> I've gone ahead and taken a stab at this cleanup. I've done some manual
> testing with nested VMs, and KVM selftests pass, but thorough feedback
> would be appreciated. Once this cleanup is done, I'll take a look at
> removing kvm_vcpu_map from svm.
Woot, been waiting for someone to take this one, thanks! It'll likely be a week
or two until I get 'round to this, but it's definitely something I want to get
merged sooner than later.
Hi David,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on kvm/queue]
[also build test ERROR on linus/master v6.2-rc5 next-20230127]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20230127044500.680329-2-stevensd%40google.com
patch subject: [PATCH 1/3] KVM: Support sharing gpc locks
config: i386-randconfig-a003 (https://download.01.org/0day-ci/archive/20230129/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/14d62b317199184bb71256cc5f652b04fb2f9107
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
git checkout 14d62b317199184bb71256cc5f652b04fb2f9107
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=i386 olddefconfig
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash arch/x86/ drivers/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
arch/x86/kvm/xen.c: In function 'kvm_xen_update_runstate_guest':
>> arch/x86/kvm/xen.c:319:45: error: 'gpc1->lock' is a pointer; did you mean to use '->'?
319 | lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
| ^
| ->
vim +319 arch/x86/kvm/xen.c
172
173 static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
174 {
175 struct kvm_vcpu_xen *vx = &v->arch.xen;
176 struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
177 struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
178 size_t user_len, user_len1, user_len2;
179 struct vcpu_runstate_info rs;
180 unsigned long flags;
181 size_t times_ofs;
182 uint8_t *update_bit = NULL;
183 uint64_t entry_time;
184 uint64_t *rs_times;
185 int *rs_state;
186
187 /*
188 * The only difference between 32-bit and 64-bit versions of the
189 * runstate struct is the alignment of uint64_t in 32-bit, which
190 * means that the 64-bit version has an additional 4 bytes of
191 * padding after the first field 'state'. Let's be really really
192 * paranoid about that, and matching it with our internal data
193 * structures that we memcpy into it...
194 */
195 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
196 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
197 BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
198 #ifdef CONFIG_X86_64
199 /*
200 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
201 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
202 */
203 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
204 offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
205 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
206 offsetof(struct compat_vcpu_runstate_info, time) + 4);
207 BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
208 #endif
209 /*
210 * The state field is in the same place at the start of both structs,
211 * and is the same size (int) as vx->current_runstate.
212 */
213 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
214 offsetof(struct compat_vcpu_runstate_info, state));
215 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
216 sizeof(vx->current_runstate));
217 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
218 sizeof(vx->current_runstate));
219
220 /*
221 * The state_entry_time field is 64 bits in both versions, and the
222 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
223 * is little-endian means that it's in the last *byte* of the word.
224 * That detail is important later.
225 */
226 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
227 sizeof(uint64_t));
228 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
229 sizeof(uint64_t));
230 BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
231
232 /*
233 * The time array is four 64-bit quantities in both versions, matching
234 * the vx->runstate_times and immediately following state_entry_time.
235 */
236 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
237 offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
238 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
239 offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
240 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
241 sizeof_field(struct compat_vcpu_runstate_info, time));
242 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
243 sizeof(vx->runstate_times));
244
245 if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
246 user_len = sizeof(struct vcpu_runstate_info);
247 times_ofs = offsetof(struct vcpu_runstate_info,
248 state_entry_time);
249 } else {
250 user_len = sizeof(struct compat_vcpu_runstate_info);
251 times_ofs = offsetof(struct compat_vcpu_runstate_info,
252 state_entry_time);
253 }
254
255 /*
256 * There are basically no alignment constraints. The guest can set it
257 * up so it crosses from one page to the next, and at arbitrary byte
258 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
259 * anyway, even if the overall struct had been 64-bit aligned).
260 */
261 if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
262 user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
263 user_len2 = user_len - user_len1;
264 } else {
265 user_len1 = user_len;
266 user_len2 = 0;
267 }
268 BUG_ON(user_len1 + user_len2 != user_len);
269
270 retry:
271 /*
272 * Attempt to obtain the GPC lock on *both* (if there are two)
273 * gfn_to_pfn caches that cover the region.
274 */
275 if (atomic) {
276 local_irq_save(flags);
277 if (!read_trylock(gpc1->lock)) {
278 local_irq_restore(flags);
279 return;
280 }
281 } else {
282 read_lock_irqsave(gpc1->lock, flags);
283 }
284 while (!kvm_gpc_check(gpc1, user_len1)) {
285 read_unlock_irqrestore(gpc1->lock, flags);
286
287 /* When invoked from kvm_sched_out() we cannot sleep */
288 if (atomic)
289 return;
290
291 if (kvm_gpc_refresh(gpc1, user_len1))
292 return;
293
294 read_lock_irqsave(gpc1->lock, flags);
295 }
296
297 if (likely(!user_len2)) {
298 /*
299 * Set up three pointers directly to the runstate_info
300 * struct in the guest (via the GPC).
301 *
302 * • @rs_state → state field
303 * • @rs_times → state_entry_time field.
304 * • @update_bit → last byte of state_entry_time, which
305 * contains the XEN_RUNSTATE_UPDATE bit.
306 */
307 rs_state = gpc1->khva;
308 rs_times = gpc1->khva + times_ofs;
309 if (v->kvm->arch.xen.runstate_update_flag)
310 update_bit = ((void *)(&rs_times[1])) - 1;
311 } else {
312 /*
313 * The guest's runstate_info is split across two pages and we
314 * need to hold and validate both GPCs simultaneously. We can
315 * declare a lock ordering GPC1 > GPC2 because nothing else
316 * takes them more than one at a time. Set a subclass on the
317 * gpc1 lock to make lockdep shut up about it.
318 */
> 319 lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
320 if (atomic) {
321 if (!read_trylock(gpc2->lock)) {
322 read_unlock_irqrestore(gpc1->lock, flags);
323 return;
324 }
325 } else {
326 read_lock(gpc2->lock);
327 }
328
329 if (!kvm_gpc_check(gpc2, user_len2)) {
330 read_unlock(gpc2->lock);
331 read_unlock_irqrestore(gpc1->lock, flags);
332
333 /* When invoked from kvm_sched_out() we cannot sleep */
334 if (atomic)
335 return;
336
337 /*
338 * Use kvm_gpc_activate() here because if the runstate
339 * area was configured in 32-bit mode and only extends
340 * to the second page now because the guest changed to
341 * 64-bit mode, the second GPC won't have been set up.
342 */
343 if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
344 user_len2))
345 return;
346
347 /*
348 * We dropped the lock on GPC1 so we have to go all the
349 * way back and revalidate that too.
350 */
351 goto retry;
352 }
353
354 /*
355 * In this case, the runstate_info struct will be assembled on
356 * the kernel stack (compat or not as appropriate) and will
357 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
358 * rs pointers accordingly.
359 */
360 rs_times = &rs.state_entry_time;
361
362 /*
363 * The rs_state pointer points to the start of what we'll
364 * copy to the guest, which in the case of a compat guest
365 * is the 32-bit field that the compiler thinks is padding.
366 */
367 rs_state = ((void *)rs_times) - times_ofs;
368
369 /*
370 * The update_bit is still directly in the guest memory,
371 * via one GPC or the other.
372 */
373 if (v->kvm->arch.xen.runstate_update_flag) {
374 if (user_len1 >= times_ofs + sizeof(uint64_t))
375 update_bit = gpc1->khva + times_ofs +
376 sizeof(uint64_t) - 1;
377 else
378 update_bit = gpc2->khva + times_ofs +
379 sizeof(uint64_t) - 1 - user_len1;
380 }
381
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
Hi David,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on kvm/queue]
[also build test ERROR on linus/master v6.2-rc5 next-20230127]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20230127044500.680329-2-stevensd%40google.com
patch subject: [PATCH 1/3] KVM: Support sharing gpc locks
config: x86_64-randconfig-a014 (https://download.01.org/0day-ci/archive/20230129/[email protected]/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/14d62b317199184bb71256cc5f652b04fb2f9107
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
git checkout 14d62b317199184bb71256cc5f652b04fb2f9107
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
>> arch/x86/kvm/xen.c:319:31: error: member reference type 'rwlock_t *' is a pointer; did you mean to use '->'?
lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
~~~~~~~~~~^
->
>> arch/x86/kvm/xen.c:319:21: error: passing 'struct lockdep_map' to parameter of incompatible type 'struct lockdep_map *'; take the address with &
lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
^~~~~~~~~~~~~~~~~~
&
include/linux/lockdep.h:296:58: note: passing argument to parameter 'lock' here
static inline void lock_set_subclass(struct lockdep_map *lock,
^
2 errors generated.
vim +319 arch/x86/kvm/xen.c
172
173 static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
174 {
175 struct kvm_vcpu_xen *vx = &v->arch.xen;
176 struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
177 struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
178 size_t user_len, user_len1, user_len2;
179 struct vcpu_runstate_info rs;
180 unsigned long flags;
181 size_t times_ofs;
182 uint8_t *update_bit = NULL;
183 uint64_t entry_time;
184 uint64_t *rs_times;
185 int *rs_state;
186
187 /*
188 * The only difference between 32-bit and 64-bit versions of the
189 * runstate struct is the alignment of uint64_t in 32-bit, which
190 * means that the 64-bit version has an additional 4 bytes of
191 * padding after the first field 'state'. Let's be really really
192 * paranoid about that, and matching it with our internal data
193 * structures that we memcpy into it...
194 */
195 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
196 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
197 BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
198 #ifdef CONFIG_X86_64
199 /*
200 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
201 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
202 */
203 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
204 offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
205 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
206 offsetof(struct compat_vcpu_runstate_info, time) + 4);
207 BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
208 #endif
209 /*
210 * The state field is in the same place at the start of both structs,
211 * and is the same size (int) as vx->current_runstate.
212 */
213 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
214 offsetof(struct compat_vcpu_runstate_info, state));
215 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
216 sizeof(vx->current_runstate));
217 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
218 sizeof(vx->current_runstate));
219
220 /*
221 * The state_entry_time field is 64 bits in both versions, and the
222 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
223 * is little-endian means that it's in the last *byte* of the word.
224 * That detail is important later.
225 */
226 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
227 sizeof(uint64_t));
228 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
229 sizeof(uint64_t));
230 BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
231
232 /*
233 * The time array is four 64-bit quantities in both versions, matching
234 * the vx->runstate_times and immediately following state_entry_time.
235 */
236 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
237 offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
238 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
239 offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
240 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
241 sizeof_field(struct compat_vcpu_runstate_info, time));
242 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
243 sizeof(vx->runstate_times));
244
245 if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
246 user_len = sizeof(struct vcpu_runstate_info);
247 times_ofs = offsetof(struct vcpu_runstate_info,
248 state_entry_time);
249 } else {
250 user_len = sizeof(struct compat_vcpu_runstate_info);
251 times_ofs = offsetof(struct compat_vcpu_runstate_info,
252 state_entry_time);
253 }
254
255 /*
256 * There are basically no alignment constraints. The guest can set it
257 * up so it crosses from one page to the next, and at arbitrary byte
258 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
259 * anyway, even if the overall struct had been 64-bit aligned).
260 */
261 if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
262 user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
263 user_len2 = user_len - user_len1;
264 } else {
265 user_len1 = user_len;
266 user_len2 = 0;
267 }
268 BUG_ON(user_len1 + user_len2 != user_len);
269
270 retry:
271 /*
272 * Attempt to obtain the GPC lock on *both* (if there are two)
273 * gfn_to_pfn caches that cover the region.
274 */
275 if (atomic) {
276 local_irq_save(flags);
277 if (!read_trylock(gpc1->lock)) {
278 local_irq_restore(flags);
279 return;
280 }
281 } else {
282 read_lock_irqsave(gpc1->lock, flags);
283 }
284 while (!kvm_gpc_check(gpc1, user_len1)) {
285 read_unlock_irqrestore(gpc1->lock, flags);
286
287 /* When invoked from kvm_sched_out() we cannot sleep */
288 if (atomic)
289 return;
290
291 if (kvm_gpc_refresh(gpc1, user_len1))
292 return;
293
294 read_lock_irqsave(gpc1->lock, flags);
295 }
296
297 if (likely(!user_len2)) {
298 /*
299 * Set up three pointers directly to the runstate_info
300 * struct in the guest (via the GPC).
301 *
302 * • @rs_state → state field
303 * • @rs_times → state_entry_time field.
304 * • @update_bit → last byte of state_entry_time, which
305 * contains the XEN_RUNSTATE_UPDATE bit.
306 */
307 rs_state = gpc1->khva;
308 rs_times = gpc1->khva + times_ofs;
309 if (v->kvm->arch.xen.runstate_update_flag)
310 update_bit = ((void *)(&rs_times[1])) - 1;
311 } else {
312 /*
313 * The guest's runstate_info is split across two pages and we
314 * need to hold and validate both GPCs simultaneously. We can
315 * declare a lock ordering GPC1 > GPC2 because nothing else
316 * takes them more than one at a time. Set a subclass on the
317 * gpc1 lock to make lockdep shut up about it.
318 */
> 319 lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
320 if (atomic) {
321 if (!read_trylock(gpc2->lock)) {
322 read_unlock_irqrestore(gpc1->lock, flags);
323 return;
324 }
325 } else {
326 read_lock(gpc2->lock);
327 }
328
329 if (!kvm_gpc_check(gpc2, user_len2)) {
330 read_unlock(gpc2->lock);
331 read_unlock_irqrestore(gpc1->lock, flags);
332
333 /* When invoked from kvm_sched_out() we cannot sleep */
334 if (atomic)
335 return;
336
337 /*
338 * Use kvm_gpc_activate() here because if the runstate
339 * area was configured in 32-bit mode and only extends
340 * to the second page now because the guest changed to
341 * 64-bit mode, the second GPC won't have been set up.
342 */
343 if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
344 user_len2))
345 return;
346
347 /*
348 * We dropped the lock on GPC1 so we have to go all the
349 * way back and revalidate that too.
350 */
351 goto retry;
352 }
353
354 /*
355 * In this case, the runstate_info struct will be assembled on
356 * the kernel stack (compat or not as appropriate) and will
357 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
358 * rs pointers accordingly.
359 */
360 rs_times = &rs.state_entry_time;
361
362 /*
363 * The rs_state pointer points to the start of what we'll
364 * copy to the guest, which in the case of a compat guest
365 * is the 32-bit field that the compiler thinks is padding.
366 */
367 rs_state = ((void *)rs_times) - times_ofs;
368
369 /*
370 * The update_bit is still directly in the guest memory,
371 * via one GPC or the other.
372 */
373 if (v->kvm->arch.xen.runstate_update_flag) {
374 if (user_len1 >= times_ofs + sizeof(uint64_t))
375 update_bit = gpc1->khva + times_ofs +
376 sizeof(uint64_t) - 1;
377 else
378 update_bit = gpc2->khva + times_ofs +
379 sizeof(uint64_t) - 1 - user_len1;
380 }
381
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
(Resending as the corporate email system has taken to adding redundant
information in transit as an HTML part — to a message that didn't need
it, which already had it in the Organization: header anyway, and was
previously plain text only.)
On Fri, 2023-01-27 at 13:44 +0900, David Stevens wrote:
> @@ -316,19 +316,19 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
> * takes them more than one at a time. Set a subclass on the
> * gpc1 lock to make lockdep shut up about it.
> */
> - lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
> + lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
The test robot already pointed out this should be gpc1->lock->dep_map,
but if we do what I said below in the first attempt at this email, this
all goes away anyway.
> if (atomic) {
> - if (!read_trylock(&gpc2->lock)) {
> - read_unlock_irqrestore(&gpc1->lock, flags);
> + if (!read_trylock(gpc2->lock)) {
> + read_unlock_irqrestore(gpc1->lock, flags);
> return;
> }
LGTM without having had time to go through it in detail. As Sean said,
we were waiting for someone (perhaps even ourselves) to do this.
Thanks.
For the runstate one above, this is crying out for gpc1 and gpc2 to
*share* a lock, and ditch the relocking and the whole comment about
being able to declare a lock ordering between the two, which is half
shown in the context cited above.
Probably worth doing that in a separate patch on top of this; I'll take
a look at it when my feet next hit the ground on Tuesday if you haven't
already done so. It's easy to test with the xen_shinfo_test self-test.
Or with this if you want more fun:
https://lore.kernel.org/qemu-devel/[email protected]/
:)
From: David Stevens <[email protected]>
Simplify locking in the case where the guest's runstate_info is split
across two pages by sharing a single lock for the two gpcs.
Signed-off-by: David Stevens <[email protected]>
---
I tested this patch with xen_shinfo_test as suggested, and it still
passes. I agree that it makes sense to do this as a seperate patch. For
the bot reported issue, looks like I forgot to build with lockdep
enabled. I'll fix the issue with that patch in the next revision of the
series, so that there aren't any commits which don't build.
arch/x86/kvm/xen.c | 25 ++++---------------------
1 file changed, 4 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index fa8ab23271d3..9251f88a4e0d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -310,24 +310,10 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
update_bit = ((void *)(&rs_times[1])) - 1;
} else {
/*
- * The guest's runstate_info is split across two pages and we
- * need to hold and validate both GPCs simultaneously. We can
- * declare a lock ordering GPC1 > GPC2 because nothing else
- * takes them more than one at a time. Set a subclass on the
- * gpc1 lock to make lockdep shut up about it.
+ * The GPCs for both pages which comprise the guest's
+ * runstate_info share a lock, and it's already locked.
*/
- lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
- if (atomic) {
- if (!read_trylock(gpc2->lock)) {
- read_unlock_irqrestore(gpc1->lock, flags);
- return;
- }
- } else {
- read_lock(gpc2->lock);
- }
-
if (!kvm_gpc_check(gpc2, user_len2)) {
- read_unlock(gpc2->lock);
read_unlock_irqrestore(gpc1->lock, flags);
/* When invoked from kvm_sched_out() we cannot sleep */
@@ -427,9 +413,6 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2)
- read_unlock(gpc2->lock);
-
read_unlock_irqrestore(gpc1->lock, flags);
mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
@@ -2056,8 +2039,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ kvm_gpc_init_with_lock(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN, vcpu->arch.xen.runstate_cache.lock);
kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
KVM_HOST_USES_PFN);
kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
--
2.39.1.456.gfc5497dd1b-goog
(resending from non-broken system)
On Mon, 2023-01-30 at 14:25 +0900, David Stevens wrote:
> From: David Stevens <[email protected]>
>
> Simplify locking in the case where the guest's runstate_info is split
> across two pages by sharing a single lock for the two gpcs.
>
> Signed-off-by: David Stevens <[email protected]>
> ---
> I tested this patch with xen_shinfo_test as suggested, and it still
> passes. I agree that it makes sense to do this as a seperate patch. For
> the bot reported issue, looks like I forgot to build with lockdep
> enabled. I'll fix the issue with that patch in the next revision of the
> series, so that there aren't any commits which don't build.
Ack. Running actual Xen guests also works (but it's the xen_shinfo_test
which is going to exercise cross-page runstate and is most
interesting).
Tested-by: David Woodhouse <[email protected]>
Reviewed-By: David Woodhouse <[email protected]>
On Sat, Jan 28, 2023 at 4:19 AM Sean Christopherson <[email protected]> wrote:
>
> On Fri, Jan 27, 2023, David Stevens wrote:
> > From: David Stevens <[email protected]>
> >
> > This series replaces the usage of kvm_vcpu_map in vmx with
> > gfn_to_pfn_cache. See [1] for details on why kvm_vcpu_map is broken.
> >
> > The presence of kvm_vcpu_map blocks another series I would like to
> > try to merge [2]. Although I'm not familiar with the internals of vmx,
> > I've gone ahead and taken a stab at this cleanup. I've done some manual
> > testing with nested VMs, and KVM selftests pass, but thorough feedback
> > would be appreciated. Once this cleanup is done, I'll take a look at
> > removing kvm_vcpu_map from svm.
>
> Woot, been waiting for someone to take this one, thanks! It'll likely be a week
> or two until I get 'round to this, but it's definitely something I want to get
> merged sooner than later.
Sean, will you be able to get to this in the next few weeks?
Thanks,
David
On Mon, Mar 06, 2023, David Stevens wrote:
> On Sat, Jan 28, 2023 at 4:19 AM Sean Christopherson <[email protected]> wrote:
> >
> > On Fri, Jan 27, 2023, David Stevens wrote:
> > > From: David Stevens <[email protected]>
> > >
> > > This series replaces the usage of kvm_vcpu_map in vmx with
> > > gfn_to_pfn_cache. See [1] for details on why kvm_vcpu_map is broken.
> > >
> > > The presence of kvm_vcpu_map blocks another series I would like to
> > > try to merge [2]. Although I'm not familiar with the internals of vmx,
> > > I've gone ahead and taken a stab at this cleanup. I've done some manual
> > > testing with nested VMs, and KVM selftests pass, but thorough feedback
> > > would be appreciated. Once this cleanup is done, I'll take a look at
> > > removing kvm_vcpu_map from svm.
> >
> > Woot, been waiting for someone to take this one, thanks! It'll likely be a week
> > or two until I get 'round to this, but it's definitely something I want to get
> > merged sooner than later.
>
> Sean, will you be able to get to this in the next few weeks?
Yes, so long as your definition of "few" means 2-4 ;-)
On Fri, Jan 27, 2023, David Stevens wrote:
> From: David Stevens <[email protected]>
>
> This series replaces the usage of kvm_vcpu_map in vmx with
> gfn_to_pfn_cache. See [1] for details on why kvm_vcpu_map is broken.
>
> The presence of kvm_vcpu_map blocks another series I would like to
> try to merge [2]. Although I'm not familiar with the internals of vmx,
> I've gone ahead and taken a stab at this cleanup. I've done some manual
> testing with nested VMs, and KVM selftests pass, but thorough feedback
> would be appreciated. Once this cleanup is done, I'll take a look at
> removing kvm_vcpu_map from svm.
First off, thank you for writing this code and posting the series. Seeing the
actual impact on the nVMX code was super helpful. Second, this response is likely
going to be very critical, but the criticism is not directed at you, it's directed
at the existing mess in KVM. On to the review...
I went fairly deep into review before coming to the conclusion that adding generic
support for mapping unpinned pages outside of the MMU is likely a waste of time
and effort, and will saddle KVM with a pile of complexity that has no real usage
outside of KVM selftests.
There are bugs all over the nVMX code in this series. Some are due to subtleties
in nVMX that aren't super difficult to resolve, but many of them are because the
core gfn_to_pfn_cache code is half-baked. E.g. KVM_REQ_OUTSIDE_GUEST_MODE is
completely useless as kicking the guest does absolutely nothing to guarantee KVM
refreshes the cache before re-entering the guest. That's not too horrible to
solved, e.g. the gpc struct could be expanded to track the KVM request associated
with guest usage so that nVMX (and nSVM) could trigger KVM_REQ_GET_NESTED_STATE_PAGES.
But then nested_get_vmcs12_pages() becomes even more complex because only some of
the pages/pfns it manages are actually plugged into vmcs02. E.g. the vmcs12 MSR
bitmap is read only at VM-Enter, what gets stuffed into vmcs02 is a KVM-allocated
bitmap. Using kvm_vcpu_map() is purely an optimization+simplification that allows
KVM to use a common set of helpers for accessing MSR bitmaps, e.g. as opposed to
having to do a big pile of uaccesses or copy the entire bitmaps into kernel memory.
The PDPTRs, which aren't actually managed through a gpc cache, would also need to
be separated out.
More problems arise in the use of kvm_gpc_check(). In hindsight, not adding a
lockdep assertion that gpc->lock is held in kvm_gpc_check() is a glaring flaw.
vmx_get_nested_state_pages() and vmx_complete_nested_posted_interrupt() fail to
take the gpc->lock when checking the cache. Again, not super difficult to resolve,
but the complexity continues to amount, especially when trying to reason about what
will happen if an invalidation occurs and sets KVM_REQ_GET_NESTED_STATE_PAGES while
vmx_get_nested_state_pages() is refreshing a cache (and thus has dropped gpc->lock).
Even more subtly broken is nested.pi_desc, which is snapshot from
vmx->nested.pi_desc_gpc.khva, but is not protected by the gpc stuff. I honestly
don't even want to think about what vmx_get_nested_state_pages() would look like
if it were fully protected.
Lastly, the number of nasty, fatal bugs we had to fix in the KVM Xen code just to
get KVM usage of gfn_to_pfn_cache healthy doesn't exactly instill me with confidence
that stabilizing guest usage will be all rainbows and butterflies.
So, I propose that we scrap this series along with the entire concept of
KVM_GUEST_USES_PFN, and instead add an off-by-default module param (could be a
per-VM capability in the future if necessary) that controls whether or not
kvm_vcpu_map() is allowed to map non-refcounted memory.
AIUI, AWS cares about mapping arbitrary memory via kvm_vcpu_map(), but uses a
static setup and trusts userspace, at least to the extent where guarding against
userspace unmapping (or moving, etc.) memory that is mapped via kvm_vcpu_map() is
a non-goal.
And on the ChromeOS side, the use case doesn't require playing nice with
kvm_vcpu_map(), the goal is purely to allow mapping non-refcounted "struct page"
memory into the guest so that it can be accessed by a vGPU (or some oddball
pass-through GPU setup?).
Unless I'm missing something, that approach would even give us something that is
consumable for stable backports, e.g. I believe the diff at the bottom is all that
is needed to close the existing holes (feel free to suggest a less scary name).
Then the only thing David S. and company need to sort out for their GPU use case
is teaching KVM to do the right thing with non-refcounted struct page memory.
And I don't feel guilty for asking them to do that work because it's actually
germane to their use case, as opposed to fixing the nVMX/nSVM mess.
And if anyone comes along with a use for unmapping/migrating/swapping memory that
is plugged directly into vmcs02, they get to have the honor of adding support and
convincing everyone that it's worth maintaining.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d255964ec331..c7669807cf73 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -93,6 +93,9 @@ unsigned int halt_poll_ns_shrink;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
+static bool allow_unsafe_kmap;
+module_param(allow_unsafe_kmap);
+
/*
* Ordering of locks:
*
@@ -2838,7 +2841,7 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
page = pfn_to_page(pfn);
hva = kmap(page);
#ifdef CONFIG_HAS_IOMEM
- } else {
+ } else if (allow_unsafe_kmap) {
hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
#endif
}