2021-03-30 17:01:43

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH 0/2] KVM: x86: fix lockdep splat due to Xen runstate update

pvclock_gtod_sync_lock can be taken with interrupts disabled if the
preempt notifier calls get_kvmclock_ns to update the Xen
runstate information:

spin_lock include/linux/spinlock.h:354 [inline]
get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69
kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100
kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline]
kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062

So change the users of the spinlock to spin_lock_irqsave and
spin_unlock_irqrestore. Before doing that, patch 1 just makes
the pvclock_gtod_sync_lock critical sections as small as possible
so that the resulting irq-disabled sections are easier to review.

Paolo

Paolo Bonzini (2):
KVM: x86: reduce pvclock_gtod_sync_lock critical sections
KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

arch/x86/kvm/x86.c | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)

--
2.26.2


2021-03-30 17:02:00

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

pvclock_gtod_sync_lock can be taken with interrupts disabled if the
preempt notifier calls get_kvmclock_ns to update the Xen
runstate information:

spin_lock include/linux/spinlock.h:354 [inline]
get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69
kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100
kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline]
kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062

So change the users of the spinlock to spin_lock_irqsave and
spin_unlock_irqrestore.

Reported-by: [email protected]
Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
Cc: David Woodhouse <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/x86.c | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a83eff40b43..2bfd00da465f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2329,7 +2329,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);

- spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
@@ -2337,7 +2337,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
}

kvm_track_tsc_matching(vcpu);
- spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}

static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2559,15 +2559,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
int i;
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
+ unsigned long flags;

kvm_hv_invalidate_tsc_page(kvm);

kvm_make_mclock_inprogress_request(kvm);

/* no guest entries from this point */
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2582,17 +2583,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ unsigned long flags;
u64 ret;

- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
return get_kvmclock_base_ns() + ka->kvmclock_offset;
}

hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
@@ -2686,13 +2688,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -7724,6 +7726,7 @@ static void kvm_hyperv_tsc_notifier(void)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int cpu;
+ unsigned long flags;

mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -7739,9 +7742,9 @@ static void kvm_hyperv_tsc_notifier(void)
list_for_each_entry(kvm, &vm_list, vm_list) {
struct kvm_arch *ka = &kvm->arch;

- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
--
2.26.2

2021-03-31 01:43:51

by Wanpeng Li

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

On Wed, 31 Mar 2021 at 01:01, Paolo Bonzini <[email protected]> wrote:
>
> pvclock_gtod_sync_lock can be taken with interrupts disabled if the
> preempt notifier calls get_kvmclock_ns to update the Xen
> runstate information:
>
> spin_lock include/linux/spinlock.h:354 [inline]
> get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
> kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69
> kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100
> kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline]
> kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062
>
> So change the users of the spinlock to spin_lock_irqsave and
> spin_unlock_irqrestore.
>
> Reported-by: [email protected]
> Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
> Cc: David Woodhouse <[email protected]>
> Cc: Marcelo Tosatti <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>

Reviewed-by: Wanpeng Li <[email protected]>

> ---
> arch/x86/kvm/x86.c | 25 ++++++++++++++-----------
> 1 file changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0a83eff40b43..2bfd00da465f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2329,7 +2329,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
> kvm_vcpu_write_tsc_offset(vcpu, offset);
> raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
>
> - spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
> if (!matched) {
> kvm->arch.nr_vcpus_matched_tsc = 0;
> } else if (!already_matched) {
> @@ -2337,7 +2337,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
> }
>
> kvm_track_tsc_matching(vcpu);
> - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
> }
>
> static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
> @@ -2559,15 +2559,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> int i;
> struct kvm_vcpu *vcpu;
> struct kvm_arch *ka = &kvm->arch;
> + unsigned long flags;
>
> kvm_hv_invalidate_tsc_page(kvm);
>
> kvm_make_mclock_inprogress_request(kvm);
>
> /* no guest entries from this point */
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> pvclock_update_vm_gtod_copy(kvm);
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -2582,17 +2583,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
> {
> struct kvm_arch *ka = &kvm->arch;
> struct pvclock_vcpu_time_info hv_clock;
> + unsigned long flags;
> u64 ret;
>
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> if (!ka->use_master_clock) {
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> return get_kvmclock_base_ns() + ka->kvmclock_offset;
> }
>
> hv_clock.tsc_timestamp = ka->master_cycle_now;
> hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> /* both __this_cpu_read() and rdtsc() should be on the same cpu */
> get_cpu();
> @@ -2686,13 +2688,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> * If the host uses TSC clock, then passthrough TSC as stable
> * to the guest.
> */
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> use_master_clock = ka->use_master_clock;
> if (use_master_clock) {
> host_tsc = ka->master_cycle_now;
> kernel_ns = ka->master_kernel_ns;
> }
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> /* Keep irq disabled to prevent changes to the clock */
> local_irq_save(flags);
> @@ -7724,6 +7726,7 @@ static void kvm_hyperv_tsc_notifier(void)
> struct kvm *kvm;
> struct kvm_vcpu *vcpu;
> int cpu;
> + unsigned long flags;
>
> mutex_lock(&kvm_lock);
> list_for_each_entry(kvm, &vm_list, vm_list)
> @@ -7739,9 +7742,9 @@ static void kvm_hyperv_tsc_notifier(void)
> list_for_each_entry(kvm, &vm_list, vm_list) {
> struct kvm_arch *ka = &kvm->arch;
>
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> pvclock_update_vm_gtod_copy(kvm);
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> kvm_for_each_vcpu(cpu, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> --
> 2.26.2
>

2021-04-01 18:33:26

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

On Tue, 2021-03-30 at 12:59 -0400, Paolo Bonzini wrote:
> @@ -2686,13 +2688,13 @@ static int kvm_guest_time_update(struct
> kvm_vcpu *v)
> * If the host uses TSC clock, then passthrough TSC as stable
> * to the guest.
> */
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> use_master_clock = ka->use_master_clock;
> if (use_master_clock) {
> host_tsc = ka->master_cycle_now;
> kernel_ns = ka->master_kernel_ns;
> }
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> /* Keep irq disabled to prevent changes to the clock */
> local_irq_save(flags);

That seems a little gratuitous at the end; restoring the flags as part
of the spin_unlock_irqrestore() and then immediately calling
local_irq_save().

Is something going to complain if we just use spin_unlock() there and
then later restore the flags with the existing local_irq_restore()?

Or should we move the local_irq_save() up above the existing
spin_lock() and leave the spin lock/unlock as they are?


Attachments:
smime.p7s (5.05 kB)

2021-10-23 19:37:01

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

On Tue, 2021-03-30 at 12:59 -0400, Paolo Bonzini wrote:
> pvclock_gtod_sync_lock can be taken with interrupts disabled if the
> preempt notifier calls get_kvmclock_ns to update the Xen
> runstate information:
>
> spin_lock include/linux/spinlock.h:354 [inline]
> get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
> kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69
> kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100
> kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline]
> kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062
>
> So change the users of the spinlock to spin_lock_irqsave and
> spin_unlock_irqrestore.

Apologies, I didn't spot this at the time. Looks sane enough (if we
ignore the elephant in the room that kvm_xen_update_runstate_guest() is
also writing to userspace with interrupts disabled on this preempted
code path, but I have a fix for that in the works¹).

However, in 5.15-rc5 I'm still seeing the warning below when I run
xen_shinfo_test. I confess I'm not entirely sure what it's telling me.


[ 89.138354] =============================
[ 89.138356] [ BUG: Invalid wait context ]
[ 89.138358] 5.15.0-rc5+ #834 Tainted: G S I E
[ 89.138360] -----------------------------
[ 89.138361] xen_shinfo_test/2575 is trying to lock:
[ 89.138363] ffffa34a0364efd8 (&kvm->arch.pvclock_gtod_sync_lock){....}-{3:3}, at: get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138442] other info that might help us debug this:
[ 89.138444] context-{5:5}
[ 89.138445] 4 locks held by xen_shinfo_test/2575:
[ 89.138447] #0: ffff972bdc3b8108 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x77/0x6f0 [kvm]
[ 89.138483] #1: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_ioctl_run+0xdc/0x8b0 [kvm]
[ 89.138526] #2: ffff97331fdbac98 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xff/0xbd0
[ 89.138534] #3: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_put+0x26/0x170 [kvm]
[ 89.138576] stack backtrace:
[ 89.138577] CPU: 27 PID: 2575 Comm: xen_shinfo_test Tainted: G S I E 5.15.0-rc5+ #834
[ 89.138580] Hardware name: Intel Corporation S2600CW/S2600CW, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015
[ 89.138582] Call Trace:
[ 89.138585] dump_stack_lvl+0x6a/0x9a
[ 89.138592] __lock_acquire.cold+0x2ac/0x2d5
[ 89.138597] ? __lock_acquire+0x578/0x1f80
[ 89.138604] lock_acquire+0xc0/0x2d0
[ 89.138608] ? get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138648] ? find_held_lock+0x2b/0x80
[ 89.138653] _raw_spin_lock_irqsave+0x48/0x60
[ 89.138656] ? get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138695] get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138734] kvm_xen_update_runstate+0x14/0x90 [kvm]
[ 89.138783] kvm_xen_update_runstate_guest+0x15/0xd0 [kvm]
[ 89.138830] kvm_arch_vcpu_put+0xe6/0x170 [kvm]
[ 89.138870] kvm_sched_out+0x2f/0x40 [kvm]
[ 89.138900] __schedule+0x5de/0xbd0
[ 89.138904] ? kvm_mmu_topup_memory_cache+0x21/0x70 [kvm]
[ 89.138937] __cond_resched+0x34/0x50
[ 89.138941] kmem_cache_alloc+0x228/0x2e0
[ 89.138946] kvm_mmu_topup_memory_cache+0x21/0x70 [kvm]
[ 89.138979] mmu_topup_memory_caches+0x1d/0x70 [kvm]
[ 89.139024] kvm_mmu_load+0x2d/0x750 [kvm]
[ 89.139070] ? kvm_cpu_has_extint+0x15/0x90 [kvm]
[ 89.139113] ? kvm_cpu_has_injectable_intr+0xe/0x50 [kvm]
[ 89.139155] vcpu_enter_guest+0xc77/0x1210 [kvm]
[ 89.139195] ? kvm_arch_vcpu_ioctl_run+0x146/0x8b0 [kvm]
[ 89.139235] kvm_arch_vcpu_ioctl_run+0x146/0x8b0 [kvm]
[ 89.139274] kvm_vcpu_ioctl+0x279/0x6f0 [kvm]
[ 89.139306] ? find_held_lock+0x2b/0x80
[ 89.139312] __x64_sys_ioctl+0x83/0xb0
[ 89.139316] do_syscall_64+0x3b/0x90
[ 89.139320] entry_SYSCALL_64_after_hwframe+0x44/0xae

¹ https://git.infradead.org/users/dwmw2/linux.git/commitdiff/ec22c08258


Attachments:
smime.p7s (5.05 kB)

2021-10-23 20:35:46

by David Woodhouse

[permalink] [raw]
Subject: [PATCH] KVM: x86: switch pvclock_gtod_sync_lock to a raw spinlock

From: David Woodhouse <[email protected]>

On the preemption path when updating a Xen guest's runstate times, this
lock is taken inside the scheduler rq->lock, which is a raw spinlock.
This was shown in a lockdep warning:

[ 89.138354] =============================
[ 89.138356] [ BUG: Invalid wait context ]
[ 89.138358] 5.15.0-rc5+ #834 Tainted: G S I E
[ 89.138360] -----------------------------
[ 89.138361] xen_shinfo_test/2575 is trying to lock:
[ 89.138363] ffffa34a0364efd8 (&kvm->arch.pvclock_gtod_sync_lock){....}-{3:3}, at: get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138442] other info that might help us debug this:
[ 89.138444] context-{5:5}
[ 89.138445] 4 locks held by xen_shinfo_test/2575:
[ 89.138447] #0: ffff972bdc3b8108 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x77/0x6f0 [kvm]
[ 89.138483] #1: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_ioctl_run+0xdc/0x8b0 [kvm]
[ 89.138526] #2: ffff97331fdbac98 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xff/0xbd0
[ 89.138534] #3: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_put+0x26/0x170 [kvm]
...
[ 89.138695] get_kvmclock_ns+0x1f/0x130 [kvm]
[ 89.138734] kvm_xen_update_runstate+0x14/0x90 [kvm]
[ 89.138783] kvm_xen_update_runstate_guest+0x15/0xd0 [kvm]
[ 89.138830] kvm_arch_vcpu_put+0xe6/0x170 [kvm]
[ 89.138870] kvm_sched_out+0x2f/0x40 [kvm]
[ 89.138900] __schedule+0x5de/0xbd0

Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
Signed-off-by: David Woodhouse <[email protected]>
---
> However, in 5.15-rc5 I'm still seeing the warning below when I run
> xen_shinfo_test. I confess I'm not entirely sure what it's telling
> me.

I think this is what it was telling me...

arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/x86.c | 28 ++++++++++++++--------------
2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8f48a7ec577..70771376e246 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1097,7 +1097,7 @@ struct kvm_arch {
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;

- spinlock_t pvclock_gtod_sync_lock;
+ raw_spinlock_t pvclock_gtod_sync_lock;
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aabd3a2ec1bc..f0874c3d12ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);

- spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+ raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
@@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
}

kvm_track_tsc_matching(vcpu);
- spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}

static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
kvm_make_mclock_inprogress_request(kvm);

/* no guest entries from this point */
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
unsigned long flags;
u64 ret;

- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
return get_kvmclock_base_ns() + ka->kvmclock_offset;
}

hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
@@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -6100,13 +6100,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
* is slightly ahead) here we risk going negative on unsigned
* 'system_time' when 'user_ns.clock' is very small.
*/
- spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+ raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
if (kvm->arch.use_master_clock)
now_ns = ka->master_kernel_ns;
else
now_ns = get_kvmclock_base_ns();
ka->kvmclock_offset = user_ns.clock - now_ns;
- spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+ raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);

kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
break;
@@ -8139,9 +8139,9 @@ static void kvm_hyperv_tsc_notifier(void)
list_for_each_entry(kvm, &vm_list, vm_list) {
struct kvm_arch *ka = &kvm->arch;

- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);

kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -11182,7 +11182,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);

kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
pvclock_update_vm_gtod_copy(kvm);
--
2.25.1



Attachments:
smime.p7s (5.05 kB)

2021-10-25 13:52:01

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: switch pvclock_gtod_sync_lock to a raw spinlock

On 23/10/21 22:29, David Woodhouse wrote:
> From: David Woodhouse <[email protected]>
>
> On the preemption path when updating a Xen guest's runstate times, this
> lock is taken inside the scheduler rq->lock, which is a raw spinlock.
> This was shown in a lockdep warning:
>
> [ 89.138354] =============================
> [ 89.138356] [ BUG: Invalid wait context ]
> [ 89.138358] 5.15.0-rc5+ #834 Tainted: G S I E
> [ 89.138360] -----------------------------
> [ 89.138361] xen_shinfo_test/2575 is trying to lock:
> [ 89.138363] ffffa34a0364efd8 (&kvm->arch.pvclock_gtod_sync_lock){....}-{3:3}, at: get_kvmclock_ns+0x1f/0x130 [kvm]
> [ 89.138442] other info that might help us debug this:
> [ 89.138444] context-{5:5}
> [ 89.138445] 4 locks held by xen_shinfo_test/2575:
> [ 89.138447] #0: ffff972bdc3b8108 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x77/0x6f0 [kvm]
> [ 89.138483] #1: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_ioctl_run+0xdc/0x8b0 [kvm]
> [ 89.138526] #2: ffff97331fdbac98 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xff/0xbd0
> [ 89.138534] #3: ffffa34a03662e90 (&kvm->srcu){....}-{0:0}, at: kvm_arch_vcpu_put+0x26/0x170 [kvm]
> ...
> [ 89.138695] get_kvmclock_ns+0x1f/0x130 [kvm]
> [ 89.138734] kvm_xen_update_runstate+0x14/0x90 [kvm]
> [ 89.138783] kvm_xen_update_runstate_guest+0x15/0xd0 [kvm]
> [ 89.138830] kvm_arch_vcpu_put+0xe6/0x170 [kvm]
> [ 89.138870] kvm_sched_out+0x2f/0x40 [kvm]
> [ 89.138900] __schedule+0x5de/0xbd0
>
> Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
> Signed-off-by: David Woodhouse <[email protected]>
> ---
>> However, in 5.15-rc5 I'm still seeing the warning below when I run
>> xen_shinfo_test. I confess I'm not entirely sure what it's telling
>> me.
>
> I think this is what it was telling me...

Yes, indeed - I will commit this to 5.15 (with a 'fake' merge commit to
kvm/next to inform git that the pvclock_gtod_sync_lock is gone in 5.16 -
and the replacement is already a raw spinlock for partly unrelated reasons).

Paolo

> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/x86.c | 28 ++++++++++++++--------------
> 2 files changed, 15 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f8f48a7ec577..70771376e246 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1097,7 +1097,7 @@ struct kvm_arch {
> u64 cur_tsc_generation;
> int nr_vcpus_matched_tsc;
>
> - spinlock_t pvclock_gtod_sync_lock;
> + raw_spinlock_t pvclock_gtod_sync_lock;
> bool use_master_clock;
> u64 master_kernel_ns;
> u64 master_cycle_now;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index aabd3a2ec1bc..f0874c3d12ce 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
> kvm_vcpu_write_tsc_offset(vcpu, offset);
> raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
>
> - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
> + raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
> if (!matched) {
> kvm->arch.nr_vcpus_matched_tsc = 0;
> } else if (!already_matched) {
> @@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
> }
>
> kvm_track_tsc_matching(vcpu);
> - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
> }
>
> static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
> @@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> kvm_make_mclock_inprogress_request(kvm);
>
> /* no guest entries from this point */
> - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> pvclock_update_vm_gtod_copy(kvm);
> - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
> unsigned long flags;
> u64 ret;
>
> - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> if (!ka->use_master_clock) {
> - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> return get_kvmclock_base_ns() + ka->kvmclock_offset;
> }
>
> hv_clock.tsc_timestamp = ka->master_cycle_now;
> hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> /* both __this_cpu_read() and rdtsc() should be on the same cpu */
> get_cpu();
> @@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> * If the host uses TSC clock, then passthrough TSC as stable
> * to the guest.
> */
> - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> use_master_clock = ka->use_master_clock;
> if (use_master_clock) {
> host_tsc = ka->master_cycle_now;
> kernel_ns = ka->master_kernel_ns;
> }
> - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> /* Keep irq disabled to prevent changes to the clock */
> local_irq_save(flags);
> @@ -6100,13 +6100,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
> * is slightly ahead) here we risk going negative on unsigned
> * 'system_time' when 'user_ns.clock' is very small.
> */
> - spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> + raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> if (kvm->arch.use_master_clock)
> now_ns = ka->master_kernel_ns;
> else
> now_ns = get_kvmclock_base_ns();
> ka->kvmclock_offset = user_ns.clock - now_ns;
> - spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> + raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
>
> kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> break;
> @@ -8139,9 +8139,9 @@ static void kvm_hyperv_tsc_notifier(void)
> list_for_each_entry(kvm, &vm_list, vm_list) {
> struct kvm_arch *ka = &kvm->arch;
>
> - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
> pvclock_update_vm_gtod_copy(kvm);
> - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
>
> kvm_for_each_vcpu(cpu, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -11182,7 +11182,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>
> raw_spin_lock_init(&kvm->arch.tsc_write_lock);
> mutex_init(&kvm->arch.apic_map_lock);
> - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
> + raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
>
> kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
> pvclock_update_vm_gtod_copy(kvm);
>