When examining a contended spinlock in a crash dump, we can only find
out the tail CPU in the MCS wait queue. There is no simple way to find
out what other CPUs are waiting for the spinlock and which CPU is the
lock owner.
Make it easier to figure out these information by saving previous node
data into the mcs_spinlock structure. This will allow us to reconstruct
the MCS wait queue from tail to head. In order not to expand the size
of mcs_spinlock, the original count field is split into two 16-bit
chunks. The first chunk is for count and the second one is the new
prev_node value.
bits 0-1 : qnode index
bits 2-15: CPU number + 1
This prev_node value may be truncated if there are 16k or more CPUs in
the system.
The locked value in the queue head is also repurposed to hold an encoded
qspinlock owner CPU number when acquiring the lock in the qspinlock
slowpath of an contended lock.
This lock owner information will not be available when the lock is
acquired directly in the fast path or in the pending code path. There
is no easy way around that.
These changes should make analysis of a contended spinlock in a crash
dump easier.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/locking/mcs_spinlock.h | 13 ++++++++++---
kernel/locking/qspinlock.c | 8 ++++++++
2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 85251d8771d9..cbe6f07dff2e 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -13,12 +13,19 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H
+/*
+ * Save an encoded version of the current MCS lock owner CPU to the
+ * mcs_spinlock structure of the next lock owner.
+ */
+#define MCS_LOCKED (smp_processor_id() + 1)
+
#include <asm/mcs_spinlock.h>
struct mcs_spinlock {
struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
- int count; /* nesting count, see qspinlock.c */
+ int locked; /* non-zero if lock acquired */
+ short count; /* nesting count, see qspinlock.c */
+ short prev_node; /* encoded previous node value */
};
#ifndef arch_mcs_spin_lock_contended
@@ -42,7 +49,7 @@ do { \
* unlocking.
*/
#define arch_mcs_spin_unlock_contended(l) \
- smp_store_release((l), 1)
+ smp_store_release((l), MCS_LOCKED)
#endif
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8ec7cb3..df78d13efa3d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -436,6 +436,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
node->locked = 0;
node->next = NULL;
+ node->prev_node = 0;
pv_init_node(node);
/*
@@ -463,6 +464,13 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
old = xchg_tail(lock, tail);
next = NULL;
+ /*
+ * The prev_node value is saved for crash dump analysis purpose only,
+ * it is not used within the qspinlock code. The encoded node value
+ * may be truncated if there are 16k or more CPUs in the system.
+ */
+ node->prev_node = old >> _Q_TAIL_IDX_OFFSET;
+
/*
* if there was a previous node; link it and wait until reaching the
* head of the waitqueue.
--
2.39.3
On Fri, May 03, 2024 at 10:41:06PM -0400, Waiman Long wrote:
Not much of a fan of this thing, but I suppose it won't hurt too much ...
> diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
> index 85251d8771d9..cbe6f07dff2e 100644
> --- a/kernel/locking/mcs_spinlock.h
> +++ b/kernel/locking/mcs_spinlock.h
> @@ -13,12 +13,19 @@
> #ifndef __LINUX_MCS_SPINLOCK_H
> #define __LINUX_MCS_SPINLOCK_H
>
> +/*
> + * Save an encoded version of the current MCS lock owner CPU to the
> + * mcs_spinlock structure of the next lock owner.
> + */
> +#define MCS_LOCKED (smp_processor_id() + 1)
> +
> #include <asm/mcs_spinlock.h>
>
> struct mcs_spinlock {
> struct mcs_spinlock *next;
> - int locked; /* 1 if lock acquired */
> - int count; /* nesting count, see qspinlock.c */
> + int locked; /* non-zero if lock acquired */
> + short count; /* nesting count, see qspinlock.c */
> + short prev_node; /* encoded previous node value */
Strictly speaking, count shouldn't ever go much higher than 4, so I
suppose a byte should be sufficient. That would then leave 24bit for the
prev thing, but you'll get to use bitfields or some other horrible
thing.
> };
>
> #ifndef arch_mcs_spin_lock_contended
> @@ -42,7 +49,7 @@ do { \
> * unlocking.
> */
> #define arch_mcs_spin_unlock_contended(l) \
> - smp_store_release((l), 1)
> + smp_store_release((l), MCS_LOCKED)
This leaves ARM up a creek I suppose... Also, there's but a single
MCS_LOCKED user.
> #endif
>
> /*
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ebe6b8ec7cb3..df78d13efa3d 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -436,6 +436,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>
> node->locked = 0;
> node->next = NULL;
> + node->prev_node = 0;
> pv_init_node(node);
>
> /*
> @@ -463,6 +464,13 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> old = xchg_tail(lock, tail);
> next = NULL;
>
> + /*
> + * The prev_node value is saved for crash dump analysis purpose only,
> + * it is not used within the qspinlock code. The encoded node value
> + * may be truncated if there are 16k or more CPUs in the system.
> + */
> + node->prev_node = old >> _Q_TAIL_IDX_OFFSET;
> +
> /*
> * if there was a previous node; link it and wait until reaching the
> * head of the waitqueue.
> --
> 2.39.3
>
On 5/6/24 06:36, Peter Zijlstra wrote:
> On Fri, May 03, 2024 at 10:41:06PM -0400, Waiman Long wrote:
>
> Not much of a fan of this thing, but I suppose it won't hurt too much ...
I purposely doesn't touch the fast path and add code only to slow path
to make sure there shouldn't be any noticeable performance impact.
>> diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
>> index 85251d8771d9..cbe6f07dff2e 100644
>> --- a/kernel/locking/mcs_spinlock.h
>> +++ b/kernel/locking/mcs_spinlock.h
>> @@ -13,12 +13,19 @@
>> #ifndef __LINUX_MCS_SPINLOCK_H
>> #define __LINUX_MCS_SPINLOCK_H
>>
>> +/*
>> + * Save an encoded version of the current MCS lock owner CPU to the
>> + * mcs_spinlock structure of the next lock owner.
>> + */
>> +#define MCS_LOCKED (smp_processor_id() + 1)
>> +
>> #include <asm/mcs_spinlock.h>
>>
>> struct mcs_spinlock {
>> struct mcs_spinlock *next;
>> - int locked; /* 1 if lock acquired */
>> - int count; /* nesting count, see qspinlock.c */
>> + int locked; /* non-zero if lock acquired */
>> + short count; /* nesting count, see qspinlock.c */
>> + short prev_node; /* encoded previous node value */
> Strictly speaking, count shouldn't ever go much higher than 4, so I
> suppose a byte should be sufficient. That would then leave 24bit for the
> prev thing, but you'll get to use bitfields or some other horrible
> thing.
Using bit field will be more expensive. So I am deferring that until the
time it becomes necessary.
>> };
>>
>> #ifndef arch_mcs_spin_lock_contended
>> @@ -42,7 +49,7 @@ do { \
>> * unlocking.
>> */
>> #define arch_mcs_spin_unlock_contended(l) \
>> - smp_store_release((l), 1)
>> + smp_store_release((l), MCS_LOCKED)
> This leaves ARM up a creek I suppose... Also, there's but a single
> MCS_LOCKED user.
I am aware that ARM has its ownarch_mcs_spin_unlock_contended() macro
defined. That is why I define the MCS_LOCKED macro before including
asm/mcs_spinlock.h. I am planning to send an arm patch to change 1 to
MCS_LOCKED if this patch is merged to close the gap. None other arches
have arch_mcs_spin_unlock_contended() macro defined.
Cheers, Longman