2020-10-27 17:28:01

by Ben Gardon

[permalink] [raw]
Subject: [PATCH 1/3] locking/rwlocks: Add contention detection for rwlocks

rwlocks do not currently have any facility to detect contention
like spinlocks do. In order to allow users of rwlocks to better manage
latency, add contention detection for queued rwlocks.

Signed-off-by: Ben Gardon <[email protected]>
---
include/asm-generic/qrwlock.h | 23 +++++++++++++++++------
include/linux/rwlock.h | 7 +++++++
2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 3aefde23dceab..c4be4673a6ab2 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -116,15 +116,26 @@ static inline void queued_write_unlock(struct qrwlock *lock)
smp_store_release(&lock->wlocked, 0);
}

+/**
+ * queued_rwlock_is_contended - check if the lock is contended
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+ return arch_spin_is_locked(&lock->wait_lock);
+}
+
/*
* Remapping rwlock architecture specific functions to the corresponding
* queue rwlock functions.
*/
-#define arch_read_lock(l) queued_read_lock(l)
-#define arch_write_lock(l) queued_write_lock(l)
-#define arch_read_trylock(l) queued_read_trylock(l)
-#define arch_write_trylock(l) queued_write_trylock(l)
-#define arch_read_unlock(l) queued_read_unlock(l)
-#define arch_write_unlock(l) queued_write_unlock(l)
+#define arch_read_lock(l) queued_read_lock(l)
+#define arch_write_lock(l) queued_write_lock(l)
+#define arch_read_trylock(l) queued_read_trylock(l)
+#define arch_write_trylock(l) queued_write_trylock(l)
+#define arch_read_unlock(l) queued_read_unlock(l)
+#define arch_write_unlock(l) queued_write_unlock(l)
+#define arch_rwlock_is_contended(l) queued_rwlock_is_contended(l)

#endif /* __ASM_GENERIC_QRWLOCK_H */
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 3dcd617e65ae9..7ce9a51ae5c04 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -128,4 +128,11 @@ do { \
1 : ({ local_irq_restore(flags); 0; }); \
})

+#ifdef arch_rwlock_is_contended
+#define rwlock_is_contended(lock) \
+ arch_rwlock_is_contended(&(lock)->raw_lock)
+#else
+#define rwlock_is_contended(lock) ((void)(lock), 0)
+#endif /* arch_rwlock_is_contended */
+
#endif /* __LINUX_RWLOCK_H */
--
2.29.0.rc2.309.g374f81d7ae-goog


2020-10-28 15:56:32

by Ben Gardon

[permalink] [raw]
Subject: [PATCH 2/3] sched: Add needbreak for rwlocks

Contention awareness while holding a spin lock is essential for reducing
latency when long running kernel operations can hold that lock. Add the
same contention detection interface for read/write spin locks.

Signed-off-by: Ben Gardon <[email protected]>
---
include/linux/sched.h | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b4593..77179160ec3ab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1870,6 +1870,23 @@ static inline int spin_needbreak(spinlock_t *lock)
#endif
}

+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+ return rwlock_is_contended(lock);
+#else
+ return 0;
+#endif
+}
+
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
--
2.29.0.rc2.309.g374f81d7ae-goog

2020-10-28 15:57:16

by Ben Gardon

[permalink] [raw]
Subject: [PATCH 3/3] sched: Add cond_resched_rwlock

Rescheduling while holding a spin lock is essential for keeping long
running kernel operations running smoothly. Add the facility to
cond_resched rwlocks.

Signed-off-by: Ben Gardon <[email protected]>
---
include/linux/sched.h | 12 ++++++++++++
kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77179160ec3ab..2eb0c53fce115 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1841,12 +1841,24 @@ static inline int _cond_resched(void) { return 0; }
})

extern int __cond_resched_lock(spinlock_t *lock);
+extern int __cond_resched_rwlock_read(rwlock_t *lock);
+extern int __cond_resched_rwlock_write(rwlock_t *lock);

#define cond_resched_lock(lock) ({ \
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
__cond_resched_lock(lock); \
})

+#define cond_resched_rwlock_read(lock) ({ \
+ __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+ __cond_resched_rwlock_read(lock); \
+})
+
+#define cond_resched_rwlock_write(lock) ({ \
+ __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+ __cond_resched_rwlock_write(lock); \
+})
+
static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab55..ac58e7829a063 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6152,6 +6152,46 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);

+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
/**
* yield - yield the current processor to other threads.
*
--
2.29.0.rc2.309.g374f81d7ae-goog

2020-10-28 18:08:26

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:
> Rescheduling while holding a spin lock is essential for keeping long
> running kernel operations running smoothly. Add the facility to
> cond_resched rwlocks.

This adds two new exports and two new macros without any in-tree users, which
is generally frowned upon. You and I know these will be used by KVM's new
TDP MMU, but the non-KVM folks, and more importantly the maintainers of this
code, are undoubtedly going to ask "why". I.e. these patches probably belong
in the KVM series to switch to a rwlock for the TDP MMU.

Regarding the code, it's all copy-pasted from the spinlock code and darn near
identical. It might be worth adding builder macros for these.

> Signed-off-by: Ben Gardon <[email protected]>
> ---
> include/linux/sched.h | 12 ++++++++++++
> kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 52 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77179160ec3ab..2eb0c53fce115 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1841,12 +1841,24 @@ static inline int _cond_resched(void) { return 0; }
> })
>
> extern int __cond_resched_lock(spinlock_t *lock);
> +extern int __cond_resched_rwlock_read(rwlock_t *lock);
> +extern int __cond_resched_rwlock_write(rwlock_t *lock);
>
> #define cond_resched_lock(lock) ({ \
> ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
> __cond_resched_lock(lock); \
> })
>
> +#define cond_resched_rwlock_read(lock) ({ \
> + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
> + __cond_resched_rwlock_read(lock); \
> +})
> +
> +#define cond_resched_rwlock_write(lock) ({ \
> + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
> + __cond_resched_rwlock_write(lock); \
> +})
> +
> static inline void cond_resched_rcu(void)
> {
> #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d2003a7d5ab55..ac58e7829a063 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6152,6 +6152,46 @@ int __cond_resched_lock(spinlock_t *lock)
> }
> EXPORT_SYMBOL(__cond_resched_lock);
>
> +int __cond_resched_rwlock_read(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);
> +
> + if (rwlock_needbreak(lock) || resched) {
> + read_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;

AFAICT, this rather odd code flow from __cond_resched_lock() is an artifact of
code changes over the years and not intentionally weird. IMO, it would be
cleaner and easier to read as:

int resched = should_resched(PREEMPT_LOCK_OFFSET);

lockdep_assert_held(lock);

if (!rwlock_needbreak(lock) && !resched)
return 0;

read_unlock(lock);
if (resched)
preempt_schedule_common();
else
cpu_relax();
read_lock(lock)
return 1;


> + read_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_read);
> +
> +int __cond_resched_rwlock_write(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);

This shoulid be lockdep_assert_held_write.

> +
> + if (rwlock_needbreak(lock) || resched) {
> + write_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;
> + write_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_write);
> +
> /**
> * yield - yield the current processor to other threads.
> *
> --
> 2.29.0.rc2.309.g374f81d7ae-goog
>

2020-10-28 20:08:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d2003a7d5ab55..ac58e7829a063 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6152,6 +6152,46 @@ int __cond_resched_lock(spinlock_t *lock)
> }
> EXPORT_SYMBOL(__cond_resched_lock);
>
> +int __cond_resched_rwlock_read(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);

lockdep_assert_held_read(lock);

> +
> + if (rwlock_needbreak(lock) || resched) {
> + read_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;
> + read_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_read);
> +
> +int __cond_resched_rwlock_write(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);

lockdep_assert_held_write(lock);

> +
> + if (rwlock_needbreak(lock) || resched) {
> + write_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;
> + write_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_write);

If this is the only feedback (the patches look fine to me), don't bother
resending, I'll edit them when applying.

2020-10-28 20:14:15

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On 27/10/20 19:44, Peter Zijlstra wrote:
> On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:
>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index d2003a7d5ab55..ac58e7829a063 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6152,6 +6152,46 @@ int __cond_resched_lock(spinlock_t *lock)
>> }
>> EXPORT_SYMBOL(__cond_resched_lock);
>>
>> +int __cond_resched_rwlock_read(rwlock_t *lock)
>> +{
>> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
>> + int ret = 0;
>> +
>> + lockdep_assert_held(lock);
>
> lockdep_assert_held_read(lock);
>
>> +
>> + if (rwlock_needbreak(lock) || resched) {
>> + read_unlock(lock);
>> + if (resched)
>> + preempt_schedule_common();
>> + else
>> + cpu_relax();
>> + ret = 1;
>> + read_lock(lock);
>> + }
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(__cond_resched_rwlock_read);
>> +
>> +int __cond_resched_rwlock_write(rwlock_t *lock)
>> +{
>> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
>> + int ret = 0;
>> +
>> + lockdep_assert_held(lock);
>
> lockdep_assert_held_write(lock);
>
>> +
>> + if (rwlock_needbreak(lock) || resched) {
>> + write_unlock(lock);
>> + if (resched)
>> + preempt_schedule_common();
>> + else
>> + cpu_relax();
>> + ret = 1;
>> + write_lock(lock);
>> + }
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(__cond_resched_rwlock_write);
>
> If this is the only feedback (the patches look fine to me), don't bother
> resending, I'll edit them when applying.
>

If that is an Acked-by, I'll merge them through the KVM tree when time
comes.

Paolo

2020-10-28 20:37:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On Tue, Oct 27, 2020 at 10:56:36AM -0700, Sean Christopherson wrote:
> On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:
> > Rescheduling while holding a spin lock is essential for keeping long
> > running kernel operations running smoothly. Add the facility to
> > cond_resched rwlocks.
>
> This adds two new exports and two new macros without any in-tree users, which
> is generally frowned upon. You and I know these will be used by KVM's new
> TDP MMU, but the non-KVM folks, and more importantly the maintainers of this
> code, are undoubtedly going to ask "why". I.e. these patches probably belong
> in the KVM series to switch to a rwlock for the TDP MMU.

I was informed about this ;-)

> Regarding the code, it's all copy-pasted from the spinlock code and darn near
> identical. It might be worth adding builder macros for these.

I considered mentioning them; I'm typically a fan of them, but I'm not
quite sure it's worth the effort here.

> > +int __cond_resched_rwlock_read(rwlock_t *lock)
> > +{
> > + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> > + int ret = 0;
> > +
> > + lockdep_assert_held(lock);
> > +
> > + if (rwlock_needbreak(lock) || resched) {
> > + read_unlock(lock);
> > + if (resched)
> > + preempt_schedule_common();
> > + else
> > + cpu_relax();
> > + ret = 1;
>
> AFAICT, this rather odd code flow from __cond_resched_lock() is an artifact of
> code changes over the years and not intentionally weird. IMO, it would be
> cleaner and easier to read as:
>
> int resched = should_resched(PREEMPT_LOCK_OFFSET);
>
> lockdep_assert_held(lock);

lockdep_assert_held_read() :-)

>
> if (!rwlock_needbreak(lock) && !resched)
> return 0;
>
> read_unlock(lock);
> if (resched)
> preempt_schedule_common();
> else
> cpu_relax();
> read_lock(lock)
> return 1;
>

I suppose that works, but then also change the existing one.

2020-10-28 21:35:21

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On Tue, 27 Oct 2020, Sean Christopherson wrote:

>On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:
>> Rescheduling while holding a spin lock is essential for keeping long
>> running kernel operations running smoothly. Add the facility to
>> cond_resched rwlocks.

Nit: I would start the paragraph with 'Safely rescheduling ...'
While obvious when reading the code, 'Rescheduling while holding
a spin lock' can throw the reader off.

>
>This adds two new exports and two new macros without any in-tree users, which
>is generally frowned upon. You and I know these will be used by KVM's new
>TDP MMU, but the non-KVM folks, and more importantly the maintainers of this
>code, are undoubtedly going to ask "why". I.e. these patches probably belong
>in the KVM series to switch to a rwlock for the TDP MMU.
>
>Regarding the code, it's all copy-pasted from the spinlock code and darn near
>identical. It might be worth adding builder macros for these.

Agreed, all three could be nicely consolidated. Otherwise this series looks
sane, feel free to add my:

Acked-by: Davidlohr Bueso <[email protected]>

2020-10-30 17:11:51

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Add cond_resched_rwlock

On 10/27/20 12:49 PM, Ben Gardon wrote:
> Rescheduling while holding a spin lock is essential for keeping long
> running kernel operations running smoothly. Add the facility to
> cond_resched rwlocks.
>
> Signed-off-by: Ben Gardon <[email protected]>
> ---
> include/linux/sched.h | 12 ++++++++++++
> kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 52 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77179160ec3ab..2eb0c53fce115 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1841,12 +1841,24 @@ static inline int _cond_resched(void) { return 0; }
> })
>
> extern int __cond_resched_lock(spinlock_t *lock);
> +extern int __cond_resched_rwlock_read(rwlock_t *lock);
> +extern int __cond_resched_rwlock_write(rwlock_t *lock);
>
> #define cond_resched_lock(lock) ({ \
> ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
> __cond_resched_lock(lock); \
> })
>
> +#define cond_resched_rwlock_read(lock) ({ \
> + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
> + __cond_resched_rwlock_read(lock); \
> +})
> +
> +#define cond_resched_rwlock_write(lock) ({ \
> + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
> + __cond_resched_rwlock_write(lock); \
> +})
> +
> static inline void cond_resched_rcu(void)
> {
> #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d2003a7d5ab55..ac58e7829a063 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6152,6 +6152,46 @@ int __cond_resched_lock(spinlock_t *lock)
> }
> EXPORT_SYMBOL(__cond_resched_lock);
>
> +int __cond_resched_rwlock_read(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);
> +
> + if (rwlock_needbreak(lock) || resched) {
> + read_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;
> + read_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_read);
> +
> +int __cond_resched_rwlock_write(rwlock_t *lock)
> +{
> + int resched = should_resched(PREEMPT_LOCK_OFFSET);
> + int ret = 0;
> +
> + lockdep_assert_held(lock);
> +
> + if (rwlock_needbreak(lock) || resched) {
> + write_unlock(lock);
> + if (resched)
> + preempt_schedule_common();
> + else
> + cpu_relax();
> + ret = 1;
> + write_lock(lock);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(__cond_resched_rwlock_write);
> +
> /**
> * yield - yield the current processor to other threads.
> *

Other than the lockdep_assert_held() changes spotted by others, this
patch looks good to me.

Acked-by: Waiman Long <[email protected]>