Subject: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

This is a revert of commit
a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")

which claims the only reason for using RCU-sched is
"rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"

and
"As the RCU critical sections are extremely short, using sched-RCU
shouldn't have any latency implications."

The problem with using RCU-sched here is that it disables preemption and
the callback must not acquire any sleeping locks like spinlock_t on
PREEMPT_RT which is the case with some of the users.

Using rcu_read_lock() on PREEMPTION=n kernels is not any different
compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
already performance issues due to additional preemption points.
Looking at the code, the rcu_read_lock() is just an increment and unlock
is almost just a decrement unless there is something special to do. Both
are functions while disabling preemption is inlined.
Doing a small benchmark, the minimal amount of time required was mostly
the same. The average time required was higher due to the higher MAX
value (which could be preemption). With DEBUG_PREEMPT=y it is
rcu_read_lock_sched() that takes a little longer due to the additional
debug code.

Convert back to normal RCU.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---

Benchmark https://breakpoint.cc/percpu_test.patch

include/linux/percpu-refcount.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 7aef0abc194a2..390031e816dcd 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count))
this_cpu_add(*percpu_count, nr);
else
atomic_long_add(nr, &ref->count);

- rcu_read_unlock_sched();
+ rcu_read_unlock();
}

/**
@@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
unsigned long __percpu *percpu_count;
bool ret;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
ret = atomic_long_inc_not_zero(&ref->count);
}

- rcu_read_unlock_sched();
+ rcu_read_unlock();

return ret;
}
@@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
unsigned long __percpu *percpu_count;
bool ret = false;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
ret = atomic_long_inc_not_zero(&ref->count);
}

- rcu_read_unlock_sched();
+ rcu_read_unlock();

return ret;
}
@@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
ref->release(ref);

- rcu_read_unlock_sched();
+ rcu_read_unlock();
}

/**
--
2.23.0


2019-10-02 16:06:49

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On Wed, Oct 02, 2019 at 01:22:53PM +0200, Sebastian Andrzej Siewior wrote:
> This is a revert of commit
> a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
>
> which claims the only reason for using RCU-sched is
> "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
>
> and
> "As the RCU critical sections are extremely short, using sched-RCU
> shouldn't have any latency implications."
>
> The problem with using RCU-sched here is that it disables preemption and
> the callback must not acquire any sleeping locks like spinlock_t on
> PREEMPT_RT which is the case with some of the users.

Looks good in general, but changing to RCU-preempt does not change the
fact that the callbacks execute with bh disabled. There is a newish
queue_rcu_work() that invokes a workqueue handler after a grace period.

Or am I missing your point here?

Thanx, Paul

> Using rcu_read_lock() on PREEMPTION=n kernels is not any different
> compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
> already performance issues due to additional preemption points.
> Looking at the code, the rcu_read_lock() is just an increment and unlock
> is almost just a decrement unless there is something special to do. Both
> are functions while disabling preemption is inlined.
> Doing a small benchmark, the minimal amount of time required was mostly
> the same. The average time required was higher due to the higher MAX
> value (which could be preemption). With DEBUG_PREEMPT=y it is
> rcu_read_lock_sched() that takes a little longer due to the additional
> debug code.
>
> Convert back to normal RCU.
>
> Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> ---
>
> Benchmark https://breakpoint.cc/percpu_test.patch
>
> include/linux/percpu-refcount.h | 16 ++++++++--------
> 1 file changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
> index 7aef0abc194a2..390031e816dcd 100644
> --- a/include/linux/percpu-refcount.h
> +++ b/include/linux/percpu-refcount.h
> @@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
> {
> unsigned long __percpu *percpu_count;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count))
> this_cpu_add(*percpu_count, nr);
> else
> atomic_long_add(nr, &ref->count);
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
> }
>
> /**
> @@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
> unsigned long __percpu *percpu_count;
> bool ret;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count)) {
> this_cpu_inc(*percpu_count);
> @@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
> ret = atomic_long_inc_not_zero(&ref->count);
> }
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
>
> return ret;
> }
> @@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
> unsigned long __percpu *percpu_count;
> bool ret = false;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count)) {
> this_cpu_inc(*percpu_count);
> @@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
> ret = atomic_long_inc_not_zero(&ref->count);
> }
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
>
> return ret;
> }
> @@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
> {
> unsigned long __percpu *percpu_count;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count))
> this_cpu_sub(*percpu_count, nr);
> else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
> ref->release(ref);
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
> }
>
> /**
> --
> 2.23.0
>

Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On 2019-10-02 08:08:52 [-0700], Paul E. McKenney wrote:
> On Wed, Oct 02, 2019 at 01:22:53PM +0200, Sebastian Andrzej Siewior wrote:
> > This is a revert of commit
> > a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
> >
> > which claims the only reason for using RCU-sched is
> > "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
> >
> > and
> > "As the RCU critical sections are extremely short, using sched-RCU
> > shouldn't have any latency implications."
> >
> > The problem with using RCU-sched here is that it disables preemption and
> > the callback must not acquire any sleeping locks like spinlock_t on
> > PREEMPT_RT which is the case with some of the users.
>
> Looks good in general, but changing to RCU-preempt does not change the
> fact that the callbacks execute with bh disabled. There is a newish
> queue_rcu_work() that invokes a workqueue handler after a grace period.
>
> Or am I missing your point here?

That is fine, no the RCU callback. The problem is that
percpu_ref_put_many() as of now does:

rcu_read_lock_sched(): /* aka preempt_disable(); */
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
ref->release(ref);

and then the callback invoked via ref->release() acquires a spinlock_t
with disabled preemption.

> Thanx, Paul

Sebastian

Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On 2019-10-02 13:22:53 [+0200], To [email protected] wrote:
> This is a revert of commit
> a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
>
> which claims the only reason for using RCU-sched is
> "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
>
> and
> "As the RCU critical sections are extremely short, using sched-RCU
> shouldn't have any latency implications."
>
> The problem with using RCU-sched here is that it disables preemption and
> the callback must not acquire any sleeping locks like spinlock_t on
> PREEMPT_RT which is the case with some of the users.
>
> Using rcu_read_lock() on PREEMPTION=n kernels is not any different
> compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
> already performance issues due to additional preemption points.
> Looking at the code, the rcu_read_lock() is just an increment and unlock
> is almost just a decrement unless there is something special to do. Both
> are functions while disabling preemption is inlined.
> Doing a small benchmark, the minimal amount of time required was mostly
> the same. The average time required was higher due to the higher MAX
> value (which could be preemption). With DEBUG_PREEMPT=y it is
> rcu_read_lock_sched() that takes a little longer due to the additional
> debug code.
>
> Convert back to normal RCU.

a gentle ping.

> Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> ---
>
> Benchmark https://breakpoint.cc/percpu_test.patch


Sebastian

2019-11-07 16:21:01

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On Thu, Nov 07, 2019 at 10:13:19AM +0100, Sebastian Andrzej Siewior wrote:
> On 2019-10-02 13:22:53 [+0200], To [email protected] wrote:
> > This is a revert of commit
> > a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
> >
> > which claims the only reason for using RCU-sched is
> > "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
> >
> > and
> > "As the RCU critical sections are extremely short, using sched-RCU
> > shouldn't have any latency implications."
> >
> > The problem with using RCU-sched here is that it disables preemption and
> > the callback must not acquire any sleeping locks like spinlock_t on
> > PREEMPT_RT which is the case with some of the users.
> >
> > Using rcu_read_lock() on PREEMPTION=n kernels is not any different
> > compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
> > already performance issues due to additional preemption points.
> > Looking at the code, the rcu_read_lock() is just an increment and unlock
> > is almost just a decrement unless there is something special to do. Both
> > are functions while disabling preemption is inlined.
> > Doing a small benchmark, the minimal amount of time required was mostly
> > the same. The average time required was higher due to the higher MAX
> > value (which could be preemption). With DEBUG_PREEMPT=y it is
> > rcu_read_lock_sched() that takes a little longer due to the additional
> > debug code.
> >
> > Convert back to normal RCU.
>
> a gentle ping.
>
> > Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> > ---
> >
> > Benchmark https://breakpoint.cc/percpu_test.patch
>
>
> Sebastian

Hello,

I just want to clarify a little bit. Is this patch aimed at fixing an
issue with RT kernels specifically? It'd also be nice to have the
numbers as well as if the kernel was RT or non-RT.

Thanks,
Dennis

Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On 2019-11-07 11:17:49 [-0500], Dennis Zhou wrote:
> Hello,

Hi,

> I just want to clarify a little bit. Is this patch aimed at fixing an
> issue with RT kernels specifically?

Due to the implications of preempt_disable() on RT kernels it fixes
problems with RT kernels.

> It'd also be nice to have the
> numbers as well as if the kernel was RT or non-RT.

The benchmark was done on a CONFIG_PREEMPT kernel. As said in the commit
log, the numbers were mostly the same, I can re-run the test and post
numbers if you want them.
This patch makes no difference on PREEMPT_NONE or PREEMPT_VOLUNTARY
kernels.

> Thanks,
> Dennis

Sebastian

2019-11-07 16:56:45

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On Thu, Nov 07, 2019 at 05:28:42PM +0100, Sebastian Andrzej Siewior wrote:
> On 2019-11-07 11:17:49 [-0500], Dennis Zhou wrote:
> > Hello,
>
> Hi,
>
> > I just want to clarify a little bit. Is this patch aimed at fixing an
> > issue with RT kernels specifically?
>
> Due to the implications of preempt_disable() on RT kernels it fixes
> problems with RT kernels.
>

Great, do you mind adding this explanation with what the implications
are in the commit message?


> > It'd also be nice to have the
> > numbers as well as if the kernel was RT or non-RT.
>
> The benchmark was done on a CONFIG_PREEMPT kernel. As said in the commit
> log, the numbers were mostly the same, I can re-run the test and post
> numbers if you want them.
> This patch makes no difference on PREEMPT_NONE or PREEMPT_VOLUNTARY
> kernels.
>

I think a more explicit explanation in the commit message would suffice.

Thanks,
Dennis

Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On 2019-11-07 11:55:19 [-0500], Dennis Zhou wrote:
> On Thu, Nov 07, 2019 at 05:28:42PM +0100, Sebastian Andrzej Siewior wrote:
> > > I just want to clarify a little bit. Is this patch aimed at fixing an
> > > issue with RT kernels specifically?
> >
> > Due to the implications of preempt_disable() on RT kernels it fixes
> > problems with RT kernels.
> >
>
> Great, do you mind adding this explanation with what the implications
> are in the commit message?

some RCU section here invoke callbacks which acquire spinlock_t locks.
This does not work on RT with disabled preemption.

> > > It'd also be nice to have the
> > > numbers as well as if the kernel was RT or non-RT.
> >
> > The benchmark was done on a CONFIG_PREEMPT kernel. As said in the commit
> > log, the numbers were mostly the same, I can re-run the test and post
> > numbers if you want them.
> > This patch makes no difference on PREEMPT_NONE or PREEMPT_VOLUNTARY
> > kernels.
> >
>
> I think a more explicit explanation in the commit message would suffice.

What do you mean by "more explicit explanation"? The part with the
numbers or that it makes no difference for PREEMPT_NONE and
PREEMPT_VOLUNTARY?

> Thanks,
> Dennis

Sebastian

2019-11-07 17:40:12

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH] percpu-refcount: Use normal instead of RCU-sched"

On Thu, Nov 07, 2019 at 06:24:34PM +0100, Sebastian Andrzej Siewior wrote:
> On 2019-11-07 11:55:19 [-0500], Dennis Zhou wrote:
> > On Thu, Nov 07, 2019 at 05:28:42PM +0100, Sebastian Andrzej Siewior wrote:
> > > > I just want to clarify a little bit. Is this patch aimed at fixing an
> > > > issue with RT kernels specifically?
> > >
> > > Due to the implications of preempt_disable() on RT kernels it fixes
> > > problems with RT kernels.
> > >
> >
> > Great, do you mind adding this explanation with what the implications
> > are in the commit message?
>
> some RCU section here invoke callbacks which acquire spinlock_t locks.
> This does not work on RT with disabled preemption.
>

Yeah, so adding a bit in the commit message about why it's an issue for
RT kernels with disabled preemption as I don't believe this is an issue
for non-RT kernels.


> > > > It'd also be nice to have the
> > > > numbers as well as if the kernel was RT or non-RT.
> > >
> > > The benchmark was done on a CONFIG_PREEMPT kernel. As said in the commit
> > > log, the numbers were mostly the same, I can re-run the test and post
> > > numbers if you want them.
> > > This patch makes no difference on PREEMPT_NONE or PREEMPT_VOLUNTARY
> > > kernels.
> > >
> >
> > I think a more explicit explanation in the commit message would suffice.
>
> What do you mean by "more explicit explanation"? The part with the
> numbers or that it makes no difference for PREEMPT_NONE and
> PREEMPT_VOLUNTARY?
>

I just meant the above, the benchmarking is fine.

Thanks,
Dennis

Subject: [PATCH v2] percpu-refcount: Use normal instead of RCU-sched"

This is a revert of commit
a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")

which claims the only reason for using RCU-sched is
"rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"

and
"As the RCU critical sections are extremely short, using sched-RCU
shouldn't have any latency implications."

The problem with using RCU-sched here is that it disables preemption and
the release callback (called from percpu_ref_put_many()) must not
acquire any sleeping locks like spinlock_t. This breaks PREEMPT_RT
because some of the users acquire spinlock_t locks in their callbacks.

Using rcu_read_lock() on PREEMPTION=n kernels is not any different
compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
already performance issues due to additional preemption points.
Looking at the code, the rcu_read_lock() is just an increment and unlock
is almost just a decrement unless there is something special to do. Both
are functions while disabling preemption is inlined.
Doing a small benchmark, the minimal amount of time required was mostly
the same. The average time required was higher due to the higher MAX
value (which could be preemption). With DEBUG_PREEMPT=y it is
rcu_read_lock_sched() that takes a little longer due to the additional
debug code.

Convert back to normal RCU.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
On 2019-11-07 12:36:53 [-0500], Dennis Zhou wrote:
> > some RCU section here invoke callbacks which acquire spinlock_t locks.
> > This does not work on RT with disabled preemption.
> >
>
> Yeah, so adding a bit in the commit message about why it's an issue for
> RT kernels with disabled preemption as I don't believe this is an issue
> for non-RT kernels.

I realized that I had partly in the commit message so I rewrote the
second chapter hopefully covering it all now more explicit.

v1…v2: Slightly rewriting the second paragraph regarding RT
implications.

include/linux/percpu-refcount.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 7aef0abc194a2..390031e816dcd 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count))
this_cpu_add(*percpu_count, nr);
else
atomic_long_add(nr, &ref->count);

- rcu_read_unlock_sched();
+ rcu_read_unlock();
}

/**
@@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
unsigned long __percpu *percpu_count;
bool ret;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
ret = atomic_long_inc_not_zero(&ref->count);
}

- rcu_read_unlock_sched();
+ rcu_read_unlock();

return ret;
}
@@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
unsigned long __percpu *percpu_count;
bool ret = false;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
ret = atomic_long_inc_not_zero(&ref->count);
}

- rcu_read_unlock_sched();
+ rcu_read_unlock();

return ret;
}
@@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;

- rcu_read_lock_sched();
+ rcu_read_lock();

if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
ref->release(ref);

- rcu_read_unlock_sched();
+ rcu_read_unlock();
}

/**
--
2.24.0


Subject: Re: [PATCH v2] percpu-refcount: Use normal instead of RCU-sched"

On Fri, 8 Nov 2019, Sebastian Andrzej Siewior wrote:

> diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
> index 7aef0abc194a2..390031e816dcd 100644
> --- a/include/linux/percpu-refcount.h
> +++ b/include/linux/percpu-refcount.h
> @@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
> {
> unsigned long __percpu *percpu_count;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count))
> this_cpu_add(*percpu_count, nr);

You can use

__this_cpu_add()

instead since rcu_read_lock implies preempt disable.

This will not change the code for x86 but other platforms that do not use
atomic operation here will be able to avoid including to code to disable
preempt for the per cpu operations.

Same is valid for all other per cpu operations in the patch.

Subject: Re: [PATCH v2] percpu-refcount: Use normal instead of RCU-sched"

On 2019-11-08 18:17:47 [+0000], Christopher Lameter wrote:
> On Fri, 8 Nov 2019, Sebastian Andrzej Siewior wrote:
>
> > diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
> > index 7aef0abc194a2..390031e816dcd 100644
> > --- a/include/linux/percpu-refcount.h
> > +++ b/include/linux/percpu-refcount.h
> > @@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
> > {
> > unsigned long __percpu *percpu_count;
> >
> > - rcu_read_lock_sched();
> > + rcu_read_lock();
> >
> > if (__ref_is_percpu(ref, &percpu_count))
> > this_cpu_add(*percpu_count, nr);
>
> You can use
>
> __this_cpu_add()
>
> instead since rcu_read_lock implies preempt disable.

Paul may correct me but rcu_read_lock() does not imply
preempt_disable(). rcu_read_lock() does preempt_disable() on preemption
models other than "Low-Latency Desktop". You can be preempted in a
RCU-read section.

> This will not change the code for x86 but other platforms that do not use
> atomic operation here will be able to avoid including to code to disable
> preempt for the per cpu operations.

x86 triggers this warning with the suggested change:

| BUG: using __this_cpu_add() in preemptible [00000000] code: login/2370
| caller is blk_mq_get_request+0x74/0x4c0
| CPU: 0 PID: 2370 Comm: login Not tainted 5.4.0-rc7+ #82
| Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 04/01/2014
| Call Trace:
| dump_stack+0x7a/0xaa
| __this_cpu_preempt_check.cold+0x49/0x4e
| blk_mq_get_request+0x74/0x4c0
| blk_mq_make_request+0x111/0x890
| generic_make_request+0xd3/0x3f0
| submit_bio+0x42/0x140
| submit_bh_wbc.isra.0+0x13f/0x170
| ll_rw_block+0xa0/0xb0
| __breadahead+0x3f/0x70
| __ext4_get_inode_loc+0x40a/0x520
| __ext4_iget+0x10a/0xcf0
| ext4_lookup+0x106/0x1f0
| lookup_open+0x267/0x8e0
| path_openat+0x7c8/0xcb0
| do_filp_open+0x8c/0x100
| do_sys_open+0x17a/0x230
| __x64_sys_openat+0x1b/0x20
| do_syscall_64+0x5a/0x230
| entry_SYSCALL_64_after_hwframe+0x49/0xbe

> Same is valid for all other per cpu operations in the patch.

Sebastian

2019-11-11 13:40:55

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] percpu-refcount: Use normal instead of RCU-sched"

On Mon, Nov 11, 2019 at 11:44:03AM +0100, Sebastian Andrzej Siewior wrote:
> On 2019-11-08 18:17:47 [+0000], Christopher Lameter wrote:
> > On Fri, 8 Nov 2019, Sebastian Andrzej Siewior wrote:
> >
> > > diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
> > > index 7aef0abc194a2..390031e816dcd 100644
> > > --- a/include/linux/percpu-refcount.h
> > > +++ b/include/linux/percpu-refcount.h
> > > @@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
> > > {
> > > unsigned long __percpu *percpu_count;
> > >
> > > - rcu_read_lock_sched();
> > > + rcu_read_lock();
> > >
> > > if (__ref_is_percpu(ref, &percpu_count))
> > > this_cpu_add(*percpu_count, nr);
> >
> > You can use
> >
> > __this_cpu_add()
> >
> > instead since rcu_read_lock implies preempt disable.
>
> Paul may correct me but rcu_read_lock() does not imply
> preempt_disable(). rcu_read_lock() does preempt_disable() on preemption
> models other than "Low-Latency Desktop". You can be preempted in a
> RCU-read section.

Sebastian is quite correct, rcu_read_lock() definitely does not imply
preempt_disable() when CONFIG_PREEMPT=y. So the splat below is expected
behavior, and not just on x86.

Thanx, Paul

> > This will not change the code for x86 but other platforms that do not use
> > atomic operation here will be able to avoid including to code to disable
> > preempt for the per cpu operations.
>
> x86 triggers this warning with the suggested change:
>
> | BUG: using __this_cpu_add() in preemptible [00000000] code: login/2370
> | caller is blk_mq_get_request+0x74/0x4c0
> | CPU: 0 PID: 2370 Comm: login Not tainted 5.4.0-rc7+ #82
> | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 04/01/2014
> | Call Trace:
> | dump_stack+0x7a/0xaa
> | __this_cpu_preempt_check.cold+0x49/0x4e
> | blk_mq_get_request+0x74/0x4c0
> | blk_mq_make_request+0x111/0x890
> | generic_make_request+0xd3/0x3f0
> | submit_bio+0x42/0x140
> | submit_bh_wbc.isra.0+0x13f/0x170
> | ll_rw_block+0xa0/0xb0
> | __breadahead+0x3f/0x70
> | __ext4_get_inode_loc+0x40a/0x520
> | __ext4_iget+0x10a/0xcf0
> | ext4_lookup+0x106/0x1f0
> | lookup_open+0x267/0x8e0
> | path_openat+0x7c8/0xcb0
> | do_filp_open+0x8c/0x100
> | do_sys_open+0x17a/0x230
> | __x64_sys_openat+0x1b/0x20
> | do_syscall_64+0x5a/0x230
> | entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> > Same is valid for all other per cpu operations in the patch.
>
> Sebastian

2019-11-17 04:14:51

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH v2] percpu-refcount: Use normal instead of RCU-sched"

On Fri, Nov 08, 2019 at 06:35:53PM +0100, Sebastian Andrzej Siewior wrote:
> This is a revert of commit
> a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
>
> which claims the only reason for using RCU-sched is
> "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
>
> and
> "As the RCU critical sections are extremely short, using sched-RCU
> shouldn't have any latency implications."
>
> The problem with using RCU-sched here is that it disables preemption and
> the release callback (called from percpu_ref_put_many()) must not
> acquire any sleeping locks like spinlock_t. This breaks PREEMPT_RT
> because some of the users acquire spinlock_t locks in their callbacks.
>
> Using rcu_read_lock() on PREEMPTION=n kernels is not any different
> compared to rcu_read_lock_sched(). On PREEMPTION=y kernels there are
> already performance issues due to additional preemption points.
> Looking at the code, the rcu_read_lock() is just an increment and unlock
> is almost just a decrement unless there is something special to do. Both
> are functions while disabling preemption is inlined.
> Doing a small benchmark, the minimal amount of time required was mostly
> the same. The average time required was higher due to the higher MAX
> value (which could be preemption). With DEBUG_PREEMPT=y it is
> rcu_read_lock_sched() that takes a little longer due to the additional
> debug code.
>
> Convert back to normal RCU.
>
> Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> ---
> On 2019-11-07 12:36:53 [-0500], Dennis Zhou wrote:
> > > some RCU section here invoke callbacks which acquire spinlock_t locks.
> > > This does not work on RT with disabled preemption.
> > >
> >
> > Yeah, so adding a bit in the commit message about why it's an issue for
> > RT kernels with disabled preemption as I don't believe this is an issue
> > for non-RT kernels.
>
> I realized that I had partly in the commit message so I rewrote the
> second chapter hopefully covering it all now more explicit.
>
> v1…v2: Slightly rewriting the second paragraph regarding RT
> implications.
>
> include/linux/percpu-refcount.h | 16 ++++++++--------
> 1 file changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
> index 7aef0abc194a2..390031e816dcd 100644
> --- a/include/linux/percpu-refcount.h
> +++ b/include/linux/percpu-refcount.h
> @@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
> {
> unsigned long __percpu *percpu_count;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count))
> this_cpu_add(*percpu_count, nr);
> else
> atomic_long_add(nr, &ref->count);
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
> }
>
> /**
> @@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
> unsigned long __percpu *percpu_count;
> bool ret;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count)) {
> this_cpu_inc(*percpu_count);
> @@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
> ret = atomic_long_inc_not_zero(&ref->count);
> }
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
>
> return ret;
> }
> @@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
> unsigned long __percpu *percpu_count;
> bool ret = false;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count)) {
> this_cpu_inc(*percpu_count);
> @@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
> ret = atomic_long_inc_not_zero(&ref->count);
> }
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
>
> return ret;
> }
> @@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
> {
> unsigned long __percpu *percpu_count;
>
> - rcu_read_lock_sched();
> + rcu_read_lock();
>
> if (__ref_is_percpu(ref, &percpu_count))
> this_cpu_sub(*percpu_count, nr);
> else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
> ref->release(ref);
>
> - rcu_read_unlock_sched();
> + rcu_read_unlock();
> }
>
> /**
> --
> 2.24.0
>
>

Sorry for sitting on this for so long. I've applied it to for-5.5.

Thanks,
Dennis