Under aarch64, __this_cpu_inc() is neither IRQ-safe nor atomic, so
when percpu_up_read() is invoked under IRQ-context (e.g. aio completion),
and it interrupts the process on the same CPU which is invoking
percpu_down_read(), the decreasement on read_count may lost and
the final value of read_count on the CPU will be unexpected
as shown below:
CPU 0 CPU 0
io_submit_one
__sb_start_write
percpu_down_read
__this_cpu_inc
// there is already an inflight IO, so
// reading *raw_cpu_ptr(&pcp) returns 1
// half complete, then being interrupted
*raw_cpu_ptr(&pcp)) += 1
nvme_irq
nvme_complete_cqes
blk_mq_complete_request
nvme_pci_complete_rq
nvme_complete_rq
blk_mq_end_request
blk_update_request
bio_endio
dio_bio_end_aio
aio_complete_rw
__sb_end_write
percpu_up_read
*raw_cpu_ptr(&pcp)) -= 1
// *raw_cpu_ptr(&pcp) is 0
// the decreasement is overwritten by the increasement
*raw_cpu_ptr(&pcp)) += 1
// the final value is 1 + 1 = 2 instead of 1
Fixing it by using the IRQ-safe helper this_cpu_inc|dec() for
operations on read_count.
Another plausible fix is to state that percpu-rwsem can NOT be
used under IRQ context and convert all users which may
use it under IRQ context.
Signed-off-by: Hou Tao <[email protected]>
---
include/linux/percpu-rwsem.h | 8 ++++----
kernel/locking/percpu-rwsem.c | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5e033fe1ff4e9..5fda40f97fe91 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -60,7 +60,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
* anything we did within this RCU-sched read-size critical section.
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
else
__percpu_down_read(sem, false); /* Unconditional memory barrier */
/*
@@ -79,7 +79,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
else
ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
preempt_enable();
@@ -103,7 +103,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss))) {
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
} else {
/*
* slowpath; reader will only ever wake a single blocked
@@ -115,7 +115,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
* aggregate zero, as that is the only time it matters) they
* will also see our critical section.
*/
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
rcuwait_wake_up(&sem->writer);
}
preempt_enable();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3e5203d..70a32a576f3f2 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
/*
* Due to having preemption disabled the decrement happens on
@@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(!atomic_read_acquire(&sem->block)))
return true;
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
/* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
--
2.25.0.4.g0ad7144999
On Wed, Sep 16, 2020 at 08:32:20PM +0800, Hou Tao wrote:
> I have simply test the performance impact on both x86 and aarch64.
>
> There is no degradation under x86 (2 sockets, 18 core per sockets, 2 threads per core)
Yeah, x86 is magical here, it's the same single instruction for both ;-)
But it is, afaik, unique in this position, no other arch can pull that
off.
> However the performance degradation is huge under aarch64 (4 sockets, 24 core per sockets): nearly 60% lost.
>
> v4.19.111
> no writer, reader cn | 24 | 48 | 72 | 96
> the rate of down_read/up_read per second | 166129572 | 166064100 | 165963448 | 165203565
> the rate of down_read/up_read per second (patched) | 63863506 | 63842132 | 63757267 | 63514920
Teh hurt :/
On Thu, Sep 17, 2020 at 04:46:38PM +0300, Boaz Harrosh wrote:
> From my totally subjective experience on the filesystem side (user of
> bio_endio) all HW block drivers I used including Nvme isci, sata... etc. end
> up calling bio_endio in softirq. The big exception to that is the vdX
> drivers under KVM. Which is very Ironic to me.
NVMe normally calls it from hardirq or IPI context. The only time it
would use softirq context is if you have a single I/O queue, which is
very unusual.
The following commit has been merged into the locking/urgent branch of tip:
Commit-ID: e6b1a44eccfcab5e5e280be376f65478c3b2c7a2
Gitweb: https://git.kernel.org/tip/e6b1a44eccfcab5e5e280be376f65478c3b2c7a2
Author: Hou Tao <[email protected]>
AuthorDate: Tue, 15 Sep 2020 22:07:50 +08:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 16 Sep 2020 16:26:56 +02:00
locking/percpu-rwsem: Use this_cpu_{inc,dec}() for read_count
The __this_cpu*() accessors are (in general) IRQ-unsafe which, given
that percpu-rwsem is a blocking primitive, should be just fine.
However, file_end_write() is used from IRQ context and will cause
load-store issues on architectures where the per-cpu accessors are not
natively irq-safe.
Fix it by using the IRQ-safe this_cpu_*() for operations on
read_count. This will generate more expensive code on a number of
platforms, which might cause a performance regression for some of the
other percpu-rwsem users.
If any such is reported, we can consider alternative solutions.
Fixes: 70fe2f48152e ("aio: fix freeze protection of aio writes")
Signed-off-by: Hou Tao <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Will Deacon <[email protected]>
Acked-by: Oleg Nesterov <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/percpu-rwsem.h | 8 ++++----
kernel/locking/percpu-rwsem.c | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5e033fe..5fda40f 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -60,7 +60,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
* anything we did within this RCU-sched read-size critical section.
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
else
__percpu_down_read(sem, false); /* Unconditional memory barrier */
/*
@@ -79,7 +79,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
else
ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
preempt_enable();
@@ -103,7 +103,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss))) {
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
} else {
/*
* slowpath; reader will only ever wake a single blocked
@@ -115,7 +115,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
* aggregate zero, as that is the only time it matters) they
* will also see our critical section.
*/
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
rcuwait_wake_up(&sem->writer);
}
preempt_enable();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3..70a32a5 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
/*
* Due to having preemption disabled the decrement happens on
@@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(!atomic_read_acquire(&sem->block)))
return true;
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
/* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);