LinuxLists.cc - Re: [PATCH V3 11/27] csky: Atomic operations

2018-09-12 15:56:56

Subject: Re: [PATCH V3 11/27] csky: Atomic operations

On Wed, Sep 12, 2018 at 09:24:45PM +0800, Guo Ren wrote:

> +#define ATOMIC_OP(op, c_op) \
> +static inline void atomic_##op(int i, atomic_t *v) \
> +{ \
> + unsigned long tmp; \
> + \
> + smp_mb(); \
> + asm volatile ( \
> + "1: ldex.w %0, (%2) \n" \
> + " " #op " %0, %1 \n" \
> + " stex.w %0, (%2) \n" \
> + " bez %0, 1b \n" \
> + : "=&r" (tmp) \
> + : "r" (i), "r"(&v->counter) \
> + : "memory"); \
> + smp_mb(); \
> +}

ATOMIC_OP doesn't need to imply any smp_mb()'s what so ever.

> +#define ATOMIC_OP_RETURN(op, c_op) \
> +static inline int atomic_##op##_return(int i, atomic_t *v) \
> +{ \
> + unsigned long tmp, ret; \
> + \
> + smp_mb(); \
> + asm volatile ( \
> + "1: ldex.w %0, (%3) \n" \
> + " " #op " %0, %2 \n" \
> + " mov %1, %0 \n" \
> + " stex.w %0, (%3) \n" \
> + " bez %0, 1b \n" \
> + : "=&r" (tmp), "=&r" (ret) \
> + : "r" (i), "r"(&v->counter) \
> + : "memory"); \
> + smp_mb(); \
> + \
> + return ret; \
> +}
> +
> +#define ATOMIC_FETCH_OP(op, c_op) \
> +static inline int atomic_fetch_##op(int i, atomic_t *v) \
> +{ \
> + unsigned long tmp, ret; \
> + \
> + smp_mb(); \
> + asm volatile ( \
> + "1: ldex.w %0, (%3) \n" \
> + " mov %1, %0 \n" \
> + " " #op " %0, %2 \n" \
> + " stex.w %0, (%3) \n" \
> + " bez %0, 1b \n" \
> + : "=&r" (tmp), "=&r" (ret) \
> + : "r" (i), "r"(&v->counter) \
> + : "memory"); \
> + smp_mb(); \
> + \
> + return ret; \
> +}

For these you could generate _relaxed variants and not provide smp_mb()
inside them.

> +#else /* CONFIG_CPU_HAS_LDSTEX */
> +
> +#include <linux/irqflags.h>
> +

> +#define ATOMIC_OP(op, c_op) \
> +static inline void atomic_##op(int i, atomic_t *v) \
> +{ \
> + unsigned long tmp, flags; \
> + \
> + raw_local_irq_save(flags); \
> + \
> + asm volatile ( \
> + " ldw %0, (%2) \n" \
> + " " #op " %0, %1 \n" \
> + " stw %0, (%2) \n" \
> + : "=&r" (tmp) \
> + : "r" (i), "r"(&v->counter) \
> + : "memory"); \
> + \
> + raw_local_irq_restore(flags); \
> +}

Is this really 'better' than the generic UP fallback implementation?

> diff --git a/arch/csky/include/asm/spinlock.h b/arch/csky/include/asm/spinlock.h
> new file mode 100644
> index 0000000..f1081bb
> --- /dev/null
> +++ b/arch/csky/include/asm/spinlock.h
> @@ -0,0 +1,286 @@
> +#ifndef __ASM_CSKY_SPINLOCK_H
> +#define __ASM_CSKY_SPINLOCK_H
> +
> +#include <linux/spinlock_types.h>
> +#include <asm/barrier.h>
> +
> +#ifdef CONFIG_QUEUED_RWLOCKS
> +
> +/*
> + * Ticket-based spin-locking.
> + */
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> + arch_spinlock_t lockval;
> + u32 ticket_next = 1 << TICKET_NEXT;
> + u32 *p = &lock->lock;
> + u32 tmp;
> +
> + smp_mb();

spin_lock() doesn't need smp_mb() before.

> + asm volatile (
> + "1: ldex.w %0, (%2) \n"
> + " mov %1, %0 \n"
> + " add %0, %3 \n"
> + " stex.w %0, (%2) \n"
> + " bez %0, 1b \n"
> + : "=&r" (tmp), "=&r" (lockval)
> + : "r"(p), "r"(ticket_next)
> + : "cc");
> +
> + while (lockval.tickets.next != lockval.tickets.owner) {
> + lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> + }
> +
> + smp_mb();
> +}
> +
> +static inline int arch_spin_trylock(arch_spinlock_t *lock)
> +{
> + u32 tmp, contended, res;
> + u32 ticket_next = 1 << TICKET_NEXT;
> + u32 *p = &lock->lock;
> +
> + smp_mb();

idem.

> + do {
> + asm volatile (
> + " ldex.w %0, (%3) \n"
> + " movi %2, 1 \n"
> + " rotli %1, %0, 16 \n"
> + " cmpne %1, %0 \n"
> + " bt 1f \n"
> + " movi %2, 0 \n"
> + " add %0, %0, %4 \n"
> + " stex.w %0, (%3) \n"
> + "1: \n"
> + : "=&r" (res), "=&r" (tmp), "=&r" (contended)
> + : "r"(p), "r"(ticket_next)
> + : "cc");
> + } while (!res);
> +
> + if (!contended)
> + smp_mb();
> +
> + return !contended;
> +}
> +
> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> +{
> + smp_mb();
> + lock->tickets.owner++;
> + smp_mb();

spin_unlock() doesn't need smp_mb() after.

> +}
> +
> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
> +{
> + return lock.tickets.owner == lock.tickets.next;
> +}
> +
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
> +{
> + return !arch_spin_value_unlocked(READ_ONCE(*lock));
> +}
> +
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
> +{
> + struct __raw_tickets tickets = READ_ONCE(lock->tickets);
> + return (tickets.next - tickets.owner) > 1;
> +}
> +#define arch_spin_is_contended arch_spin_is_contended
> +
> +#include <asm/qrwlock.h>
> +
> +/* See include/linux/spinlock.h */
> +#define smp_mb__after_spinlock() smp_mb()
> +
> +#else /* CONFIG_QUEUED_RWLOCKS */
> +
> +/*
> + * Test-and-set spin-locking.
> + */

Why retain that?

same comments; it has far too many smp_mb()s in.

> +#endif /* CONFIG_QUEUED_RWLOCKS */
> +#endif /* __ASM_CSKY_SPINLOCK_H */
> diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h
> new file mode 100644
> index 0000000..7e825c2
> --- /dev/null
> +++ b/arch/csky/include/asm/spinlock_types.h
> @@ -0,0 +1,35 @@
> +#ifndef __ASM_CSKY_SPINLOCK_TYPES_H
> +#define __ASM_CSKY_SPINLOCK_TYPES_H
> +
> +#ifndef __LINUX_SPINLOCK_TYPES_H
> +# error "please don't include this file directly"
> +#endif
> +
> +#define TICKET_NEXT 16
> +
> +typedef struct {
> + union {
> + u32 lock;
> + struct __raw_tickets {
> + /* little endian */
> + u16 owner;
> + u16 next;
> + } tickets;
> + };
> +} arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
> +
> +#ifdef CONFIG_QUEUED_RWLOCKS
> +#include <asm-generic/qrwlock_types.h>
> +
> +#else /* CONFIG_NR_CPUS > 2 */
> +
> +typedef struct {
> + u32 lock;
> +} arch_rwlock_t;
> +
> +#define __ARCH_RW_LOCK_UNLOCKED { 0 }
> +
> +#endif /* CONFIG_QUEUED_RWLOCKS */
> +#endif /* __ASM_CSKY_SPINLOCK_TYPES_H */

2018-09-15 14:56:10

by Guo Ren

[permalink] [raw]

Subject: Re: [PATCH V3 11/27] csky: Atomic operations

Thx for the review, that's very helpful.

On Wed, Sep 12, 2018 at 05:55:14PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 12, 2018 at 09:24:45PM +0800, Guo Ren wrote:
>
> > +#define ATOMIC_OP(op, c_op) \
> > +static inline void atomic_##op(int i, atomic_t *v) \
> > +{ \
> > + unsigned long tmp; \
> > + \
> > + smp_mb(); \
> > + asm volatile ( \
> > + "1: ldex.w %0, (%2) \n" \
> > + " " #op " %0, %1 \n" \
> > + " stex.w %0, (%2) \n" \
> > + " bez %0, 1b \n" \
> > + : "=&r" (tmp) \
> > + : "r" (i), "r"(&v->counter) \
> > + : "memory"); \
> > + smp_mb(); \
> > +}
>
> ATOMIC_OP doesn't need to imply any smp_mb()'s what so ever.
Ok.

> > +#define ATOMIC_OP_RETURN(op, c_op) \
> > +static inline int atomic_##op##_return(int i, atomic_t *v) \
> > +{ \
> > + unsigned long tmp, ret; \
> > + \
> > + smp_mb(); \
> > + asm volatile ( \
> > + "1: ldex.w %0, (%3) \n" \
> > + " " #op " %0, %2 \n" \
> > + " mov %1, %0 \n" \
> > + " stex.w %0, (%3) \n" \
> > + " bez %0, 1b \n" \
> > + : "=&r" (tmp), "=&r" (ret) \
> > + : "r" (i), "r"(&v->counter) \
> > + : "memory"); \
> > + smp_mb(); \
> > + \
> > + return ret; \
> > +}
> > +
> > +#define ATOMIC_FETCH_OP(op, c_op) \
> > +static inline int atomic_fetch_##op(int i, atomic_t *v) \
> > +{ \
> > + unsigned long tmp, ret; \
> > + \
> > + smp_mb(); \
> > + asm volatile ( \
> > + "1: ldex.w %0, (%3) \n" \
> > + " mov %1, %0 \n" \
> > + " " #op " %0, %2 \n" \
> > + " stex.w %0, (%3) \n" \
> > + " bez %0, 1b \n" \
> > + : "=&r" (tmp), "=&r" (ret) \
> > + : "r" (i), "r"(&v->counter) \
> > + : "memory"); \
> > + smp_mb(); \
> > + \
> > + return ret; \
> > +}
>
> For these you could generate _relaxed variants and not provide smp_mb()
> inside them.
Ok, but I'll modify it in next commit.

> > +#else /* CONFIG_CPU_HAS_LDSTEX */
> > +
> > +#include <linux/irqflags.h>
> > +
>
> > +#define ATOMIC_OP(op, c_op) \
> > +static inline void atomic_##op(int i, atomic_t *v) \
> > +{ \
> > + unsigned long tmp, flags; \
> > + \
> > + raw_local_irq_save(flags); \
> > + \
> > + asm volatile ( \
> > + " ldw %0, (%2) \n" \
> > + " " #op " %0, %1 \n" \
> > + " stw %0, (%2) \n" \
> > + : "=&r" (tmp) \
> > + : "r" (i), "r"(&v->counter) \
> > + : "memory"); \
> > + \
> > + raw_local_irq_restore(flags); \
> > +}
>
> Is this really 'better' than the generic UP fallback implementation?
There is a lock irq instruction "idly4" with out irq_save. eg:
asm volatile ( \
" idly4 \n" \
" ldw %0, (%2) \n" \
" " #op " %0, %1 \n" \
" stw %0, (%2) \n" \
I'll change to that after full tested.

> > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > +{
> > + arch_spinlock_t lockval;
> > + u32 ticket_next = 1 << TICKET_NEXT;
> > + u32 *p = &lock->lock;
> > + u32 tmp;
> > +
> > + smp_mb();
>
> spin_lock() doesn't need smp_mb() before.
read_lock and write_lock also needn't smp_mb() before, isn't it?

> > +
> > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + smp_mb();
> > + lock->tickets.owner++;
> > + smp_mb();
>
> spin_unlock() doesn't need smp_mb() after.
read_unlock and write_unlock also needn't smp_mb() after, isn't it?

> > +#else /* CONFIG_QUEUED_RWLOCKS */
> > +
> > +/*
> > + * Test-and-set spin-locking.
> > + */
>
> Why retain that?
>
> same comments; it has far too many smp_mb()s in.
I'm not sure about queued_rwlocks and just for 2-cores-smp test-and-set is
faster and simpler, isn't it?

Best Regards
Guo Ren

2018-09-17 08:18:35

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH V3 11/27] csky: Atomic operations

On Sat, Sep 15, 2018 at 10:55:13PM +0800, Guo Ren wrote:
> > > +#define ATOMIC_OP_RETURN(op, c_op) \

> > > +#define ATOMIC_FETCH_OP(op, c_op) \

> > For these you could generate _relaxed variants and not provide smp_mb()
> > inside them.
> Ok, but I'll modify it in next commit.

That's fine. Just wanted to let you know about _relaxed() since it will
benefit your platform.

> > > +#define ATOMIC_OP(op, c_op) \
> > > +static inline void atomic_##op(int i, atomic_t *v) \
> > > +{ \
> > > + unsigned long tmp, flags; \
> > > + \
> > > + raw_local_irq_save(flags); \
> > > + \
> > > + asm volatile ( \
> > > + " ldw %0, (%2) \n" \
> > > + " " #op " %0, %1 \n" \
> > > + " stw %0, (%2) \n" \
> > > + : "=&r" (tmp) \
> > > + : "r" (i), "r"(&v->counter) \
> > > + : "memory"); \
> > > + \
> > > + raw_local_irq_restore(flags); \
> > > +}
> >
> > Is this really 'better' than the generic UP fallback implementation?
> There is a lock irq instruction "idly4" with out irq_save. eg:
> asm volatile ( \
> " idly4 \n" \
> " ldw %0, (%2) \n" \
> " " #op " %0, %1 \n" \
> " stw %0, (%2) \n" \
> I'll change to that after full tested.

That is pretty nifty, could you explain (or reference me to a arch doc
that does) the exact semantics of that "idly4" instruction?

> > > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > > +{
> > > + arch_spinlock_t lockval;
> > > + u32 ticket_next = 1 << TICKET_NEXT;
> > > + u32 *p = &lock->lock;
> > > + u32 tmp;
> > > +
> > > + smp_mb();
> >
> > spin_lock() doesn't need smp_mb() before.
> read_lock and write_lock also needn't smp_mb() before, isn't it?

Correct. The various *_lock() functions only need imply an ACQUIRE
barrier, such that the critical section happens after the lock is taken.

> > > +
> > > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> > > +{
> > > + smp_mb();
> > > + lock->tickets.owner++;
> > > + smp_mb();
> >
> > spin_unlock() doesn't need smp_mb() after.
> read_unlock and write_unlock also needn't smp_mb() after, isn't it?

Indeed so, the various *_unlock() functions only need imply a RELEASE
barrier, such that the critical section happend before the lock is
released.

In both cases (lock and unlock) there is a great amount of subtle
details, but most of that is irrelevant if all you have is smp_mb().

> > > +/*
> > > + * Test-and-set spin-locking.
> > > + */
> >
> > Why retain that?
> >
> > same comments; it has far too many smp_mb()s in.
> I'm not sure about queued_rwlocks and just for 2-cores-smp test-and-set is
> faster and simpler, isn't it?

Even on 2 cores I think you can create starvation cases with
test-and-set spinlocks. And the maintenace overhead of carrying two lock
implementations is non trivial.

As to performance; I cannot say, but the ticket lock isn't very
expensive, you could benchmark of course.

2018-09-17 15:08:03

by Guo Ren

[permalink] [raw]

Subject: Re: [PATCH V3 11/27] csky: Atomic operations

On Mon, Sep 17, 2018 at 10:17:55AM +0200, Peter Zijlstra wrote:
> On Sat, Sep 15, 2018 at 10:55:13PM +0800, Guo Ren wrote:
> > > > +#define ATOMIC_OP_RETURN(op, c_op) \
>
> > > > +#define ATOMIC_FETCH_OP(op, c_op) \
>
> > > For these you could generate _relaxed variants and not provide smp_mb()
> > > inside them.
> > Ok, but I'll modify it in next commit.
>
> That's fine. Just wanted to let you know about _relaxed() since it will
> benefit your platform.
Thank you.

> > > > +#define ATOMIC_OP(op, c_op) \
> > > > +static inline void atomic_##op(int i, atomic_t *v) \
> > > > +{ \
> > > > + unsigned long tmp, flags; \
> > > > + \
> > > > + raw_local_irq_save(flags); \
> > > > + \
> > > > + asm volatile ( \
> > > > + " ldw %0, (%2) \n" \
> > > > + " " #op " %0, %1 \n" \
> > > > + " stw %0, (%2) \n" \
> > > > + : "=&r" (tmp) \
> > > > + : "r" (i), "r"(&v->counter) \
> > > > + : "memory"); \
> > > > + \
> > > > + raw_local_irq_restore(flags); \
> > > > +}
> > >
> > > Is this really 'better' than the generic UP fallback implementation?
> > There is a lock irq instruction "idly4" with out irq_save. eg:
> > asm volatile ( \
> > " idly4 \n" \
> > " ldw %0, (%2) \n" \
> > " " #op " %0, %1 \n" \
> > " stw %0, (%2) \n" \
> > I'll change to that after full tested.
>
> That is pretty nifty, could you explain (or reference me to a arch doc
> that does) the exact semantics of that "idly4" instruction?
The idly4 allows the 4 instructions behind it to not respond to interrupts.
When ldw got exception, it will cause the carry to be 1. So I need
prepare the assemble like this:
1: cmpne r0, r0
idly4
ldw %0, (%2)
bt 1b
" #op " ...
stw ...

I need more stress test on it and then I'll change to it.

> > > > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > > > +{
> > > > + arch_spinlock_t lockval;
> > > > + u32 ticket_next = 1 << TICKET_NEXT;
> > > > + u32 *p = &lock->lock;
> > > > + u32 tmp;
> > > > +
> > > > + smp_mb();
> > >
> > > spin_lock() doesn't need smp_mb() before.
> > read_lock and write_lock also needn't smp_mb() before, isn't it?
>
> Correct. The various *_lock() functions only need imply an ACQUIRE
> barrier, such that the critical section happens after the lock is taken.
>
> > > > +
> > > > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> > > > +{
> > > > + smp_mb();
> > > > + lock->tickets.owner++;
> > > > + smp_mb();
> > >
> > > spin_unlock() doesn't need smp_mb() after.
> > read_unlock and write_unlock also needn't smp_mb() after, isn't it?
>
> Indeed so, the various *_unlock() functions only need imply a RELEASE
> barrier, such that the critical section happend before the lock is
> released.
>
> In both cases (lock and unlock) there is a great amount of subtle
> details, but most of that is irrelevant if all you have is smp_mb().
Got it, Thx for the explanation.

>
>
> > > > +/*
> > > > + * Test-and-set spin-locking.
> > > > + */
> > >
> > > Why retain that?
> > >
> > > same comments; it has far too many smp_mb()s in.
> > I'm not sure about queued_rwlocks and just for 2-cores-smp test-and-set is
> > faster and simpler, isn't it?
>
> Even on 2 cores I think you can create starvation cases with
> test-and-set spinlocks. And the maintenace overhead of carrying two lock
> implementations is non trivial.
>
> As to performance; I cannot say, but the ticket lock isn't very
> expensive, you could benchmark of course.
Ticket lock is good.
But How about queued_rwlocks v.s my_test_set_rwlock?
I'm not sure about the queued_rwlocks. I just implement the ticket-spinlock.

Best Regards
Guo Ren