2013-09-25 22:10:55

by Tim Chen

[permalink] [raw]
Subject: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

We will need the MCS lock code for doing optimistic spinning for rwsem.
Extracting the MCS code from mutex.c and put into its own file allow us
to reuse this code easily for rwsem.

Signed-off-by: Tim Chen <[email protected]>
Signed-off-by: Davidlohr Bueso <[email protected]>
---
include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
kernel/mutex.c | 58 +++++-----------------------------------------
2 files changed, 65 insertions(+), 51 deletions(-)
create mode 100644 include/linux/mcslock.h

diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
new file mode 100644
index 0000000..20fd3f0
--- /dev/null
+++ b/include/linux/mcslock.h
@@ -0,0 +1,58 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ */
+#ifndef __LINUX_MCSLOCK_H
+#define __LINUX_MCSLOCK_H
+
+struct mcs_spin_node {
+ struct mcs_spin_node *next;
+ int locked; /* 1 if lock acquired */
+};
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static noinline
+void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
+{
+ struct mcs_spin_node *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
+{
+ struct mcs_spin_node *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 6d647ae..1b6ba3f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include <linux/mcslock.h>

/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -111,54 +112,9 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))

-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */
- node->locked = 1;
- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */
- if (cmpxchg(lock, node, NULL) == node)
- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}
+#define MLOCK(mutex) ((struct mcs_spin_node **)&((mutex)->spin_mlock))

/*
* Mutex spinning code migrated from kernel/sched/core.c
@@ -448,7 +404,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,

for (;;) {
struct task_struct *owner;
- struct mspin_node node;
+ struct mcs_spin_node node;

if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
struct ww_mutex *ww;
@@ -470,10 +426,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- mspin_lock(MLOCK(lock), &node);
+ mcs_spin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner)) {
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(MLOCK(lock), &node);
goto slowpath;
}

@@ -488,11 +444,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}

mutex_set_owner(lock);
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(MLOCK(lock), &node);

/*
* When there's no owner, we might have preempted between the
--
1.7.4.4



2013-09-26 06:46:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file


* Tim Chen <[email protected]> wrote:

> We will need the MCS lock code for doing optimistic spinning for rwsem.
> Extracting the MCS code from mutex.c and put into its own file allow us
> to reuse this code easily for rwsem.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> ---
> include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/mutex.c | 58 +++++-----------------------------------------
> 2 files changed, 65 insertions(+), 51 deletions(-)
> create mode 100644 include/linux/mcslock.h
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> new file mode 100644
> index 0000000..20fd3f0
> --- /dev/null
> +++ b/include/linux/mcslock.h
> @@ -0,0 +1,58 @@
> +/*
> + * MCS lock defines
> + *
> + * This file contains the main data structure and API definitions of MCS lock.

A (very) short blurb about what an MCS lock is would be nice here.

> + */
> +#ifndef __LINUX_MCSLOCK_H
> +#define __LINUX_MCSLOCK_H
> +
> +struct mcs_spin_node {
> + struct mcs_spin_node *next;
> + int locked; /* 1 if lock acquired */
> +};

The vertical alignment looks broken here.

> +
> +/*
> + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> + * time spent in this lock function.
> + */
> +static noinline
> +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *prev;
> +
> + /* Init node */
> + node->locked = 0;
> + node->next = NULL;
> +
> + prev = xchg(lock, node);
> + if (likely(prev == NULL)) {
> + /* Lock acquired */
> + node->locked = 1;
> + return;
> + }
> + ACCESS_ONCE(prev->next) = node;
> + smp_wmb();
> + /* Wait until the lock holder passes the lock down */
> + while (!ACCESS_ONCE(node->locked))
> + arch_mutex_cpu_relax();
> +}
> +
> +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> +
> + if (likely(!next)) {
> + /*
> + * Release the lock by setting it to NULL
> + */
> + if (cmpxchg(lock, node, NULL) == node)
> + return;
> + /* Wait until the next pointer is set */
> + while (!(next = ACCESS_ONCE(node->next)))
> + arch_mutex_cpu_relax();
> + }
> + ACCESS_ONCE(next->locked) = 1;
> + smp_wmb();
> +}
> +
> +#endif

We typically close header guards not via a plain #endif but like this:

#endif /* __LINUX_SPINLOCK_H */

#endif /* __LINUX_SPINLOCK_TYPES_H */

etc.

Thanks,

Ingo

2013-09-26 08:40:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, Sep 26, 2013 at 08:46:29AM +0200, Ingo Molnar wrote:
> > +/*
> > + * MCS lock defines
> > + *
> > + * This file contains the main data structure and API definitions of MCS lock.
>
> A (very) short blurb about what an MCS lock is would be nice here.

A while back I suggested including a link to something like:

http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf

Its a fairly concise write-up of the idea; only 6 pages. The sad part
about linking to the web is that links tend to go dead after a while.

2013-09-26 09:37:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file


* Peter Zijlstra <[email protected]> wrote:

> On Thu, Sep 26, 2013 at 08:46:29AM +0200, Ingo Molnar wrote:
> > > +/*
> > > + * MCS lock defines
> > > + *
> > > + * This file contains the main data structure and API definitions of MCS lock.
> >
> > A (very) short blurb about what an MCS lock is would be nice here.
>
> A while back I suggested including a link to something like:
>
> http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
>
> Its a fairly concise write-up of the idea; only 6 pages. The sad part
> about linking to the web is that links tend to go dead after a while.

So what I wanted to see was to add just a few sentences summing up the
concept - so that people blundering into this file in include/linux/ have
an idea what it's all about!

Thanks,

Ingo

2013-09-26 18:18:49

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 10:40 +0200, Peter Zijlstra wrote:
> On Thu, Sep 26, 2013 at 08:46:29AM +0200, Ingo Molnar wrote:
> > > +/*
> > > + * MCS lock defines
> > > + *
> > > + * This file contains the main data structure and API definitions of MCS lock.
> >
> > A (very) short blurb about what an MCS lock is would be nice here.
>
> A while back I suggested including a link to something like:
>
> http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
>
> Its a fairly concise write-up of the idea; only 6 pages. The sad part
> about linking to the web is that links tend to go dead after a while.

Link rot is a problem. If I provide a few details about MCS lock I
think people should be able to google for it.

Tim


2013-09-26 19:27:26

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> We will need the MCS lock code for doing optimistic spinning for rwsem.
> Extracting the MCS code from mutex.c and put into its own file allow us
> to reuse this code easily for rwsem.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> ---
> include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/mutex.c | 58 +++++-----------------------------------------
> 2 files changed, 65 insertions(+), 51 deletions(-)
> create mode 100644 include/linux/mcslock.h
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> new file mode 100644
> index 0000000..20fd3f0
> --- /dev/null
> +++ b/include/linux/mcslock.h
> @@ -0,0 +1,58 @@
> +/*
> + * MCS lock defines
> + *
> + * This file contains the main data structure and API definitions of MCS lock.
> + */
> +#ifndef __LINUX_MCSLOCK_H
> +#define __LINUX_MCSLOCK_H
> +
> +struct mcs_spin_node {
> + struct mcs_spin_node *next;
> + int locked; /* 1 if lock acquired */
> +};
> +
> +/*
> + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> + * time spent in this lock function.
> + */
> +static noinline
> +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *prev;
> +
> + /* Init node */
> + node->locked = 0;
> + node->next = NULL;
> +
> + prev = xchg(lock, node);
> + if (likely(prev == NULL)) {
> + /* Lock acquired */
> + node->locked = 1;

If we don't spin on the local node, is it necessary to set this variable?

> + return;
> + }
> + ACCESS_ONCE(prev->next) = node;
> + smp_wmb();
> + /* Wait until the lock holder passes the lock down */
> + while (!ACCESS_ONCE(node->locked))
> + arch_mutex_cpu_relax();
> +}
> +
> +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> +
> + if (likely(!next)) {
> + /*
> + * Release the lock by setting it to NULL
> + */
> + if (cmpxchg(lock, node, NULL) == node)

And can we make this check likely()?

2013-09-26 20:06:54

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > Extracting the MCS code from mutex.c and put into its own file allow us
> > to reuse this code easily for rwsem.
> >
> > Signed-off-by: Tim Chen <[email protected]>
> > Signed-off-by: Davidlohr Bueso <[email protected]>
> > ---
> > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > kernel/mutex.c | 58 +++++-----------------------------------------
> > 2 files changed, 65 insertions(+), 51 deletions(-)
> > create mode 100644 include/linux/mcslock.h
> >
> > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > new file mode 100644
> > index 0000000..20fd3f0
> > --- /dev/null
> > +++ b/include/linux/mcslock.h
> > @@ -0,0 +1,58 @@
> > +/*
> > + * MCS lock defines
> > + *
> > + * This file contains the main data structure and API definitions of MCS lock.
> > + */
> > +#ifndef __LINUX_MCSLOCK_H
> > +#define __LINUX_MCSLOCK_H
> > +
> > +struct mcs_spin_node {
> > + struct mcs_spin_node *next;
> > + int locked; /* 1 if lock acquired */
> > +};
> > +
> > +/*
> > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > + * time spent in this lock function.
> > + */
> > +static noinline
> > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > +{
> > + struct mcs_spin_node *prev;
> > +
> > + /* Init node */
> > + node->locked = 0;
> > + node->next = NULL;
> > +
> > + prev = xchg(lock, node);
> > + if (likely(prev == NULL)) {
> > + /* Lock acquired */
> > + node->locked = 1;
>
> If we don't spin on the local node, is it necessary to set this variable?

I don't follow, the whole idea is to spin on the local variable.

2013-09-26 20:23:26

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > to reuse this code easily for rwsem.
> > >
> > > Signed-off-by: Tim Chen <[email protected]>
> > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > ---
> > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > create mode 100644 include/linux/mcslock.h
> > >
> > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > new file mode 100644
> > > index 0000000..20fd3f0
> > > --- /dev/null
> > > +++ b/include/linux/mcslock.h
> > > @@ -0,0 +1,58 @@
> > > +/*
> > > + * MCS lock defines
> > > + *
> > > + * This file contains the main data structure and API definitions of MCS lock.
> > > + */
> > > +#ifndef __LINUX_MCSLOCK_H
> > > +#define __LINUX_MCSLOCK_H
> > > +
> > > +struct mcs_spin_node {
> > > + struct mcs_spin_node *next;
> > > + int locked; /* 1 if lock acquired */
> > > +};
> > > +
> > > +/*
> > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > + * time spent in this lock function.
> > > + */
> > > +static noinline
> > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > +{
> > > + struct mcs_spin_node *prev;
> > > +
> > > + /* Init node */
> > > + node->locked = 0;
> > > + node->next = NULL;
> > > +
> > > + prev = xchg(lock, node);
> > > + if (likely(prev == NULL)) {
> > > + /* Lock acquired */
> > > + node->locked = 1;
> >
> > If we don't spin on the local node, is it necessary to set this variable?
>
> I don't follow, the whole idea is to spin on the local variable.

If prev == NULL, doesn't that mean it won't proceed to spin on the
variable because the lock is already free and we call return? In that
case where we directly acquire the lock, I was wondering if it is
necessary to set node->locked = 1.

2013-09-26 20:41:08

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > to reuse this code easily for rwsem.
> > > >
> > > > Signed-off-by: Tim Chen <[email protected]>
> > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > ---
> > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > create mode 100644 include/linux/mcslock.h
> > > >
> > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > new file mode 100644
> > > > index 0000000..20fd3f0
> > > > --- /dev/null
> > > > +++ b/include/linux/mcslock.h
> > > > @@ -0,0 +1,58 @@
> > > > +/*
> > > > + * MCS lock defines
> > > > + *
> > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > + */
> > > > +#ifndef __LINUX_MCSLOCK_H
> > > > +#define __LINUX_MCSLOCK_H
> > > > +
> > > > +struct mcs_spin_node {
> > > > + struct mcs_spin_node *next;
> > > > + int locked; /* 1 if lock acquired */
> > > > +};
> > > > +
> > > > +/*
> > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > + * time spent in this lock function.
> > > > + */
> > > > +static noinline
> > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > +{
> > > > + struct mcs_spin_node *prev;
> > > > +
> > > > + /* Init node */
> > > > + node->locked = 0;
> > > > + node->next = NULL;
> > > > +
> > > > + prev = xchg(lock, node);
> > > > + if (likely(prev == NULL)) {
> > > > + /* Lock acquired */
> > > > + node->locked = 1;
> > >
> > > If we don't spin on the local node, is it necessary to set this variable?
> >
> > I don't follow, the whole idea is to spin on the local variable.
>
> If prev == NULL, doesn't that mean it won't proceed to spin on the
> variable because the lock is already free and we call return? In that
> case where we directly acquire the lock, I was wondering if it is
> necessary to set node->locked = 1.

Yes, that's true, but we need to flag the lock as acquired (the node's
lock is initially set to unlocked), otherwise others trying to acquire
the lock can spin forever:

/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();

The ->locked variable in this implementation refers to if the lock is
acquired, and *not* to if busy-waiting is necessary.

Thanks,
Davidlohr

2013-09-26 21:10:00

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 13:40 -0700, Davidlohr Bueso wrote:
> On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> > On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > to reuse this code easily for rwsem.
> > > > >
> > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > ---
> > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > create mode 100644 include/linux/mcslock.h
> > > > >
> > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > new file mode 100644
> > > > > index 0000000..20fd3f0
> > > > > --- /dev/null
> > > > > +++ b/include/linux/mcslock.h
> > > > > @@ -0,0 +1,58 @@
> > > > > +/*
> > > > > + * MCS lock defines
> > > > > + *
> > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > + */
> > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > +#define __LINUX_MCSLOCK_H
> > > > > +
> > > > > +struct mcs_spin_node {
> > > > > + struct mcs_spin_node *next;
> > > > > + int locked; /* 1 if lock acquired */
> > > > > +};
> > > > > +
> > > > > +/*
> > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > + * time spent in this lock function.
> > > > > + */
> > > > > +static noinline
> > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > +{
> > > > > + struct mcs_spin_node *prev;
> > > > > +
> > > > > + /* Init node */
> > > > > + node->locked = 0;
> > > > > + node->next = NULL;
> > > > > +
> > > > > + prev = xchg(lock, node);
> > > > > + if (likely(prev == NULL)) {
> > > > > + /* Lock acquired */
> > > > > + node->locked = 1;
> > > >
> > > > If we don't spin on the local node, is it necessary to set this variable?
> > >
> > > I don't follow, the whole idea is to spin on the local variable.
> >
> > If prev == NULL, doesn't that mean it won't proceed to spin on the
> > variable because the lock is already free and we call return? In that
> > case where we directly acquire the lock, I was wondering if it is
> > necessary to set node->locked = 1.
>
> Yes, that's true, but we need to flag the lock as acquired (the node's
> lock is initially set to unlocked), otherwise others trying to acquire
> the lock can spin forever:
>
> /* Wait until the lock holder passes the lock down */
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
>
> The ->locked variable in this implementation refers to if the lock is
> acquired, and *not* to if busy-waiting is necessary.

hmm, others threads acquiring the lock will be spinning on their own
local nodes, not this node's node->locked. And if prev == NULL, the
current thread won't be reading it's node->lock either since we return.
So what other thread is going to be reading this node's node->lock?

Thanks,
Jason

2013-09-26 21:41:48

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 14:09 -0700, Jason Low wrote:
> On Thu, 2013-09-26 at 13:40 -0700, Davidlohr Bueso wrote:
> > On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> > > On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > > > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > > to reuse this code easily for rwsem.
> > > > > >
> > > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > > ---
> > > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > > create mode 100644 include/linux/mcslock.h
> > > > > >
> > > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > > new file mode 100644
> > > > > > index 0000000..20fd3f0
> > > > > > --- /dev/null
> > > > > > +++ b/include/linux/mcslock.h
> > > > > > @@ -0,0 +1,58 @@
> > > > > > +/*
> > > > > > + * MCS lock defines
> > > > > > + *
> > > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > > + */
> > > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > > +#define __LINUX_MCSLOCK_H
> > > > > > +
> > > > > > +struct mcs_spin_node {
> > > > > > + struct mcs_spin_node *next;
> > > > > > + int locked; /* 1 if lock acquired */
> > > > > > +};
> > > > > > +
> > > > > > +/*
> > > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > > + * time spent in this lock function.
> > > > > > + */
> > > > > > +static noinline
> > > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > > +{
> > > > > > + struct mcs_spin_node *prev;
> > > > > > +
> > > > > > + /* Init node */
> > > > > > + node->locked = 0;
> > > > > > + node->next = NULL;
> > > > > > +
> > > > > > + prev = xchg(lock, node);
> > > > > > + if (likely(prev == NULL)) {
> > > > > > + /* Lock acquired */
> > > > > > + node->locked = 1;
> > > > >
> > > > > If we don't spin on the local node, is it necessary to set this variable?
> > > >
> > > > I don't follow, the whole idea is to spin on the local variable.
> > >
> > > If prev == NULL, doesn't that mean it won't proceed to spin on the
> > > variable because the lock is already free and we call return? In that
> > > case where we directly acquire the lock, I was wondering if it is
> > > necessary to set node->locked = 1.
> >
> > Yes, that's true, but we need to flag the lock as acquired (the node's
> > lock is initially set to unlocked), otherwise others trying to acquire
> > the lock can spin forever:
> >
> > /* Wait until the lock holder passes the lock down */
> > while (!ACCESS_ONCE(node->locked))
> > arch_mutex_cpu_relax();
> >
> > The ->locked variable in this implementation refers to if the lock is
> > acquired, and *not* to if busy-waiting is necessary.
>
> hmm, others threads acquiring the lock will be spinning on their own
> local nodes, not this node's node->locked. And if prev == NULL, the
> current thread won't be reading it's node->lock either since we return.
> So what other thread is going to be reading this node's node->lock?
>
> Thanks,
> Jason

I think setting node->locked = 1 for the prev==NULL case is not
necessary functionally, but was done for semantics consistency.

Tim

2013-09-26 22:22:18

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 14:09 -0700, Jason Low wrote:
> On Thu, 2013-09-26 at 13:40 -0700, Davidlohr Bueso wrote:
> > On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> > > On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > > > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > > to reuse this code easily for rwsem.
> > > > > >
> > > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > > ---
> > > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > > create mode 100644 include/linux/mcslock.h
> > > > > >
> > > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > > new file mode 100644
> > > > > > index 0000000..20fd3f0
> > > > > > --- /dev/null
> > > > > > +++ b/include/linux/mcslock.h
> > > > > > @@ -0,0 +1,58 @@
> > > > > > +/*
> > > > > > + * MCS lock defines
> > > > > > + *
> > > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > > + */
> > > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > > +#define __LINUX_MCSLOCK_H
> > > > > > +
> > > > > > +struct mcs_spin_node {
> > > > > > + struct mcs_spin_node *next;
> > > > > > + int locked; /* 1 if lock acquired */
> > > > > > +};
> > > > > > +
> > > > > > +/*
> > > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > > + * time spent in this lock function.
> > > > > > + */
> > > > > > +static noinline
> > > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > > +{
> > > > > > + struct mcs_spin_node *prev;
> > > > > > +
> > > > > > + /* Init node */
> > > > > > + node->locked = 0;
> > > > > > + node->next = NULL;
> > > > > > +
> > > > > > + prev = xchg(lock, node);
> > > > > > + if (likely(prev == NULL)) {
> > > > > > + /* Lock acquired */
> > > > > > + node->locked = 1;
> > > > >
> > > > > If we don't spin on the local node, is it necessary to set this variable?
> > > >
> > > > I don't follow, the whole idea is to spin on the local variable.
> > >
> > > If prev == NULL, doesn't that mean it won't proceed to spin on the
> > > variable because the lock is already free and we call return? In that
> > > case where we directly acquire the lock, I was wondering if it is
> > > necessary to set node->locked = 1.
> >
> > Yes, that's true, but we need to flag the lock as acquired (the node's
> > lock is initially set to unlocked), otherwise others trying to acquire
> > the lock can spin forever:
> >
> > /* Wait until the lock holder passes the lock down */
> > while (!ACCESS_ONCE(node->locked))
> > arch_mutex_cpu_relax();
> >
> > The ->locked variable in this implementation refers to if the lock is
> > acquired, and *not* to if busy-waiting is necessary.
>
> hmm, others threads acquiring the lock will be spinning on their own
> local nodes, not this node's node->locked.

But we're xchg'ing memory locations, not values - so after the xchg()
call, when we modify node->locked, we're also changing lock, which isn't
local.

> And if prev == NULL, the
> current thread won't be reading it's node->lock either since we return.
> So what other thread is going to be reading this node's node->lock?




>
> Thanks,
> Jason
>

2013-09-26 22:42:18

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:
> On Thu, 2013-09-26 at 14:09 -0700, Jason Low wrote:
> > On Thu, 2013-09-26 at 13:40 -0700, Davidlohr Bueso wrote:
> > > On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> > > > On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > > > > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > > > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > > > to reuse this code easily for rwsem.
> > > > > > >
> > > > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > > > ---
> > > > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > > > create mode 100644 include/linux/mcslock.h
> > > > > > >
> > > > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > > > new file mode 100644
> > > > > > > index 0000000..20fd3f0
> > > > > > > --- /dev/null
> > > > > > > +++ b/include/linux/mcslock.h
> > > > > > > @@ -0,0 +1,58 @@
> > > > > > > +/*
> > > > > > > + * MCS lock defines
> > > > > > > + *
> > > > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > > > + */
> > > > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > > > +#define __LINUX_MCSLOCK_H
> > > > > > > +
> > > > > > > +struct mcs_spin_node {
> > > > > > > + struct mcs_spin_node *next;
> > > > > > > + int locked; /* 1 if lock acquired */
> > > > > > > +};
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > > > + * time spent in this lock function.
> > > > > > > + */
> > > > > > > +static noinline
> > > > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > > > +{
> > > > > > > + struct mcs_spin_node *prev;
> > > > > > > +
> > > > > > > + /* Init node */
> > > > > > > + node->locked = 0;
> > > > > > > + node->next = NULL;
> > > > > > > +
> > > > > > > + prev = xchg(lock, node);
> > > > > > > + if (likely(prev == NULL)) {
> > > > > > > + /* Lock acquired */
> > > > > > > + node->locked = 1;
> > > > > >
> > > > > > If we don't spin on the local node, is it necessary to set this variable?
> > > > >
> > > > > I don't follow, the whole idea is to spin on the local variable.
> > > >
> > > > If prev == NULL, doesn't that mean it won't proceed to spin on the
> > > > variable because the lock is already free and we call return? In that
> > > > case where we directly acquire the lock, I was wondering if it is
> > > > necessary to set node->locked = 1.
> > >
> > > Yes, that's true, but we need to flag the lock as acquired (the node's
> > > lock is initially set to unlocked), otherwise others trying to acquire
> > > the lock can spin forever:
> > >
> > > /* Wait until the lock holder passes the lock down */
> > > while (!ACCESS_ONCE(node->locked))
> > > arch_mutex_cpu_relax();
> > >
> > > The ->locked variable in this implementation refers to if the lock is
> > > acquired, and *not* to if busy-waiting is necessary.
> >
> > hmm, others threads acquiring the lock will be spinning on their own
> > local nodes, not this node's node->locked. And if prev == NULL, the
> > current thread won't be reading it's node->lock either since we return.
> > So what other thread is going to be reading this node's node->lock?
> >
> > Thanks,
> > Jason
>
> I think setting node->locked = 1 for the prev==NULL case is not
> necessary functionally, but was done for semantics consistency.

Okay, that would makes sense for consistency because we always
first set node->lock = 0 at the top of the function.

If we prefer to optimize this a bit though, perhaps we can
first move the node->lock = 0 so that it gets executed after the
"if (likely(prev == NULL)) {}" code block and then delete
"node->lock = 1" inside the code block.

static noinline
void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
{
struct mcs_spin_node *prev;

/* Init node */
node->next = NULL;

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
return;
}
node->locked = 0;
ACCESS_ONCE(prev->next) = node;
smp_wmb();
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
}

2013-09-26 22:57:51

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Thu, 2013-09-26 at 15:42 -0700, Jason Low wrote:
> On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:
> > On Thu, 2013-09-26 at 14:09 -0700, Jason Low wrote:
> > > On Thu, 2013-09-26 at 13:40 -0700, Davidlohr Bueso wrote:
> > > > On Thu, 2013-09-26 at 13:23 -0700, Jason Low wrote:
> > > > > On Thu, 2013-09-26 at 13:06 -0700, Davidlohr Bueso wrote:
> > > > > > On Thu, 2013-09-26 at 12:27 -0700, Jason Low wrote:
> > > > > > > On Wed, Sep 25, 2013 at 3:10 PM, Tim Chen <[email protected]> wrote:
> > > > > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > > > > to reuse this code easily for rwsem.
> > > > > > > >
> > > > > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > > > > ---
> > > > > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > > > > create mode 100644 include/linux/mcslock.h
> > > > > > > >
> > > > > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000..20fd3f0
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/include/linux/mcslock.h
> > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > +/*
> > > > > > > > + * MCS lock defines
> > > > > > > > + *
> > > > > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > > > > + */
> > > > > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > > > > +#define __LINUX_MCSLOCK_H
> > > > > > > > +
> > > > > > > > +struct mcs_spin_node {
> > > > > > > > + struct mcs_spin_node *next;
> > > > > > > > + int locked; /* 1 if lock acquired */
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +/*
> > > > > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > > > > + * time spent in this lock function.
> > > > > > > > + */
> > > > > > > > +static noinline
> > > > > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > > > > +{
> > > > > > > > + struct mcs_spin_node *prev;
> > > > > > > > +
> > > > > > > > + /* Init node */
> > > > > > > > + node->locked = 0;
> > > > > > > > + node->next = NULL;
> > > > > > > > +
> > > > > > > > + prev = xchg(lock, node);
> > > > > > > > + if (likely(prev == NULL)) {
> > > > > > > > + /* Lock acquired */
> > > > > > > > + node->locked = 1;
> > > > > > >
> > > > > > > If we don't spin on the local node, is it necessary to set this variable?
> > > > > >
> > > > > > I don't follow, the whole idea is to spin on the local variable.
> > > > >
> > > > > If prev == NULL, doesn't that mean it won't proceed to spin on the
> > > > > variable because the lock is already free and we call return? In that
> > > > > case where we directly acquire the lock, I was wondering if it is
> > > > > necessary to set node->locked = 1.
> > > >
> > > > Yes, that's true, but we need to flag the lock as acquired (the node's
> > > > lock is initially set to unlocked), otherwise others trying to acquire
> > > > the lock can spin forever:
> > > >
> > > > /* Wait until the lock holder passes the lock down */
> > > > while (!ACCESS_ONCE(node->locked))
> > > > arch_mutex_cpu_relax();
> > > >
> > > > The ->locked variable in this implementation refers to if the lock is
> > > > acquired, and *not* to if busy-waiting is necessary.
> > >
> > > hmm, others threads acquiring the lock will be spinning on their own
> > > local nodes, not this node's node->locked. And if prev == NULL, the
> > > current thread won't be reading it's node->lock either since we return.
> > > So what other thread is going to be reading this node's node->lock?
> > >
> > > Thanks,
> > > Jason
> >
> > I think setting node->locked = 1 for the prev==NULL case is not
> > necessary functionally, but was done for semantics consistency.
>
> Okay, that would makes sense for consistency because we always
> first set node->lock = 0 at the top of the function.
>
> If we prefer to optimize this a bit though, perhaps we can
> first move the node->lock = 0 so that it gets executed after the
> "if (likely(prev == NULL)) {}" code block and then delete
> "node->lock = 1" inside the code block.

I suppose we can save one single assignment.
The gain is probably not noticeable as once we set node->next to NULL,
node->locked is likely in local cache line and the assignment operation
is cheap.

Regards,
Tim

>
> static noinline
> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> {
> struct mcs_spin_node *prev;
>
> /* Init node */
> node->next = NULL;
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> /* Lock acquired */
> return;
> }
> node->locked = 0;
> ACCESS_ONCE(prev->next) = node;
> smp_wmb();
> /* Wait until the lock holder passes the lock down */
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
> }
>

2013-09-27 06:02:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file


* Tim Chen <[email protected]> wrote:

> > If we prefer to optimize this a bit though, perhaps we can first move
> > the node->lock = 0 so that it gets executed after the "if (likely(prev
> > == NULL)) {}" code block and then delete "node->lock = 1" inside the
> > code block.
>
> I suppose we can save one single assignment. The gain is probably not
> noticeable as once we set node->next to NULL, node->locked is likely in
> local cache line and the assignment operation is cheap.

Would be nice to have this as a separate, add-on patch. Every single
instruction removal that has no downside is an upside!

You can add a comment that explains it.

Thanks,

Ingo

2013-09-27 06:27:03

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 08:02 +0200, Ingo Molnar wrote:
> * Tim Chen <[email protected]> wrote:
>
> > > If we prefer to optimize this a bit though, perhaps we can first move
> > > the node->lock = 0 so that it gets executed after the "if (likely(prev
> > > == NULL)) {}" code block and then delete "node->lock = 1" inside the
> > > code block.
> >
> > I suppose we can save one single assignment. The gain is probably not
> > noticeable as once we set node->next to NULL, node->locked is likely in
> > local cache line and the assignment operation is cheap.
>
> Would be nice to have this as a separate, add-on patch. Every single
> instruction removal that has no downside is an upside!
>
> You can add a comment that explains it.

Yup, especially a spin lock (and one that I have found to be be used
very frequently when running workloads on big machines).

2013-09-27 11:24:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 08:02:13AM +0200, Ingo Molnar wrote:
> Would be nice to have this as a separate, add-on patch. Every single
> instruction removal that has no downside is an upside!
>
> You can add a comment that explains it.

If someone is going to do add-on patches to the mcslock.h file, please
also consider doing a patch that adds comments to the memory barriers in
there.

Also, checkpatch.pl should really warn about that; and it appears there
code in there for that; however:

# grep -C3 smp_mb scripts/checkpatch.pl
}
}
# check for memory barriers without a comment.
if ($line =~ /\b(mb|rmb|wmb|read_barrier_depends|smp_mb|smp_rmb|smp_wmb|smp_read_barrier_depends)\(/) {
if (!ctx_has_comment($first_line, $linenr)) {
CHK("MEMORY_BARRIER",
"memory barrier without comment\n" . $herecurr);
# grep -C3 smp_wmb kernel/mutex.c
return;
}
ACCESS_ONCE(prev->next) = node;
smp_wmb();
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
--
arch_mutex_cpu_relax();
}
ACCESS_ONCE(next->locked) = 1;
smp_wmb();
}

/*
# scripts/checkpatch.pl -f kernel/mutex.c 2>&1 | grep memory
#

so that appears to be completely broken :/

Joe, any clue what's up with that?

2013-09-27 13:45:07

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 13:23 +0200, Peter Zijlstra wrote:
> checkpatch.pl should really warn about that; and it appears there
> code in there for that; however:
>
> # grep -C3 smp_mb scripts/checkpatch.pl
[]
> # check for memory barriers without a comment.
> if ($line =~ /\b(mb|rmb|wmb|read_barrier_depends|smp_mb|smp_rmb|smp_wmb|smp_read_barrier_depends)\(/) {
> if (!ctx_has_comment($first_line, $linenr)) {
> CHK("MEMORY_BARRIER",
> "memory barrier without comment\n" . $herecurr);
[]
> # scripts/checkpatch.pl -f kernel/mutex.c 2>&1 | grep memory
> #
>
> so that appears to be completely broken :/
>
> Joe, any clue what's up with that?

It's a CHK test, so it's only tested with --strict

$ scripts/checkpatch.pl -f --strict kernel/mutex.c 2>&1 | grep memory
CHECK: memory barrier without comment
CHECK: memory barrier without comment

It could be changed to WARN so it's always on.

2013-09-27 13:48:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 06:44:55AM -0700, Joe Perches wrote:
> It's a CHK test, so it's only tested with --strict
>
> $ scripts/checkpatch.pl -f --strict kernel/mutex.c 2>&1 | grep memory
> CHECK: memory barrier without comment
> CHECK: memory barrier without comment
>
> It could be changed to WARN so it's always on.

Yes please, we can't be too careful with memory barriers.

2013-09-27 14:05:10

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 15:48 +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 06:44:55AM -0700, Joe Perches wrote:
> > It's a CHK test, so it's only tested with --strict
> >
> > $ scripts/checkpatch.pl -f --strict kernel/mutex.c 2>&1 | grep memory
> > CHECK: memory barrier without comment
> > CHECK: memory barrier without comment
> >
> > It could be changed to WARN so it's always on.
>
> Yes please, we can't be too careful with memory barriers.

I'll send the patch separately.

It seems a pretty noisy test.
There are 13 hits just in arch/x86/kernel/

$ ./scripts/checkpatch.pl -f arch/x86/kernel/*.c | grep -A3 "memory barrier"
WARNING: memory barrier without comment
#685: FILE: x86/kernel/alternative.c:685:
+ smp_wmb();

--
WARNING: memory barrier without comment
#401: FILE: x86/kernel/kvm.c:401:
+ rmb();

WARNING: memory barrier without comment
#403: FILE: x86/kernel/kvm.c:403:
+ rmb();

--
WARNING: memory barrier without comment
#702: FILE: x86/kernel/kvm.c:702:
+ smp_wmb();

WARNING: memory barrier without comment
#704: FILE: x86/kernel/kvm.c:704:
+ smp_wmb();

--
WARNING: memory barrier without comment
#62: FILE: x86/kernel/ldt.c:62:
+ wmb();

WARNING: memory barrier without comment
#64: FILE: x86/kernel/ldt.c:64:
+ wmb();

--
WARNING: memory barrier without comment
#204: FILE: x86/kernel/smpboot.c:204:
+ wmb();

WARNING: memory barrier without comment
#265: FILE: x86/kernel/smpboot.c:265:
+ wmb();

--
WARNING: memory barrier without comment
#557: FILE: x86/kernel/smpboot.c:557:
+ mb();

--
WARNING: memory barrier without comment
#1065: FILE: x86/kernel/smpboot.c:1065:
+ mb();

--
WARNING: memory barrier without comment
#1321: FILE: x86/kernel/smpboot.c:1321:
+ mb();

WARNING: memory barrier without comment
#1399: FILE: x86/kernel/smpboot.c:1399:
+ mb();


2013-09-27 14:14:26

by Joe Perches

[permalink] [raw]
Subject: [PATCH] checkpatch: Make the memory barrier test noisier

Peter Zijlstra prefers that comments be required near uses
of memory barriers.

Change the message level for memory barrier uses from a
--strict test only to a normal WARN so it's always emitted.

This might produce false positives around insertions of
memory barriers when a comment is outside the patch context
block.

And checkpatch is still stupid, it only looks for existence
of any comment, not at the comment content.

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Joe Perches <[email protected]>
---
scripts/checkpatch.pl | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index c03e427..bd4103a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3816,8 +3816,8 @@ sub string_find_replace {
# check for memory barriers without a comment.
if ($line =~ /\b(mb|rmb|wmb|read_barrier_depends|smp_mb|smp_rmb|smp_wmb|smp_read_barrier_depends)\(/) {
if (!ctx_has_comment($first_line, $linenr)) {
- CHK("MEMORY_BARRIER",
- "memory barrier without comment\n" . $herecurr);
+ WARN("MEMORY_BARRIER",
+ "memory barrier without comment\n" . $herecurr);
}
}
# check of hardware specific defines


2013-09-27 14:19:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 07:05:00AM -0700, Joe Perches wrote:
> On Fri, 2013-09-27 at 15:48 +0200, Peter Zijlstra wrote:
> > On Fri, Sep 27, 2013 at 06:44:55AM -0700, Joe Perches wrote:
> > > It's a CHK test, so it's only tested with --strict
> > >
> > > $ scripts/checkpatch.pl -f --strict kernel/mutex.c 2>&1 | grep memory
> > > CHECK: memory barrier without comment
> > > CHECK: memory barrier without comment
> > >
> > > It could be changed to WARN so it's always on.
> >
> > Yes please, we can't be too careful with memory barriers.
>
> I'll send the patch separately.
>
> It seems a pretty noisy test.
> There are 13 hits just in arch/x86/kernel/

Urgh. that wants fixing. We really need to stop getting more and that's
where checkpatch is good at.

At very very bare minimum the comment should mention where the pairing
barrier is; but ideally it should describe the actual ordering.

2013-09-27 14:26:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, Sep 27, 2013 at 07:14:17AM -0700, Joe Perches wrote:
> Peter Zijlstra prefers that comments be required near uses
> of memory barriers.
>
> Change the message level for memory barrier uses from a
> --strict test only to a normal WARN so it's always emitted.
>
> This might produce false positives around insertions of
> memory barriers when a comment is outside the patch context
> block.

One would argue that in that case they're too far away in any case :-)

> And checkpatch is still stupid, it only looks for existence
> of any comment, not at the comment content.

Could we try and alleviate this by giving a slightly more verbose
warning?

Maybe something like:

memory barrier without comment; please refer to the pairing barrier and
describe the ordering requirements.

> Suggested-by: Peter Zijlstra <[email protected]>
> Signed-off-by: Joe Perches <[email protected]>

Acked-by: Peter Zijlstra <[email protected]>

2013-09-27 14:35:11

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, 2013-09-27 at 16:26 +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 07:14:17AM -0700, Joe Perches wrote:
> > Peter Zijlstra prefers that comments be required near uses
> > of memory barriers.
> >
> > Change the message level for memory barrier uses from a
> > --strict test only to a normal WARN so it's always emitted.
> >
> > This might produce false positives around insertions of
> > memory barriers when a comment is outside the patch context
> > block.
>
> One would argue that in that case they're too far away in any case :-)
>
> > And checkpatch is still stupid, it only looks for existence
> > of any comment, not at the comment content.
>
> Could we try and alleviate this by giving a slightly more verbose
> warning?

> Maybe something like:
>
> memory barrier without comment; please refer to the pairing barrier and
> describe the ordering requirements.

That would make it seem as if all barriers are SMP no?

Maybe just refer to Documentation/memory-barriers.txt
and/or say something like "please document appropriately"

2013-09-27 14:50:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, Sep 27, 2013 at 07:34:55AM -0700, Joe Perches wrote:
> That would make it seem as if all barriers are SMP no?

I would think any memory barrier is ordering against someone else; if
not smp then a device/hardware -- like for instance the hardware page
table walker.

Barriers are fundamentally about order; and order only makes sense if
there's more than 1 party to the game.

> Maybe just refer to Documentation/memory-barriers.txt
> and/or say something like "please document appropriately"

Documentation/memory-barriers.txt is always good; appropriately doesn't
seem to quantify anything much at all. Someone might think:

/* */
smp_mb();

appropriate...

2013-09-27 15:18:01

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, Sep 27, 2013 at 04:50:07PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 07:34:55AM -0700, Joe Perches wrote:
> > That would make it seem as if all barriers are SMP no?
>
> I would think any memory barrier is ordering against someone else; if
> not smp then a device/hardware -- like for instance the hardware page
> table walker.
>
> Barriers are fundamentally about order; and order only makes sense if
> there's more than 1 party to the game.

Oddly enough, there is one exception that proves the rule... On Itanium,
suppose we have the following code, with x initially equal to zero:

CPU 1: ACCESS_ONCE(x) = 1;

CPU 2: r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(x);

Itanium architects have told me that it really is possible for CPU 2 to
see r1==1 and r2==0. Placing a memory barrier between CPU 2's pair of
fetches prevents this, but without any other memory barrier to pair with.

> > Maybe just refer to Documentation/memory-barriers.txt
> > and/or say something like "please document appropriately"
>
> Documentation/memory-barriers.txt is always good; appropriately doesn't
> seem to quantify anything much at all. Someone might think:
>
> /* */
> smp_mb();
>
> appropriate...

I end up doing this:

/* */
smp_mb(); /* See above block comment. */

But it would be nice for the prior comment to be recognized as belonging
to the memory barrier without the additional "See above" comment.

In any case, please feel free to add:

Acked-by: Paul E. McKenney <[email protected]>

to the original checkpatch.pl patch.

Thanx, Paul

2013-09-27 15:30:08

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> We will need the MCS lock code for doing optimistic spinning for rwsem.
> Extracting the MCS code from mutex.c and put into its own file allow us
> to reuse this code easily for rwsem.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> ---
> include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/mutex.c | 58 +++++-----------------------------------------
> 2 files changed, 65 insertions(+), 51 deletions(-)
> create mode 100644 include/linux/mcslock.h
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> new file mode 100644
> index 0000000..20fd3f0
> --- /dev/null
> +++ b/include/linux/mcslock.h
> @@ -0,0 +1,58 @@
> +/*
> + * MCS lock defines
> + *
> + * This file contains the main data structure and API definitions of MCS lock.
> + */
> +#ifndef __LINUX_MCSLOCK_H
> +#define __LINUX_MCSLOCK_H
> +
> +struct mcs_spin_node {
> + struct mcs_spin_node *next;
> + int locked; /* 1 if lock acquired */
> +};
> +
> +/*
> + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> + * time spent in this lock function.
> + */
> +static noinline
> +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *prev;
> +
> + /* Init node */
> + node->locked = 0;
> + node->next = NULL;
> +
> + prev = xchg(lock, node);
> + if (likely(prev == NULL)) {
> + /* Lock acquired */
> + node->locked = 1;
> + return;
> + }
> + ACCESS_ONCE(prev->next) = node;
> + smp_wmb();
> + /* Wait until the lock holder passes the lock down */
> + while (!ACCESS_ONCE(node->locked))
> + arch_mutex_cpu_relax();
> +}
> +
> +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> +{
> + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> +
> + if (likely(!next)) {
> + /*
> + * Release the lock by setting it to NULL
> + */
> + if (cmpxchg(lock, node, NULL) == node)
> + return;
> + /* Wait until the next pointer is set */
> + while (!(next = ACCESS_ONCE(node->next)))
> + arch_mutex_cpu_relax();
> + }
> + ACCESS_ONCE(next->locked) = 1;
> + smp_wmb();

Shouldn't the memory barrier precede the "ACCESS_ONCE(next->locked) = 1;"?
Maybe in an "else" clause of the prior "if" statement, given that the
cmpxchg() does it otherwise.

Otherwise, in the case where the "if" conditionn is false, the critical
section could bleed out past the unlock.

Thanx, Paul

> +}
> +
> +#endif
> diff --git a/kernel/mutex.c b/kernel/mutex.c
> index 6d647ae..1b6ba3f 100644
> --- a/kernel/mutex.c
> +++ b/kernel/mutex.c
> @@ -25,6 +25,7 @@
> #include <linux/spinlock.h>
> #include <linux/interrupt.h>
> #include <linux/debug_locks.h>
> +#include <linux/mcslock.h>
>
> /*
> * In the DEBUG case we are using the "NULL fastpath" for mutexes,
> @@ -111,54 +112,9 @@ EXPORT_SYMBOL(mutex_lock);
> * more or less simultaneously, the spinners need to acquire a MCS lock
> * first before spinning on the owner field.
> *
> - * We don't inline mspin_lock() so that perf can correctly account for the
> - * time spent in this lock function.
> */
> -struct mspin_node {
> - struct mspin_node *next ;
> - int locked; /* 1 if lock acquired */
> -};
> -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
>
> -static noinline
> -void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
> -{
> - struct mspin_node *prev;
> -
> - /* Init node */
> - node->locked = 0;
> - node->next = NULL;
> -
> - prev = xchg(lock, node);
> - if (likely(prev == NULL)) {
> - /* Lock acquired */
> - node->locked = 1;
> - return;
> - }
> - ACCESS_ONCE(prev->next) = node;
> - smp_wmb();
> - /* Wait until the lock holder passes the lock down */
> - while (!ACCESS_ONCE(node->locked))
> - arch_mutex_cpu_relax();
> -}
> -
> -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
> -{
> - struct mspin_node *next = ACCESS_ONCE(node->next);
> -
> - if (likely(!next)) {
> - /*
> - * Release the lock by setting it to NULL
> - */
> - if (cmpxchg(lock, node, NULL) == node)
> - return;
> - /* Wait until the next pointer is set */
> - while (!(next = ACCESS_ONCE(node->next)))
> - arch_mutex_cpu_relax();
> - }
> - ACCESS_ONCE(next->locked) = 1;
> - smp_wmb();
> -}
> +#define MLOCK(mutex) ((struct mcs_spin_node **)&((mutex)->spin_mlock))
>
> /*
> * Mutex spinning code migrated from kernel/sched/core.c
> @@ -448,7 +404,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>
> for (;;) {
> struct task_struct *owner;
> - struct mspin_node node;
> + struct mcs_spin_node node;
>
> if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
> struct ww_mutex *ww;
> @@ -470,10 +426,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
> * If there's an owner, wait for it to either
> * release the lock or go to sleep.
> */
> - mspin_lock(MLOCK(lock), &node);
> + mcs_spin_lock(MLOCK(lock), &node);
> owner = ACCESS_ONCE(lock->owner);
> if (owner && !mutex_spin_on_owner(lock, owner)) {
> - mspin_unlock(MLOCK(lock), &node);
> + mcs_spin_unlock(MLOCK(lock), &node);
> goto slowpath;
> }
>
> @@ -488,11 +444,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
> }
>
> mutex_set_owner(lock);
> - mspin_unlock(MLOCK(lock), &node);
> + mcs_spin_unlock(MLOCK(lock), &node);
> preempt_enable();
> return 0;
> }
> - mspin_unlock(MLOCK(lock), &node);
> + mcs_spin_unlock(MLOCK(lock), &node);
>
> /*
> * When there's no owner, we might have preempted between the
> --
> 1.7.4.4
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-09-27 15:35:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, Sep 27, 2013 at 08:17:50AM -0700, Paul E. McKenney wrote:
> > Barriers are fundamentally about order; and order only makes sense if
> > there's more than 1 party to the game.
>
> Oddly enough, there is one exception that proves the rule... On Itanium,
> suppose we have the following code, with x initially equal to zero:
>
> CPU 1: ACCESS_ONCE(x) = 1;
>
> CPU 2: r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(x);
>
> Itanium architects have told me that it really is possible for CPU 2 to
> see r1==1 and r2==0. Placing a memory barrier between CPU 2's pair of
> fetches prevents this, but without any other memory barrier to pair with.

Oh man.. its really past time to sink that itanic already.

I suppose it allows the cpu to reorder the reads in its pipeline and the
memory barrier disallows this. Curious.. does our memory-barriers.txt
file mention this 'fun' fact?

2013-09-27 16:04:26

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, Sep 27, 2013 at 05:34:34PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 08:17:50AM -0700, Paul E. McKenney wrote:
> > > Barriers are fundamentally about order; and order only makes sense if
> > > there's more than 1 party to the game.
> >
> > Oddly enough, there is one exception that proves the rule... On Itanium,
> > suppose we have the following code, with x initially equal to zero:
> >
> > CPU 1: ACCESS_ONCE(x) = 1;
> >
> > CPU 2: r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(x);
> >
> > Itanium architects have told me that it really is possible for CPU 2 to
> > see r1==1 and r2==0. Placing a memory barrier between CPU 2's pair of
> > fetches prevents this, but without any other memory barrier to pair with.
>
> Oh man.. its really past time to sink that itanic already.
>
> I suppose it allows the cpu to reorder the reads in its pipeline and the
> memory barrier disallows this. Curious.. does our memory-barriers.txt
> file mention this 'fun' fact?

Probably not. I was recently reminded of it by some people on the C++
standards committee. I had first heard of it about 5 years ago, but
hadn't heard definitively until quite recently.

I defer to the Itanium maintainers to actually make the required changes,
should they choose to do so. I suppose that one way to handle it in the
Linux kernel would be to make ACCESS_ONCE() be architecture specific,
with Itanium placing a memory barrier either before or after --- either
would work. But since Itanium seems to run Linux reliably, I am guessing
that the probability of misordering is quite low. But again, the ball
is firmly in the Itanium maintainers' courts, and I have added them on CC.

Thanx, Paul

2013-09-27 16:12:13

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 08:02 +0200, Ingo Molnar wrote:
> Would be nice to have this as a separate, add-on patch. Every single
> instruction removal that has no downside is an upside!

Okay, so here is a patch. Tim, would you like to add this to v7?

...
Subject: MCS lock: Remove and reorder unnecessary assignments in mcs_spin_lock()

In mcs_spin_lock(), if (likely(prev == NULL)) is true, then the lock is free
and we won't spin on the local node. In that case, we don't have to assign
node->locked because it won't be used. We can also move the node->locked = 0
assignment so that it occurs after the if (likely(prev == NULL)) check.

This might also help make it clearer as to how the node->locked variable
is used in MCS locks.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/mcslock.h | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
index 20fd3f0..1167d57 100644
--- a/include/linux/mcslock.h
+++ b/include/linux/mcslock.h
@@ -21,15 +21,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
struct mcs_spin_node *prev;

/* Init node */
- node->locked = 0;
node->next = NULL;

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
- node->locked = 1;
return;
}
+ node->locked = 0;
ACCESS_ONCE(prev->next) = node;
smp_wmb();
/* Wait until the lock holder passes the lock down */
--
1.7.1

2013-09-27 16:19:06

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 09:12 -0700, Jason Low wrote:
> On Fri, 2013-09-27 at 08:02 +0200, Ingo Molnar wrote:
> > Would be nice to have this as a separate, add-on patch. Every single
> > instruction removal that has no downside is an upside!
>
> Okay, so here is a patch. Tim, would you like to add this to v7?

Okay. Will do.

Tim

>
> ...
> Subject: MCS lock: Remove and reorder unnecessary assignments in mcs_spin_lock()
>
> In mcs_spin_lock(), if (likely(prev == NULL)) is true, then the lock is free
> and we won't spin on the local node. In that case, we don't have to assign
> node->locked because it won't be used. We can also move the node->locked = 0
> assignment so that it occurs after the if (likely(prev == NULL)) check.
>
> This might also help make it clearer as to how the node->locked variable
> is used in MCS locks.
>
> Signed-off-by: Jason Low <[email protected]>
> ---
> include/linux/mcslock.h | 3 +--
> 1 files changed, 1 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> index 20fd3f0..1167d57 100644
> --- a/include/linux/mcslock.h
> +++ b/include/linux/mcslock.h
> @@ -21,15 +21,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> struct mcs_spin_node *prev;
>
> /* Init node */
> - node->locked = 0;
> node->next = NULL;
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> /* Lock acquired */
> - node->locked = 1;
> return;
> }
> + node->locked = 0;
> ACCESS_ONCE(prev->next) = node;
> smp_wmb();
> /* Wait until the lock holder passes the lock down */

2013-09-27 18:09:23

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
> On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > Extracting the MCS code from mutex.c and put into its own file allow us
> > to reuse this code easily for rwsem.
> >
> > Signed-off-by: Tim Chen <[email protected]>
> > Signed-off-by: Davidlohr Bueso <[email protected]>
> > ---
> > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > kernel/mutex.c | 58 +++++-----------------------------------------
> > 2 files changed, 65 insertions(+), 51 deletions(-)
> > create mode 100644 include/linux/mcslock.h
> >
> > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > new file mode 100644
> > index 0000000..20fd3f0
> > --- /dev/null
> > +++ b/include/linux/mcslock.h
> > @@ -0,0 +1,58 @@
> > +/*
> > + * MCS lock defines
> > + *
> > + * This file contains the main data structure and API definitions of MCS lock.
> > + */
> > +#ifndef __LINUX_MCSLOCK_H
> > +#define __LINUX_MCSLOCK_H
> > +
> > +struct mcs_spin_node {
> > + struct mcs_spin_node *next;
> > + int locked; /* 1 if lock acquired */
> > +};
> > +
> > +/*
> > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > + * time spent in this lock function.
> > + */
> > +static noinline
> > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > +{
> > + struct mcs_spin_node *prev;
> > +
> > + /* Init node */
> > + node->locked = 0;
> > + node->next = NULL;
> > +
> > + prev = xchg(lock, node);
> > + if (likely(prev == NULL)) {
> > + /* Lock acquired */
> > + node->locked = 1;
> > + return;
> > + }
> > + ACCESS_ONCE(prev->next) = node;
> > + smp_wmb();
> > + /* Wait until the lock holder passes the lock down */
> > + while (!ACCESS_ONCE(node->locked))
> > + arch_mutex_cpu_relax();
> > +}
> > +
> > +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > +{
> > + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> > +
> > + if (likely(!next)) {
> > + /*
> > + * Release the lock by setting it to NULL
> > + */
> > + if (cmpxchg(lock, node, NULL) == node)
> > + return;
> > + /* Wait until the next pointer is set */
> > + while (!(next = ACCESS_ONCE(node->next)))
> > + arch_mutex_cpu_relax();
> > + }
> > + ACCESS_ONCE(next->locked) = 1;
> > + smp_wmb();
>
> Shouldn't the memory barrier precede the "ACCESS_ONCE(next->locked) = 1;"?
> Maybe in an "else" clause of the prior "if" statement, given that the
> cmpxchg() does it otherwise.
>
> Otherwise, in the case where the "if" conditionn is false, the critical
> section could bleed out past the unlock.

Yes, I agree with you that the smp_wmb should be moved before
ACCESS_ONCE to prevent critical section from bleeding. Copying Waiman
who is the original author of the mcs code to see if he has any comments
on things we may have missed.

Tim

2013-09-27 19:38:59

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
> On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > Extracting the MCS code from mutex.c and put into its own file allow us
> > to reuse this code easily for rwsem.
> >
> > Signed-off-by: Tim Chen <[email protected]>
> > Signed-off-by: Davidlohr Bueso <[email protected]>
> > ---
> > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > kernel/mutex.c | 58 +++++-----------------------------------------
> > 2 files changed, 65 insertions(+), 51 deletions(-)
> > create mode 100644 include/linux/mcslock.h
> >
> > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > new file mode 100644
> > index 0000000..20fd3f0
> > --- /dev/null
> > +++ b/include/linux/mcslock.h
> > @@ -0,0 +1,58 @@
> > +/*
> > + * MCS lock defines
> > + *
> > + * This file contains the main data structure and API definitions of MCS lock.
> > + */
> > +#ifndef __LINUX_MCSLOCK_H
> > +#define __LINUX_MCSLOCK_H
> > +
> > +struct mcs_spin_node {
> > + struct mcs_spin_node *next;
> > + int locked; /* 1 if lock acquired */
> > +};
> > +
> > +/*
> > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > + * time spent in this lock function.
> > + */
> > +static noinline
> > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > +{
> > + struct mcs_spin_node *prev;
> > +
> > + /* Init node */
> > + node->locked = 0;
> > + node->next = NULL;
> > +
> > + prev = xchg(lock, node);
> > + if (likely(prev == NULL)) {
> > + /* Lock acquired */
> > + node->locked = 1;
> > + return;
> > + }
> > + ACCESS_ONCE(prev->next) = node;
> > + smp_wmb();

BTW, is the above memory barrier necessary? It seems like the xchg
instruction already provided a memory barrier.

Now if we made the changes that Jason suggested:


/* Init node */
- node->locked = 0;
node->next = NULL;

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
- node->locked = 1;
return;
}
+ node->locked = 0;
ACCESS_ONCE(prev->next) = node;
smp_wmb();

We are probably still okay as other cpus do not read the value of
node->locked, which is a local variable.

Tim

> > + /* Wait until the lock holder passes the lock down */
> > + while (!ACCESS_ONCE(node->locked))
> > + arch_mutex_cpu_relax();
> > +}
> > +
> > +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > +{
> > + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> > +
> > + if (likely(!next)) {
> > + /*
> > + * Release the lock by setting it to NULL
> > + */
> > + if (cmpxchg(lock, node, NULL) == node)
> > + return;
> > + /* Wait until the next pointer is set */
> > + while (!(next = ACCESS_ONCE(node->next)))
> > + arch_mutex_cpu_relax();
> > + }
> > + ACCESS_ONCE(next->locked) = 1;
> > + smp_wmb();
>
> Shouldn't the memory barrier precede the "ACCESS_ONCE(next->locked) = 1;"?
> Maybe in an "else" clause of the prior "if" statement, given that the
> cmpxchg() does it otherwise.
>
> Otherwise, in the case where the "if" conditionn is false, the critical
> section could bleed out past the unlock.
>
> Thanx, Paul
>

2013-09-27 20:16:22

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 12:38 PM, Tim Chen <[email protected]> wrote:

> BTW, is the above memory barrier necessary? It seems like the xchg
> instruction already provided a memory barrier.
>
> Now if we made the changes that Jason suggested:
>
>
> /* Init node */
> - node->locked = 0;
> node->next = NULL;
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> /* Lock acquired */
> - node->locked = 1;
> return;
> }
> + node->locked = 0;
> ACCESS_ONCE(prev->next) = node;
> smp_wmb();
>
> We are probably still okay as other cpus do not read the value of
> node->locked, which is a local variable.

Similarly, I was wondering if we should also move smp_wmb() so that it
is before the ACCESS_ONCE(prev->next) = node and after the
node->locked = 0. Would we want to guarantee that the node->locked
gets set before it is added to the linked list where a previous thread
calling mcs_spin_unlock() would potentially modify node->locked?

2013-09-27 20:39:10

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 12:38:53PM -0700, Tim Chen wrote:
> On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
> > On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > to reuse this code easily for rwsem.
> > >
> > > Signed-off-by: Tim Chen <[email protected]>
> > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > ---
> > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > create mode 100644 include/linux/mcslock.h
> > >
> > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > new file mode 100644
> > > index 0000000..20fd3f0
> > > --- /dev/null
> > > +++ b/include/linux/mcslock.h
> > > @@ -0,0 +1,58 @@
> > > +/*
> > > + * MCS lock defines
> > > + *
> > > + * This file contains the main data structure and API definitions of MCS lock.
> > > + */
> > > +#ifndef __LINUX_MCSLOCK_H
> > > +#define __LINUX_MCSLOCK_H
> > > +
> > > +struct mcs_spin_node {
> > > + struct mcs_spin_node *next;
> > > + int locked; /* 1 if lock acquired */
> > > +};
> > > +
> > > +/*
> > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > + * time spent in this lock function.
> > > + */
> > > +static noinline
> > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > +{
> > > + struct mcs_spin_node *prev;
> > > +
> > > + /* Init node */
> > > + node->locked = 0;
> > > + node->next = NULL;
> > > +
> > > + prev = xchg(lock, node);
> > > + if (likely(prev == NULL)) {
> > > + /* Lock acquired */
> > > + node->locked = 1;
> > > + return;
> > > + }
> > > + ACCESS_ONCE(prev->next) = node;
> > > + smp_wmb();
>
> BTW, is the above memory barrier necessary? It seems like the xchg
> instruction already provided a memory barrier.
>
> Now if we made the changes that Jason suggested:
>
>
> /* Init node */
> - node->locked = 0;
> node->next = NULL;
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> /* Lock acquired */
> - node->locked = 1;
> return;
> }
> + node->locked = 0;
> ACCESS_ONCE(prev->next) = node;
> smp_wmb();
>
> We are probably still okay as other cpus do not read the value of
> node->locked, which is a local variable.

I don't immediately see the need for the smp_wmb() in either case.

> Tim
>
> > > + /* Wait until the lock holder passes the lock down */
> > > + while (!ACCESS_ONCE(node->locked))
> > > + arch_mutex_cpu_relax();

However, you do need a full memory barrier here in order to ensure that
you see the effects of the previous lock holder's critical section.

Thanx, Paul

> > > +}
> > > +
> > > +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > +{
> > > + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
> > > +
> > > + if (likely(!next)) {
> > > + /*
> > > + * Release the lock by setting it to NULL
> > > + */
> > > + if (cmpxchg(lock, node, NULL) == node)
> > > + return;
> > > + /* Wait until the next pointer is set */
> > > + while (!(next = ACCESS_ONCE(node->next)))
> > > + arch_mutex_cpu_relax();
> > > + }
> > > + ACCESS_ONCE(next->locked) = 1;
> > > + smp_wmb();
> >
> > Shouldn't the memory barrier precede the "ACCESS_ONCE(next->locked) = 1;"?
> > Maybe in an "else" clause of the prior "if" statement, given that the
> > cmpxchg() does it otherwise.
> >
> > Otherwise, in the case where the "if" conditionn is false, the critical
> > section could bleed out past the unlock.
> >
> > Thanx, Paul
> >
>
>

2013-09-27 22:46:53

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 13:38 -0700, Paul E. McKenney wrote:
> On Fri, Sep 27, 2013 at 12:38:53PM -0700, Tim Chen wrote:
> > On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
> > > On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > to reuse this code easily for rwsem.
> > > >
> > > > Signed-off-by: Tim Chen <[email protected]>
> > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > ---
> > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > create mode 100644 include/linux/mcslock.h
> > > >
> > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > new file mode 100644
> > > > index 0000000..20fd3f0
> > > > --- /dev/null
> > > > +++ b/include/linux/mcslock.h
> > > > @@ -0,0 +1,58 @@
> > > > +/*
> > > > + * MCS lock defines
> > > > + *
> > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > + */
> > > > +#ifndef __LINUX_MCSLOCK_H
> > > > +#define __LINUX_MCSLOCK_H
> > > > +
> > > > +struct mcs_spin_node {
> > > > + struct mcs_spin_node *next;
> > > > + int locked; /* 1 if lock acquired */
> > > > +};
> > > > +
> > > > +/*
> > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > + * time spent in this lock function.
> > > > + */
> > > > +static noinline
> > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > +{
> > > > + struct mcs_spin_node *prev;
> > > > +
> > > > + /* Init node */
> > > > + node->locked = 0;
> > > > + node->next = NULL;
> > > > +
> > > > + prev = xchg(lock, node);
> > > > + if (likely(prev == NULL)) {
> > > > + /* Lock acquired */
> > > > + node->locked = 1;
> > > > + return;
> > > > + }
> > > > + ACCESS_ONCE(prev->next) = node;
> > > > + smp_wmb();
> >
> > BTW, is the above memory barrier necessary? It seems like the xchg
> > instruction already provided a memory barrier.
> >
> > Now if we made the changes that Jason suggested:
> >
> >
> > /* Init node */
> > - node->locked = 0;
> > node->next = NULL;
> >
> > prev = xchg(lock, node);
> > if (likely(prev == NULL)) {
> > /* Lock acquired */
> > - node->locked = 1;
> > return;
> > }
> > + node->locked = 0;
> > ACCESS_ONCE(prev->next) = node;
> > smp_wmb();
> >
> > We are probably still okay as other cpus do not read the value of
> > node->locked, which is a local variable.
>
> I don't immediately see the need for the smp_wmb() in either case.


Thinking a bit more, the following could happen in Jason's
initial patch proposal. In this case variable "prev" referenced
by CPU1 points to "node" referenced by CPU2

CPU 1 (calling lock) CPU 2 (calling unlock)
ACCESS_ONCE(prev->next) = node
*next = ACCESS_ONCE(node->next);
ACCESS_ONCE(next->locked) = 1;
node->locked = 0;

Then we will be spinning forever on CPU1 as we overwrite the lock passed
from CPU2 before we check it. The original code assign
"node->locked = 0" before xchg does not have this issue.
Doing the following change of moving smp_wmb immediately
after node->locked assignment (suggested by Jason)

node->locked = 0;
smp_wmb();
ACCESS_ONCE(prev->next) = node;

could avoid the problem, but will need closer scrutiny to see if
there are other pitfalls if wmb happen before

ACCESS_ONCE(prev->next) = node;


> >
> > > > + /* Wait until the lock holder passes the lock down */
> > > > + while (!ACCESS_ONCE(node->locked))
> > > > + arch_mutex_cpu_relax();
>
> However, you do need a full memory barrier here in order to ensure that
> you see the effects of the previous lock holder's critical section.

Is it necessary to add a memory barrier after acquiring
the lock if the previous lock holder execute smp_wmb before passing
the lock?

Thanks.

Tim

2013-09-27 23:01:45

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 03:46:45PM -0700, Tim Chen wrote:
> On Fri, 2013-09-27 at 13:38 -0700, Paul E. McKenney wrote:
> > On Fri, Sep 27, 2013 at 12:38:53PM -0700, Tim Chen wrote:
> > > On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
> > > > On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
> > > > > We will need the MCS lock code for doing optimistic spinning for rwsem.
> > > > > Extracting the MCS code from mutex.c and put into its own file allow us
> > > > > to reuse this code easily for rwsem.
> > > > >
> > > > > Signed-off-by: Tim Chen <[email protected]>
> > > > > Signed-off-by: Davidlohr Bueso <[email protected]>
> > > > > ---
> > > > > include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > > kernel/mutex.c | 58 +++++-----------------------------------------
> > > > > 2 files changed, 65 insertions(+), 51 deletions(-)
> > > > > create mode 100644 include/linux/mcslock.h
> > > > >
> > > > > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > > > > new file mode 100644
> > > > > index 0000000..20fd3f0
> > > > > --- /dev/null
> > > > > +++ b/include/linux/mcslock.h
> > > > > @@ -0,0 +1,58 @@
> > > > > +/*
> > > > > + * MCS lock defines
> > > > > + *
> > > > > + * This file contains the main data structure and API definitions of MCS lock.
> > > > > + */
> > > > > +#ifndef __LINUX_MCSLOCK_H
> > > > > +#define __LINUX_MCSLOCK_H
> > > > > +
> > > > > +struct mcs_spin_node {
> > > > > + struct mcs_spin_node *next;
> > > > > + int locked; /* 1 if lock acquired */
> > > > > +};
> > > > > +
> > > > > +/*
> > > > > + * We don't inline mcs_spin_lock() so that perf can correctly account for the
> > > > > + * time spent in this lock function.
> > > > > + */
> > > > > +static noinline
> > > > > +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> > > > > +{
> > > > > + struct mcs_spin_node *prev;
> > > > > +
> > > > > + /* Init node */
> > > > > + node->locked = 0;
> > > > > + node->next = NULL;
> > > > > +
> > > > > + prev = xchg(lock, node);
> > > > > + if (likely(prev == NULL)) {
> > > > > + /* Lock acquired */
> > > > > + node->locked = 1;
> > > > > + return;
> > > > > + }
> > > > > + ACCESS_ONCE(prev->next) = node;
> > > > > + smp_wmb();
> > >
> > > BTW, is the above memory barrier necessary? It seems like the xchg
> > > instruction already provided a memory barrier.
> > >
> > > Now if we made the changes that Jason suggested:
> > >
> > >
> > > /* Init node */
> > > - node->locked = 0;
> > > node->next = NULL;
> > >
> > > prev = xchg(lock, node);
> > > if (likely(prev == NULL)) {
> > > /* Lock acquired */
> > > - node->locked = 1;
> > > return;
> > > }
> > > + node->locked = 0;
> > > ACCESS_ONCE(prev->next) = node;
> > > smp_wmb();
> > >
> > > We are probably still okay as other cpus do not read the value of
> > > node->locked, which is a local variable.
> >
> > I don't immediately see the need for the smp_wmb() in either case.
>
>
> Thinking a bit more, the following could happen in Jason's
> initial patch proposal. In this case variable "prev" referenced
> by CPU1 points to "node" referenced by CPU2
>
> CPU 1 (calling lock) CPU 2 (calling unlock)
> ACCESS_ONCE(prev->next) = node
> *next = ACCESS_ONCE(node->next);
> ACCESS_ONCE(next->locked) = 1;
> node->locked = 0;
>
> Then we will be spinning forever on CPU1 as we overwrite the lock passed
> from CPU2 before we check it. The original code assign
> "node->locked = 0" before xchg does not have this issue.
> Doing the following change of moving smp_wmb immediately
> after node->locked assignment (suggested by Jason)
>
> node->locked = 0;
> smp_wmb();
> ACCESS_ONCE(prev->next) = node;
>
> could avoid the problem, but will need closer scrutiny to see if
> there are other pitfalls if wmb happen before
>
> ACCESS_ONCE(prev->next) = node;

I could believe that an smp_wmb() might be needed before the
"ACCESS_ONCE(prev->next) = node;", just not after.

> > > > > + /* Wait until the lock holder passes the lock down */
> > > > > + while (!ACCESS_ONCE(node->locked))
> > > > > + arch_mutex_cpu_relax();
> >
> > However, you do need a full memory barrier here in order to ensure that
> > you see the effects of the previous lock holder's critical section.
>
> Is it necessary to add a memory barrier after acquiring
> the lock if the previous lock holder execute smp_wmb before passing
> the lock?

Yep. The previous lock holder's smp_wmb() won't keep either the compiler
or the CPU from reordering things for the new lock holder. They could for
example reorder the critical section to precede the node->locked check,
which would be very bad.

Thanx, Paul

2013-09-27 23:40:35

by Oliver Neukum

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Fri, 2013-09-27 at 16:50 +0200, Peter Zijlstra wrote:
> On Fri, Sep 27, 2013 at 07:34:55AM -0700, Joe Perches wrote:
> > That would make it seem as if all barriers are SMP no?
>
> I would think any memory barrier is ordering against someone else; if
> not smp then a device/hardware -- like for instance the hardware page
> table walker.
>
> Barriers are fundamentally about order; and order only makes sense if
> there's more than 1 party to the game.

But not necessarily more than 1 kind of parties. It is perfectly
possible to have a barrier against other threads running the same
function.

Regards
Oliver

2013-09-27 23:54:10

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 4:01 PM, Paul E. McKenney
<[email protected]> wrote:
> Yep. The previous lock holder's smp_wmb() won't keep either the compiler
> or the CPU from reordering things for the new lock holder. They could for
> example reorder the critical section to precede the node->locked check,
> which would be very bad.

Paul, Tim, Longman,

How would you like the proposed changes below?

---
Subject: [PATCH] MCS: optimizations and barrier corrections

Delete the node->locked = 1 assignment if the lock is free as it won't be used.

Delete the smp_wmb() in mcs_spin_lock() and add a full memory barrier at the
end of the mcs_spin_lock() function. As Paul McKenney suggested, "you do need a
full memory barrier here in order to ensure that you see the effects of the
previous lock holder's critical section." And in the mcs_spin_unlock(), move the
memory barrier so that it is before the "ACCESS_ONCE(next->locked) = 1;".

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/mcslock.h | 7 +++----
1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
index 20fd3f0..edd57d2 100644
--- a/include/linux/mcslock.h
+++ b/include/linux/mcslock.h
@@ -26,15 +26,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock,
struct mcs_spin_node *node)

prev = xchg(lock, node);
if (likely(prev == NULL)) {
- /* Lock acquired */
- node->locked = 1;
+ /* Lock acquired. No need to set node->locked since it
won't be used */
return;
}
ACCESS_ONCE(prev->next) = node;
- smp_wmb();
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
+ smp_mb();
}

static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
mcs_spin_node *node)
@@ -51,8 +50,8 @@ static void mcs_spin_unlock(struct mcs_spin_node
**lock, struct mcs_spin_node *n
while (!(next = ACCESS_ONCE(node->next)))
arch_mutex_cpu_relax();
}
- ACCESS_ONCE(next->locked) = 1;
smp_wmb();
+ ACCESS_ONCE(next->locked) = 1;
}

#endif
--
1.7.1

2013-09-28 00:02:40

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 16:54 -0700, Jason Low wrote:
> On Fri, Sep 27, 2013 at 4:01 PM, Paul E. McKenney
> <[email protected]> wrote:
> > Yep. The previous lock holder's smp_wmb() won't keep either the compiler
> > or the CPU from reordering things for the new lock holder. They could for
> > example reorder the critical section to precede the node->locked check,
> > which would be very bad.
>
> Paul, Tim, Longman,
>
> How would you like the proposed changes below?
>
> ---
> Subject: [PATCH] MCS: optimizations and barrier corrections

We *really* need to comment those barriers - explicitly that is :)

>
> Delete the node->locked = 1 assignment if the lock is free as it won't be used.
>
> Delete the smp_wmb() in mcs_spin_lock() and add a full memory barrier at the
> end of the mcs_spin_lock() function. As Paul McKenney suggested, "you do need a
> full memory barrier here in order to ensure that you see the effects of the
> previous lock holder's critical section." And in the mcs_spin_unlock(), move the
> memory barrier so that it is before the "ACCESS_ONCE(next->locked) = 1;".
>
> Signed-off-by: Jason Low <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> include/linux/mcslock.h | 7 +++----
> 1 files changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> index 20fd3f0..edd57d2 100644
> --- a/include/linux/mcslock.h
> +++ b/include/linux/mcslock.h
> @@ -26,15 +26,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock,
> struct mcs_spin_node *node)
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> - /* Lock acquired */
> - node->locked = 1;
> + /* Lock acquired. No need to set node->locked since it
> won't be used */

Then, we need to explain/comment then the relationship between this
situation and the locked being set in mspin_unlock(), passing the lock
holder down the list.

> return;
> }
> ACCESS_ONCE(prev->next) = node;
> - smp_wmb();
> /* Wait until the lock holder passes the lock down */
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
> + smp_mb();
> }
>
> static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
> mcs_spin_node *node)
> @@ -51,8 +50,8 @@ static void mcs_spin_unlock(struct mcs_spin_node
> **lock, struct mcs_spin_node *n
> while (!(next = ACCESS_ONCE(node->next)))
> arch_mutex_cpu_relax();
> }
> - ACCESS_ONCE(next->locked) = 1;
> smp_wmb();
> + ACCESS_ONCE(next->locked) = 1;
> }
>
> #endif

2013-09-28 02:19:55

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 04:54:06PM -0700, Jason Low wrote:
> On Fri, Sep 27, 2013 at 4:01 PM, Paul E. McKenney
> <[email protected]> wrote:
> > Yep. The previous lock holder's smp_wmb() won't keep either the compiler
> > or the CPU from reordering things for the new lock holder. They could for
> > example reorder the critical section to precede the node->locked check,
> > which would be very bad.
>
> Paul, Tim, Longman,
>
> How would you like the proposed changes below?

Could you point me at what this applies to? I can find flaws looking
at random pieces, given a little luck, but at some point I need to look
at the whole thing. ;-)

Thanx, Paul

> ---
> Subject: [PATCH] MCS: optimizations and barrier corrections
>
> Delete the node->locked = 1 assignment if the lock is free as it won't be used.
>
> Delete the smp_wmb() in mcs_spin_lock() and add a full memory barrier at the
> end of the mcs_spin_lock() function. As Paul McKenney suggested, "you do need a
> full memory barrier here in order to ensure that you see the effects of the
> previous lock holder's critical section." And in the mcs_spin_unlock(), move the
> memory barrier so that it is before the "ACCESS_ONCE(next->locked) = 1;".
>
> Signed-off-by: Jason Low <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> include/linux/mcslock.h | 7 +++----
> 1 files changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> index 20fd3f0..edd57d2 100644
> --- a/include/linux/mcslock.h
> +++ b/include/linux/mcslock.h
> @@ -26,15 +26,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock,
> struct mcs_spin_node *node)
>
> prev = xchg(lock, node);
> if (likely(prev == NULL)) {
> - /* Lock acquired */
> - node->locked = 1;
> + /* Lock acquired. No need to set node->locked since it
> won't be used */
> return;
> }
> ACCESS_ONCE(prev->next) = node;
> - smp_wmb();
> /* Wait until the lock holder passes the lock down */
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
> + smp_mb();
> }
>
> static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
> mcs_spin_node *node)
> @@ -51,8 +50,8 @@ static void mcs_spin_unlock(struct mcs_spin_node
> **lock, struct mcs_spin_node *n
> while (!(next = ACCESS_ONCE(node->next)))
> arch_mutex_cpu_relax();
> }
> - ACCESS_ONCE(next->locked) = 1;
> smp_wmb();
> + ACCESS_ONCE(next->locked) = 1;
> }
>
> #endif
> --
> 1.7.1
>

2013-09-28 02:59:17

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 09/27/2013 02:09 PM, Tim Chen wrote:
> On Fri, 2013-09-27 at 08:29 -0700, Paul E. McKenney wrote:
>> On Wed, Sep 25, 2013 at 03:10:49PM -0700, Tim Chen wrote:
>>> We will need the MCS lock code for doing optimistic spinning for rwsem.
>>> Extracting the MCS code from mutex.c and put into its own file allow us
>>> to reuse this code easily for rwsem.
>>>
>>> Signed-off-by: Tim Chen<[email protected]>
>>> Signed-off-by: Davidlohr Bueso<[email protected]>
>>> ---
>>> include/linux/mcslock.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
>>> kernel/mutex.c | 58 +++++-----------------------------------------
>>> 2 files changed, 65 insertions(+), 51 deletions(-)
>>> create mode 100644 include/linux/mcslock.h
>>>
>>> diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
>>> new file mode 100644
>>> index 0000000..20fd3f0
>>> --- /dev/null
>>> +++ b/include/linux/mcslock.h
>>> @@ -0,0 +1,58 @@
>>> +/*
>>> + * MCS lock defines
>>> + *
>>> + * This file contains the main data structure and API definitions of MCS lock.
>>> + */
>>> +#ifndef __LINUX_MCSLOCK_H
>>> +#define __LINUX_MCSLOCK_H
>>> +
>>> +struct mcs_spin_node {
>>> + struct mcs_spin_node *next;
>>> + int locked; /* 1 if lock acquired */
>>> +};
>>> +
>>> +/*
>>> + * We don't inline mcs_spin_lock() so that perf can correctly account for the
>>> + * time spent in this lock function.
>>> + */
>>> +static noinline
>>> +void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>>> +{
>>> + struct mcs_spin_node *prev;
>>> +
>>> + /* Init node */
>>> + node->locked = 0;
>>> + node->next = NULL;
>>> +
>>> + prev = xchg(lock, node);
>>> + if (likely(prev == NULL)) {
>>> + /* Lock acquired */
>>> + node->locked = 1;
>>> + return;
>>> + }
>>> + ACCESS_ONCE(prev->next) = node;
>>> + smp_wmb();
>>> + /* Wait until the lock holder passes the lock down */
>>> + while (!ACCESS_ONCE(node->locked))
>>> + arch_mutex_cpu_relax();
>>> +}
>>> +
>>> +static void mcs_spin_unlock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>>> +{
>>> + struct mcs_spin_node *next = ACCESS_ONCE(node->next);
>>> +
>>> + if (likely(!next)) {
>>> + /*
>>> + * Release the lock by setting it to NULL
>>> + */
>>> + if (cmpxchg(lock, node, NULL) == node)
>>> + return;
>>> + /* Wait until the next pointer is set */
>>> + while (!(next = ACCESS_ONCE(node->next)))
>>> + arch_mutex_cpu_relax();
>>> + }
>>> + ACCESS_ONCE(next->locked) = 1;
>>> + smp_wmb();
>> Shouldn't the memory barrier precede the "ACCESS_ONCE(next->locked) = 1;"?
>> Maybe in an "else" clause of the prior "if" statement, given that the
>> cmpxchg() does it otherwise.
>>
>> Otherwise, in the case where the "if" conditionn is false, the critical
>> section could bleed out past the unlock.
> Yes, I agree with you that the smp_wmb should be moved before
> ACCESS_ONCE to prevent critical section from bleeding. Copying Waiman
> who is the original author of the mcs code to see if he has any comments
> on things we may have missed.
>
> Tim

As a more general lock/unlock mechanism, I also agreed that we should
move smp_wmb() before ACCESS_ONCE(). For the mutex case, it is used as a
queuing mechanism rather than guarding critical section, so it doesn't
really matter.

Regards,
Longman

2013-09-28 04:34:10

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, Sep 27, 2013 at 7:19 PM, Paul E. McKenney
<[email protected]> wrote:
> On Fri, Sep 27, 2013 at 04:54:06PM -0700, Jason Low wrote:
>> On Fri, Sep 27, 2013 at 4:01 PM, Paul E. McKenney
>> <[email protected]> wrote:
>> > Yep. The previous lock holder's smp_wmb() won't keep either the compiler
>> > or the CPU from reordering things for the new lock holder. They could for
>> > example reorder the critical section to precede the node->locked check,
>> > which would be very bad.
>>
>> Paul, Tim, Longman,
>>
>> How would you like the proposed changes below?
>
> Could you point me at what this applies to? I can find flaws looking
> at random pieces, given a little luck, but at some point I need to look
> at the whole thing. ;-)

Sure. Here is a link to the patch we are trying to modify:
https://lkml.org/lkml/2013/9/25/532

Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
functions would look like after applying the proposed changes.

static noinline
void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
{
struct mcs_spin_node *prev;

/* Init node */
node->locked = 0;
node->next = NULL;

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired. No need to set node->locked since it
won't be used */
return;
}
ACCESS_ONCE(prev->next) = node;
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
smp_mb();
}

static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
mcs_spin_node *node)
{
struct mcs_spin_node *next = ACCESS_ONCE(node->next);

if (likely(!next)) {
/*
* Release the lock by setting it to NULL
*/
if (cmpxchg(lock, node, NULL) == node)
return;
/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))
arch_mutex_cpu_relax();
}
smp_wmb();
ACCESS_ONCE(next->locked) = 1;
}

2013-09-28 07:54:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] checkpatch: Make the memory barrier test noisier

On Sat, Sep 28, 2013 at 01:40:27AM +0200, Oliver Neukum wrote:
> On Fri, 2013-09-27 at 16:50 +0200, Peter Zijlstra wrote:
> > On Fri, Sep 27, 2013 at 07:34:55AM -0700, Joe Perches wrote:
> > > That would make it seem as if all barriers are SMP no?
> >
> > I would think any memory barrier is ordering against someone else; if
> > not smp then a device/hardware -- like for instance the hardware page
> > table walker.
> >
> > Barriers are fundamentally about order; and order only makes sense if
> > there's more than 1 party to the game.
>
> But not necessarily more than 1 kind of parties. It is perfectly
> possible to have a barrier against other threads running the same
> function.

Then that makes a good comment ;-)

2013-09-30 15:52:04

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 09/28/2013 12:34 AM, Jason Low wrote:
>> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
>> functions would look like after applying the proposed changes.
>>
>> static noinline
>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>> {
>> struct mcs_spin_node *prev;
>>
>> /* Init node */
>> node->locked = 0;
>> node->next = NULL;
>>
>> prev = xchg(lock, node);
>> if (likely(prev == NULL)) {
>> /* Lock acquired. No need to set node->locked since it
>> won't be used */
>> return;
>> }
>> ACCESS_ONCE(prev->next) = node;
>> /* Wait until the lock holder passes the lock down */
>> while (!ACCESS_ONCE(node->locked))
>> arch_mutex_cpu_relax();
>> smp_mb();

I wonder if a memory barrier is really needed here.

>> }
>>
>> static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
>> mcs_spin_node *node)
>> {
>> struct mcs_spin_node *next = ACCESS_ONCE(node->next);
>>
>> if (likely(!next)) {
>> /*
>> * Release the lock by setting it to NULL
>> */
>> if (cmpxchg(lock, node, NULL) == node)
>> return;
>> /* Wait until the next pointer is set */
>> while (!(next = ACCESS_ONCE(node->next)))
>> arch_mutex_cpu_relax();
>> }
>> smp_wmb();
>> ACCESS_ONCE(next->locked) = 1;
>> }

Instead, I think what we need may be:

if (likely(!next)) {
....
} else
smp_mb();
ACCESS_ONCE(next->locked) = 1;

That will ensure a memory barrier in the unlock path.

Regards,
Longman

2013-09-30 16:10:50

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
> On 09/28/2013 12:34 AM, Jason Low wrote:
> >> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
> >> functions would look like after applying the proposed changes.
> >>
> >> static noinline
> >> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> >> {
> >> struct mcs_spin_node *prev;
> >>
> >> /* Init node */
> >> node->locked = 0;
> >> node->next = NULL;
> >>
> >> prev = xchg(lock, node);
> >> if (likely(prev == NULL)) {
> >> /* Lock acquired. No need to set node->locked since it
> >> won't be used */
> >> return;
> >> }
> >> ACCESS_ONCE(prev->next) = node;
> >> /* Wait until the lock holder passes the lock down */
> >> while (!ACCESS_ONCE(node->locked))
> >> arch_mutex_cpu_relax();
> >> smp_mb();
>
> I wonder if a memory barrier is really needed here.

If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
so that the check occurs after an instruction in the critical section,
then the barrier may be necessary.

2013-09-30 16:28:35

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Fri, 2013-09-27 at 19:19 -0700, Paul E. McKenney wrote:
> On Fri, Sep 27, 2013 at 04:54:06PM -0700, Jason Low wrote:
> > On Fri, Sep 27, 2013 at 4:01 PM, Paul E. McKenney
> > <[email protected]> wrote:
> > > Yep. The previous lock holder's smp_wmb() won't keep either the compiler
> > > or the CPU from reordering things for the new lock holder. They could for
> > > example reorder the critical section to precede the node->locked check,
> > > which would be very bad.
> >
> > Paul, Tim, Longman,
> >
> > How would you like the proposed changes below?
>
> Could you point me at what this applies to? I can find flaws looking
> at random pieces, given a little luck, but at some point I need to look
> at the whole thing. ;-)
>
> Thanx, Paul

Jason's patch is on top of the following patchset:

https://lkml.org/lkml/2013/9/26/674

Thanks.

Tim

>
> > ---
> > Subject: [PATCH] MCS: optimizations and barrier corrections
> >
> > Delete the node->locked = 1 assignment if the lock is free as it won't be used.
> >
> > Delete the smp_wmb() in mcs_spin_lock() and add a full memory barrier at the
> > end of the mcs_spin_lock() function. As Paul McKenney suggested, "you do need a
> > full memory barrier here in order to ensure that you see the effects of the
> > previous lock holder's critical section." And in the mcs_spin_unlock(), move the
> > memory barrier so that it is before the "ACCESS_ONCE(next->locked) = 1;".
> >
> > Signed-off-by: Jason Low <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > Signed-off-by: Tim Chen <[email protected]>
> > ---
> > include/linux/mcslock.h | 7 +++----
> > 1 files changed, 3 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/mcslock.h b/include/linux/mcslock.h
> > index 20fd3f0..edd57d2 100644
> > --- a/include/linux/mcslock.h
> > +++ b/include/linux/mcslock.h
> > @@ -26,15 +26,14 @@ void mcs_spin_lock(struct mcs_spin_node **lock,
> > struct mcs_spin_node *node)
> >
> > prev = xchg(lock, node);
> > if (likely(prev == NULL)) {
> > - /* Lock acquired */
> > - node->locked = 1;
> > + /* Lock acquired. No need to set node->locked since it
> > won't be used */
> > return;
> > }
> > ACCESS_ONCE(prev->next) = node;
> > - smp_wmb();
> > /* Wait until the lock holder passes the lock down */
> > while (!ACCESS_ONCE(node->locked))
> > arch_mutex_cpu_relax();
> > + smp_mb();
> > }
> >
> > static void mcs_spin_unlock(struct mcs_spin_node **lock, struct
> > mcs_spin_node *node)
> > @@ -51,8 +50,8 @@ static void mcs_spin_unlock(struct mcs_spin_node
> > **lock, struct mcs_spin_node *n
> > while (!(next = ACCESS_ONCE(node->next)))
> > arch_mutex_cpu_relax();
> > }
> > - ACCESS_ONCE(next->locked) = 1;
> > smp_wmb();
> > + ACCESS_ONCE(next->locked) = 1;
> > }
> >
> > #endif
> > --
> > 1.7.1
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-09-30 16:37:09

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 09/30/2013 12:10 PM, Jason Low wrote:
> On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
>> On 09/28/2013 12:34 AM, Jason Low wrote:
>>>> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
>>>> functions would look like after applying the proposed changes.
>>>>
>>>> static noinline
>>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>>>> {
>>>> struct mcs_spin_node *prev;
>>>>
>>>> /* Init node */
>>>> node->locked = 0;
>>>> node->next = NULL;
>>>>
>>>> prev = xchg(lock, node);
>>>> if (likely(prev == NULL)) {
>>>> /* Lock acquired. No need to set node->locked since it
>>>> won't be used */
>>>> return;
>>>> }
>>>> ACCESS_ONCE(prev->next) = node;
>>>> /* Wait until the lock holder passes the lock down */
>>>> while (!ACCESS_ONCE(node->locked))
>>>> arch_mutex_cpu_relax();
>>>> smp_mb();
>> I wonder if a memory barrier is really needed here.
> If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
> so that the check occurs after an instruction in the critical section,
> then the barrier may be necessary.
>

In that case, just a barrier() call should be enough.

-Longman

2013-10-01 16:48:18

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Mon, 2013-09-30 at 12:36 -0400, Waiman Long wrote:
> On 09/30/2013 12:10 PM, Jason Low wrote:
> > On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
> >> On 09/28/2013 12:34 AM, Jason Low wrote:
> >>>> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
> >>>> functions would look like after applying the proposed changes.
> >>>>
> >>>> static noinline
> >>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> >>>> {
> >>>> struct mcs_spin_node *prev;
> >>>>
> >>>> /* Init node */
> >>>> node->locked = 0;
> >>>> node->next = NULL;
> >>>>
> >>>> prev = xchg(lock, node);
> >>>> if (likely(prev == NULL)) {
> >>>> /* Lock acquired. No need to set node->locked since it
> >>>> won't be used */
> >>>> return;
> >>>> }
> >>>> ACCESS_ONCE(prev->next) = node;
> >>>> /* Wait until the lock holder passes the lock down */
> >>>> while (!ACCESS_ONCE(node->locked))
> >>>> arch_mutex_cpu_relax();
> >>>> smp_mb();
> >> I wonder if a memory barrier is really needed here.
> > If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
> > so that the check occurs after an instruction in the critical section,
> > then the barrier may be necessary.
> >
>
> In that case, just a barrier() call should be enough.

The cpu could still be executing out of order load instruction from the
critical section before checking node->locked? Probably smp_mb() is
still needed.

Tim

2013-10-01 20:01:22

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 10/01/2013 12:48 PM, Tim Chen wrote:
> On Mon, 2013-09-30 at 12:36 -0400, Waiman Long wrote:
>> On 09/30/2013 12:10 PM, Jason Low wrote:
>>> On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
>>>> On 09/28/2013 12:34 AM, Jason Low wrote:
>>>>>> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
>>>>>> functions would look like after applying the proposed changes.
>>>>>>
>>>>>> static noinline
>>>>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>>>>>> {
>>>>>> struct mcs_spin_node *prev;
>>>>>>
>>>>>> /* Init node */
>>>>>> node->locked = 0;
>>>>>> node->next = NULL;
>>>>>>
>>>>>> prev = xchg(lock, node);
>>>>>> if (likely(prev == NULL)) {
>>>>>> /* Lock acquired. No need to set node->locked since it
>>>>>> won't be used */
>>>>>> return;
>>>>>> }
>>>>>> ACCESS_ONCE(prev->next) = node;
>>>>>> /* Wait until the lock holder passes the lock down */
>>>>>> while (!ACCESS_ONCE(node->locked))
>>>>>> arch_mutex_cpu_relax();
>>>>>> smp_mb();
>>>> I wonder if a memory barrier is really needed here.
>>> If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
>>> so that the check occurs after an instruction in the critical section,
>>> then the barrier may be necessary.
>>>
>> In that case, just a barrier() call should be enough.
> The cpu could still be executing out of order load instruction from the
> critical section before checking node->locked? Probably smp_mb() is
> still needed.
>
> Tim

But this is the lock function, a barrier() call should be enough to
prevent the critical section from creeping up there. We certainly need
some kind of memory barrier at the end of the unlock function.

-Longman

2013-10-01 21:16:34

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Tue, 2013-10-01 at 16:01 -0400, Waiman Long wrote:
> On 10/01/2013 12:48 PM, Tim Chen wrote:
> > On Mon, 2013-09-30 at 12:36 -0400, Waiman Long wrote:
> >> On 09/30/2013 12:10 PM, Jason Low wrote:
> >>> On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
> >>>> On 09/28/2013 12:34 AM, Jason Low wrote:
> >>>>>> Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
> >>>>>> functions would look like after applying the proposed changes.
> >>>>>>
> >>>>>> static noinline
> >>>>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
> >>>>>> {
> >>>>>> struct mcs_spin_node *prev;
> >>>>>>
> >>>>>> /* Init node */
> >>>>>> node->locked = 0;
> >>>>>> node->next = NULL;
> >>>>>>
> >>>>>> prev = xchg(lock, node);
> >>>>>> if (likely(prev == NULL)) {
> >>>>>> /* Lock acquired. No need to set node->locked since it
> >>>>>> won't be used */
> >>>>>> return;
> >>>>>> }
> >>>>>> ACCESS_ONCE(prev->next) = node;
> >>>>>> /* Wait until the lock holder passes the lock down */
> >>>>>> while (!ACCESS_ONCE(node->locked))
> >>>>>> arch_mutex_cpu_relax();
> >>>>>> smp_mb();
> >>>> I wonder if a memory barrier is really needed here.
> >>> If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
> >>> so that the check occurs after an instruction in the critical section,
> >>> then the barrier may be necessary.
> >>>
> >> In that case, just a barrier() call should be enough.
> > The cpu could still be executing out of order load instruction from the
> > critical section before checking node->locked? Probably smp_mb() is
> > still needed.
> >
> > Tim
>
> But this is the lock function, a barrier() call should be enough to
> prevent the critical section from creeping up there. We certainly need
> some kind of memory barrier at the end of the unlock function.

I may be missing something. My understanding is that barrier only
prevents the compiler from rearranging instructions, but not for cpu out
of order execution (as in smp_mb). So cpu could read memory in the next
critical section, before node->locked is true, (i.e. unlock has been
completed). If we only have a simple barrier at end of mcs_lock, then
say the code on CPU1 is

mcs_lock
x = 1;
...
x = 2;
mcs_unlock

and CPU 2 is

mcs_lock
y = x;
...
mcs_unlock

We expect y to be 2 after the "y = x" assignment. But we
we may execute the code as

CPU1 CPU2

x = 1;
... y = x; ( y=1, out of order load)
x = 2
mcs_unlock
Check node->locked==true
continue executing critical section (y=1 when we expect y=2)

So we get y to be 1 when we expect that it should be 2. Adding smp_mb
after the node->locked check in lock code

ACCESS_ONCE(prev->next) = node;
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
smp_mb();

should prevent this scenario.

Thanks.
Tim

>
> -Longman
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-10-02 01:25:21

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 10/01/2013 05:16 PM, Tim Chen wrote:
> On Tue, 2013-10-01 at 16:01 -0400, Waiman Long wrote:
>>>
>>> The cpu could still be executing out of order load instruction from the
>>> critical section before checking node->locked? Probably smp_mb() is
>>> still needed.
>>>
>>> Tim
>> But this is the lock function, a barrier() call should be enough to
>> prevent the critical section from creeping up there. We certainly need
>> some kind of memory barrier at the end of the unlock function.
> I may be missing something. My understanding is that barrier only
> prevents the compiler from rearranging instructions, but not for cpu out
> of order execution (as in smp_mb). So cpu could read memory in the next
> critical section, before node->locked is true, (i.e. unlock has been
> completed). If we only have a simple barrier at end of mcs_lock, then
> say the code on CPU1 is
>
> mcs_lock
> x = 1;
> ...
> x = 2;
> mcs_unlock
>
> and CPU 2 is
>
> mcs_lock
> y = x;
> ...
> mcs_unlock
>
> We expect y to be 2 after the "y = x" assignment. But we
> we may execute the code as
>
> CPU1 CPU2
>
> x = 1;
> ... y = x; ( y=1, out of order load)
> x = 2
> mcs_unlock
> Check node->locked==true
> continue executing critical section (y=1 when we expect y=2)
>
> So we get y to be 1 when we expect that it should be 2. Adding smp_mb
> after the node->locked check in lock code
>
> ACCESS_ONCE(prev->next) = node;
> /* Wait until the lock holder passes the lock down */
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
> smp_mb();
>
> should prevent this scenario.
>
> Thanks.
> Tim

If the lock and unlock functions are done right, there should be no
overlap of critical section. So it is job of the lock/unlock functions
to make sure that critical section code won't leak out. There should be
some kind of memory barrier at the beginning of the lock function and
the end of the unlock function.

The critical section also likely to have branches. The CPU may
speculatively execute code on the 2 branches, but one of them will be
discarded once the branch condition is known. Also
arch_mutex_cpu_relax() is a compiler barrier by itself. So we may not
need a barrier() after all. The while statement is a branch instruction,
any code after that can only be speculatively executed and cannot be
committed until the branch is done.

In x86, the smp_mb() function translated to a mfence instruction which
cost time. That is why I try to get rid of it if it is not necessary.

Regards,
Longman

2013-10-02 18:43:26

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Tue, 2013-10-01 at 21:25 -0400, Waiman Long wrote:
> On 10/01/2013 05:16 PM, Tim Chen wrote:
> > On Tue, 2013-10-01 at 16:01 -0400, Waiman Long wrote:
> >>>
> >>> The cpu could still be executing out of order load instruction from the
> >>> critical section before checking node->locked? Probably smp_mb() is
> >>> still needed.
> >>>
> >>> Tim
> >> But this is the lock function, a barrier() call should be enough to
> >> prevent the critical section from creeping up there. We certainly need
> >> some kind of memory barrier at the end of the unlock function.
> > I may be missing something. My understanding is that barrier only
> > prevents the compiler from rearranging instructions, but not for cpu out
> > of order execution (as in smp_mb). So cpu could read memory in the next
> > critical section, before node->locked is true, (i.e. unlock has been
> > completed). If we only have a simple barrier at end of mcs_lock, then
> > say the code on CPU1 is
> >
> > mcs_lock
> > x = 1;
> > ...
> > x = 2;
> > mcs_unlock
> >
> > and CPU 2 is
> >
> > mcs_lock
> > y = x;
> > ...
> > mcs_unlock
> >
> > We expect y to be 2 after the "y = x" assignment. But we
> > we may execute the code as
> >
> > CPU1 CPU2
> >
> > x = 1;
> > ... y = x; ( y=1, out of order load)
> > x = 2
> > mcs_unlock
> > Check node->locked==true
> > continue executing critical section (y=1 when we expect y=2)
> >
> > So we get y to be 1 when we expect that it should be 2. Adding smp_mb
> > after the node->locked check in lock code
> >
> > ACCESS_ONCE(prev->next) = node;
> > /* Wait until the lock holder passes the lock down */
> > while (!ACCESS_ONCE(node->locked))
> > arch_mutex_cpu_relax();
> > smp_mb();
> >
> > should prevent this scenario.
> >
> > Thanks.
> > Tim
>
> If the lock and unlock functions are done right, there should be no
> overlap of critical section. So it is job of the lock/unlock functions
> to make sure that critical section code won't leak out. There should be
> some kind of memory barrier at the beginning of the lock function and
> the end of the unlock function.
>
> The critical section also likely to have branches. The CPU may
> speculatively execute code on the 2 branches, but one of them will be
> discarded once the branch condition is known. Also
> arch_mutex_cpu_relax() is a compiler barrier by itself. So we may not
> need a barrier() after all. The while statement is a branch instruction,
> any code after that can only be speculatively executed and cannot be
> committed until the branch is done.

But the condition code may be checked after speculative execution?
The condition may not be true during speculative execution and only
turns true when we check the condition, and take that branch?

The thing that bothers me is without memory barrier after the while
statement, we could speculatively execute before affirming the lock is
in acquired state. Then when we check the lock, the lock is set
to acquired state in the mean time.
We could be loading some memory entry *before*
the node->locked has been set true. I think a smp_rmb (if not a
smp_mb) should be set after the while statement.

At first I was also thinking that the memory barrier is not
necessary but Paul convinced me otherwise in a previous email.
https://lkml.org/lkml/2013/9/27/523

>
> In x86, the smp_mb() function translated to a mfence instruction which
> cost time. That is why I try to get rid of it if it is not necessary.
>

I also hope that the memory barrier is not necessary and I am missing
something obvious. But I haven't been able to persuade myself.

Tim


2013-10-02 19:19:46

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 09/26/2013 06:42 PM, Jason Low wrote:
> On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:
>> Okay, that would makes sense for consistency because we always
>> first set node->lock = 0 at the top of the function.
>>
>> If we prefer to optimize this a bit though, perhaps we can
>> first move the node->lock = 0 so that it gets executed after the
>> "if (likely(prev == NULL)) {}" code block and then delete
>> "node->lock = 1" inside the code block.
>>
>> static noinline
>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
>> {
>> struct mcs_spin_node *prev;
>>
>> /* Init node */
>> node->next = NULL;
>>
>> prev = xchg(lock, node);
>> if (likely(prev == NULL)) {
>> /* Lock acquired */
>> return;
>> }
>> node->locked = 0;

You can remove the locked flag setting statement inside if (prev ==
NULL), but you can't clear the locked flag after xchg(). In the interval
between xchg() and locked=0, the previous lock owner may come in and set
the flag. Now if your clear it, the thread will loop forever. You have
to clear it before xchg().

-Longman

2013-10-02 19:30:04

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On Wed, Oct 2, 2013 at 12:19 PM, Waiman Long <[email protected]> wrote:
> On 09/26/2013 06:42 PM, Jason Low wrote:
>>
>> On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:
>>>
>>> Okay, that would makes sense for consistency because we always
>>> first set node->lock = 0 at the top of the function.
>>>
>>> If we prefer to optimize this a bit though, perhaps we can
>>> first move the node->lock = 0 so that it gets executed after the
>>> "if (likely(prev == NULL)) {}" code block and then delete
>>> "node->lock = 1" inside the code block.
>>>
>>> static noinline
>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node
>>> *node)
>>> {
>>> struct mcs_spin_node *prev;
>>>
>>> /* Init node */
>>> node->next = NULL;
>>>
>>> prev = xchg(lock, node);
>>> if (likely(prev == NULL)) {
>>> /* Lock acquired */
>>> return;
>>> }
>>> node->locked = 0;
>
>
> You can remove the locked flag setting statement inside if (prev == NULL),
> but you can't clear the locked flag after xchg(). In the interval between
> xchg() and locked=0, the previous lock owner may come in and set the flag.
> Now if your clear it, the thread will loop forever. You have to clear it
> before xchg().

Yes, in my most recent version, I left locked = 0 in its original
place so that the xchg() can act as a barrier for it.

The other option would have been to put another barrier after locked =
0. I went with leaving locked = 0 in its original place so that we
don't need that extra barrier.

2013-10-02 19:32:37

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 10/02/2013 02:43 PM, Tim Chen wrote:
> On Tue, 2013-10-01 at 21:25 -0400, Waiman Long wrote:
>>
>> If the lock and unlock functions are done right, there should be no
>> overlap of critical section. So it is job of the lock/unlock functions
>> to make sure that critical section code won't leak out. There should be
>> some kind of memory barrier at the beginning of the lock function and
>> the end of the unlock function.
>>
>> The critical section also likely to have branches. The CPU may
>> speculatively execute code on the 2 branches, but one of them will be
>> discarded once the branch condition is known. Also
>> arch_mutex_cpu_relax() is a compiler barrier by itself. So we may not
>> need a barrier() after all. The while statement is a branch instruction,
>> any code after that can only be speculatively executed and cannot be
>> committed until the branch is done.
> But the condition code may be checked after speculative execution?
> The condition may not be true during speculative execution and only
> turns true when we check the condition, and take that branch?
>
> The thing that bothers me is without memory barrier after the while
> statement, we could speculatively execute before affirming the lock is
> in acquired state. Then when we check the lock, the lock is set
> to acquired state in the mean time.
> We could be loading some memory entry *before*
> the node->locked has been set true. I think a smp_rmb (if not a
> smp_mb) should be set after the while statement.

Yes, I think a smp_rmb() make sense here to correspond to the smp_wmb()
in the unlock path.

BTW, you need to move the node->locked = 0; statement before xchg() if
you haven't done so.

-Longman

2013-10-02 19:37:43

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

On 10/02/2013 03:30 PM, Jason Low wrote:
> On Wed, Oct 2, 2013 at 12:19 PM, Waiman Long<[email protected]> wrote:
>> On 09/26/2013 06:42 PM, Jason Low wrote:
>>> On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:
>>>> Okay, that would makes sense for consistency because we always
>>>> first set node->lock = 0 at the top of the function.
>>>>
>>>> If we prefer to optimize this a bit though, perhaps we can
>>>> first move the node->lock = 0 so that it gets executed after the
>>>> "if (likely(prev == NULL)) {}" code block and then delete
>>>> "node->lock = 1" inside the code block.
>>>>
>>>> static noinline
>>>> void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node
>>>> *node)
>>>> {
>>>> struct mcs_spin_node *prev;
>>>>
>>>> /* Init node */
>>>> node->next = NULL;
>>>>
>>>> prev = xchg(lock, node);
>>>> if (likely(prev == NULL)) {
>>>> /* Lock acquired */
>>>> return;
>>>> }
>>>> node->locked = 0;
>>
>> You can remove the locked flag setting statement inside if (prev == NULL),
>> but you can't clear the locked flag after xchg(). In the interval between
>> xchg() and locked=0, the previous lock owner may come in and set the flag.
>> Now if your clear it, the thread will loop forever. You have to clear it
>> before xchg().
> Yes, in my most recent version, I left locked = 0 in its original
> place so that the xchg() can act as a barrier for it.
>
> The other option would have been to put another barrier after locked =
> 0. I went with leaving locked = 0 in its original place so that we
> don't need that extra barrier.

I don't think putting another barrier after locked=0 will work.
Chronologically, the flag must be cleared before the node address is
saved in the lock field. There is no way to guarantee that except by
putting the locked=0 before xchg().

-Longman