2015-08-21 17:44:57

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

Hello,

Now that sb->s_writers was changed to use percpu_rw_semaphore let me
send v2.

Changes:

- whitespace fix in 1/8.

- remove EXPORT_SYMBOL() in 3/8, currently rcu_sync has no
modular users.

- 5/8 is new. This ugly hack pairs with another one:
"shift percpu_counter_destroy() into destroy_super_work()"
https://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/commit/?h=for-next&id=853b39a7c82826b8413048feec7bf08e98ce7a84
They both will be reverted later.

The problem is that we have 2 series routed via different
trees, RCU and VFS. We need this hack to ensure that this
series won't break alloc_super() which currently assumes that
destroy_super()->percpu_free_rwsem() is safe after kzalloc().
This way these 2 series do not depend on each other, we can
test/change/revert/etc them independently.

We will add rcu_sync_dtor() into deactivate_locked_super()
later and revert both hacks.
Oleg.

include/linux/percpu-rwsem.h | 3 +-
include/linux/rcusync.h | 56 +++++++++++++++
kernel/locking/percpu-rwsem.c | 85 ++++++++---------------
kernel/rcu/Makefile | 2 +-
kernel/rcu/sync.c | 151 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 240 insertions(+), 57 deletions(-)


2015-08-21 17:45:11

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 1/8] rcu: Create rcu_sync infrastructure

It is functionally equivalent to

struct rcu_sync_struct {
atomic_t counter;
};

static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return atomic_read(&rss->counter) == 0;
}

static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
{
atomic_inc(&rss->counter);
synchronize_sched();
}

static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
{
synchronize_sched();
atomic_dec(&rss->counter);
}

except: it records the state and synchronize_sched() is only called by
rcu_sync_enter() and only if necessary.

Reviewed-by: Paul E. McKenney <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/rcusync.h | 63 +++++++++++++++++++++++++++
kernel/rcu/Makefile | 2 +-
kernel/rcu/sync.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 172 insertions(+), 1 deletions(-)
create mode 100644 include/linux/rcusync.h
create mode 100644 kernel/rcu/sync.c

diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
new file mode 100644
index 0000000..f13f95c
--- /dev/null
+++ b/include/linux/rcusync.h
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUSYNC_H_
+#define _LINUX_RCUSYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+struct rcu_sync_struct {
+ int gp_state;
+ int gp_count;
+ wait_queue_head_t gp_wait;
+
+ int cb_state;
+ struct rcu_head cb_head;
+
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name) \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_sched, \
+ .call = call_rcu_sched, \
+}
+
+#define __RCU_BH_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu_bh, \
+ .call = call_rcu_bh, \
+}
+
+#define __RCU_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu, \
+ .call = call_rcu, \
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
+
+static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ return !rss->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync_struct *);
+extern void rcu_sync_exit(struct rcu_sync_struct *);
+
+#endif /* _LINUX_RCUSYNC_H_ */
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a8084..61a1656 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 0000000..f84176a
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,108 @@
+
+#include <linux/rcusync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define rss_lock gp_wait.lock
+
+void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+{
+ memset(rss, 0, sizeof(*rss));
+ init_waitqueue_head(&rss->gp_wait);
+
+ switch (type) {
+ case RCU_SYNC:
+ rss->sync = synchronize_rcu;
+ rss->call = call_rcu;
+ break;
+
+ case RCU_SCHED_SYNC:
+ rss->sync = synchronize_sched;
+ rss->call = call_rcu_sched;
+ break;
+
+ case RCU_BH_SYNC:
+ rss->sync = synchronize_rcu_bh;
+ rss->call = call_rcu_bh;
+ break;
+ }
+}
+
+void rcu_sync_enter(struct rcu_sync_struct *rss)
+{
+ bool need_wait, need_sync;
+
+ spin_lock_irq(&rss->rss_lock);
+ need_wait = rss->gp_count++;
+ need_sync = rss->gp_state == GP_IDLE;
+ if (need_sync)
+ rss->gp_state = GP_PENDING;
+ spin_unlock_irq(&rss->rss_lock);
+
+ BUG_ON(need_wait && need_sync);
+
+ if (need_sync) {
+ rss->sync();
+ rss->gp_state = GP_PASSED;
+ wake_up_all(&rss->gp_wait);
+ } else if (need_wait) {
+ wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ } else {
+ /*
+ * Possible when there's a pending CB from a rcu_sync_exit().
+ * Nobody has yet been allowed the 'fast' path and thus we can
+ * avoid doing any sync(). The callback will get 'dropped'.
+ */
+ BUG_ON(rss->gp_state != GP_PASSED);
+ }
+}
+
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+ struct rcu_sync_struct *rss =
+ container_of(rcu, struct rcu_sync_struct, cb_head);
+ unsigned long flags;
+
+
+ BUG_ON(rss->gp_state != GP_PASSED);
+ BUG_ON(rss->cb_state == CB_IDLE);
+
+ spin_lock_irqsave(&rss->rss_lock, flags);
+ if (rss->gp_count) {
+ /*
+ * A new rcu_sync_begin() has happened; drop the callback.
+ */
+ rss->cb_state = CB_IDLE;
+ } else if (rss->cb_state == CB_REPLAY) {
+ /*
+ * A new rcu_sync_exit() has happened; requeue the callback
+ * to catch a later GP.
+ */
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else {
+ /*
+ * We're at least a GP after rcu_sync_exit(); eveybody will now
+ * have observed the write side critical section. Let 'em rip!.
+ */
+ rss->cb_state = CB_IDLE;
+ rss->gp_state = GP_IDLE;
+ }
+ spin_unlock_irqrestore(&rss->rss_lock, flags);
+}
+
+void rcu_sync_exit(struct rcu_sync_struct *rss)
+{
+ spin_lock_irq(&rss->rss_lock);
+ if (!--rss->gp_count) {
+ if (rss->cb_state == CB_IDLE) {
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else if (rss->cb_state == CB_PENDING) {
+ rss->cb_state = CB_REPLAY;
+ }
+ }
+ spin_unlock_irq(&rss->rss_lock);
+}
--
1.5.5.1

2015-08-21 17:45:14

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 2/8] rcusync: Introduce struct rcu_sync_ops

Add the new struct rcu_sync_ops which holds sync/call methods, and
turn the function pointers in rcu_sync_struct into an array of struct
rcu_sync_ops.

This simplifies the "init" helpers, and this way it is simpler to add
the new methods we need, especially ifdef'ed.

Reviewed-by: Paul E. McKenney <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/rcusync.h | 60 ++++++++++++++++++-----------------------------
kernel/rcu/sync.c | 43 +++++++++++++++++----------------
2 files changed, 45 insertions(+), 58 deletions(-)

diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index f13f95c..ff86315 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -4,6 +4,8 @@
#include <linux/wait.h>
#include <linux/rcupdate.h>

+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
struct rcu_sync_struct {
int gp_state;
int gp_count;
@@ -12,52 +14,36 @@ struct rcu_sync_struct {
int cb_state;
struct rcu_head cb_head;

- void (*sync)(void);
- void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ enum rcu_sync_type gp_type;
};

-#define ___RCU_SYNC_INIT(name) \
- .gp_state = 0, \
- .gp_count = 0, \
- .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
- .cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_sched, \
- .call = call_rcu_sched, \
-}
-
-#define __RCU_BH_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu_bh, \
- .call = call_rcu_bh, \
-}
-
-#define __RCU_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu, \
- .call = call_rcu, \
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name) \
- struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
-
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return !rss->gp_state; /* GP_IDLE */
}

-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);

+#define __RCU_SYNC_INITIALIZER(name, type) { \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0, \
+ .gp_type = type, \
+ }
+
+#define __DEFINE_RCU_SYNC(name, type) \
+ struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
#endif /* _LINUX_RCUSYNC_H_ */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index f84176a..99051b7 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,7 +1,24 @@
-
#include <linux/rcusync.h>
#include <linux/sched.h>

+static const struct {
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+ [RCU_SYNC] = {
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ },
+ [RCU_SCHED_SYNC] = {
+ .sync = synchronize_sched,
+ .call = call_rcu_sched,
+ },
+ [RCU_BH_SYNC] = {
+ .sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
+ },
+};
+
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };

@@ -11,23 +28,7 @@ void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
{
memset(rss, 0, sizeof(*rss));
init_waitqueue_head(&rss->gp_wait);
-
- switch (type) {
- case RCU_SYNC:
- rss->sync = synchronize_rcu;
- rss->call = call_rcu;
- break;
-
- case RCU_SCHED_SYNC:
- rss->sync = synchronize_sched;
- rss->call = call_rcu_sched;
- break;
-
- case RCU_BH_SYNC:
- rss->sync = synchronize_rcu_bh;
- rss->call = call_rcu_bh;
- break;
- }
+ rss->gp_type = type;
}

void rcu_sync_enter(struct rcu_sync_struct *rss)
@@ -44,7 +45,7 @@ void rcu_sync_enter(struct rcu_sync_struct *rss)
BUG_ON(need_wait && need_sync);

if (need_sync) {
- rss->sync();
+ gp_ops[rss->gp_type].sync();
rss->gp_state = GP_PASSED;
wake_up_all(&rss->gp_wait);
} else if (need_wait) {
@@ -81,7 +82,7 @@ static void rcu_sync_func(struct rcu_head *rcu)
* to catch a later GP.
*/
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else {
/*
* We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -99,7 +100,7 @@ void rcu_sync_exit(struct rcu_sync_struct *rss)
if (!--rss->gp_count) {
if (rss->cb_state == CB_IDLE) {
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else if (rss->cb_state == CB_PENDING) {
rss->cb_state = CB_REPLAY;
}
--
1.5.5.1

2015-08-21 17:45:19

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 3/8] rcusync: Add the CONFIG_PROVE_RCU checks

It would be nice to validate that the caller of rcu_sync_is_idle()
holds the corresponding type of RCU read-side lock. Add the new
rcu_sync_ops->held() method and change rcu_sync_is_idle() to
WARN() if it returns false.

This obviously penalizes the readers (fast-path), but only if
CONFIG_PROVE_RCU.

Reviewed-by: Paul E. McKenney <[email protected]>
Suggested-by: "Paul E. McKenney" <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/rcusync.h | 6 ++++++
kernel/rcu/sync.c | 20 ++++++++++++++++++++
2 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index ff86315..631a842 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -17,9 +17,15 @@ struct rcu_sync_struct {
enum rcu_sync_type gp_type;
};

+extern bool __rcu_sync_is_idle(struct rcu_sync_struct *);
+
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
+#ifdef CONFIG_PROVE_RCU
+ return __rcu_sync_is_idle(rss);
+#else
return !rss->gp_state; /* GP_IDLE */
+#endif
}

extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 99051b7..0ab7cbd 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,21 +1,33 @@
#include <linux/rcusync.h>
#include <linux/sched.h>

+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func) .held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+ int (*held)(void);
+#endif
} gp_ops[] = {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ __INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ __INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ __INIT_HELD(rcu_read_lock_bh_held)
},
};

@@ -24,6 +36,14 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };

#define rss_lock gp_wait.lock

+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ WARN_ON(!gp_ops[rss->gp_type].held());
+ return rss->gp_state == GP_IDLE;
+}
+#endif
+
void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
{
memset(rss, 0, sizeof(*rss));
--
1.5.5.1

2015-08-21 17:46:48

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 4/8] rcusync: Introduce rcu_sync_dtor()

Add the new rcu_sync_ops->wait() method and the new helper,
rcu_sync_dtor().

It is needed if you are going to, say, kfree(rcu_sync_object).
It simply calls ops->wait() to "flush" the potentially pending
rcu callback.

Reviewed-by: Paul E. McKenney <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/rcusync.h | 1 +
kernel/rcu/sync.c | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index 631a842..50229c0 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -31,6 +31,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
+extern void rcu_sync_dtor(struct rcu_sync_struct *);

#define __RCU_SYNC_INITIALIZER(name, type) { \
.gp_state = 0, \
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 0ab7cbd..7e2a9aa 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,6 +10,7 @@
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ void (*wait)(void);
#ifdef CONFIG_PROVE_RCU
int (*held)(void);
#endif
@@ -17,16 +18,19 @@ static const struct {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ .wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ .wait = rcu_barrier_sched,
__INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ .wait = rcu_barrier_bh,
__INIT_HELD(rcu_read_lock_bh_held)
},
};
@@ -127,3 +131,21 @@ void rcu_sync_exit(struct rcu_sync_struct *rss)
}
spin_unlock_irq(&rss->rss_lock);
}
+
+void rcu_sync_dtor(struct rcu_sync_struct *rss)
+{
+ int cb_state;
+
+ BUG_ON(rss->gp_count);
+
+ spin_lock_irq(&rss->rss_lock);
+ if (rss->cb_state == CB_REPLAY)
+ rss->cb_state = CB_PENDING;
+ cb_state = rss->cb_state;
+ spin_unlock_irq(&rss->rss_lock);
+
+ if (cb_state != CB_IDLE) {
+ gp_ops[rss->gp_type].wait();
+ BUG_ON(rss->cb_state != CB_IDLE);
+ }
+}
--
1.5.5.1

2015-08-21 17:45:27

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 5/8] percpu-rwsem: make percpu_free_rwsem() after kzalloc() safe

This is the temporary ugly hack which will be reverted later. We only
need it to ensure that the next patch will not break "change sb_writers
to use percpu_rw_semaphore" patches routed via VFS tree.

alloc_super()->destroy_super() error path assumes that it is safe to
call percpu_free_rwsem() after kzalloc() without percpu_init_rwsem(),
let's not dissapoint it.

Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/locking/percpu-rwsem.c | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f325672..3bb2dc9 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -25,6 +25,13 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,

void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
{
+ /*
+ * XXX: temporary kludge. The error path in alloc_super()
+ * assumes that percpu_free_rwsem() is safe after kzalloc().
+ */
+ if (!brw->fast_read_ctr)
+ return;
+
free_percpu(brw->fast_read_ctr);
brw->fast_read_ctr = NULL; /* catch use after free bugs */
}
--
1.5.5.1

2015-08-21 17:45:25

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 6/8] percpu-rwsem: change it to rely on rss_sync infrastructure

Currently down_write/up_write calls synchronize_sched_expedited()
twice which is evil. Change this code to rely on rcu-sync primitives.
This avoids the _expedited "big hammer", and this can be faster in
the contended case or even in the case when a single thread does
down_write/up_write in a loop.

Of course, a single down_write() will take more time, but otoh it
will be much more friendly to the whole system.

To simplify the review this patch doesn't update the comments, fixed
by the next change.

Signed-off-by: Oleg Nesterov <[email protected]>
---
include/linux/percpu-rwsem.h | 3 ++-
kernel/locking/percpu-rwsem.c | 18 +++++++-----------
2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 834c4e5..06af654 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,11 +5,12 @@
#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/wait.h>
+#include <linux/rcusync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
+ struct rcu_sync_struct rss;
unsigned int __percpu *fast_read_ctr;
- atomic_t write_ctr;
struct rw_semaphore rw_sem;
atomic_t slow_read_ctr;
wait_queue_head_t write_waitq;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 3bb2dc9..d3103da 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,

/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
__init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
+ rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
atomic_set(&brw->slow_read_ctr, 0);
init_waitqueue_head(&brw->write_waitq);
return 0;
@@ -32,6 +32,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
if (!brw->fast_read_ctr)
return;

+ rcu_sync_dtor(&brw->rss);
free_percpu(brw->fast_read_ctr);
brw->fast_read_ctr = NULL; /* catch use after free bugs */
}
@@ -61,13 +62,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
*/
static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
{
- bool success = false;
+ bool success;

preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
+ success = rcu_sync_is_idle(&brw->rss);
+ if (likely(success))
__this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
preempt_enable();

return success;
@@ -146,8 +146,6 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
*/
void percpu_down_write(struct percpu_rw_semaphore *brw)
{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
/*
* 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
* so that update_fast_ctr() can't succeed.
@@ -159,7 +157,7 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
* fast-path, it executes a full memory barrier before we return.
* See R_W case in the comment above update_fast_ctr().
*/
- synchronize_sched_expedited();
+ rcu_sync_enter(&brw->rss);

/* exclude other writers, and block the new readers completely */
down_write(&brw->rw_sem);
@@ -179,7 +177,5 @@ void percpu_up_write(struct percpu_rw_semaphore *brw)
* Insert the barrier before the next fast-path in down_read,
* see W_R case in the comment above update_fast_ctr().
*/
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
+ rcu_sync_exit(&brw->rss);
}
--
1.5.5.1

2015-08-21 17:46:08

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 7/8] percpu-rwsem: fix the comments outdated by rcu_sync

Update the comments broken by the previous change.

Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/locking/percpu-rwsem.c | 50 +++++++++--------------------------------
1 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index d3103da..69704eb 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -38,27 +38,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
}

/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
+ * This is the fast-path for down_read/up_read. If it succeeds we rely
+ * on the barriers provided by rcu_sync_enter/exit; see the comments in
+ * percpu_down_write() and percpu_up_write().
*
* If this helper fails the callers rely on the normal rw_semaphore and
* atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
*/
static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
{
@@ -133,29 +118,15 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
return sum;
}

-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
void percpu_down_write(struct percpu_rw_semaphore *brw)
{
/*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
- *
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
+ * Make rcu_sync_is_idle() == F and thus disable the fast-path in
+ * percpu_down_read() and percpu_up_read(), and wait for gp pass.
*
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
+ * The latter synchronises us with the preceeding readers which used
+ * the fast-past, so we can not miss the result of __this_cpu_add()
+ * or anything else inside their criticial sections.
*/
rcu_sync_enter(&brw->rss);

@@ -174,8 +145,9 @@ void percpu_up_write(struct percpu_rw_semaphore *brw)
/* release the lock, but the readers can't use the fast-path */
up_write(&brw->rw_sem);
/*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
+ * Enable the fast-path in percpu_down_read() and percpu_up_read()
+ * but only after another gp pass; this adds the necessary barrier
+ * to ensure the reader can't miss the changes done by us.
*/
rcu_sync_exit(&brw->rss);
}
--
1.5.5.1

2015-08-21 17:45:30

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH v2 8/8] percpu-rwsem: cleanup the lockdep annotations in percpu_down_read()

Stolen from Peter's patch.

Change percpu_down_read() to use __down_read(), this way we can
do rwsem_acquire_read() unconditionally at the start to make this
code more symmetric and clean.

Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/locking/percpu-rwsem.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 69704eb..5694849 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -69,14 +69,14 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
void percpu_down_read(struct percpu_rw_semaphore *brw)
{
might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+ if (likely(update_fast_ctr(brw, +1)))
return;
- }

- down_read(&brw->rw_sem);
+ /* Avoid rwsem_acquire_read() and rwsem_release() */
+ __down_read(&brw->rw_sem);
atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
__up_read(&brw->rw_sem);
}

--
1.5.5.1

2015-08-22 16:38:18

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On Fri, Aug 21, 2015 at 07:42:30PM +0200, Oleg Nesterov wrote:
> Hello,
>
> Now that sb->s_writers was changed to use percpu_rw_semaphore let me
> send v2.
>
> Changes:
>
> - whitespace fix in 1/8.
>
> - remove EXPORT_SYMBOL() in 3/8, currently rcu_sync has no
> modular users.
>
> - 5/8 is new. This ugly hack pairs with another one:
> "shift percpu_counter_destroy() into destroy_super_work()"
> https://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/commit/?h=for-next&id=853b39a7c82826b8413048feec7bf08e98ce7a84
> They both will be reverted later.
>
> The problem is that we have 2 series routed via different
> trees, RCU and VFS. We need this hack to ensure that this
> series won't break alloc_super() which currently assumes that
> destroy_super()->percpu_free_rwsem() is safe after kzalloc().
> This way these 2 series do not depend on each other, we can
> test/change/revert/etc them independently.
>
> We will add rcu_sync_dtor() into deactivate_locked_super()
> later and revert both hacks.
> Oleg.

Queued for testing, thank you, Oleg!

Right now, this is mostly relying on 0day and -next testing. Any thoughts
for a useful torture test for this? One approach would be to treat it
like a reader-writer lock. Other thoughts?

Thanx, Paul

> include/linux/percpu-rwsem.h | 3 +-
> include/linux/rcusync.h | 56 +++++++++++++++
> kernel/locking/percpu-rwsem.c | 85 ++++++++---------------
> kernel/rcu/Makefile | 2 +-
> kernel/rcu/sync.c | 151 +++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 240 insertions(+), 57 deletions(-)
>

2015-08-24 15:37:22

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On 08/22, Paul E. McKenney wrote:
>
> Queued for testing, thank you, Oleg!

Thanks Paul!

> Right now, this is mostly relying on 0day and -next testing. Any thoughts
> for a useful torture test for this?

Right now I do not have any idea how to write the meaningful test for
rcu_sync... Perhaps something like

struct rcu_sync_struct rss;
spinlock_t lock;

int A, B;

void read(void)
{
rcu_read_lock();

bool need_lock = !rcu_sync_is_idle(&rss);
if (need_lock)
spin_lock(&lock);

BUG_ON(A != B);

if (need_lock)
spin_unlock(&lock);

rcu_read_unlock();
}

void modify(void)
{
rcu_sync_enter(&rss);

spin_lock(&lock);
A++; B++;
spin_unlock(&lock);

rcu_sync_exit(&rss);
}

makes sense... I'll try to think.

> One approach would be to treat it
> like a reader-writer lock. Other thoughts?

I booted the kernel with the additional patch below, and nothing bad has
happened, it continues to print

Writes: Total: 2 Max/Min: 0/0 Fail: 0
Reads : Total: 2 Max/Min: 0/0 Fail: 0

However, I do not know what this code actually does, so currently I have
no idea if this test makes any sense for percpu_rw_semaphore.

Oleg.
---

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce2..62561ec 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -55,7 +55,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
torture_param(bool, verbose, true,
"Enable verbose debugging printk()s");

-static char *torture_type = "spin_lock";
+static char *torture_type = "rwsem_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
@@ -361,10 +361,12 @@ static struct lock_torture_ops mutex_lock_ops = {
.name = "mutex_lock"
};

-static DECLARE_RWSEM(torture_rwsem);
-static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+static int torture_rwsem_down_write(void) __acquires(pcpu_rwsem)
{
- down_write(&torture_rwsem);
+ percpu_down_write(&pcpu_rwsem);
return 0;
}

@@ -384,14 +386,14 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp)
#endif
}

-static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+static void torture_rwsem_up_write(void) __releases(pcpu_rwsem)
{
- up_write(&torture_rwsem);
+ percpu_up_write(&pcpu_rwsem);
}

-static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_read(void) __acquires(pcpu_rwsem)
{
- down_read(&torture_rwsem);
+ percpu_down_read(&pcpu_rwsem);
return 0;
}

@@ -411,9 +413,9 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp)
#endif
}

-static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+static void torture_rwsem_up_read(void) __releases(pcpu_rwsem)
{
- up_read(&torture_rwsem);
+ percpu_up_read(&pcpu_rwsem);
}

static struct lock_torture_ops rwsem_lock_ops = {
@@ -645,6 +647,11 @@ static int __init lock_torture_init(void)
&rwsem_lock_ops,
};

+ /*
+ * TODO: DECLARE_PERCPU_RWSEM(). The patch already exists.
+ */
+ BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
return -EBUSY;

2015-08-24 18:33:58

by Oleg Nesterov

[permalink] [raw]
Subject: parse_args() is too unforgivable?

On 08/24, Oleg Nesterov wrote:
>
> I booted the kernel with the additional patch below, and nothing bad has
> happened,

Until I tried reboot it once with "locktorture.verbose=true" paramater.
It didn't boot.

This is because parse_args() just aborts after it hits the error, so other
arguments at the same initcall level are simply ignored.

Fixed by the patch below, but I simply can't believe nobody hit this (imo)
bug before.

Why does parse_args() do this?? I simply can't understand why parse_args()
adds more random and hard-to-understand problems if one of the args ("=true"
in this particular case) is wrong.

Yes, the patch below is probably oversimplified / incomplete but imho the
current behaviour is confusing. At least I was greatly confused ;) At least
(I think) it makes sense to let the user know that the rest of command line
was probably ignored.

Oleg.
---

--- a/kernel/params.c
+++ b/kernel/params.c
@@ -220,19 +220,19 @@ char *parse_args(const char *doing,
doing, param);

switch (ret) {
+ case 0:
+ break;
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ERR_PTR(ret);
+ break;
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
- case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
+ break;
}
}

2015-08-25 01:34:36

by Rusty Russell

[permalink] [raw]
Subject: Re: parse_args() is too unforgivable?

Oleg Nesterov <[email protected]> writes:
> On 08/24, Oleg Nesterov wrote:
>>
>> I booted the kernel with the additional patch below, and nothing bad has
>> happened,
>
> Until I tried reboot it once with "locktorture.verbose=true" paramater.
> It didn't boot.
>
> This is because parse_args() just aborts after it hits the error, so other
> arguments at the same initcall level are simply ignored.
>
> Fixed by the patch below, but I simply can't believe nobody hit this (imo)
> bug before.
>
> Why does parse_args() do this?? I simply can't understand why parse_args()
> adds more random and hard-to-understand problems if one of the args ("=true"
> in this particular case) is wrong.
>
> Yes, the patch below is probably oversimplified / incomplete but imho the
> current behaviour is confusing. At least I was greatly confused ;) At least
> (I think) it makes sense to let the user know that the rest of command line
> was probably ignored.

This is nice, but please save and return the error properly; modules need
this too.

I think nobody hit this before because they notice that they screwed up
the commandline and it didn't boot.

Thanks,
Rusty.

2015-08-25 15:20:57

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/1] params: don't ignore the rest of cmdline if parse_one() fails

On 08/25, Rusty Russell wrote:
>
> Oleg Nesterov <[email protected]> writes:
> > On 08/24, Oleg Nesterov wrote:
> >>
> >> I booted the kernel with the additional patch below, and nothing bad has
> >> happened,
> >
> > Until I tried reboot it once with "locktorture.verbose=true" paramater.
> > It didn't boot.
> >
> > This is because parse_args() just aborts after it hits the error, so other
> > arguments at the same initcall level are simply ignored.
> >
> > Fixed by the patch below, but I simply can't believe nobody hit this (imo)
> > bug before.
> >
> > Why does parse_args() do this?? I simply can't understand why parse_args()
> > adds more random and hard-to-understand problems if one of the args ("=true"
> > in this particular case) is wrong.
> >
> > Yes, the patch below is probably oversimplified / incomplete but imho the
> > current behaviour is confusing. At least I was greatly confused ;) At least
> > (I think) it makes sense to let the user know that the rest of command line
> > was probably ignored.
>
> This is nice, but please save and return the error properly; modules need
> this too.

OK, thanks, please see the patch.

> I think nobody hit this before because they notice that they screwed up
> the commandline and it didn't boot.

Yes, I didn't know parse_args() works this way. Besides, I didn't notice
the "invalid for parameter" error message on the serial console, so I
thought that the kernel panic was somehow caused by the patch I tried
to test.

Oleg.

2015-08-25 15:21:13

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/1] params: don't ignore the rest of cmdline if parse_one() fails

parse_args() just aborts after it hits an error, so other args
at the same initcall level are simply ignored. This can lead to
other hard-to-understand problems, for example my testing machine
panics during the boot if I pass "locktorture.verbose=true".

Change parse_args() to save the err code for return and continue.

Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/params.c | 17 +++++++++--------
1 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/params.c b/kernel/params.c
index a22d6a7..b21139f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -223,7 +223,7 @@ char *parse_args(const char *doing,
int (*unknown)(char *param, char *val,
const char *doing, void *arg))
{
- char *param, *val;
+ char *param, *val, *err = NULL;

/* Chew leading spaces */
args = skip_spaces(args);
@@ -238,7 +238,7 @@ char *parse_args(const char *doing,
args = next_arg(args, &param, &val);
/* Stop at -- */
if (!val && strcmp(param, "--") == 0)
- return args;
+ return err ?: args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
min_level, max_level, arg, unknown);
@@ -247,24 +247,25 @@ char *parse_args(const char *doing,
doing, param);

switch (ret) {
+ case 0:
+ continue;
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ERR_PTR(ret);
+ break;
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
- case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
+ break;
}
+
+ err = ERR_PTR(ret);
}

- /* All parsed OK. */
- return NULL;
+ return err;
}

/* Lazy bastard, eh? */

2015-08-26 01:12:57

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/1] params: don't ignore the rest of cmdline if parse_one() fails

Oleg Nesterov <[email protected]> writes:
> parse_args() just aborts after it hits an error, so other args
> at the same initcall level are simply ignored. This can lead to
> other hard-to-understand problems, for example my testing machine
> panics during the boot if I pass "locktorture.verbose=true".
>
> Change parse_args() to save the err code for return and continue.
>
> Signed-off-by: Oleg Nesterov <[email protected]>

Thanks!

Applied,
Rusty.

> ---
> kernel/params.c | 17 +++++++++--------
> 1 files changed, 9 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/params.c b/kernel/params.c
> index a22d6a7..b21139f 100644
> --- a/kernel/params.c
> +++ b/kernel/params.c
> @@ -223,7 +223,7 @@ char *parse_args(const char *doing,
> int (*unknown)(char *param, char *val,
> const char *doing, void *arg))
> {
> - char *param, *val;
> + char *param, *val, *err = NULL;
>
> /* Chew leading spaces */
> args = skip_spaces(args);
> @@ -238,7 +238,7 @@ char *parse_args(const char *doing,
> args = next_arg(args, &param, &val);
> /* Stop at -- */
> if (!val && strcmp(param, "--") == 0)
> - return args;
> + return err ?: args;
> irq_was_disabled = irqs_disabled();
> ret = parse_one(param, val, doing, params, num,
> min_level, max_level, arg, unknown);
> @@ -247,24 +247,25 @@ char *parse_args(const char *doing,
> doing, param);
>
> switch (ret) {
> + case 0:
> + continue;
> case -ENOENT:
> pr_err("%s: Unknown parameter `%s'\n", doing, param);
> - return ERR_PTR(ret);
> + break;
> case -ENOSPC:
> pr_err("%s: `%s' too large for parameter `%s'\n",
> doing, val ?: "", param);
> - return ERR_PTR(ret);
> - case 0:
> break;
> default:
> pr_err("%s: `%s' invalid for parameter `%s'\n",
> doing, val ?: "", param);
> - return ERR_PTR(ret);
> + break;
> }
> +
> + err = ERR_PTR(ret);
> }
>
> - /* All parsed OK. */
> - return NULL;
> + return err;
> }
>
> /* Lazy bastard, eh? */

2015-08-26 00:22:27

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On Mon, Aug 24, 2015 at 05:34:31PM +0200, Oleg Nesterov wrote:
> On 08/22, Paul E. McKenney wrote:
> >
> > Queued for testing, thank you, Oleg!
>
> Thanks Paul!

I cannot really test it, but thus far it has at least not broken anything
else.

> > Right now, this is mostly relying on 0day and -next testing. Any thoughts
> > for a useful torture test for this?
>
> Right now I do not have any idea how to write the meaningful test for
> rcu_sync... Perhaps something like
>
> struct rcu_sync_struct rss;
> spinlock_t lock;
>
> int A, B;
>
> void read(void)
> {
> rcu_read_lock();
>
> bool need_lock = !rcu_sync_is_idle(&rss);
> if (need_lock)
> spin_lock(&lock);
>
> BUG_ON(A != B);
>
> if (need_lock)
> spin_unlock(&lock);
>
> rcu_read_unlock();
> }
>
> void modify(void)
> {
> rcu_sync_enter(&rss);
>
> spin_lock(&lock);
> A++; B++;
> spin_unlock(&lock);
>
> rcu_sync_exit(&rss);
> }
>
> makes sense... I'll try to think.

That looks like a promising start. There would need to be some sleep
time in modify() between rcu_sync_exit() and rcu_sync_enter().

> > One approach would be to treat it
> > like a reader-writer lock. Other thoughts?
>
> I booted the kernel with the additional patch below, and nothing bad has
> happened, it continues to print
>
> Writes: Total: 2 Max/Min: 0/0 Fail: 0
> Reads : Total: 2 Max/Min: 0/0 Fail: 0
>
> However, I do not know what this code actually does, so currently I have
> no idea if this test makes any sense for percpu_rw_semaphore.

Actually, unless I am really confused, that does not look good...

I would expect something like this, from a run with rwsem_lock:

[ 16.336057] Writes: Total: 473 Max/Min: 0/0 Fail: 0
[ 16.337615] Reads : Total: 219 Max/Min: 0/0 Fail: 0
[ 31.338152] Writes: Total: 959 Max/Min: 0/0 Fail: 0
[ 31.339114] Reads : Total: 437 Max/Min: 0/0 Fail: 0
[ 46.340167] Writes: Total: 1365 Max/Min: 0/0 Fail: 0
[ 46.341952] Reads : Total: 653 Max/Min: 0/0 Fail: 0
[ 61.343027] Writes: Total: 1795 Max/Min: 0/0 Fail: 0
[ 61.343968] Reads : Total: 865 Max/Min: 0/0 Fail: 0
[ 76.344034] Writes: Total: 2220 Max/Min: 0/0 Fail: 0
[ 76.345243] Reads : Total: 1071 Max/Min: 0/0 Fail: 0

The "Total" should increase for writes and for reads -- if you are
just seeing "Total: 2" over and over, that indicates that either
the torture test or rcu_sync got stuck somewhere.

Thanx, Paul

> Oleg.
> ---
>
> diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
> index ec8cce2..62561ec 100644
> --- a/kernel/locking/locktorture.c
> +++ b/kernel/locking/locktorture.c
> @@ -55,7 +55,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
> torture_param(bool, verbose, true,
> "Enable verbose debugging printk()s");
>
> -static char *torture_type = "spin_lock";
> +static char *torture_type = "rwsem_lock";
> module_param(torture_type, charp, 0444);
> MODULE_PARM_DESC(torture_type,
> "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
> @@ -361,10 +361,12 @@ static struct lock_torture_ops mutex_lock_ops = {
> .name = "mutex_lock"
> };
>
> -static DECLARE_RWSEM(torture_rwsem);
> -static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
> +#include <linux/percpu-rwsem.h>
> +static struct percpu_rw_semaphore pcpu_rwsem;
> +
> +static int torture_rwsem_down_write(void) __acquires(pcpu_rwsem)
> {
> - down_write(&torture_rwsem);
> + percpu_down_write(&pcpu_rwsem);
> return 0;
> }
>
> @@ -384,14 +386,14 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp)
> #endif
> }
>
> -static void torture_rwsem_up_write(void) __releases(torture_rwsem)
> +static void torture_rwsem_up_write(void) __releases(pcpu_rwsem)
> {
> - up_write(&torture_rwsem);
> + percpu_up_write(&pcpu_rwsem);
> }
>
> -static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
> +static int torture_rwsem_down_read(void) __acquires(pcpu_rwsem)
> {
> - down_read(&torture_rwsem);
> + percpu_down_read(&pcpu_rwsem);
> return 0;
> }
>
> @@ -411,9 +413,9 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp)
> #endif
> }
>
> -static void torture_rwsem_up_read(void) __releases(torture_rwsem)
> +static void torture_rwsem_up_read(void) __releases(pcpu_rwsem)
> {
> - up_read(&torture_rwsem);
> + percpu_up_read(&pcpu_rwsem);
> }
>
> static struct lock_torture_ops rwsem_lock_ops = {
> @@ -645,6 +647,11 @@ static int __init lock_torture_init(void)
> &rwsem_lock_ops,
> };
>
> + /*
> + * TODO: DECLARE_PERCPU_RWSEM(). The patch already exists.
> + */
> + BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
> +
> if (!torture_init_begin(torture_type, verbose, &torture_runnable))
> return -EBUSY;
>
>

2015-08-26 12:19:15

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On 08/25, Paul E. McKenney wrote:
>
> On Mon, Aug 24, 2015 at 05:34:31PM +0200, Oleg Nesterov wrote:
> >
> > I booted the kernel with the additional patch below, and nothing bad has
> > happened, it continues to print
> >
> > Writes: Total: 2 Max/Min: 0/0 Fail: 0
> > Reads : Total: 2 Max/Min: 0/0 Fail: 0
> >
> > However, I do not know what this code actually does, so currently I have
> > no idea if this test makes any sense for percpu_rw_semaphore.
>
> Actually, unless I am really confused, that does not look good...
>
> I would expect something like this, from a run with rwsem_lock:
>
> [ 16.336057] Writes: Total: 473 Max/Min: 0/0 Fail: 0
> [ 16.337615] Reads : Total: 219 Max/Min: 0/0 Fail: 0
> [ 31.338152] Writes: Total: 959 Max/Min: 0/0 Fail: 0
> [ 31.339114] Reads : Total: 437 Max/Min: 0/0 Fail: 0
> [ 46.340167] Writes: Total: 1365 Max/Min: 0/0 Fail: 0
> [ 46.341952] Reads : Total: 653 Max/Min: 0/0 Fail: 0
> [ 61.343027] Writes: Total: 1795 Max/Min: 0/0 Fail: 0
> [ 61.343968] Reads : Total: 865 Max/Min: 0/0 Fail: 0
> [ 76.344034] Writes: Total: 2220 Max/Min: 0/0 Fail: 0
> [ 76.345243] Reads : Total: 1071 Max/Min: 0/0 Fail: 0
>
> The "Total" should increase for writes and for reads -- if you are
> just seeing "Total: 2" over and over, that indicates that either
> the torture test or rcu_sync got stuck somewhere.

Hmm. I reverted the change in locktorture.c , and I see the same
numbers when I boot the kernel with

locktorture.verbose=1 locktorture.torture_type=rwsem_lock

parameters.

Writes: Total: 2 Max/Min: 0/0 Fail: 0
Reads : Total: 2 Max/Min: 0/0 Fail: 0

"Total" doesn't grow. Looks like something is wrong with locktorture.
I'll try to re-check...

Oleg.

2015-08-26 12:54:46

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On 08/26, Oleg Nesterov wrote:
>
> On 08/25, Paul E. McKenney wrote:
> >
> > On Mon, Aug 24, 2015 at 05:34:31PM +0200, Oleg Nesterov wrote:
> > >
> > > I booted the kernel with the additional patch below, and nothing bad has
> > > happened, it continues to print
> > >
> > > Writes: Total: 2 Max/Min: 0/0 Fail: 0
> > > Reads : Total: 2 Max/Min: 0/0 Fail: 0
> > >
> > > However, I do not know what this code actually does, so currently I have
> > > no idea if this test makes any sense for percpu_rw_semaphore.
> >
> > Actually, unless I am really confused, that does not look good...
> >
> > I would expect something like this, from a run with rwsem_lock:
> >
> > [ 16.336057] Writes: Total: 473 Max/Min: 0/0 Fail: 0
> > [ 16.337615] Reads : Total: 219 Max/Min: 0/0 Fail: 0
> > [ 31.338152] Writes: Total: 959 Max/Min: 0/0 Fail: 0
> > [ 31.339114] Reads : Total: 437 Max/Min: 0/0 Fail: 0
> > [ 46.340167] Writes: Total: 1365 Max/Min: 0/0 Fail: 0
> > [ 46.341952] Reads : Total: 653 Max/Min: 0/0 Fail: 0
> > [ 61.343027] Writes: Total: 1795 Max/Min: 0/0 Fail: 0
> > [ 61.343968] Reads : Total: 865 Max/Min: 0/0 Fail: 0
> > [ 76.344034] Writes: Total: 2220 Max/Min: 0/0 Fail: 0
> > [ 76.345243] Reads : Total: 1071 Max/Min: 0/0 Fail: 0
> >
> > The "Total" should increase for writes and for reads -- if you are
> > just seeing "Total: 2" over and over, that indicates that either
> > the torture test or rcu_sync got stuck somewhere.
>
> Hmm. I reverted the change in locktorture.c , and I see the same
> numbers when I boot the kernel with
>
> locktorture.verbose=1 locktorture.torture_type=rwsem_lock
>
> parameters.
>
> Writes: Total: 2 Max/Min: 0/0 Fail: 0
> Reads : Total: 2 Max/Min: 0/0 Fail: 0
>
> "Total" doesn't grow. Looks like something is wrong with locktorture.
> I'll try to re-check...

Heh ;) torture threads spin in stutter_wait(). Added another parameter,

locktorture.torture_runnable=1

now I see the similar numbers

Writes: Total: 1242 Max/Min: 0/0 Fail: 0
Reads : Total: 892 Max/Min: 0/0 Fail: 0
Writes: Total: 2485 Max/Min: 0/0 Fail: 0
Reads : Total: 1796 Max/Min: 0/0 Fail: 0
Writes: Total: 3786 Max/Min: 0/0 Fail: 0
Reads : Total: 2713 Max/Min: 0/0 Fail: 0
Writes: Total: 5045 Max/Min: 0/0 Fail: 0
Reads : Total: 3636 Max/Min: 0/0 Fail: 0

with or without s/rw_semaphore/percpu_rw_semaphore/ change in locktorture.c

Oleg.

2015-08-26 14:29:55

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] Add rcu_sync infrastructure to avoid _expedited() in percpu-rwsem

On Wed, Aug 26, 2015 at 02:52:15PM +0200, Oleg Nesterov wrote:
> On 08/26, Oleg Nesterov wrote:
> >
> > On 08/25, Paul E. McKenney wrote:
> > >
> > > On Mon, Aug 24, 2015 at 05:34:31PM +0200, Oleg Nesterov wrote:
> > > >
> > > > I booted the kernel with the additional patch below, and nothing bad has
> > > > happened, it continues to print
> > > >
> > > > Writes: Total: 2 Max/Min: 0/0 Fail: 0
> > > > Reads : Total: 2 Max/Min: 0/0 Fail: 0
> > > >
> > > > However, I do not know what this code actually does, so currently I have
> > > > no idea if this test makes any sense for percpu_rw_semaphore.
> > >
> > > Actually, unless I am really confused, that does not look good...
> > >
> > > I would expect something like this, from a run with rwsem_lock:
> > >
> > > [ 16.336057] Writes: Total: 473 Max/Min: 0/0 Fail: 0
> > > [ 16.337615] Reads : Total: 219 Max/Min: 0/0 Fail: 0
> > > [ 31.338152] Writes: Total: 959 Max/Min: 0/0 Fail: 0
> > > [ 31.339114] Reads : Total: 437 Max/Min: 0/0 Fail: 0
> > > [ 46.340167] Writes: Total: 1365 Max/Min: 0/0 Fail: 0
> > > [ 46.341952] Reads : Total: 653 Max/Min: 0/0 Fail: 0
> > > [ 61.343027] Writes: Total: 1795 Max/Min: 0/0 Fail: 0
> > > [ 61.343968] Reads : Total: 865 Max/Min: 0/0 Fail: 0
> > > [ 76.344034] Writes: Total: 2220 Max/Min: 0/0 Fail: 0
> > > [ 76.345243] Reads : Total: 1071 Max/Min: 0/0 Fail: 0
> > >
> > > The "Total" should increase for writes and for reads -- if you are
> > > just seeing "Total: 2" over and over, that indicates that either
> > > the torture test or rcu_sync got stuck somewhere.
> >
> > Hmm. I reverted the change in locktorture.c , and I see the same
> > numbers when I boot the kernel with
> >
> > locktorture.verbose=1 locktorture.torture_type=rwsem_lock
> >
> > parameters.
> >
> > Writes: Total: 2 Max/Min: 0/0 Fail: 0
> > Reads : Total: 2 Max/Min: 0/0 Fail: 0
> >
> > "Total" doesn't grow. Looks like something is wrong with locktorture.
> > I'll try to re-check...
>
> Heh ;) torture threads spin in stutter_wait(). Added another parameter,
>
> locktorture.torture_runnable=1
>
> now I see the similar numbers
>
> Writes: Total: 1242 Max/Min: 0/0 Fail: 0
> Reads : Total: 892 Max/Min: 0/0 Fail: 0
> Writes: Total: 2485 Max/Min: 0/0 Fail: 0
> Reads : Total: 1796 Max/Min: 0/0 Fail: 0
> Writes: Total: 3786 Max/Min: 0/0 Fail: 0
> Reads : Total: 2713 Max/Min: 0/0 Fail: 0
> Writes: Total: 5045 Max/Min: 0/0 Fail: 0
> Reads : Total: 3636 Max/Min: 0/0 Fail: 0
>
> with or without s/rw_semaphore/percpu_rw_semaphore/ change in locktorture.c

Whew!!! ;-)

Thanx, Paul