2018-05-21 04:34:59

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 0/4] fixes, cleanups for rcu/dev

Hi Paul,

Here are some patches reworked with a few comments on few of the patches
from previous series: https://lkml.org/lkml/2018/5/13/296

4/4 is a new addition which fixes a potential issue.

Let me know what you think, thanks!

Joel Fernandes (4):
rcu: Add comment documenting how rcu_seq_snap works
rcu: Cleanup the variables used to request a new grace period
rcu: Use better variable names in funnel locking loop
rcu: Unlock non-start node only after accessing its gp_seq_needed

include/trace/events/rcu.h | 15 +++----
kernel/rcu/rcu.h | 33 +++++++++++++++-
kernel/rcu/tree.c | 80 +++++++++++++++++++++-----------------
3 files changed, 85 insertions(+), 43 deletions(-)

--
2.17.0.441.gb46fe60e1d-goog



2018-05-21 04:34:10

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 3/4] rcu: Cleanup the variables used to request a new grace period

The 'c' variable was used previously to store the grace period that is
being requested. However it is not very meaningful since the gp_seq
conversions. This patch replaces it with gp_seq_req indicating that
this is the grace period that was requested. Also updating tracing with
the new name.

Previous patch discussion is at:
https://patchwork.kernel.org/patch/10396579/

Signed-off-by: Joel Fernandes <[email protected]>
---
include/trace/events/rcu.h | 15 ++++++------
kernel/rcu/tree.c | 50 ++++++++++++++++++++++----------------
2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 5373bc61ae08..a8d07feff6a0 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -103,15 +103,16 @@ TRACE_EVENT(rcu_grace_period,
*/
TRACE_EVENT(rcu_future_grace_period,

- TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long c,
- u8 level, int grplo, int grphi, const char *gpevent),
+ TP_PROTO(const char *rcuname, unsigned long gp_seq,
+ unsigned long gp_seq_req, u8 level, int grplo, int grphi,
+ const char *gpevent),

- TP_ARGS(rcuname, gp_seq, c, level, grplo, grphi, gpevent),
+ TP_ARGS(rcuname, gp_seq, gp_seq_req, level, grplo, grphi, gpevent),

TP_STRUCT__entry(
__field(const char *, rcuname)
__field(unsigned long, gp_seq)
- __field(unsigned long, c)
+ __field(unsigned long, gp_seq_req)
__field(u8, level)
__field(int, grplo)
__field(int, grphi)
@@ -121,7 +122,7 @@ TRACE_EVENT(rcu_future_grace_period,
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gp_seq = gp_seq;
- __entry->c = c;
+ __entry->gp_seq_req = gp_seq_req;
__entry->level = level;
__entry->grplo = grplo;
__entry->grphi = grphi;
@@ -129,7 +130,7 @@ TRACE_EVENT(rcu_future_grace_period,
),

TP_printk("%s %lu %lu %u %d %d %s",
- __entry->rcuname, __entry->gp_seq, __entry->c, __entry->level,
+ __entry->rcuname, __entry->gp_seq, __entry->gp_seq_req, __entry->level,
__entry->grplo, __entry->grphi, __entry->gpevent)
);

@@ -751,7 +752,7 @@ TRACE_EVENT(rcu_barrier,
#else /* #ifdef CONFIG_RCU_TRACE */

#define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0)
-#define trace_rcu_future_grace_period(rcuname, gp_seq, c, \
+#define trace_rcu_future_grace_period(rcuname, gp_seq, gp_seq_req, \
level, grplo, grphi, event) \
do { } while (0)
#define trace_rcu_grace_period_init(rcuname, gp_seq, level, grplo, grphi, \
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2c06e1ab90cd..0ffd41ba304f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1518,13 +1518,18 @@ void rcu_cpu_stall_reset(void)

/* Trace-event wrapper function for trace_rcu_future_grace_period. */
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, const char *s)
+ unsigned long gp_seq_req, const char *s)
{
- trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c,
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req,
rnp->level, rnp->grplo, rnp->grphi, s);
}

/*
+ * rcu_start_this_gp - Request the start of a particular grace period
+ * @rnp: The leaf node of the CPU from which to start.
+ * @rdp: The rcu_data corresponding to the CPU from which to start.
+ * @gp_seq_req: The gp_seq of the grace period to start.
+ *
* Start the specified grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
* rcu_node structure's ->gp_seq_needed field. Returns true if there
@@ -1532,9 +1537,11 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
*
* The caller must hold the specified rcu_node structure's ->lock, which
* is why the caller is responsible for waking the grace-period kthread.
+ *
+ * Returns true if the GP thread needs to be awakened else false.
*/
static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c)
+ unsigned long gp_seq_req)
{
bool ret = false;
struct rcu_state *rsp = rdp->rsp;
@@ -1550,25 +1557,27 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* not be released.
*/
raw_lockdep_assert_held_rcu_node(rnp);
- trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf"));
for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
if (rnp_root != rnp)
raw_spin_lock_rcu_node(rnp_root);
- if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) ||
- rcu_seq_started(&rnp_root->gp_seq, c) ||
+ if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) ||
+ rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) ||
(rnp != rnp_root &&
rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) {
- trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+ trace_rcu_this_gp(rnp_root, rdp, gp_seq_req,
+ TPS("Prestarted"));
goto unlock_out;
}
- rnp_root->gp_seq_needed = c;
+ rnp_root->gp_seq_needed = gp_seq_req;
if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
/*
* We just marked the leaf, and a grace period
* is in progress, which means that rcu_gp_cleanup()
* will see the marking. Bail to reduce contention.
*/
- trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf"));
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req,
+ TPS("Startedleaf"));
goto unlock_out;
}
if (rnp_root != rnp && rnp_root->parent != NULL)
@@ -1579,14 +1588,14 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,

/* If GP already in progress, just leave, otherwise start one. */
if (rcu_gp_in_progress(rsp)) {
- trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
+ trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot"));
goto unlock_out;
}
- trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+ trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot"));
WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
rsp->gp_req_activity = jiffies;
if (!rsp->gp_kthread) {
- trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+ trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread"));
goto unlock_out;
}
trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq"));
@@ -1595,9 +1604,9 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
if (rnp != rnp_root)
raw_spin_unlock_rcu_node(rnp_root);
/* Push furthest requested GP to leaf node and rcu_data structure. */
- if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c)) {
- rnp->gp_seq_needed = c;
- rdp->gp_seq_needed = c;
+ if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req)) {
+ rnp->gp_seq_needed = gp_seq_req;
+ rdp->gp_seq_needed = gp_seq_req;
}
return ret;
}
@@ -1608,14 +1617,13 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
*/
static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
- unsigned long c = rnp->gp_seq;
bool needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);

needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
if (!needmore)
rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
- trace_rcu_this_gp(rnp, rdp, c,
+ trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
needmore ? TPS("CleanupMore") : TPS("Cleanup"));
return needmore;
}
@@ -1651,7 +1659,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- unsigned long c;
+ unsigned long gp_seq_req;
bool ret = false;

raw_lockdep_assert_held_rcu_node(rnp);
@@ -1670,9 +1678,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* accelerating callback invocation to an earlier grace-period
* number.
*/
- c = rcu_seq_snap(&rsp->gp_seq);
- if (rcu_segcblist_accelerate(&rdp->cblist, c))
- ret = rcu_start_this_gp(rnp, rdp, c);
+ gp_seq_req = rcu_seq_snap(&rsp->gp_seq);
+ if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
+ ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);

/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
--
2.17.0.441.gb46fe60e1d-goog


2018-05-21 04:34:25

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 1/4] rcu: Speed up calling of RCU tasks callbacks

From: "Joel Fernandes (Google)" <[email protected]>

RCU tasks callbacks can take atleast 1 second before the callbacks are
executed. This happens even if the hold-out tasks enter their quiescent states
quickly. I noticed this when I was testing trampoline callback execution.

To test the trampoline freeing, I wrote a simple script:
cd /sys/kernel/debug/tracing/
echo '__schedule_bug:traceon' > set_ftrace_filter;
echo '!__schedule_bug:traceon' > set_ftrace_filter;

In the background I had simple bash while loop:
while [ 1 ]; do x=1; done &

Total time of completion of above commands in seconds:

With this patch:
real 0m0.179s
user 0m0.000s
sys 0m0.054s

Without this patch:
real 0m1.098s
user 0m0.000s
sys 0m0.053s

That's a greater than 6X speed up in performance. In order to accomplish
this, I am waiting for HZ/10 time before entering the hold-out checking
loop. The loop still preserves its checking of held tasks every 1 second
as before, incase this first test doesn't succeed.

Cc: Steven Rostedt <[email protected]>
Cc: Peter Zilstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Boqun Feng <[email protected]>
Cc: Paul McKenney <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/update.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5783bdf86e5a..a28698e44b08 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -743,6 +743,12 @@ static int __noreturn rcu_tasks_kthread(void *arg)
*/
synchronize_srcu(&tasks_rcu_exit_srcu);

+ /*
+ * Wait a little bit incase held tasks are released
+ * during their next timer ticks.
+ */
+ schedule_timeout_interruptible(HZ/10);
+
/*
* Each pass through the following loop scans the list
* of holdout tasks, removing any that are no longer
@@ -755,7 +761,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
int rtst;
struct task_struct *t1;

- schedule_timeout_interruptible(HZ);
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 &&
time_after(jiffies, lastreport + rtst);
@@ -768,6 +773,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
check_holdout_task(t, needreport, &firstreport);
cond_resched();
}
+
+ if (list_empty(&rcu_tasks_holdouts))
+ break;
+
+ schedule_timeout_interruptible(HZ);
}

/*
--
2.17.0.441.gb46fe60e1d-goog


2018-05-21 04:34:36

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 2/4] rcu: Add comment documenting how rcu_seq_snap works

rcu_seq_snap may be tricky to decipher. Lets document how it works with
an example to make it easier.

Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/rcu.h | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0453a7d12b3f..d4396c96f614 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -91,7 +91,38 @@ static inline void rcu_seq_end(unsigned long *sp)
WRITE_ONCE(*sp, rcu_seq_endval(sp));
}

-/* Take a snapshot of the update side's sequence number. */
+/*
+ * rcu_seq_snap - Take a snapshot of the update side's sequence number.
+ *
+ * This function returns the earliest value of the grace-period sequence number
+ * that will indicate that a full grace period has elapsed since the current
+ * time. Once the grace-period sequence number has reached this value, it will
+ * be safe to invoke all callbacks that have been registered prior to the
+ * current time. This value is the current grace-period number plus two to the
+ * power of the number of low-order bits reserved for state, then rounded up to
+ * the next value in which the state bits are all zero.
+ *
+ * For example, since RCU_SEQ_STATE_MASK=3 and the least significant bit of
+ * the seq is used to track if a GP is in progress or not, its sufficient if we
+ * add (6+1) and mask with ~3 to get the next GP. Let's see why with an example:
+ *
+ * Say the current seq is 12 which is 0b1100 (GP is 3 and state bits are 0b00).
+ * To get to the next GP number of 4, we have to add 0b100 to this (0x1 << 2)
+ * to account for the shift due to 2 state bits. Now, if the current seq is
+ * 13 (GP is 3 and state bits are 0b01), then it means the current grace period
+ * is already in progress so the next GP that a future call back will be queued
+ * to run at is GP+2 = 5, not 4. To account for the extra +1, we just overflow
+ * the 2 lower bits by adding 0b11. Incase the lower bit was set, the overflow
+ * will cause the extra +1 to the GP, along with the usual +1 explained before.
+ * This gives us GP+2. Finally we mask the lower to bits by ~0x3 incase the
+ * overflow didn't occur. This masking is needed because incase RCU was idle
+ * (no GP in progress so lower 2 bits are 0b00), then the overflow of the lower
+ * 2 state bits wouldn't occur, so we mask to zero out those lower 2 bits.
+ *
+ * In other words, the next seq can be obtained by (0b11 + 0b100) & (~0b11)
+ * which can be generalized to:
+ * seq + (RCU_SEQ_STATE_MASK + (RCU_SEQ_STATE_MASK + 1)) & (~RCU_SEQ_STATE_MASK)
+ */
static inline unsigned long rcu_seq_snap(unsigned long *sp)
{
unsigned long s;
--
2.17.0.441.gb46fe60e1d-goog


2018-05-21 04:36:30

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 4/4] rcu: Use better variable names in funnel locking loop

The funnel locking loop in rcu_start_this_gp uses rcu_root as a
temporary variable while walking the combining tree. This causes a
tiresome exercise of a code reader reminding themselves that rcu_root
may not be root. Lets just call it rnp, and rename other variables as
well to be more appropriate.

Original patch: https://patchwork.kernel.org/patch/10396577/

Signed-off-by: Joel Fernandes <[email protected]>
---
kernel/rcu/tree.c | 48 ++++++++++++++++++++++++-----------------------
1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0ffd41ba304f..879c67a31116 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1526,7 +1526,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,

/*
* rcu_start_this_gp - Request the start of a particular grace period
- * @rnp: The leaf node of the CPU from which to start.
+ * @rnp_start: The leaf node of the CPU from which to start.
* @rdp: The rcu_data corresponding to the CPU from which to start.
* @gp_seq_req: The gp_seq of the grace period to start.
*
@@ -1540,12 +1540,12 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
*
* Returns true if the GP thread needs to be awakened else false.
*/
-static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
unsigned long gp_seq_req)
{
bool ret = false;
struct rcu_state *rsp = rdp->rsp;
- struct rcu_node *rnp_root;
+ struct rcu_node *rnp, *rnp_root = NULL;

/*
* Use funnel locking to either acquire the root rcu_node
@@ -1556,34 +1556,36 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* scan the leaf rcu_node structures. Note that rnp->lock must
* not be released.
*/
- raw_lockdep_assert_held_rcu_node(rnp);
- trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf"));
- for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
- if (rnp_root != rnp)
- raw_spin_lock_rcu_node(rnp_root);
- if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) ||
- rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) ||
- (rnp != rnp_root &&
- rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) {
- trace_rcu_this_gp(rnp_root, rdp, gp_seq_req,
+ raw_lockdep_assert_held_rcu_node(rnp_start);
+ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
+ for (rnp = rnp_start; 1; rnp = rnp->parent) {
+ if (rnp != rnp_start)
+ raw_spin_lock_rcu_node(rnp);
+ if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
+ rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
+ (rnp != rnp_start &&
+ rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req,
TPS("Prestarted"));
goto unlock_out;
}
- rnp_root->gp_seq_needed = gp_seq_req;
- if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
+ rnp->gp_seq_needed = gp_seq_req;
+ if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) {
/*
* We just marked the leaf, and a grace period
* is in progress, which means that rcu_gp_cleanup()
* will see the marking. Bail to reduce contention.
*/
- trace_rcu_this_gp(rnp, rdp, gp_seq_req,
+ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
TPS("Startedleaf"));
goto unlock_out;
}
- if (rnp_root != rnp && rnp_root->parent != NULL)
- raw_spin_unlock_rcu_node(rnp_root);
- if (!rnp_root->parent)
+ if (rnp != rnp_start && rnp->parent != NULL)
+ raw_spin_unlock_rcu_node(rnp);
+ if (!rnp->parent) {
+ rnp_root = rnp;
break; /* At root, and perhaps also leaf. */
+ }
}

/* If GP already in progress, just leave, otherwise start one. */
@@ -1601,11 +1603,11 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq"));
ret = true; /* Caller must wake GP kthread. */
unlock_out:
- if (rnp != rnp_root)
- raw_spin_unlock_rcu_node(rnp_root);
+ if (rnp != rnp_start)
+ raw_spin_unlock_rcu_node(rnp);
/* Push furthest requested GP to leaf node and rcu_data structure. */
- if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req)) {
- rnp->gp_seq_needed = gp_seq_req;
+ if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req)) {
+ rnp_start->gp_seq_needed = gp_seq_req;
rdp->gp_seq_needed = gp_seq_req;
}
return ret;
--
2.17.0.441.gb46fe60e1d-goog


2018-05-21 04:40:17

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 0/4] fixes, cleanups for rcu/dev

On Sun, May 20, 2018 at 09:32:47PM -0700, Joel Fernandes wrote:
> Hi Paul,
>
> Here are some patches reworked with a few comments on few of the patches
> from previous series: https://lkml.org/lkml/2018/5/13/296
>
> 4/4 is a new addition which fixes a potential issue.
>
> Let me know what you think, thanks!
>
> Joel Fernandes (4):
> rcu: Add comment documenting how rcu_seq_snap works
> rcu: Cleanup the variables used to request a new grace period
> rcu: Use better variable names in funnel locking loop
> rcu: Unlock non-start node only after accessing its gp_seq_needed

Please disregard the v2, my infant daughter distracted me when I was
preparing the 'git format-patch' :-) I was not supposed to send the 'Speed
up..' patch since I already sent that separately. And patch 4/4 which I
believe fixes a potential issue is also missing. So lets drop/ignore all of
this v2 and I'll just send a v3.

thanks!