2020-10-15 04:01:23

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v7 0/6] Add support for length of each segment in the segcblist

This is required for several usecases identified. One of them being tracing how
the segmented callback list changes. Tracing this has identified issues in RCU
code in the past.

>From Paul:
Another use case is of course more accurately determining whether a given CPU's
large pile of callbacks can be best served by making grace periods go faster,
invoking callbacks more vigorously, or both. It should also be possible to
simplify some of the callback handling a bit, given that some of the unnatural
acts are due to there having been no per-batch counts.

Revision history:
v7: Cleaned up memory barriers (thanks fweisbec@ for reviewing), made minor
corrections per Neeraj (thanks).

v6: Fixed TREE04, and restored older logic to ensure rcu_barrier works.

v5: Various changes, bug fixes. Discovery of rcu_barrier issue.

v4: Restructured rcu_do_batch() and segcblist merging to avoid issues.
Fixed minor nit from Davidlohr.
v1->v3: minor nits.
(https://lore.kernel.org/lkml/20200719034210.2382053-1-joel@xxxxxxxxxxxxxxxxx/)

Joel Fernandes (Google) (6):
rcu/tree: Make rcu_do_batch count how many callbacks were executed
rcu/segcblist: Add counters to segcblist datastructure
rcu/trace: Add tracing for how segcb list changes
rcu/segcblist: Remove useless rcupdate.h include
rcu/tree: Remove redundant smp_mb() in rcu_do_batch
rcu/segcblist: Add additional comments to explain smp_mb()

include/linux/rcu_segcblist.h | 2 +
include/trace/events/rcu.h | 25 +++++
kernel/rcu/rcu_segcblist.c | 205 ++++++++++++++++++++++++++--------
kernel/rcu/rcu_segcblist.h | 8 +-
kernel/rcu/tree.c | 21 ++--
5 files changed, 203 insertions(+), 58 deletions(-)

--
2.28.0.1011.ga647a8990f-goog


2020-10-15 09:42:11

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v7 3/6] rcu/trace: Add tracing for how segcb list changes

Track how the segcb list changes before/after acceleration, during
queuing and during dequeuing.

This has proved useful to discover an optimization to avoid unwanted GP
requests when there are no callbacks accelerated. The overhead is minimal as
each segment's length is now stored in the respective segment.

Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.c | 31 +++++++++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.h | 5 +++++
kernel/rcu/tree.c | 9 +++++++++
4 files changed, 70 insertions(+)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 155b5cb43cfd..9f2237d9b0c8 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
__entry->qlen)
);

+TRACE_EVENT_RCU(rcu_segcb,
+
+ TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
+
+ TP_ARGS(ctx, cb_count, gp_seq),
+
+ TP_STRUCT__entry(
+ __field(const char *, ctx)
+ __array(int, cb_count, RCU_CBLIST_NSEGS)
+ __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * sizeof(int));
+ memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
+ ),
+
+ TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
+ "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
+ __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
+ __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
+
+);
+
/*
* Tracepoint for the registration of a single RCU callback of the special
* kvfree() form. The first argument is the RCU type, the second argument
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index d9fab49dee1b..2dccbd29cd3a 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -14,6 +14,7 @@
#include <linux/types.h>

#include "rcu_segcblist.h"
+#include "rcu.h"

/* Initialize simple callback list. */
void rcu_cblist_init(struct rcu_cblist *rclp)
@@ -339,6 +340,36 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}

+/*
+ * Return how many CBs each segment along with their gp_seq values.
+ *
+ * This function is O(N) where N is the number of segments. Only used from
+ * tracing code which is usually disabled in production.
+ */
+#ifdef CONFIG_RCU_TRACE
+static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
+ int cbcount[RCU_CBLIST_NSEGS],
+ unsigned long gpseq[RCU_CBLIST_NSEGS])
+{
+ int i;
+
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
+ cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
+ gpseq[i] = rsclp->gp_seq[i];
+ }
+}
+
+void trace_rcu_segcb_list(struct rcu_segcblist *rsclp, const char *context)
+{
+ int cbs[RCU_CBLIST_NSEGS];
+ unsigned long gps[RCU_CBLIST_NSEGS];
+
+ rcu_segcblist_countseq(rsclp, cbs, gps);
+
+ trace_rcu_segcb(context, cbs, gps);
+}
+#endif
+
/*
* Extract only those callbacks still pending (not yet ready to be
* invoked) from the specified rcu_segcblist structure and place them in
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index cd35c9faaf51..c2e274ae0912 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
+#ifdef CONFIG_RCU_TRACE
+void trace_rcu_segcb_list(struct rcu_segcblist *rsclp, const char *context);
+#else
+#define trace_rcu_segcb_list(...)
+#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 24c00020ab83..346a05506935 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;

+ trace_rcu_segcb_list(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));

+ trace_rcu_segcb_list(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}

@@ -2466,11 +2470,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_list(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);

/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;

@@ -2983,6 +2990,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));

+ trace_rcu_segcb_list(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
--
2.28.0.1011.ga647a8990f-goog