2020-08-25 02:53:22

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v4 -rcu 4/4] rcu/trace: Add tracing for how segcb list changes

Track how the segcb list changes before/after acceleration, during
queuing and during dequeuing.

This has proved useful to discover an optimization to avoid unwanted GP
requests when there are no callbacks accelerated. The overhead is minimal as
each segment's length is now stored in the respective segment.

Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.c | 27 +++++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.h | 7 +++++++
kernel/rcu/tree.c | 23 +++++++++++++++++++++++
4 files changed, 82 insertions(+)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 155b5cb43cfd..7b84df3c95df 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
__entry->qlen)
);

+TRACE_EVENT_RCU(rcu_segcb,
+
+ TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
+
+ TP_ARGS(ctx, cb_count, gp_seq),
+
+ TP_STRUCT__entry(
+ __field(const char *, ctx)
+ __array(int, cb_count, 4)
+ __array(unsigned long, gp_seq, 4)
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ memcpy(__entry->cb_count, cb_count, 4 * sizeof(int));
+ memcpy(__entry->gp_seq, gp_seq, 4 * sizeof(unsigned long));
+ ),
+
+ TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
+ "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
+ __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
+ __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
+
+);
+
/*
* Tracepoint for the registration of a single RCU callback of the special
* kvfree() form. The first argument is the RCU type, the second argument
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 73a103464ea4..6419dbbaecde 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -378,6 +378,33 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}

+/*
+ * Return how many CBs each segment along with their gp_seq values.
+ *
+ * This function is O(N) where N is the number of callbacks. Only used from
+ * tracing code which is usually disabled in production.
+ */
+#ifdef CONFIG_RCU_TRACE
+void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
+ int cbcount[RCU_CBLIST_NSEGS],
+ unsigned long gpseq[RCU_CBLIST_NSEGS])
+{
+ struct rcu_head **cur_tail;
+ int i;
+
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ cbcount[i] = 0;
+
+ cur_tail = &(rsclp->head);
+
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
+ cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
+ gpseq[i] = rsclp->gp_seq[i];
+ cur_tail = rsclp->tails[i];
+ }
+}
+#endif
+
/*
* Extract only those callbacks still pending (not yet ready to be
* invoked) from the specified rcu_segcblist structure and place them in
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index b90725f81d77..5f7cdfed0ba4 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -105,3 +105,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
+#ifdef CONFIG_RCU_TRACE
+void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
+ int cbcount[RCU_CBLIST_NSEGS],
+ unsigned long gpseq[RCU_CBLIST_NSEGS]);
+#else
+#define rcu_segcblist_countseq(...)
+#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51348144a4ea..16ad99a9ebba 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1476,6 +1476,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
unsigned long gp_seq_req;
bool ret = false;
+ int cbs[RCU_CBLIST_NSEGS];
+ unsigned long gps[RCU_CBLIST_NSEGS];

rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1484,6 +1486,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;

+ /* Count CBs for tracing. */
+ rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
+ trace_rcu_segcb("SegCbPreAcc", cbs, gps);
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1504,6 +1510,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));

+ /* Count CBs for tracing. */
+ rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
+ trace_rcu_segcb("SegCbPostAcc", cbs, gps);
+
return ret;
}

@@ -2421,6 +2431,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count = 0;
long pending, tlimit = 0;
+ int cbs[RCU_CBLIST_NSEGS];
+ unsigned long gps[RCU_CBLIST_NSEGS];

/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2461,6 +2473,11 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
+ /* Count CBs for tracing. */
+ rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
+ trace_rcu_segcb("SegCbDequeued", cbs, gps);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;

@@ -2929,6 +2946,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
unsigned long flags;
struct rcu_data *rdp;
bool was_alldone;
+ int cbs[RCU_CBLIST_NSEGS];
+ unsigned long gps[RCU_CBLIST_NSEGS];

/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2974,6 +2993,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));

+ /* Count CBs for tracing. */
+ rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
+ trace_rcu_segcb("SegCBQueued", cbs, gps);
+
/* Go handle any RCU core processing required. */
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
--
2.28.0.297.g1956fa8f8d-goog


2020-08-25 21:59:51

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v4 -rcu 4/4] rcu/trace: Add tracing for how segcb list changes

On Mon, Aug 24, 2020 at 10:48:42PM -0400, Joel Fernandes (Google) wrote:
> Track how the segcb list changes before/after acceleration, during
> queuing and during dequeuing.
>
> This has proved useful to discover an optimization to avoid unwanted GP
> requests when there are no callbacks accelerated. The overhead is minimal as
> each segment's length is now stored in the respective segment.
>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>

Speaking of tracing, diagnostics, and debugging, a few instances of
ASSERT_EXCLUSIVE_WRITER() in the previous patch might save a lot
of debugging. Help KCSAN help us.

> ---
> include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.c | 27 +++++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.h | 7 +++++++
> kernel/rcu/tree.c | 23 +++++++++++++++++++++++
> 4 files changed, 82 insertions(+)
>
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index 155b5cb43cfd..7b84df3c95df 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> __entry->qlen)
> );
>
> +TRACE_EVENT_RCU(rcu_segcb,
> +
> + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> +
> + TP_ARGS(ctx, cb_count, gp_seq),
> +
> + TP_STRUCT__entry(
> + __field(const char *, ctx)
> + __array(int, cb_count, 4)
> + __array(unsigned long, gp_seq, 4)
> + ),
> +
> + TP_fast_assign(
> + __entry->ctx = ctx;
> + memcpy(__entry->cb_count, cb_count, 4 * sizeof(int));
> + memcpy(__entry->gp_seq, gp_seq, 4 * sizeof(unsigned long));
> + ),
> +
> + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
> + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
> + __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
> + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
> +
> +);
> +
> /*
> * Tracepoint for the registration of a single RCU callback of the special
> * kvfree() form. The first argument is the RCU type, the second argument
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 73a103464ea4..6419dbbaecde 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -378,6 +378,33 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
> rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> }
>
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of callbacks. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> + int cbcount[RCU_CBLIST_NSEGS],
> + unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{

Why not declare the arrays here and invoke the new trace event from
here also? That would simplify the usage and save a few lines of code.

Thanx, Paul

> + struct rcu_head **cur_tail;
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++)
> + cbcount[i] = 0;
> +
> + cur_tail = &(rsclp->head);
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + cur_tail = rsclp->tails[i];
> + }
> +}
> +#endif
> +
> /*
> * Extract only those callbacks still pending (not yet ready to be
> * invoked) from the specified rcu_segcblist structure and place them in
> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> index b90725f81d77..5f7cdfed0ba4 100644
> --- a/kernel/rcu/rcu_segcblist.h
> +++ b/kernel/rcu/rcu_segcblist.h
> @@ -105,3 +105,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
> bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
> void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
> struct rcu_segcblist *src_rsclp);
> +#ifdef CONFIG_RCU_TRACE
> +void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> + int cbcount[RCU_CBLIST_NSEGS],
> + unsigned long gpseq[RCU_CBLIST_NSEGS]);
> +#else
> +#define rcu_segcblist_countseq(...)
> +#endif
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 51348144a4ea..16ad99a9ebba 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1476,6 +1476,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> {
> unsigned long gp_seq_req;
> bool ret = false;
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
>
> rcu_lockdep_assert_cblist_protected(rdp);
> raw_lockdep_assert_held_rcu_node(rnp);
> @@ -1484,6 +1486,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> if (!rcu_segcblist_pend_cbs(&rdp->cblist))
> return false;
>
> + /* Count CBs for tracing. */
> + rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
> + trace_rcu_segcb("SegCbPreAcc", cbs, gps);
> +
> /*
> * Callbacks are often registered with incomplete grace-period
> * information. Something about the fact that getting exact
> @@ -1504,6 +1510,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> else
> trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
>
> + /* Count CBs for tracing. */
> + rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
> + trace_rcu_segcb("SegCbPostAcc", cbs, gps);
> +
> return ret;
> }
>
> @@ -2421,6 +2431,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
> long bl, count = 0;
> long pending, tlimit = 0;
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
>
> /* If no callbacks are ready, just return. */
> if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
> @@ -2461,6 +2473,11 @@ static void rcu_do_batch(struct rcu_data *rdp)
> /* Invoke callbacks. */
> tick_dep_set_task(current, TICK_DEP_BIT_RCU);
> rhp = rcu_cblist_dequeue(&rcl);
> +
> + /* Count CBs for tracing. */
> + rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
> + trace_rcu_segcb("SegCbDequeued", cbs, gps);
> +
> for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
> rcu_callback_t f;
>
> @@ -2929,6 +2946,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
> unsigned long flags;
> struct rcu_data *rdp;
> bool was_alldone;
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
>
> /* Misaligned rcu_head! */
> WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
> @@ -2974,6 +2993,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
> trace_rcu_callback(rcu_state.name, head,
> rcu_segcblist_n_cbs(&rdp->cblist));
>
> + /* Count CBs for tracing. */
> + rcu_segcblist_countseq(&rdp->cblist, cbs, gps);
> + trace_rcu_segcb("SegCBQueued", cbs, gps);
> +
> /* Go handle any RCU core processing required. */
> if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
> --
> 2.28.0.297.g1956fa8f8d-goog
>

2020-08-25 22:54:33

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v4 -rcu 4/4] rcu/trace: Add tracing for how segcb list changes

On Tue, Aug 25, 2020 at 02:55:53PM -0700, Paul E. McKenney wrote:
> On Mon, Aug 24, 2020 at 10:48:42PM -0400, Joel Fernandes (Google) wrote:
> > Track how the segcb list changes before/after acceleration, during
> > queuing and during dequeuing.
> >
> > This has proved useful to discover an optimization to avoid unwanted GP
> > requests when there are no callbacks accelerated. The overhead is minimal as
> > each segment's length is now stored in the respective segment.
> >
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
>
> Speaking of tracing, diagnostics, and debugging, a few instances of
> ASSERT_EXCLUSIVE_WRITER() in the previous patch might save a lot
> of debugging. Help KCSAN help us.

Ok, I will look into it, thanks.

> > include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
> > kernel/rcu/rcu_segcblist.c | 27 +++++++++++++++++++++++++++
> > kernel/rcu/rcu_segcblist.h | 7 +++++++
> > kernel/rcu/tree.c | 23 +++++++++++++++++++++++
> > 4 files changed, 82 insertions(+)
> >
> > diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> > index 155b5cb43cfd..7b84df3c95df 100644
> > --- a/include/trace/events/rcu.h
> > +++ b/include/trace/events/rcu.h
> > @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> > __entry->qlen)
> > );
> >
> > +TRACE_EVENT_RCU(rcu_segcb,
> > +
> > + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> > +
> > + TP_ARGS(ctx, cb_count, gp_seq),
> > +
> > + TP_STRUCT__entry(
> > + __field(const char *, ctx)
> > + __array(int, cb_count, 4)
> > + __array(unsigned long, gp_seq, 4)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->ctx = ctx;
> > + memcpy(__entry->cb_count, cb_count, 4 * sizeof(int));
> > + memcpy(__entry->gp_seq, gp_seq, 4 * sizeof(unsigned long));
> > + ),
> > +
> > + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
> > + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
> > + __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
> > + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
> > +
> > +);
> > +
> > /*
> > * Tracepoint for the registration of a single RCU callback of the special
> > * kvfree() form. The first argument is the RCU type, the second argument
> > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > index 73a103464ea4..6419dbbaecde 100644
> > --- a/kernel/rcu/rcu_segcblist.c
> > +++ b/kernel/rcu/rcu_segcblist.c
> > @@ -378,6 +378,33 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
> > rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> > }
> >
> > +/*
> > + * Return how many CBs each segment along with their gp_seq values.
> > + *
> > + * This function is O(N) where N is the number of callbacks. Only used from
> > + * tracing code which is usually disabled in production.
> > + */
> > +#ifdef CONFIG_RCU_TRACE
> > +void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > + int cbcount[RCU_CBLIST_NSEGS],
> > + unsigned long gpseq[RCU_CBLIST_NSEGS])
> > +{
>
> Why not declare the arrays here and invoke the new trace event from
> here also? That would simplify the usage and save a few lines of code.

True! I will do it that way.

thanks,

- Joel