Track how the segcb list changes before/after acceleration, during
queuing and during dequeuing.
This has proved useful to discover an optimization to avoid unwanted GP
requests when there are no callbacks accelerated. The overhead is minimal as
each segment's length is now stored in the respective segment.
Reviewed-by: Frederic Weisbecker <[email protected]>
Reviewed-by: Neeraj Upadhyay <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.c | 34 ++++++++++++++++++++++++++++++++++
kernel/rcu/rcu_segcblist.h | 5 +++++
kernel/rcu/tree.c | 9 +++++++++
4 files changed, 73 insertions(+)
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 155b5cb43cfd..5f8f2ee1a936 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
__entry->qlen)
);
+TRACE_EVENT_RCU(rcu_segcb_stats,
+
+ TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
+
+ TP_ARGS(ctx, cb_count, gp_seq),
+
+ TP_STRUCT__entry(
+ __field(const char *, ctx)
+ __array(int, cb_count, RCU_CBLIST_NSEGS)
+ __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * sizeof(int));
+ memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
+ ),
+
+ TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
+ "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
+ __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
+ __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
+
+);
+
/*
* Tracepoint for the registration of a single RCU callback of the special
* kvfree() form. The first argument is the RCU type, the second argument
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 357c19bbcb00..2a03949d0b82 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include "rcu_segcblist.h"
+#include "rcu.h"
/* Initialize simple callback list. */
void rcu_cblist_init(struct rcu_cblist *rclp)
@@ -328,6 +329,39 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}
+/*
+ * Return how many CBs each segment along with their gp_seq values.
+ *
+ * This function is O(N) where N is the number of segments. Only used from
+ * tracing code which is usually disabled in production.
+ */
+#ifdef CONFIG_RCU_TRACE
+static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
+ int cbcount[RCU_CBLIST_NSEGS],
+ unsigned long gpseq[RCU_CBLIST_NSEGS])
+{
+ int i;
+
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
+ cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
+ gpseq[i] = rsclp->gp_seq[i];
+ }
+}
+
+void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
+{
+ int cbs[RCU_CBLIST_NSEGS];
+ unsigned long gps[RCU_CBLIST_NSEGS];
+
+ if (!trace_rcu_segcb_stats_enabled())
+ return;
+
+ rcu_segcblist_countseq(rsclp, cbs, gps);
+
+ trace_rcu_segcb_stats(context, cbs, gps);
+}
+#endif
+
/*
* Extract only those callbacks still pending (not yet ready to be
* invoked) from the specified rcu_segcblist structure and place them in
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index cd35c9faaf51..7750734fa116 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
+#ifdef CONFIG_RCU_TRACE
+void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context);
+#else
+#define __trace_rcu_segcb_stats(...)
+#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 24c00020ab83..f6c6653b3ec2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+ __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
@@ -2466,11 +2470,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
@@ -2983,6 +2990,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
--
2.29.1.341.ge80a0c044ae-goog
On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of segments. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> + int cbcount[RCU_CBLIST_NSEGS],
> + unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + }
> +}
> +
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> +{
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
> +
> + if (!trace_rcu_segcb_stats_enabled())
> + return;
Yes, very good!
Paul just told me that RCU_TRACE can be used in production so that confirms that we
wanted to avoid this loop of 8 iterations when tracing is disabled.
Thanks.
On 11/3/2020 7:56 PM, Joel Fernandes (Google) wrote:
> Track how the segcb list changes before/after acceleration, during
> queuing and during dequeuing.
>
> This has proved useful to discover an optimization to avoid unwanted GP
> requests when there are no callbacks accelerated. The overhead is minimal as
> each segment's length is now stored in the respective segment.
>
> Reviewed-by: Frederic Weisbecker <[email protected]>
> Reviewed-by: Neeraj Upadhyay <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.c | 34 ++++++++++++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.h | 5 +++++
> kernel/rcu/tree.c | 9 +++++++++
> 4 files changed, 73 insertions(+)
>
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index 155b5cb43cfd..5f8f2ee1a936 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> __entry->qlen)
> );
>
> +TRACE_EVENT_RCU(rcu_segcb_stats,
> +
> + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
I think we need to use long[] instead of int[] for cb_count everywhere
in this patch?
Thanks
Neeraj
> +
> + TP_ARGS(ctx, cb_count, gp_seq),
> +
> + TP_STRUCT__entry(
> + __field(const char *, ctx)
> + __array(int, cb_count, RCU_CBLIST_NSEGS)
> + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> + ),
> +
> + TP_fast_assign(
> + __entry->ctx = ctx;
> + memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * sizeof(int));
> + memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
> + ),
> +
> + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
> + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
> + __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
> + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
> +
> +);
> +
> /*
> * Tracepoint for the registration of a single RCU callback of the special
> * kvfree() form. The first argument is the RCU type, the second argument
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 357c19bbcb00..2a03949d0b82 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -14,6 +14,7 @@
> #include <linux/types.h>
>
> #include "rcu_segcblist.h"
> +#include "rcu.h"
>
> /* Initialize simple callback list. */
> void rcu_cblist_init(struct rcu_cblist *rclp)
> @@ -328,6 +329,39 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
> rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> }
>
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of segments. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> + int cbcount[RCU_CBLIST_NSEGS],
> + unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + }
> +}
> +
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> +{
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
> +
> + if (!trace_rcu_segcb_stats_enabled())
> + return;
> +
> + rcu_segcblist_countseq(rsclp, cbs, gps);
> +
> + trace_rcu_segcb_stats(context, cbs, gps);
> +}
> +#endif
> +
> /*
> * Extract only those callbacks still pending (not yet ready to be
> * invoked) from the specified rcu_segcblist structure and place them in
> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> index cd35c9faaf51..7750734fa116 100644
> --- a/kernel/rcu/rcu_segcblist.h
> +++ b/kernel/rcu/rcu_segcblist.h
> @@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
> bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
> void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
> struct rcu_segcblist *src_rsclp);
> +#ifdef CONFIG_RCU_TRACE
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context);
> +#else
> +#define __trace_rcu_segcb_stats(...)
> +#endif
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 24c00020ab83..f6c6653b3ec2 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> if (!rcu_segcblist_pend_cbs(&rdp->cblist))
> return false;
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
> +
> /*
> * Callbacks are often registered with incomplete grace-period
> * information. Something about the fact that getting exact
> @@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> else
> trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
> +
> return ret;
> }
>
> @@ -2466,11 +2470,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
> rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
> if (offloaded)
> rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> +
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
> rcu_nocb_unlock_irqrestore(rdp, flags);
>
> /* Invoke callbacks. */
> tick_dep_set_task(current, TICK_DEP_BIT_RCU);
> rhp = rcu_cblist_dequeue(&rcl);
> +
> for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
> rcu_callback_t f;
>
> @@ -2983,6 +2990,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
> trace_rcu_callback(rcu_state.name, head,
> rcu_segcblist_n_cbs(&rdp->cblist));
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
> +
> /* Go handle any RCU core processing required. */
> if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
> __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
>
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation
On Tue, Nov 03, 2020 at 04:17:31PM +0100, Frederic Weisbecker wrote:
> On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> > +/*
> > + * Return how many CBs each segment along with their gp_seq values.
> > + *
> > + * This function is O(N) where N is the number of segments. Only used from
> > + * tracing code which is usually disabled in production.
> > + */
> > +#ifdef CONFIG_RCU_TRACE
> > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > + int cbcount[RCU_CBLIST_NSEGS],
> > + unsigned long gpseq[RCU_CBLIST_NSEGS])
> > +{
> > + int i;
> > +
> > + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > + gpseq[i] = rsclp->gp_seq[i];
> > + }
> > +}
> > +
> > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > +{
> > + int cbs[RCU_CBLIST_NSEGS];
> > + unsigned long gps[RCU_CBLIST_NSEGS];
> > +
> > + if (!trace_rcu_segcb_stats_enabled())
> > + return;
>
> Yes, very good!
>
> Paul just told me that RCU_TRACE can be used in production so that confirms that we
> wanted to avoid this loop of 8 iterations when tracing is disabled.
RCU's "don't try this in production" Kconfig option is PROVE_RCU.
I would be looking for checks that the sum of the segment lengths
match the overall ->len or checks that all of the segment lengths
are zero when ->cblist is empty to be guarded by something like
IS_ENABLED(CONFIG_PROVE_RCU). Of course, checks of this sort need to
be confined to those portions of rcu_do_batch() that are excluding other
accesses to ->cblist.
But if rcu_segcblist_countseq() is invoked only when a specific trace
event is enabled, it should be OK to have it guarded only by RCU_TRACE.
Thanx, Paul
On Wed, Nov 04, 2020 at 06:08:07AM -0800, Paul E. McKenney wrote:
> On Tue, Nov 03, 2020 at 04:17:31PM +0100, Frederic Weisbecker wrote:
> > On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> > > +/*
> > > + * Return how many CBs each segment along with their gp_seq values.
> > > + *
> > > + * This function is O(N) where N is the number of segments. Only used from
> > > + * tracing code which is usually disabled in production.
> > > + */
> > > +#ifdef CONFIG_RCU_TRACE
> > > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > > + int cbcount[RCU_CBLIST_NSEGS],
> > > + unsigned long gpseq[RCU_CBLIST_NSEGS])
> > > +{
> > > + int i;
> > > +
> > > + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > > + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > > + gpseq[i] = rsclp->gp_seq[i];
> > > + }
> > > +}
> > > +
> > > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > > +{
> > > + int cbs[RCU_CBLIST_NSEGS];
> > > + unsigned long gps[RCU_CBLIST_NSEGS];
> > > +
> > > + if (!trace_rcu_segcb_stats_enabled())
> > > + return;
> >
> > Yes, very good!
> >
> > Paul just told me that RCU_TRACE can be used in production so that confirms that we
> > wanted to avoid this loop of 8 iterations when tracing is disabled.
>
> RCU's "don't try this in production" Kconfig option is PROVE_RCU.
>
> I would be looking for checks that the sum of the segment lengths
> match the overall ->len or checks that all of the segment lengths
> are zero when ->cblist is empty to be guarded by something like
> IS_ENABLED(CONFIG_PROVE_RCU). Of course, checks of this sort need to
> be confined to those portions of rcu_do_batch() that are excluding other
> accesses to ->cblist.
Right.
>
> But if rcu_segcblist_countseq() is invoked only when a specific trace
> event is enabled, it should be OK to have it guarded only by RCU_TRACE.
Indeed, so I think we are good.
Thanks.
On Wed, Nov 04, 2020 at 07:10:17PM +0530, Neeraj Upadhyay wrote:
>
>
> On 11/3/2020 7:56 PM, Joel Fernandes (Google) wrote:
> > Track how the segcb list changes before/after acceleration, during
> > queuing and during dequeuing.
> >
> > This has proved useful to discover an optimization to avoid unwanted GP
> > requests when there are no callbacks accelerated. The overhead is minimal as
> > each segment's length is now stored in the respective segment.
> >
> > Reviewed-by: Frederic Weisbecker <[email protected]>
> > Reviewed-by: Neeraj Upadhyay <[email protected]>
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > ---
> > include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
> > kernel/rcu/rcu_segcblist.c | 34 ++++++++++++++++++++++++++++++++++
> > kernel/rcu/rcu_segcblist.h | 5 +++++
> > kernel/rcu/tree.c | 9 +++++++++
> > 4 files changed, 73 insertions(+)
> >
> > diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> > index 155b5cb43cfd..5f8f2ee1a936 100644
> > --- a/include/trace/events/rcu.h
> > +++ b/include/trace/events/rcu.h
> > @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> > __entry->qlen)
> > );
> > +TRACE_EVENT_RCU(rcu_segcb_stats,
> > +
> > + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
>
> I think we need to use long[] instead of int[] for cb_count everywhere in
> this patch?
More than 4 billion callbacks on a single cblist sounds like a bug though so
int should work. Plus I prefer to keep the tracepoint size small (on 64-bit
systems, long is 64 bits, int is 32 bits).
Thanks for all the reviews on this and other patches!
- Joel
>
>
> Thanks
> Neeraj
>
> > +
> > + TP_ARGS(ctx, cb_count, gp_seq),
> > +
> > + TP_STRUCT__entry(
> > + __field(const char *, ctx)
> > + __array(int, cb_count, RCU_CBLIST_NSEGS)
> > + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->ctx = ctx;
> > + memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * sizeof(int));
> > + memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
> > + ),
> > +
> > + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
> > + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
> > + __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
> > + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
> > +
> > +);
> > +
> > /*
> > * Tracepoint for the registration of a single RCU callback of the special
> > * kvfree() form. The first argument is the RCU type, the second argument
> > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > index 357c19bbcb00..2a03949d0b82 100644
> > --- a/kernel/rcu/rcu_segcblist.c
> > +++ b/kernel/rcu/rcu_segcblist.c
> > @@ -14,6 +14,7 @@
> > #include <linux/types.h>
> > #include "rcu_segcblist.h"
> > +#include "rcu.h"
> > /* Initialize simple callback list. */
> > void rcu_cblist_init(struct rcu_cblist *rclp)
> > @@ -328,6 +329,39 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
> > rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> > }
> > +/*
> > + * Return how many CBs each segment along with their gp_seq values.
> > + *
> > + * This function is O(N) where N is the number of segments. Only used from
> > + * tracing code which is usually disabled in production.
> > + */
> > +#ifdef CONFIG_RCU_TRACE
> > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > + int cbcount[RCU_CBLIST_NSEGS],
> > + unsigned long gpseq[RCU_CBLIST_NSEGS])
> > +{
> > + int i;
> > +
> > + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > + gpseq[i] = rsclp->gp_seq[i];
> > + }
> > +}
> > +
> > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > +{
> > + int cbs[RCU_CBLIST_NSEGS];
> > + unsigned long gps[RCU_CBLIST_NSEGS];
> > +
> > + if (!trace_rcu_segcb_stats_enabled())
> > + return;
> > +
> > + rcu_segcblist_countseq(rsclp, cbs, gps);
> > +
> > + trace_rcu_segcb_stats(context, cbs, gps);
> > +}
> > +#endif
> > +
> > /*
> > * Extract only those callbacks still pending (not yet ready to be
> > * invoked) from the specified rcu_segcblist structure and place them in
> > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > index cd35c9faaf51..7750734fa116 100644
> > --- a/kernel/rcu/rcu_segcblist.h
> > +++ b/kernel/rcu/rcu_segcblist.h
> > @@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
> > bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
> > void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
> > struct rcu_segcblist *src_rsclp);
> > +#ifdef CONFIG_RCU_TRACE
> > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context);
> > +#else
> > +#define __trace_rcu_segcb_stats(...)
> > +#endif
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 24c00020ab83..f6c6653b3ec2 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> > if (!rcu_segcblist_pend_cbs(&rdp->cblist))
> > return false;
> > + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
> > +
> > /*
> > * Callbacks are often registered with incomplete grace-period
> > * information. Something about the fact that getting exact
> > @@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> > else
> > trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
> > + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
> > +
> > return ret;
> > }
> > @@ -2466,11 +2470,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
> > if (offloaded)
> > rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> > +
> > + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
> > rcu_nocb_unlock_irqrestore(rdp, flags);
> > /* Invoke callbacks. */
> > tick_dep_set_task(current, TICK_DEP_BIT_RCU);
> > rhp = rcu_cblist_dequeue(&rcl);
> > +
> > for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
> > rcu_callback_t f;
> > @@ -2983,6 +2990,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
> > trace_rcu_callback(rcu_state.name, head,
> > rcu_segcblist_n_cbs(&rdp->cblist));
> > + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
> > +
> > /* Go handle any RCU core processing required. */
> > if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
> > __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
> >
>
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of
> the Code Aurora Forum, hosted by The Linux Foundation
On Wed, Nov 04, 2020 at 03:33:14PM +0100, Frederic Weisbecker wrote:
> On Wed, Nov 04, 2020 at 06:08:07AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 03, 2020 at 04:17:31PM +0100, Frederic Weisbecker wrote:
> > > On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> > > > +/*
> > > > + * Return how many CBs each segment along with their gp_seq values.
> > > > + *
> > > > + * This function is O(N) where N is the number of segments. Only used from
> > > > + * tracing code which is usually disabled in production.
> > > > + */
> > > > +#ifdef CONFIG_RCU_TRACE
> > > > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > > > + int cbcount[RCU_CBLIST_NSEGS],
> > > > + unsigned long gpseq[RCU_CBLIST_NSEGS])
> > > > +{
> > > > + int i;
> > > > +
> > > > + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > > > + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > > > + gpseq[i] = rsclp->gp_seq[i];
> > > > + }
> > > > +}
> > > > +
> > > > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > > > +{
> > > > + int cbs[RCU_CBLIST_NSEGS];
> > > > + unsigned long gps[RCU_CBLIST_NSEGS];
> > > > +
> > > > + if (!trace_rcu_segcb_stats_enabled())
> > > > + return;
> > >
> > > Yes, very good!
> > >
> > > Paul just told me that RCU_TRACE can be used in production so that confirms that we
> > > wanted to avoid this loop of 8 iterations when tracing is disabled.
> >
> > RCU's "don't try this in production" Kconfig option is PROVE_RCU.
> >
> > I would be looking for checks that the sum of the segment lengths
> > match the overall ->len or checks that all of the segment lengths
> > are zero when ->cblist is empty to be guarded by something like
> > IS_ENABLED(CONFIG_PROVE_RCU). Of course, checks of this sort need to
> > be confined to those portions of rcu_do_batch() that are excluding other
> > accesses to ->cblist.
>
> Right.
>
> >
> > But if rcu_segcblist_countseq() is invoked only when a specific trace
> > event is enabled, it should be OK to have it guarded only by RCU_TRACE.
>
> Indeed, so I think we are good.
Thanks, so the only changes are to patches 2/7 and 4/7 which I will work on.
1/7 was already taken by Paul. For 7/7, it sounds like I did not understand
Paul's reworks on the comments and we're still discussing it; but some
comments are better than none, so I am Ok with Pauls version of it.
thanks,
- Joel
On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> Track how the segcb list changes before/after acceleration, during
> queuing and during dequeuing.
>
> This has proved useful to discover an optimization to avoid unwanted GP
> requests when there are no callbacks accelerated. The overhead is minimal as
> each segment's length is now stored in the respective segment.
>
> Reviewed-by: Frederic Weisbecker <[email protected]>
> Reviewed-by: Neeraj Upadhyay <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
I thought I had responded to this one, but I find no record of having
done so. But please see below for some questions and comments.
> ---
> include/trace/events/rcu.h | 25 +++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.c | 34 ++++++++++++++++++++++++++++++++++
> kernel/rcu/rcu_segcblist.h | 5 +++++
> kernel/rcu/tree.c | 9 +++++++++
> 4 files changed, 73 insertions(+)
>
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index 155b5cb43cfd..5f8f2ee1a936 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> __entry->qlen)
> );
>
> +TRACE_EVENT_RCU(rcu_segcb_stats,
> +
> + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> +
> + TP_ARGS(ctx, cb_count, gp_seq),
> +
> + TP_STRUCT__entry(
> + __field(const char *, ctx)
> + __array(int, cb_count, RCU_CBLIST_NSEGS)
> + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> + ),
> +
> + TP_fast_assign(
> + __entry->ctx = ctx;
> + memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * sizeof(int));
> + memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
OK, I will bite...
Why not include linux/rcu_segcblist.h and then fold all of
rcu_segcblist_countseq() and much of __trace_rcu_segcb_stats() into
this TP_fast_assign()? If you must have "unsigned int" for cb_count,
then you can use "do {} while (0)" to declare the index variable, but
if you can use unsigned long as suggested by Neeraj, you can just use
a pair of memcpy()s.
> + ),
> +
> + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, NEXT=%d) "
> + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
> + __entry->cb_count[0], __entry->cb_count[1], __entry->cb_count[2], __entry->cb_count[3],
> + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
> +
> +);
> +
> /*
> * Tracepoint for the registration of a single RCU callback of the special
> * kvfree() form. The first argument is the RCU type, the second argument
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 357c19bbcb00..2a03949d0b82 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -14,6 +14,7 @@
> #include <linux/types.h>
>
> #include "rcu_segcblist.h"
> +#include "rcu.h"
>
> /* Initialize simple callback list. */
> void rcu_cblist_init(struct rcu_cblist *rclp)
> @@ -328,6 +329,39 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
> rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> }
>
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of segments. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> + int cbcount[RCU_CBLIST_NSEGS],
> + unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + }
> +}
> +
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> +{
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
> +
> + if (!trace_rcu_segcb_stats_enabled())
> + return;
Can't you rely on the trace system to enable and disable this trace
event? If the thought is to save instructions, then moving all this
into TP_fast_assign() enables the trace system to deal with that as well.
> + rcu_segcblist_countseq(rsclp, cbs, gps);
> +
> + trace_rcu_segcb_stats(context, cbs, gps);
> +}
> +#endif
> +
> /*
> * Extract only those callbacks still pending (not yet ready to be
> * invoked) from the specified rcu_segcblist structure and place them in
> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> index cd35c9faaf51..7750734fa116 100644
> --- a/kernel/rcu/rcu_segcblist.h
> +++ b/kernel/rcu/rcu_segcblist.h
> @@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
> bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
> void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
> struct rcu_segcblist *src_rsclp);
> +#ifdef CONFIG_RCU_TRACE
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context);
> +#else
> +#define __trace_rcu_segcb_stats(...)
> +#endif
And letting TP_fast_assign() to the work allows this to go away...
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 24c00020ab83..f6c6653b3ec2 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> if (!rcu_segcblist_pend_cbs(&rdp->cblist))
> return false;
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
...and you don't need the "__" everywhere.
So what am I missing here?
Thanx, Paul
> +
> /*
> * Callbacks are often registered with incomplete grace-period
> * information. Something about the fact that getting exact
> @@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
> else
> trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
> +
> return ret;
> }
>
> @@ -2466,11 +2470,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
> rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
> if (offloaded)
> rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> +
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
> rcu_nocb_unlock_irqrestore(rdp, flags);
>
> /* Invoke callbacks. */
> tick_dep_set_task(current, TICK_DEP_BIT_RCU);
> rhp = rcu_cblist_dequeue(&rcl);
> +
> for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
> rcu_callback_t f;
>
> @@ -2983,6 +2990,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
> trace_rcu_callback(rcu_state.name, head,
> rcu_segcblist_n_cbs(&rdp->cblist));
>
> + __trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
> +
> /* Go handle any RCU core processing required. */
> if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
> __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
> --
> 2.29.1.341.ge80a0c044ae-goog
>
On Tue, 10 Nov 2020 16:35:30 -0800
"Paul E. McKenney" <[email protected]> wrote:
> > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > +{
> > + int cbs[RCU_CBLIST_NSEGS];
> > + unsigned long gps[RCU_CBLIST_NSEGS];
> > +
> > + if (!trace_rcu_segcb_stats_enabled())
> > + return;
>
> Can't you rely on the trace system to enable and disable this trace
> event? If the thought is to save instructions, then moving all this
> into TP_fast_assign() enables the trace system to deal with that as well.
>
> > + rcu_segcblist_countseq(rsclp, cbs, gps);
> > +
> > + trace_rcu_segcb_stats(context, cbs, gps);
> > +}
> > +#endif
Yeah, I agree with Paul. I think it is possible to move this all into the
TP_fast_assign. If you have trouble doing so, let me know.
-- Steve
On Wed, Nov 11, 2020 at 08:49:41AM -0500, Steven Rostedt wrote:
> On Tue, 10 Nov 2020 16:35:30 -0800
> "Paul E. McKenney" <[email protected]> wrote:
>
> > > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char *context)
> > > +{
> > > + int cbs[RCU_CBLIST_NSEGS];
> > > + unsigned long gps[RCU_CBLIST_NSEGS];
> > > +
> > > + if (!trace_rcu_segcb_stats_enabled())
> > > + return;
> >
> > Can't you rely on the trace system to enable and disable this trace
> > event? If the thought is to save instructions, then moving all this
> > into TP_fast_assign() enables the trace system to deal with that as well.
Makes sense.
> > > + rcu_segcblist_countseq(rsclp, cbs, gps);
> > > +
> > > + trace_rcu_segcb_stats(context, cbs, gps);
> > > +}
> > > +#endif
>
> Yeah, I agree with Paul. I think it is possible to move this all into the
> TP_fast_assign. If you have trouble doing so, let me know.
Sure. Last time I tried this for this patch, I ran into some issue. I will
try again and let you know if I need help.
thanks,
- Joel
>
> -- Steve