These patches add trace events support for preempt and irq disable/enable
events.
Changes since v5:
- Use trace_*_rcuidle variants (thanks Steve!)
Here's an example of how Android's systrace will be using it to show critical
sections as a gantt chart: http://imgur.com/download/TZplEVp
Links to earlier series:
v5: https://lkml.org/lkml/2017/9/12/507
v4: https://lkml.org/lkml/2017/9/6/724
v3: https://lkml.org/lkml/2017/9/3/15
Joel Fernandes (2):
tracing: Prepare to add preempt and irq trace events
tracing: Add support for preempt and irq enable/disable events
include/linux/ftrace.h | 3 +-
include/linux/irqflags.h | 4 +-
include/trace/events/preemptirq.h | 66 ++++++++++++++
kernel/trace/Kconfig | 9 ++
kernel/trace/Makefile | 1 +
kernel/trace/trace_irqsoff.c | 176 +++++++++++++++++++++++++++++++-------
6 files changed, 226 insertions(+), 33 deletions(-)
create mode 100644 include/trace/events/preemptirq.h
Cc: Steven Rostedt <[email protected]>
Cc: Peter Zilstra <[email protected]>
--
2.14.1.821.g8fa685d3b7-goog
Preempt and irq trace events can be used for tracing the start and
end of an atomic section which can be used by a trace viewer like
systrace to graphically view the start and end of an atomic section and
correlate them with latencies and scheduling issues.
This also serves as a prelude to using synthetic events or probes to
rewrite the preempt and irqsoff tracers, along with numerous benefits of
using trace events features for these events.
Cc: Steven Rostedt <[email protected]>
Cc: Peter Zilstra <[email protected]>
Cc: [email protected]
Signed-off-by: Joel Fernandes <[email protected]>
---
include/linux/ftrace.h | 3 +-
include/linux/irqflags.h | 4 +--
include/trace/events/preemptirq.h | 66 +++++++++++++++++++++++++++++++++++++++
kernel/trace/Kconfig | 9 ++++++
kernel/trace/Makefile | 1 +
kernel/trace/trace_irqsoff.c | 57 ++++++++++++++++++++++++++++++++-
6 files changed, 136 insertions(+), 4 deletions(-)
create mode 100644 include/trace/events/preemptirq.h
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2e028854bac7..768e49b8c80f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -742,7 +742,8 @@ static inline unsigned long get_lock_parent_ip(void)
static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
extern void trace_preempt_on(unsigned long a0, unsigned long a1);
extern void trace_preempt_off(unsigned long a0, unsigned long a1);
#else
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5dd1272d1ab2..6ad6732a4e98 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -44,8 +44,8 @@
# define INIT_TRACE_IRQFLAGS
#endif
-#if defined(CONFIG_IRQSOFF_TRACER) || \
- defined(CONFIG_PREEMPT_TRACER)
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) || \
+ defined(CONFIG_PREEMPTIRQ_EVENTS)
extern void stop_critical_timings(void);
extern void start_critical_timings(void);
#else
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..734b7e2a3890
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,66 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+ TP_ARGS(ip, parent_ip),
+
+ TP_STRUCT__entry(
+ __field(u32, caller_offs)
+ __field(u32, parent_offs)
+ ),
+
+ TP_fast_assign(
+ __entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+ __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+ ),
+
+ TP_printk("caller=%pF parent=%pF",
+ (void *)((unsigned long)(_stext) + __entry->caller_offs),
+ (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+DEFINE_EVENT(preemptirq_template, irq_disable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..420666152de8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,15 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, CONFIG_DEBUG_PREEMPT must
+ be enabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 90f2701d92a7..9f62eee61f14 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 26c688740742..f3087e7278cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,9 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -777,14 +780,29 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
#endif
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+static DEFINE_PER_CPU(int, tracing_preempt_cpu);
+
void start_critical_timings(void)
{
+ if (this_cpu_read(tracing_preempt_cpu))
+ trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+
+ if (this_cpu_read(tracing_irq_cpu))
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+
start_critical_timings_tracer();
}
EXPORT_SYMBOL_GPL(start_critical_timings);
void stop_critical_timings(void)
{
+ if (this_cpu_read(tracing_preempt_cpu))
+ trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+
+ if (this_cpu_read(tracing_irq_cpu))
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+
stop_critical_timings_tracer();
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
@@ -792,24 +810,48 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
void trace_hardirqs_on(void)
{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
void trace_hardirqs_off(void)
{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_off();
}
EXPORT_SYMBOL(trace_hardirqs_off);
__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
tracer_hardirqs_off_caller(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -831,14 +873,27 @@ inline void print_irqtrace_events(struct task_struct *curr)
}
#endif
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
+ if (!this_cpu_read(tracing_preempt_cpu))
+ return;
+
+ trace_preempt_enable_rcuidle(a0, a1);
tracer_preempt_on(a0, a1);
+
+ this_cpu_write(tracing_preempt_cpu, 0);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
+ if (this_cpu_read(tracing_preempt_cpu))
+ return;
+
+ this_cpu_write(tracing_preempt_cpu, 1);
+
+ trace_preempt_disable_rcuidle(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
--
2.14.1.821.g8fa685d3b7-goog
In preparation of adding irqsoff and preemptsoff enable and disable trace
events, move required functions and code to make it easier to add these events
in a later patch. This patch is just code movement and no functional change.
Cc: Steven Rostedt <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: [email protected]
Signed-off-by: Joel Fernandes <[email protected]>
---
kernel/trace/trace_irqsoff.c | 121 ++++++++++++++++++++++++++++++++-----------
1 file changed, 91 insertions(+), 30 deletions(-)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..26c688740742 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,7 @@
#include "trace.h"
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -432,19 +433,17 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
}
/* start and stop critical timings used to for stoppage (in idle) */
-void start_critical_timings(void)
+static inline void start_critical_timings_tracer(void)
{
if (preempt_trace() || irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL_GPL(start_critical_timings);
-void stop_critical_timings(void)
+static inline void stop_critical_timings_tracer(void)
{
if (preempt_trace() || irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL_GPL(stop_critical_timings);
#ifdef CONFIG_IRQSOFF_TRACER
#ifdef CONFIG_PROVE_LOCKING
@@ -462,64 +461,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -781,3 +760,85 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#else /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+static inline void start_critical_timings_tracer(void) { }
+static inline void stop_critical_timings_tracer(void) { }
+#endif
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+void start_critical_timings(void)
+{
+ start_critical_timings_tracer();
+}
+EXPORT_SYMBOL_GPL(start_critical_timings);
+
+void stop_critical_timings(void)
+{
+ stop_critical_timings_tracer();
+}
+EXPORT_SYMBOL_GPL(stop_critical_timings);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+ tracer_hardirqs_on();
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ tracer_hardirqs_on_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ tracer_preempt_off(a0, a1);
+}
+#endif
--
2.14.1.821.g8fa685d3b7-goog
Peter,
Do you have any qualms with this patch set?
-- Steve
On Thu, 21 Sep 2017 18:50:22 -0700
Joel Fernandes <[email protected]> wrote:
> These patches add trace events support for preempt and irq disable/enable
> events.
>
> Changes since v5:
> - Use trace_*_rcuidle variants (thanks Steve!)
>
> Here's an example of how Android's systrace will be using it to show critical
> sections as a gantt chart: http://imgur.com/download/TZplEVp
>
> Links to earlier series:
> v5: https://lkml.org/lkml/2017/9/12/507
> v4: https://lkml.org/lkml/2017/9/6/724
> v3: https://lkml.org/lkml/2017/9/3/15
>
> Joel Fernandes (2):
> tracing: Prepare to add preempt and irq trace events
> tracing: Add support for preempt and irq enable/disable events
>
> include/linux/ftrace.h | 3 +-
> include/linux/irqflags.h | 4 +-
> include/trace/events/preemptirq.h | 66 ++++++++++++++
> kernel/trace/Kconfig | 9 ++
> kernel/trace/Makefile | 1 +
> kernel/trace/trace_irqsoff.c | 176 +++++++++++++++++++++++++++++++-------
> 6 files changed, 226 insertions(+), 33 deletions(-)
> create mode 100644 include/trace/events/preemptirq.h
>
> Cc: Steven Rostedt <[email protected]>
> Cc: Peter Zilstra <[email protected]>
On Thu, Sep 21, 2017 at 06:50:24PM -0700, Joel Fernandes wrote:
> +DECLARE_EVENT_CLASS(preemptirq_template,
> +
> + TP_PROTO(unsigned long ip, unsigned long parent_ip),
> +
> + TP_ARGS(ip, parent_ip),
> +
> + TP_STRUCT__entry(
> + __field(u32, caller_offs)
> + __field(u32, parent_offs)
> + ),
> +
> + TP_fast_assign(
> + __entry->caller_offs = (u32)(ip - (unsigned long)_stext);
> + __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
> + ),
> +
> + TP_printk("caller=%pF parent=%pF",
> + (void *)((unsigned long)(_stext) + __entry->caller_offs),
> + (void *)((unsigned long)(_stext) + __entry->parent_offs))
> +);
So I don't get that function prototype. Why do we need two IPs?
> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
> index 26c688740742..f3087e7278cb 100644
> --- a/kernel/trace/trace_irqsoff.c
> +++ b/kernel/trace/trace_irqsoff.c
> @@ -16,6 +16,9 @@
>
> #include "trace.h"
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/preemptirq.h>
> +
> #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
> static struct trace_array *irqsoff_trace __read_mostly;
> static int tracer_enabled __read_mostly;
> @@ -777,14 +780,29 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
> static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
> #endif
>
> +static DEFINE_PER_CPU(int, tracing_irq_cpu);
> +static DEFINE_PER_CPU(int, tracing_preempt_cpu);
> +
> void start_critical_timings(void)
> {
> + if (this_cpu_read(tracing_preempt_cpu))
> + trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> +
> + if (this_cpu_read(tracing_irq_cpu))
> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> +
> start_critical_timings_tracer();
> }
> EXPORT_SYMBOL_GPL(start_critical_timings);
>
> void stop_critical_timings(void)
> {
> + if (this_cpu_read(tracing_preempt_cpu))
> + trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> +
> + if (this_cpu_read(tracing_irq_cpu))
> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> +
> stop_critical_timings_tracer();
> }
> EXPORT_SYMBOL_GPL(stop_critical_timings);
And I feel these yield somewhat odd semantics, does that need explaining
somewhere?
> @@ -792,24 +810,48 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
> #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
> void trace_hardirqs_on(void)
> {
> + if (!this_cpu_read(tracing_irq_cpu))
> + return;
> +
> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> tracer_hardirqs_on();
> +
> + this_cpu_write(tracing_irq_cpu, 0);
> }
> EXPORT_SYMBOL(trace_hardirqs_on);
>
> void trace_hardirqs_off(void)
> {
> + if (this_cpu_read(tracing_irq_cpu))
> + return;
> +
> + this_cpu_write(tracing_irq_cpu, 1);
> +
> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> tracer_hardirqs_off();
> }
> EXPORT_SYMBOL(trace_hardirqs_off);
>
> __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
> {
> + if (!this_cpu_read(tracing_irq_cpu))
> + return;
> +
> + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
> tracer_hardirqs_on_caller(caller_addr);
> +
> + this_cpu_write(tracing_irq_cpu, 0);
> }
> EXPORT_SYMBOL(trace_hardirqs_on_caller);
>
> __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
> {
> + if (this_cpu_read(tracing_irq_cpu))
> + return;
> +
> + this_cpu_write(tracing_irq_cpu, 1);
> +
> + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
> tracer_hardirqs_off_caller(caller_addr);
> }
> EXPORT_SYMBOL(trace_hardirqs_off_caller);
> @@ -831,14 +873,27 @@ inline void print_irqtrace_events(struct task_struct *curr)
> }
> #endif
>
> -#ifdef CONFIG_PREEMPT_TRACER
> +#if defined(CONFIG_PREEMPT_TRACER) || \
> + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
> void trace_preempt_on(unsigned long a0, unsigned long a1)
> {
> + if (!this_cpu_read(tracing_preempt_cpu))
> + return;
> +
> + trace_preempt_enable_rcuidle(a0, a1);
> tracer_preempt_on(a0, a1);
> +
> + this_cpu_write(tracing_preempt_cpu, 0);
> }
>
> void trace_preempt_off(unsigned long a0, unsigned long a1)
> {
> + if (this_cpu_read(tracing_preempt_cpu))
> + return;
> +
> + this_cpu_write(tracing_preempt_cpu, 1);
> +
> + trace_preempt_disable_rcuidle(a0, a1);
> tracer_preempt_off(a0, a1);
> }
> #endif
And here you assume things like double on / double off don't happen,
which might well be so, but does seem somewhat fragile.
So all in all I don't see anything particularly wrong, but I see lots
that could do with explaining.
Hi Peter,
On Fri, Sep 22, 2017 at 2:02 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Sep 21, 2017 at 06:50:24PM -0700, Joel Fernandes wrote:
>> +DECLARE_EVENT_CLASS(preemptirq_template,
>> +
>> + TP_PROTO(unsigned long ip, unsigned long parent_ip),
>> +
>> + TP_ARGS(ip, parent_ip),
>> +
>> + TP_STRUCT__entry(
>> + __field(u32, caller_offs)
>> + __field(u32, parent_offs)
>> + ),
>> +
>> + TP_fast_assign(
>> + __entry->caller_offs = (u32)(ip - (unsigned long)_stext);
>> + __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
>> + ),
>> +
>> + TP_printk("caller=%pF parent=%pF",
>> + (void *)((unsigned long)(_stext) + __entry->caller_offs),
>> + (void *)((unsigned long)(_stext) + __entry->parent_offs))
>> +);
>
> So I don't get that function prototype. Why do we need two IPs?
This follows the exact same "model" used by the preempt/irqsoff
tracer. The rationale IIUC is a lot of times the actual preempt event
is done by say a lock, but the actual caller of the lock function
causing the preempt disable event is also of interest so this gives
some more information of interest without needing the full stack, for
example the spinlock acquiring in add_wait_queue disabled preemption
here:
grep-1041 [002] d..1 80.363455: preempt_disable:
caller=_raw_spin_lock_irqsave+0x1d/0x40
parent=add_wait_queue+0x15/0x50
>
>> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
>> index 26c688740742..f3087e7278cb 100644
>> --- a/kernel/trace/trace_irqsoff.c
>> +++ b/kernel/trace/trace_irqsoff.c
>> @@ -16,6 +16,9 @@
>>
>> #include "trace.h"
>>
>> +#define CREATE_TRACE_POINTS
>> +#include <trace/events/preemptirq.h>
>> +
>> #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
>> static struct trace_array *irqsoff_trace __read_mostly;
>> static int tracer_enabled __read_mostly;
>> @@ -777,14 +780,29 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
>> static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
>> #endif
>>
>> +static DEFINE_PER_CPU(int, tracing_irq_cpu);
>> +static DEFINE_PER_CPU(int, tracing_preempt_cpu);
>> +
>> void start_critical_timings(void)
>> {
>> + if (this_cpu_read(tracing_preempt_cpu))
>> + trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> +
>> + if (this_cpu_read(tracing_irq_cpu))
>> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> +
>> start_critical_timings_tracer();
>> }
>> EXPORT_SYMBOL_GPL(start_critical_timings);
>>
>> void stop_critical_timings(void)
>> {
>> + if (this_cpu_read(tracing_preempt_cpu))
>> + trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> +
>> + if (this_cpu_read(tracing_irq_cpu))
>> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> +
>> stop_critical_timings_tracer();
>> }
>> EXPORT_SYMBOL_GPL(stop_critical_timings);
>
> And I feel these yield somewhat odd semantics, does that need explaining
> somewhere?
Maybe I can add a comment here, if you prefer that. When you meant
semantics, do you mean 'critical' vs 'atomic' thing or do you mean the
semantics/context of how this function is supposed to be used?
>
>> @@ -792,24 +810,48 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
>> #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
>> void trace_hardirqs_on(void)
>> {
>> + if (!this_cpu_read(tracing_irq_cpu))
>> + return;
>> +
>> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> tracer_hardirqs_on();
>> +
>> + this_cpu_write(tracing_irq_cpu, 0);
>> }
>> EXPORT_SYMBOL(trace_hardirqs_on);
>>
>> void trace_hardirqs_off(void)
>> {
>> + if (this_cpu_read(tracing_irq_cpu))
>> + return;
>> +
>> + this_cpu_write(tracing_irq_cpu, 1);
>> +
>> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> tracer_hardirqs_off();
>> }
>> EXPORT_SYMBOL(trace_hardirqs_off);
>>
>> __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
>> {
>> + if (!this_cpu_read(tracing_irq_cpu))
>> + return;
>> +
>> + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
>> tracer_hardirqs_on_caller(caller_addr);
>> +
>> + this_cpu_write(tracing_irq_cpuHopefully I addressed your concern properly here?
, 0);
>> }
>> EXPORT_SYMBOL(trace_hardirqs_on_caller);
>>
>> __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
>> {
>> + if (this_cpu_read(tracing_irq_cpu))
>> + return;
>> +
>> + this_cpu_write(tracing_irq_cpu, 1);
>> +
>> + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
>> tracer_hardirqs_off_caller(caller_addr);
>> }
>> EXPORT_SYMBOL(trace_hardirqs_off_caller);
>> @@ -831,14 +873,27 @@ inline void print_irqtrace_events(struct task_struct *curr)
>> }
>> #endif
>>
>> -#ifdef CONFIG_PREEMPT_TRACER
>> +#if defined(CONFIG_PREEMPT_TRACER) || \
>> + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
>> void trace_preempt_on(unsigned long a0, unsigned long a1)
>> {
>> + if (!this_cpu_read(tracing_preempt_cpu))
>> + return;
>> +
>> + trace_preempt_enable_rcuidle(a0, a1);
>> tracer_preempt_on(a0, a1);
>> +
>> + this_cpu_write(tracing_preempt_cpu, 0);
>> }
>>
>> void trace_preempt_off(unsigned long a0, unsigned long a1)
>> {
>> + if (this_cpu_read(tracing_preempt_cpu))
>> + return;
>> +
>> + this_cpu_write(tracing_preempt_cpu, 1);
>> +
>> + trace_preempt_disable_rcuidle(a0, a1);
>> tracer_preempt_off(a0, a1);
>> }
>> #endif
>
> And here you assume things like double on / double off don't happen,
> which might well be so, but does seem somewhat fragile.
>
We are handling the cases where these functions might be called twice,
but we are only interested in the first time they're called. I caught
a dead lock happen when I didn't add such protection to
trace_hardirqs_off so I added to these to the trace_hardirqs* and
trace_preempt* ones as well to just to be extra safe and keep it
consistent. Hope I understood your concern correctly, if not please
let me know, thanks.
>
> So all in all I don't see anything particularly wrong, but I see lots
> that could do with explaining.
Sure, thanks for taking a look :) Let me know if you had any other
comments about this patch :)
regards,
- Joel
On Fri, Sep 22, 2017 at 11:28:30AM -0700, Joel Fernandes wrote:
> >> + TP_printk("caller=%pF parent=%pF",
> >> + (void *)((unsigned long)(_stext) + __entry->caller_offs),
> >> + (void *)((unsigned long)(_stext) + __entry->parent_offs))
> >> +);
> >
> > So I don't get that function prototype. Why do we need two IPs?
>
> This follows the exact same "model" used by the preempt/irqsoff
> tracer. The rationale IIUC is a lot of times the actual preempt event
> is done by say a lock, but the actual caller of the lock function
> causing the preempt disable event is also of interest so this gives
> some more information of interest without needing the full stack, for
> example the spinlock acquiring in add_wait_queue disabled preemption
> here:
>
> grep-1041 [002] d..1 80.363455: preempt_disable:
> caller=_raw_spin_lock_irqsave+0x1d/0x40
> parent=add_wait_queue+0x15/0x50
I suppose that sort-of makes sense for the preempt-tracer, but its a
weird thing for a generic tracepoint.
> >> void start_critical_timings(void)
> >> {
> >> + if (this_cpu_read(tracing_preempt_cpu))
> >> + trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >> +
> >> + if (this_cpu_read(tracing_irq_cpu))
> >> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >> +
> >> start_critical_timings_tracer();
> >> }
> >> EXPORT_SYMBOL_GPL(start_critical_timings);
> >>
> >> void stop_critical_timings(void)
> >> {
> >> + if (this_cpu_read(tracing_preempt_cpu))
> >> + trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >> +
> >> + if (this_cpu_read(tracing_irq_cpu))
> >> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >> +
> >> stop_critical_timings_tracer();
> >> }
> >> EXPORT_SYMBOL_GPL(stop_critical_timings);
> >
> > And I feel these yield somewhat odd semantics, does that need explaining
> > somewhere?
>
> Maybe I can add a comment here, if you prefer that. When you meant
> semantics, do you mean 'critical' vs 'atomic' thing or do you mean the
> semantics/context of how this function is supposed to be used?
I would add the comment to the tracepoint definition.
On semantics, the whole stop/start excludes a fair bunch of walltime
from our measurement, I feel that needs to be called out and enumerated
(when time goes missing and why).
Given that the idle thread runs with preempt-off I understand its
purpose from the POV from the preempt-tracer, but its 'weird' behaviour
if you're looking at it from a pure tracepoint pov.
> >> void trace_preempt_off(unsigned long a0, unsigned long a1)
> >> {
> >> + if (this_cpu_read(tracing_preempt_cpu))
> >> + return;
> >> +
> >> + this_cpu_write(tracing_preempt_cpu, 1);
> >> +
> >> + trace_preempt_disable_rcuidle(a0, a1);
> >> tracer_preempt_off(a0, a1);
> >> }
> >> #endif
> >
> > And here you assume things like double on / double off don't happen,
> > which might well be so, but does seem somewhat fragile.
> >
>
> We are handling the cases where these functions might be called twice,
> but we are only interested in the first time they're called. I caught
> a dead lock happen when I didn't add such protection to
> trace_hardirqs_off so I added to these to the trace_hardirqs* and
> trace_preempt* ones as well to just to be extra safe and keep it
> consistent. Hope I understood your concern correctly, if not please
> let me know, thanks.
Under what conditions where they called twice? That seems like something
that should not happen to begin with. Esp the one I left quoted above,
disabling when its already disabled sounds like fail. So please provide
more details on the scenario you're working around.
On Mon, 25 Sep 2017 10:34:32 +0200
Peter Zijlstra <[email protected]> wrote:
> >
> > grep-1041 [002] d..1 80.363455: preempt_disable:
> > caller=_raw_spin_lock_irqsave+0x1d/0x40
> > parent=add_wait_queue+0x15/0x50
>
> I suppose that sort-of makes sense for the preempt-tracer, but its a
> weird thing for a generic tracepoint.
I still find it very useful, even as a tracepoint.
>
> > >> void start_critical_timings(void)
> > >> {
> > >> + if (this_cpu_read(tracing_preempt_cpu))
> > >> + trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >> +
> > >> + if (this_cpu_read(tracing_irq_cpu))
> > >> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >> +
> > >> start_critical_timings_tracer();
> > >> }
> > >> EXPORT_SYMBOL_GPL(start_critical_timings);
> > >>
> > >> void stop_critical_timings(void)
> > >> {
> > >> + if (this_cpu_read(tracing_preempt_cpu))
> > >> + trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >> +
> > >> + if (this_cpu_read(tracing_irq_cpu))
> > >> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >> +
> > >> stop_critical_timings_tracer();
> > >> }
> > >> EXPORT_SYMBOL_GPL(stop_critical_timings);
> > >
> > > And I feel these yield somewhat odd semantics, does that need explaining
> > > somewhere?
> >
> > Maybe I can add a comment here, if you prefer that. When you meant
> > semantics, do you mean 'critical' vs 'atomic' thing or do you mean the
> > semantics/context of how this function is supposed to be used?
>
> I would add the comment to the tracepoint definition.
>
> On semantics, the whole stop/start excludes a fair bunch of walltime
> from our measurement, I feel that needs to be called out and enumerated
> (when time goes missing and why).
>
> Given that the idle thread runs with preempt-off I understand its
> purpose from the POV from the preempt-tracer, but its 'weird' behaviour
> if you're looking at it from a pure tracepoint pov.
You mean you want to trace all calls to preempt and irq off even if
preempt and irqs are already off?
>
> > >> void trace_preempt_off(unsigned long a0, unsigned long a1)
> > >> {
> > >> + if (this_cpu_read(tracing_preempt_cpu))
> > >> + return;
> > >> +
> > >> + this_cpu_write(tracing_preempt_cpu, 1);
> > >> +
> > >> + trace_preempt_disable_rcuidle(a0, a1);
> > >> tracer_preempt_off(a0, a1);
> > >> }
> > >> #endif
> > >
> > > And here you assume things like double on / double off don't happen,
> > > which might well be so, but does seem somewhat fragile.
> > >
> >
> > We are handling the cases where these functions might be called twice,
> > but we are only interested in the first time they're called. I caught
> > a dead lock happen when I didn't add such protection to
> > trace_hardirqs_off so I added to these to the trace_hardirqs* and
> > trace_preempt* ones as well to just to be extra safe and keep it
> > consistent. Hope I understood your concern correctly, if not please
> > let me know, thanks.
>
> Under what conditions where they called twice? That seems like something
> that should not happen to begin with. Esp the one I left quoted above,
> disabling when its already disabled sounds like fail. So please provide
> more details on the scenario you're working around.
I'm a little confused by this too.
-- Steve
On Mon, Sep 25, 2017 at 06:15:28AM -0400, Steven Rostedt wrote:
> On Mon, 25 Sep 2017 10:34:32 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > >
> > > grep-1041 [002] d..1 80.363455: preempt_disable:
> > > caller=_raw_spin_lock_irqsave+0x1d/0x40
> > > parent=add_wait_queue+0x15/0x50
> >
> > I suppose that sort-of makes sense for the preempt-tracer, but its a
> > weird thing for a generic tracepoint.
>
> I still find it very useful, even as a tracepoint.
Didn't say anything about useful, just that it might not be expected.
> > I would add the comment to the tracepoint definition.
> >
> > On semantics, the whole stop/start excludes a fair bunch of walltime
> > from our measurement, I feel that needs to be called out and enumerated
> > (when time goes missing and why).
> >
> > Given that the idle thread runs with preempt-off I understand its
> > purpose from the POV from the preempt-tracer, but its 'weird' behaviour
> > if you're looking at it from a pure tracepoint pov.
>
> You mean you want to trace all calls to preempt and irq off even if
> preempt and irqs are already off?
Sure, why not? This stuff naturally nests, and who is to say its not a
useful thing to trace all of them?
By also tracing the nested sections you can, for instance, see how much
you'd really win by getting rid of the outer one. If, for instance, the
outer only accounts for 1% of the time, while the inner ones are
interlinked and span the other 99%, there's more work to do than if that
were not the case.
On Mon, 25 Sep 2017 12:32:23 +0200
Peter Zijlstra <[email protected]> wrote:
> > You mean you want to trace all calls to preempt and irq off even if
> > preempt and irqs are already off?
>
> Sure, why not? This stuff naturally nests, and who is to say its not a
> useful thing to trace all of them?
>
> By also tracing the nested sections you can, for instance, see how much
> you'd really win by getting rid of the outer one. If, for instance, the
> outer only accounts for 1% of the time, while the inner ones are
> interlinked and span the other 99%, there's more work to do than if that
> were not the case.
If we do this we need a field to record if the preemption or irqs were
toggled by that call. Something that filters could easily be added to
only show what this patch set has.
-- Steve
On Mon, Sep 25, 2017 at 3:52 AM, Steven Rostedt <[email protected]> wrote:
> On Mon, 25 Sep 2017 12:32:23 +0200
> Peter Zijlstra <[email protected]> wrote:
>
>
>> > You mean you want to trace all calls to preempt and irq off even if
>> > preempt and irqs are already off?
>>
>> Sure, why not? This stuff naturally nests, and who is to say its not a
>> useful thing to trace all of them?
>>
>> By also tracing the nested sections you can, for instance, see how much
>> you'd really win by getting rid of the outer one. If, for instance, the
>> outer only accounts for 1% of the time, while the inner ones are
>> interlinked and span the other 99%, there's more work to do than if that
>> were not the case.
>
> If we do this we need a field to record if the preemption or irqs were
> toggled by that call. Something that filters could easily be added to
> only show what this patch set has.
I request that we please not do this for my patchset, there are a
number of reasons in my mind:
1. trace_preempt_off in existing code is only called the first time
preempt is turned off. This is the definition of "preempt off", its
the first time Preemption is actually turned off, and has nothing much
to do with going into a deeper preempt count. Whether the count
increases or not, preempt is already off and that's confirmed by the
first preempt off event.
This is how I read it in the comments in sched/core.c as well:
"
* If the value passed in is equal to the current preempt count
* then we just disabled preemption."
This is how I based this patchset as well, againt its not my usecase
and it can be a future patch if its useful to track this.
2. This stuff is already a bit trace heavy, I prefer not to generate
event every time the preempt count increases. Ofcourse filters but
still then we have the filtering overhead for a usecase that I am not
really targetting with this patchset.
3. It will complicate the patch more, apart from adding filters as
Steven suggested, it would also mean we change how
preempt_latency_start in sched/core.c works.
Do you mind if we please keep this as a 'future' usecase for a future
patch? Its not my usecase at all for this patchset and not what I was
intending.
I will reply to Peter's other comments on the other email shortly.
thanks!
- Joel
Hi Peter,
On Mon, Sep 25, 2017 at 1:34 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, Sep 22, 2017 at 11:28:30AM -0700, Joel Fernandes wrote:
[...]
>> >> void start_critical_timings(void)
>> >> {
>> >> + if (this_cpu_read(tracing_preempt_cpu))
>> >> + trace_preempt_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> >> +
>> >> + if (this_cpu_read(tracing_irq_cpu))
>> >> + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> >> +
>> >> start_critical_timings_tracer();
>> >> }
>> >> EXPORT_SYMBOL_GPL(start_critical_timings);
>> >>
>> >> void stop_critical_timings(void)
>> >> {
>> >> + if (this_cpu_read(tracing_preempt_cpu))
>> >> + trace_preempt_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> >> +
>> >> + if (this_cpu_read(tracing_irq_cpu))
>> >> + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>> >> +
>> >> stop_critical_timings_tracer();
>> >> }
>> >> EXPORT_SYMBOL_GPL(stop_critical_timings);
>> >
>> > And I feel these yield somewhat odd semantics, does that need explaining
>> > somewhere?
>>
>> Maybe I can add a comment here, if you prefer that. When you meant
>> semantics, do you mean 'critical' vs 'atomic' thing or do you mean the
>> semantics/context of how this function is supposed to be used?
>
> I would add the comment to the tracepoint definition.
>
> On semantics, the whole stop/start excludes a fair bunch of walltime
> from our measurement, I feel that needs to be called out and enumerated
> (when time goes missing and why).
>
> Given that the idle thread runs with preempt-off I understand its
> purpose from the POV from the preempt-tracer, but its 'weird' behaviour
> if you're looking at it from a pure tracepoint pov.
I see your point, basically you have a problem with the "fake preempt
on" event that I'm generating when start_critical_timings is called
from the idle code path which isn't really a preempt-on per-se but is
an effect of entering the idle path.
Since we already have cpuidle events, one thing I could do is just
drop out this part of the patch set and not generate any trace events
during start_critical_timings, and one could use the preempt_disable
trace event (introduced in this path) and the cpuidle trace event to
figure out that part of the preempt-disabled section is actual the cpu
idle path. Would that alleviate your concern here?
As a next step Steven discussed with me in the hallway at LPC that we
could possibly a start_critical_timings event as well, so probably as
a next step after this patch, those new events can be added which
gives more visibility into walltime that shouldn't be counted. Or I
could add it in this series as a separate patch, let me know if that
makes sense and what you'd like to do, thanks.
>> >> void trace_preempt_off(unsigned long a0, unsigned long a1)
>> >> {
>> >> + if (this_cpu_read(tracing_preempt_cpu))
>> >> + return;
>> >> +
>> >> + this_cpu_write(tracing_preempt_cpu, 1);
>> >> +
>> >> + trace_preempt_disable_rcuidle(a0, a1);
>> >> tracer_preempt_off(a0, a1);
>> >> }
>> >> #endif
>> >
>> > And here you assume things like double on / double off don't happen,
>> > which might well be so, but does seem somewhat fragile.
>> >
>>
>> We are handling the cases where these functions might be called twice,
>> but we are only interested in the first time they're called. I caught
>> a dead lock happen when I didn't add such protection to
>> trace_hardirqs_off so I added to these to the trace_hardirqs* and
>> trace_preempt* ones as well to just to be extra safe and keep it
>> consistent. Hope I understood your concern correctly, if not please
>> let me know, thanks.
>
> Under what conditions where they called twice? That seems like something
> that should not happen to begin with. Esp the one I left quoted above,
Ok, so the calling twice AIUI cannot happen with trace_preempt_on and
trace_preempt_off, I agree and I'll drop the unnecessary per-cpu
variable check, but...
> disabling when its already disabled sounds like fail. So please provide
> more details on the scenario you're working around.
It can still happen with local_irq_save called while interrupts are disabled:
The trace_hardirqs_off API can be called even when IRQs are already
off. This is unlike the trace_hardirqs_on which checks if IRQs are off
(atleast from some callsites), here are the definitions just for
reference [1]. I guess we could modify local_irq_disable and
local_irq_save to check what the HW flags was before calling
raw_local_irq_save and only then call trace_hardirqs_off if they were
indeed on and now being turned off, but that adds complexity to it -
also we have to then modify all the callsites from assembly code to
conditionally call trace_hardirqs_on/off :(.
Instead my patch uses a per-CPU variable in trace_hardirqs_* function
to track that interrupts were already off when it was called thus
avoiding uselessly emitting a trace event:
void trace_hardirqs_off(void)
{
if (this_cpu_read(tracing_irq_cpu))
return;
this_cpu_write(tracing_irq_cpu, 1);
trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_off();
}
I feel this is the cleanest path forward for the IRQ disable/enable
trace events.
Are you Ok with this?
thanks,
- Joel
[1] http://elixir.free-electrons.com/linux/latest/source/include/linux/irqflags.h#L89
On Mon, 25 Sep 2017 15:57:30 -0700
Joel Fernandes <[email protected]> wrote:
> 2. This stuff is already a bit trace heavy, I prefer not to generate
> event every time the preempt count increases. Ofcourse filters but
> still then we have the filtering overhead for a usecase that I am not
> really targetting with this patchset.
This is actually a key reason I would agree with Joel. Just tracing the
toggle is extremely heavy. Adding all the other calls will probably
make it useless.
-- Steve
Hi Steven, Peter,
I'm planning to make the following changes for the next rev, could you
let me know if you're Ok with it?
1. Drop the stop_critical_timings changes - previous patch was
generating the preempt_enable/disable events but they aren't "real"
events. Instead since we already have cpuidle trace events, we can
just rely on those for now to understand how much time was spent in
idle. A future patch could do something smarter.
2. Drop the recursion protection from trace_preempt_enable/disable.
The trace_preempt_enable/disable calls don't nest, so there's no need
to protect it with a per-cpu variable.
3. trace_irq_enable/disable on the other hand are called in this way,
so I'll add some comments about why per-cpu variable is needed.
thanks a lot,
- Joel
On Mon, Sep 25, 2017 at 3:57 PM, Joel Fernandes <[email protected]> wrote:
> On Mon, Sep 25, 2017 at 3:52 AM, Steven Rostedt <[email protected]> wrote:
>> On Mon, 25 Sep 2017 12:32:23 +0200
>> Peter Zijlstra <[email protected]> wrote:
>>
>>
>>> > You mean you want to trace all calls to preempt and irq off even if
>>> > preempt and irqs are already off?
>>>
>>> Sure, why not? This stuff naturally nests, and who is to say its not a
>>> useful thing to trace all of them?
>>>
>>> By also tracing the nested sections you can, for instance, see how much
>>> you'd really win by getting rid of the outer one. If, for instance, the
>>> outer only accounts for 1% of the time, while the inner ones are
>>> interlinked and span the other 99%, there's more work to do than if that
>>> were not the case.
>>
>> If we do this we need a field to record if the preemption or irqs were
>> toggled by that call. Something that filters could easily be added to
>> only show what this patch set has.
>
> I request that we please not do this for my patchset, there are a
> number of reasons in my mind:
>
> 1. trace_preempt_off in existing code is only called the first time
> preempt is turned off. This is the definition of "preempt off", its
> the first time Preemption is actually turned off, and has nothing much
> to do with going into a deeper preempt count. Whether the count
> increases or not, preempt is already off and that's confirmed by the
> first preempt off event.
>
> This is how I read it in the comments in sched/core.c as well:
> "
> * If the value passed in is equal to the current preempt count
> * then we just disabled preemption."
>
> This is how I based this patchset as well, againt its not my usecase
> and it can be a future patch if its useful to track this.
>
> 2. This stuff is already a bit trace heavy, I prefer not to generate
> event every time the preempt count increases. Ofcourse filters but
> still then we have the filtering overhead for a usecase that I am not
> really targetting with this patchset.
>
> 3. It will complicate the patch more, apart from adding filters as
> Steven suggested, it would also mean we change how
> preempt_latency_start in sched/core.c works.
>
> Do you mind if we please keep this as a 'future' usecase for a future
> patch? Its not my usecase at all for this patchset and not what I was
> intending.
>
> I will reply to Peter's other comments on the other email shortly.
>
> thanks!
>
> - Joel
On Mon, 25 Sep 2017 17:23:00 -0700
Joel Fernandes <[email protected]> wrote:
> The trace_hardirqs_off API can be called even when IRQs are already
> off. This is unlike the trace_hardirqs_on which checks if IRQs are off
> (atleast from some callsites), here are the definitions just for
> reference [1]. I guess we could modify local_irq_disable and
> local_irq_save to check what the HW flags was before calling
> raw_local_irq_save and only then call trace_hardirqs_off if they were
> indeed on and now being turned off, but that adds complexity to it -
> also we have to then modify all the callsites from assembly code to
> conditionally call trace_hardirqs_on/off :(.
>
> [1] http://elixir.free-electrons.com/linux/latest/source/include/linux/irqflags.h#L89
Yeah, I think the issue for the recursion is basically this:
#define local_irq_restore(flags) \
do { \
if (raw_irqs_disabled_flags(flags)) { \
raw_local_irq_restore(flags); \
trace_hardirqs_off(); \
} else { \
trace_hardirqs_on(); \
raw_local_irq_restore(flags); \
} \
} while (0)
Peter,
Is there any place where we would call local_irq_restore() when
interrupts are enabled?
-- Steve