When invoked from system call enter/exit instrumentation, accessing
user-space data is a common use-case for tracers. However, tracepoints
currently disable preemption around iteration on the registered
tracepoint probes and invocation of the probe callbacks, which prevents
tracers from handling page faults.
Extend the tracepoint and trace event APIs to allow defining a faultable
tracepoint which invokes its callback with preemption enabled.
Also extend the tracepoint API to allow tracers to request specific
probes to be connected to those faultable tracepoints. When the
TRACEPOINT_MAY_FAULT flag is provided on registration, the probe
callback will be called with preemption enabled, and is allowed to take
page faults. Faultable probes can only be registered on faultable
tracepoints and non-faultable probes on non-faultable tracepoints.
The tasks trace rcu mechanism is used to synchronize read-side
marshalling of the registered probes with respect to faultable probes
unregistration and teardown.
Co-developed-by: Michael Jeanson <[email protected]>
Signed-off-by: Mathieu Desnoyers <[email protected]>
Signed-off-by: Michael Jeanson <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Yonghong Song <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: [email protected]
Cc: Joel Fernandes <[email protected]>
---
Changes since v1:
- Cleanup __DO_TRACE() implementation.
- Rename "sleepable tracepoints" to "faultable tracepoints", MAYSLEEP to
MAYFAULT, and use might_fault() rather than might_sleep(), to properly
convey that the tracepoints are meant to be able to take a page fault,
which requires to be able to sleep *and* to hold the mmap_sem.
Changes since v2:
- Rename MAYFAULT to MAY_FAULT.
- Rebased on 6.5.5.
- Introduce MAY_EXIST tracepoint flag.
---
include/linux/tracepoint-defs.h | 14 ++++++
include/linux/tracepoint.h | 88 +++++++++++++++++++++++----------
include/trace/define_trace.h | 7 +++
include/trace/trace_events.h | 6 +++
init/Kconfig | 1 +
kernel/trace/bpf_trace.c | 5 +-
kernel/trace/trace_fprobe.c | 5 +-
kernel/tracepoint.c | 58 ++++++++++++----------
8 files changed, 129 insertions(+), 55 deletions(-)
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 4dc4955f0fbf..67bacfaa8fd0 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -29,6 +29,19 @@ struct tracepoint_func {
int prio;
};
+/**
+ * enum tracepoint_flags - Tracepoint flags
+ * @TRACEPOINT_MAY_EXIST: Don't return an error if the tracepoint does not
+ * exist upon registration.
+ * @TRACEPOINT_MAY_FAULT: The tracepoint probe callback will be called with
+ * preemption enabled, and is allowed to take page
+ * faults.
+ */
+enum tracepoint_flags {
+ TRACEPOINT_MAY_EXIST = (1 << 0),
+ TRACEPOINT_MAY_FAULT = (1 << 1),
+};
+
struct tracepoint {
const char *name; /* Tracepoint name */
struct static_key key;
@@ -39,6 +52,7 @@ struct tracepoint {
int (*regfunc)(void);
void (*unregfunc)(void);
struct tracepoint_func __rcu *funcs;
+ unsigned int flags;
};
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 88c0ba623ee6..8a6b58a2bf3b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
#include <linux/tracepoint-defs.h>
#include <linux/static_call.h>
@@ -41,17 +42,10 @@ extern int
tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data,
int prio);
extern int
-tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data,
- int prio);
+tracepoint_probe_register_prio_flags(struct tracepoint *tp, void *probe, void *data,
+ int prio, unsigned int flags);
extern int
tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);
-static inline int
-tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe,
- void *data)
-{
- return tracepoint_probe_register_prio_may_exist(tp, probe, data,
- TRACEPOINT_DEFAULT_PRIO);
-}
extern void
for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
void *priv);
@@ -90,6 +84,7 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
#ifdef CONFIG_TRACEPOINTS
static inline void tracepoint_synchronize_unregister(void)
{
+ synchronize_rcu_tasks_trace();
synchronize_srcu(&tracepoint_srcu);
synchronize_rcu();
}
@@ -192,9 +187,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
*/
-#define __DO_TRACE(name, args, cond, rcuidle) \
+#define __DO_TRACE(name, args, cond, rcuidle, tp_flags) \
do { \
int __maybe_unused __idx = 0; \
+ bool mayfault = (tp_flags) & TRACEPOINT_MAY_FAULT; \
\
if (!(cond)) \
return; \
@@ -202,8 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
return; \
\
- /* keep srcu and sched-rcu usage consistent */ \
- preempt_disable_notrace(); \
+ if (mayfault) { \
+ rcu_read_lock_trace(); \
+ } else { \
+ /* keep srcu and sched-rcu usage consistent */ \
+ preempt_disable_notrace(); \
+ } \
\
/* \
* For rcuidle callers, use srcu since sched-rcu \
@@ -221,20 +221,23 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
} \
\
- preempt_enable_notrace(); \
+ if (mayfault) \
+ rcu_read_unlock_trace(); \
+ else \
+ preempt_enable_notrace(); \
} while (0)
#ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond) \
+#define __DECLARE_TRACE_RCU(name, proto, args, cond, tp_flags) \
static inline void trace_##name##_rcuidle(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
TP_ARGS(args), \
- TP_CONDITION(cond), 1); \
+ TP_CONDITION(cond), 1, tp_flags); \
}
#else
-#define __DECLARE_TRACE_RCU(name, proto, args, cond)
+#define __DECLARE_TRACE_RCU(name, proto, args, cond, tp_flags)
#endif
/*
@@ -248,7 +251,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* site if it is not watching, as it will need to be active when the
* tracepoint is enabled.
*/
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, tp_flags) \
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
@@ -257,13 +260,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
TP_ARGS(args), \
- TP_CONDITION(cond), 0); \
+ TP_CONDITION(cond), 0, tp_flags); \
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
WARN_ON_ONCE(!rcu_is_watching()); \
} \
+ if ((tp_flags) & TRACEPOINT_MAY_FAULT) \
+ might_fault(); \
} \
__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
- PARAMS(cond)) \
+ PARAMS(cond), tp_flags) \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
{ \
@@ -278,6 +283,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
(void *)probe, data, prio); \
} \
static inline int \
+ register_trace_prio_flags_##name(void (*probe)(data_proto), void *data, \
+ int prio, unsigned int flags) \
+ { \
+ return tracepoint_probe_register_prio_flags(&__tracepoint_##name, \
+ (void *)probe, data, prio, flags); \
+ } \
+ static inline int \
unregister_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_unregister(&__tracepoint_##name,\
@@ -298,7 +310,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* structures, so we create an array of pointers that will be used for iteration
* on the tracepoints.
*/
-#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
+#define DEFINE_TRACE_FN_FLAGS(_name, _reg, _unreg, proto, args, tp_flags) \
static const char __tpstrtab_##_name[] \
__section("__tracepoints_strings") = #_name; \
extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \
@@ -314,7 +326,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
.probestub = &__probestub_##_name, \
.regfunc = _reg, \
.unregfunc = _unreg, \
- .funcs = NULL }; \
+ .funcs = NULL, \
+ .flags = (tp_flags), \
+ }; \
__TRACEPOINT_ENTRY(_name); \
int __traceiter_##_name(void *__data, proto) \
{ \
@@ -337,8 +351,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
} \
DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
+#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
+ DEFINE_TRACE_FN_FLAGS(_name, _reg, _unreg, PARAMS(proto), PARAMS(args), 0)
+
#define DEFINE_TRACE(name, proto, args) \
- DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
+ DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args))
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
EXPORT_SYMBOL_GPL(__tracepoint_##name); \
@@ -351,7 +368,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#else /* !TRACEPOINTS_ENABLED */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, tp_flags) \
static inline void trace_##name(proto) \
{ } \
static inline void trace_##name##_rcuidle(proto) \
@@ -363,6 +380,18 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return -ENOSYS; \
} \
static inline int \
+ register_trace_prio_##name(void (*probe)(data_proto), \
+ void *data, int prio) \
+ { \
+ return -ENOSYS; \
+ } \
+ static inline int \
+ register_trace_prio_flags_##name(void (*probe)(data_proto), \
+ void *data, int prio, unsigned int flags) \
+ { \
+ return -ENOSYS; \
+ } \
+ static inline int \
unregister_trace_##name(void (*probe)(data_proto), \
void *data) \
{ \
@@ -377,6 +406,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return false; \
}
+#define DEFINE_TRACE_FN_FLAGS(name, reg, unreg, proto, args, tp_flags)
#define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
#define DEFINE_TRACE(name, proto, args)
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
@@ -431,12 +461,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#define DECLARE_TRACE(name, proto, args) \
__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
cpu_online(raw_smp_processor_id()), \
- PARAMS(void *__data, proto))
+ PARAMS(void *__data, proto), 0)
+
+#define DECLARE_TRACE_MAY_FAULT(name, proto, args) \
+ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()), \
+ PARAMS(void *__data, proto), TRACEPOINT_MAY_FAULT)
#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \
__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
- PARAMS(void *__data, proto))
+ PARAMS(void *__data, proto), 0)
#define TRACE_EVENT_FLAGS(event, flag)
@@ -567,6 +602,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#define TRACE_EVENT_FN(name, proto, args, struct, \
assign, print, reg, unreg) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_FN_MAY_FAULT(name, proto, args, struct, \
+ assign, print, reg, unreg) \
+ DECLARE_TRACE_MAY_FAULT(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \
assign, print, reg, unreg) \
DECLARE_TRACE_CONDITION(name, PARAMS(proto), \
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 00723935dcc7..1b8ca143724a 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -41,6 +41,12 @@
assign, print, reg, unreg) \
DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
+#undef TRACE_EVENT_FN_MAY_FAULT
+#define TRACE_EVENT_FN_MAY_FAULT(name, proto, args, tstruct, \
+ assign, print, reg, unreg) \
+ DEFINE_TRACE_FN_FLAGS(name, reg, unreg, PARAMS(proto), \
+ PARAMS(args), TRACEPOINT_MAY_FAULT)
+
#undef TRACE_EVENT_FN_COND
#define TRACE_EVENT_FN_COND(name, proto, args, cond, tstruct, \
assign, print, reg, unreg) \
@@ -106,6 +112,7 @@
#undef TRACE_EVENT
#undef TRACE_EVENT_FN
+#undef TRACE_EVENT_FN_MAY_FAULT
#undef TRACE_EVENT_FN_COND
#undef TRACE_EVENT_CONDITION
#undef TRACE_EVENT_NOP
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index c2f9cabf154d..df590eea8ae4 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -77,6 +77,12 @@
TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \
PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \
+#undef TRACE_EVENT_FN_MAY_FAULT
+#define TRACE_EVENT_FN_MAY_FAULT(name, proto, args, tstruct, \
+ assign, print, reg, unreg) \
+ TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \
+ PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \
+
#undef TRACE_EVENT_FN_COND
#define TRACE_EVENT_FN_COND(name, proto, args, cond, tstruct, \
assign, print, reg, unreg) \
diff --git a/init/Kconfig b/init/Kconfig
index 5e7d4885d1bf..05841191395b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1927,6 +1927,7 @@ config BINDGEN_VERSION_TEXT
#
config TRACEPOINTS
bool
+ select TASKS_TRACE_RCU
endmenu # General setup
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index abf287b2678a..4accf2f138b8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2327,8 +2327,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
- prog);
+ return tracepoint_probe_register_prio_flags(tp, (void *)btp->bpf_func,
+ prog, TRACEPOINT_DEFAULT_PRIO,
+ TRACEPOINT_MAY_EXIST);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index dfe2e546acdc..e653199aa0b7 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -687,8 +687,9 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
* At first, put __probestub_##TP function on the tracepoint
* and put a fprobe on the stub function.
*/
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
+ ret = tracepoint_probe_register_prio_flags(tpoint,
+ tpoint->probestub, NULL, 0,
+ TRACEPOINT_MAY_EXIST);
if (ret < 0)
return ret;
return register_fprobe_ips(&tf->fp, &ip, 1);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 8d1507dd0724..1f137163bdc5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -111,11 +111,16 @@ static inline void *allocate_probes(int count)
return p == NULL ? NULL : p->probes;
}
-static void srcu_free_old_probes(struct rcu_head *head)
+static void rcu_tasks_trace_free_old_probes(struct rcu_head *head)
{
kfree(container_of(head, struct tp_probes, rcu));
}
+static void srcu_free_old_probes(struct rcu_head *head)
+{
+ call_rcu_tasks_trace(head, rcu_tasks_trace_free_old_probes);
+}
+
static void rcu_free_old_probes(struct rcu_head *head)
{
call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
@@ -136,7 +141,7 @@ static __init int release_early_probes(void)
return 0;
}
-/* SRCU is initialized at core_initcall */
+/* SRCU and Tasks Trace RCU are initialized at core_initcall */
postcore_initcall(release_early_probes);
static inline void release_probes(struct tracepoint_func *old)
@@ -146,8 +151,9 @@ static inline void release_probes(struct tracepoint_func *old)
struct tp_probes, probes[0]);
/*
- * We can't free probes if SRCU is not initialized yet.
- * Postpone the freeing till after SRCU is initialized.
+ * We can't free probes if SRCU and Tasks Trace RCU are not
+ * initialized yet. Postpone the freeing till after both are
+ * initialized.
*/
if (unlikely(!ok_to_free_tracepoints)) {
tp_probes->rcu.next = early_probes;
@@ -156,10 +162,9 @@ static inline void release_probes(struct tracepoint_func *old)
}
/*
- * Tracepoint probes are protected by both sched RCU and SRCU,
- * by calling the SRCU callback in the sched RCU callback we
- * cover both cases. So let us chain the SRCU and sched RCU
- * callbacks to wait for both grace periods.
+ * Tracepoint probes are protected by sched RCU, SRCU and
+ * Tasks Trace RCU by chaining the callbacks we cover all three
+ * cases and wait for all three grace periods.
*/
call_rcu(&tp_probes->rcu, rcu_free_old_probes);
}
@@ -460,30 +465,38 @@ static int tracepoint_remove_func(struct tracepoint *tp,
}
/**
- * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority
+ * tracepoint_probe_register_prio_flags - Connect a probe to a tracepoint with priority and flags
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
* @prio: priority of this function over other registered functions
+ * @flags: tracepoint flags argument (enum tracepoint_flags bits)
*
- * Same as tracepoint_probe_register_prio() except that it will not warn
- * if the tracepoint is already registered.
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
*/
-int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe,
- void *data, int prio)
+int tracepoint_probe_register_prio_flags(struct tracepoint *tp, void *probe,
+ void *data, int prio, unsigned int flags)
{
struct tracepoint_func tp_func;
int ret;
+ if (((tp->flags & TRACEPOINT_MAY_FAULT) && !(flags & TRACEPOINT_MAY_FAULT)) ||
+ (!(tp->flags & TRACEPOINT_MAY_FAULT) && (flags & TRACEPOINT_MAY_FAULT)))
+ return -EINVAL;
+
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio, false);
+ ret = tracepoint_add_func(tp, &tp_func, prio, flags & TRACEPOINT_MAY_EXIST);
mutex_unlock(&tracepoints_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_flags);
/**
* tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
@@ -501,16 +514,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
void *data, int prio)
{
- struct tracepoint_func tp_func;
- int ret;
-
- mutex_lock(&tracepoints_mutex);
- tp_func.func = probe;
- tp_func.data = data;
- tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio, true);
- mutex_unlock(&tracepoints_mutex);
- return ret;
+ return tracepoint_probe_register_prio_flags(tp, probe, data, prio, 0);
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
@@ -520,6 +524,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
* @probe: probe handler
* @data: tracepoint data
*
+ * Non-faultable probes can only be registered on non-faultable tracepoints.
+ *
* Returns 0 if ok, error value on error.
* Note: if @tp is within a module, the caller is responsible for
* unregistering the probe before the module is gone. This can be
@@ -528,7 +534,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
*/
int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
- return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
+ return tracepoint_probe_register_prio_flags(tp, probe, data, TRACEPOINT_DEFAULT_PRIO, 0);
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
--
2.25.1
On Mon, 2 Oct 2023 16:25:27 -0400
Mathieu Desnoyers <[email protected]> wrote:
> @@ -202,8 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
> return; \
> \
> - /* keep srcu and sched-rcu usage consistent */ \
> - preempt_disable_notrace(); \
> + if (mayfault) { \
> + rcu_read_lock_trace(); \
I thought rcu_trace was for the case that a task can not voluntarily call
schedule. If this tracepoint tries to read user space memory that isn't
paged in, and faults, can't the faulting logic call schedule and break this
requirement?
-- Steve
> + } else { \
> + /* keep srcu and sched-rcu usage consistent */ \
> + preempt_disable_notrace(); \
> + } \
> \
> /* \
> * For rcuidle callers, use srcu since sched-rcu \
On Mon, Oct 02, 2023 at 07:10:23PM -0400, Steven Rostedt wrote:
> On Mon, 2 Oct 2023 16:25:27 -0400
> Mathieu Desnoyers <[email protected]> wrote:
>
> > @@ -202,8 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> > if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
> > return; \
> > \
> > - /* keep srcu and sched-rcu usage consistent */ \
> > - preempt_disable_notrace(); \
> > + if (mayfault) { \
> > + rcu_read_lock_trace(); \
>
> I thought rcu_trace was for the case that a task can not voluntarily call
> schedule. If this tracepoint tries to read user space memory that isn't
> paged in, and faults, can't the faulting logic call schedule and break this
> requirement?
Well, additional new uses of rcu_read_lock_trace() do bear close scrutiny,
but RCU Tasks Trace readers are permitted to block for page faults.
The BPF folks already use it for this purpose, so this should be OK.
(If for some unknown-to-me reason it isn't, I am sure that Alexei,
who is on CC, will not suffer in silence.)
One way of thinking of RCU Tasks Trace is as a form of SRCU with
lightweight readers. Except that, unlike SRCU, there is only one global
RCU Tasks Trace. This means that all RCU Tasks Trace users need to keep
each other informed, because one users' unruly readers will affect all
RCU Tasks Trace users.
But given that the BPF folks already have page faults in RCU Tasks Trace
readers, this one should be OK.
Thanx, Paul
> -- Steve
>
>
> > + } else { \
> > + /* keep srcu and sched-rcu usage consistent */ \
> > + preempt_disable_notrace(); \
> > + } \
> > \
> > /* \
> > * For rcuidle callers, use srcu since sched-rcu \
On Mon, 2 Oct 2023 17:14:39 -0700
"Paul E. McKenney" <[email protected]> wrote:
> On Mon, Oct 02, 2023 at 07:10:23PM -0400, Steven Rostedt wrote:
> > On Mon, 2 Oct 2023 16:25:27 -0400
> > Mathieu Desnoyers <[email protected]> wrote:
> >
> > > @@ -202,8 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> > > if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
> > > return; \
> > > \
> > > - /* keep srcu and sched-rcu usage consistent */ \
> > > - preempt_disable_notrace(); \
> > > + if (mayfault) { \
> > > + rcu_read_lock_trace(); \
> >
> > I thought rcu_trace was for the case that a task can not voluntarily call
> > schedule. If this tracepoint tries to read user space memory that isn't
> > paged in, and faults, can't the faulting logic call schedule and break this
> > requirement?
>
> Well, additional new uses of rcu_read_lock_trace() do bear close scrutiny,
> but RCU Tasks Trace readers are permitted to block for page faults.
> The BPF folks already use it for this purpose, so this should be OK.
> (If for some unknown-to-me reason it isn't, I am sure that Alexei,
> who is on CC, will not suffer in silence.)
>
> One way of thinking of RCU Tasks Trace is as a form of SRCU with
> lightweight readers. Except that, unlike SRCU, there is only one global
> RCU Tasks Trace. This means that all RCU Tasks Trace users need to keep
> each other informed, because one users' unruly readers will affect all
> RCU Tasks Trace users.
>
> But given that the BPF folks already have page faults in RCU Tasks Trace
> readers, this one should be OK.
Then I think we should update the documentation.
From: Documentation/RCU/checklist.rst:
If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
then the readers must refrain from executing voluntary
context switches, that is, from blocking. If the updater uses
call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
the corresponding readers must use rcu_read_lock_trace() and
rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
or synchronize_rcu_tasks_rude(), then the corresponding readers
must use anything that disables preemption, for example,
preempt_disable() and preempt_enable().
Because it's all one paragraph it's a bit confusing to know what uses what.
Perhaps it should be broken up a bit more?
If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
then the readers must refrain from executing voluntary
context switches, that is, from blocking.
If the updater uses call_rcu_tasks_trace() or
synchronize_rcu_tasks_trace(), then the corresponding readers must
use rcu_read_lock_trace() and rcu_read_unlock_trace().
If an updater uses call_rcu_tasks_rude() or synchronize_rcu_tasks_rude(),
then the corresponding readers must use anything that disables
preemption, for example, preempt_disable() and preempt_enable().
That way it is clear what uses what, as I read the original paragraph a
couple of times and could have sworn that rcu_read_lock_trace() required
tasks to not block.
-- Steve
On Mon, Oct 02, 2023 at 09:19:36PM -0400, Steven Rostedt wrote:
> On Mon, 2 Oct 2023 17:14:39 -0700
> "Paul E. McKenney" <[email protected]> wrote:
>
> > On Mon, Oct 02, 2023 at 07:10:23PM -0400, Steven Rostedt wrote:
> > > On Mon, 2 Oct 2023 16:25:27 -0400
> > > Mathieu Desnoyers <[email protected]> wrote:
> > >
> > > > @@ -202,8 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> > > > if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
> > > > return; \
> > > > \
> > > > - /* keep srcu and sched-rcu usage consistent */ \
> > > > - preempt_disable_notrace(); \
> > > > + if (mayfault) { \
> > > > + rcu_read_lock_trace(); \
> > >
> > > I thought rcu_trace was for the case that a task can not voluntarily call
> > > schedule. If this tracepoint tries to read user space memory that isn't
> > > paged in, and faults, can't the faulting logic call schedule and break this
> > > requirement?
> >
> > Well, additional new uses of rcu_read_lock_trace() do bear close scrutiny,
> > but RCU Tasks Trace readers are permitted to block for page faults.
> > The BPF folks already use it for this purpose, so this should be OK.
> > (If for some unknown-to-me reason it isn't, I am sure that Alexei,
> > who is on CC, will not suffer in silence.)
> >
> > One way of thinking of RCU Tasks Trace is as a form of SRCU with
> > lightweight readers. Except that, unlike SRCU, there is only one global
> > RCU Tasks Trace. This means that all RCU Tasks Trace users need to keep
> > each other informed, because one users' unruly readers will affect all
> > RCU Tasks Trace users.
> >
> > But given that the BPF folks already have page faults in RCU Tasks Trace
> > readers, this one should be OK.
>
> Then I think we should update the documentation.
>
> From: Documentation/RCU/checklist.rst:
>
> If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
> then the readers must refrain from executing voluntary
> context switches, that is, from blocking. If the updater uses
> call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
> the corresponding readers must use rcu_read_lock_trace() and
> rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
> or synchronize_rcu_tasks_rude(), then the corresponding readers
> must use anything that disables preemption, for example,
> preempt_disable() and preempt_enable().
>
> Because it's all one paragraph it's a bit confusing to know what uses what.
> Perhaps it should be broken up a bit more?
>
> If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
> then the readers must refrain from executing voluntary
> context switches, that is, from blocking.
>
> If the updater uses call_rcu_tasks_trace() or
> synchronize_rcu_tasks_trace(), then the corresponding readers must
> use rcu_read_lock_trace() and rcu_read_unlock_trace().
>
> If an updater uses call_rcu_tasks_rude() or synchronize_rcu_tasks_rude(),
> then the corresponding readers must use anything that disables
> preemption, for example, preempt_disable() and preempt_enable().
>
> That way it is clear what uses what, as I read the original paragraph a
> couple of times and could have sworn that rcu_read_lock_trace() required
> tasks to not block.
That would work for me. Would you like to send a patch, or would you
rather we made the adjustments?
Thanx, Paul
On Tue, 3 Oct 2023 06:44:50 -0700
"Paul E. McKenney" <[email protected]> wrote:
> > That way it is clear what uses what, as I read the original paragraph a
> > couple of times and could have sworn that rcu_read_lock_trace() required
> > tasks to not block.
>
> That would work for me. Would you like to send a patch, or would you
> rather we made the adjustments?
Which ever.
-- Steve
On Tue, Oct 03, 2023 at 10:08:54AM -0400, Steven Rostedt wrote:
> On Tue, 3 Oct 2023 06:44:50 -0700
> "Paul E. McKenney" <[email protected]> wrote:
>
> > > That way it is clear what uses what, as I read the original paragraph a
> > > couple of times and could have sworn that rcu_read_lock_trace() required
> > > tasks to not block.
> >
> > That would work for me. Would you like to send a patch, or would you
> > rather we made the adjustments?
>
> Which ever.
OK, how about like this?
Thanx, Paul
------------------------------------------------------------------------
commit 973eb79ec46c16f13bb5b47ad14d44a1f1c79dc9
Author: Paul E. McKenney <[email protected]>
Date: Tue Oct 3 10:30:01 2023 -0700
doc: Clarify RCU Tasks reader/updater checklist
Currently, the reader/updater compatibility rules for the three RCU
Tasks flavors are squished together in a single paragraph, which can
result in confusion. This commit therefore splits them out into a list,
clearly showing the distinction between these flavors.
Reported-by: Steven Rostedt <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index bd3c58c44bef..c432899aff22 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -241,15 +241,22 @@ over a rather long period of time, but improvements are always welcome!
srcu_struct. The rules for the expedited RCU grace-period-wait
primitives are the same as for their non-expedited counterparts.
- If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
- then the readers must refrain from executing voluntary
- context switches, that is, from blocking. If the updater uses
- call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
- the corresponding readers must use rcu_read_lock_trace() and
- rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
- or synchronize_rcu_tasks_rude(), then the corresponding readers
- must use anything that disables preemption, for example,
- preempt_disable() and preempt_enable().
+ Similarly, it is necssary to correctly use the RCU Tasks flavors:
+
+ a. If the updater uses synchronize_rcu_tasks() or
+ call_rcu_tasks(), then the readers must refrain from
+ executing voluntary context switches, that is, from
+ blocking.
+
+ b. If the updater uses call_rcu_tasks_trace()
+ or synchronize_rcu_tasks_trace(), then the
+ corresponding readers must use rcu_read_lock_trace()
+ and rcu_read_unlock_trace().
+
+ c. If an updater uses call_rcu_tasks_rude() or
+ synchronize_rcu_tasks_rude(), then the corresponding
+ readers must use anything that disables preemption,
+ for example, preempt_disable() and preempt_enable().
Mixing things up will result in confusion and broken kernels, and
has even resulted in an exploitable security issue. Therefore,
On 10/3/23 13:33, Paul E. McKenney wrote:
> On Tue, Oct 03, 2023 at 10:08:54AM -0400, Steven Rostedt wrote:
>> On Tue, 3 Oct 2023 06:44:50 -0700
>> "Paul E. McKenney" <[email protected]> wrote:
>>
>>>> That way it is clear what uses what, as I read the original paragraph a
>>>> couple of times and could have sworn that rcu_read_lock_trace() required
>>>> tasks to not block.
>>>
>>> That would work for me. Would you like to send a patch, or would you
>>> rather we made the adjustments?
>>
>> Which ever.
>
> OK, how about like this?
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 973eb79ec46c16f13bb5b47ad14d44a1f1c79dc9
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Oct 3 10:30:01 2023 -0700
>
> doc: Clarify RCU Tasks reader/updater checklist
>
> Currently, the reader/updater compatibility rules for the three RCU
> Tasks flavors are squished together in a single paragraph, which can
> result in confusion. This commit therefore splits them out into a list,
> clearly showing the distinction between these flavors.
>
Reviewed-by: Mathieu Desnoyers <[email protected]>
Thanks!
Mathieu
> Reported-by: Steven Rostedt <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
> index bd3c58c44bef..c432899aff22 100644
> --- a/Documentation/RCU/checklist.rst
> +++ b/Documentation/RCU/checklist.rst
> @@ -241,15 +241,22 @@ over a rather long period of time, but improvements are always welcome!
> srcu_struct. The rules for the expedited RCU grace-period-wait
> primitives are the same as for their non-expedited counterparts.
>
> - If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
> - then the readers must refrain from executing voluntary
> - context switches, that is, from blocking. If the updater uses
> - call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
> - the corresponding readers must use rcu_read_lock_trace() and
> - rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
> - or synchronize_rcu_tasks_rude(), then the corresponding readers
> - must use anything that disables preemption, for example,
> - preempt_disable() and preempt_enable().
> + Similarly, it is necssary to correctly use the RCU Tasks flavors:
> +
> + a. If the updater uses synchronize_rcu_tasks() or
> + call_rcu_tasks(), then the readers must refrain from
> + executing voluntary context switches, that is, from
> + blocking.
> +
> + b. If the updater uses call_rcu_tasks_trace()
> + or synchronize_rcu_tasks_trace(), then the
> + corresponding readers must use rcu_read_lock_trace()
> + and rcu_read_unlock_trace().
> +
> + c. If an updater uses call_rcu_tasks_rude() or
> + synchronize_rcu_tasks_rude(), then the corresponding
> + readers must use anything that disables preemption,
> + for example, preempt_disable() and preempt_enable().
>
> Mixing things up will result in confusion and broken kernels, and
> has even resulted in an exploitable security issue. Therefore,
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Tue, 3 Oct 2023 10:33:33 -0700
"Paul E. McKenney" <[email protected]> wrote:
> On Tue, Oct 03, 2023 at 10:08:54AM -0400, Steven Rostedt wrote:
> > On Tue, 3 Oct 2023 06:44:50 -0700
> > "Paul E. McKenney" <[email protected]> wrote:
> >
> > > > That way it is clear what uses what, as I read the original paragraph a
> > > > couple of times and could have sworn that rcu_read_lock_trace() required
> > > > tasks to not block.
> > >
> > > That would work for me. Would you like to send a patch, or would you
> > > rather we made the adjustments?
> >
> > Which ever.
>
> OK, how about like this?
Reviewed-by: Steven Rostedt (Google) <[email protected]>
Link: https://lore.kernel.org/all/[email protected]/
-- Steve
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 973eb79ec46c16f13bb5b47ad14d44a1f1c79dc9
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Oct 3 10:30:01 2023 -0700
>
> doc: Clarify RCU Tasks reader/updater checklist
>
> Currently, the reader/updater compatibility rules for the three RCU
> Tasks flavors are squished together in a single paragraph, which can
> result in confusion. This commit therefore splits them out into a list,
> clearly showing the distinction between these flavors.
>
> Reported-by: Steven Rostedt <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
> index bd3c58c44bef..c432899aff22 100644
> --- a/Documentation/RCU/checklist.rst
> +++ b/Documentation/RCU/checklist.rst
> @@ -241,15 +241,22 @@ over a rather long period of time, but improvements are always welcome!
> srcu_struct. The rules for the expedited RCU grace-period-wait
> primitives are the same as for their non-expedited counterparts.
>
> - If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
> - then the readers must refrain from executing voluntary
> - context switches, that is, from blocking. If the updater uses
> - call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
> - the corresponding readers must use rcu_read_lock_trace() and
> - rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
> - or synchronize_rcu_tasks_rude(), then the corresponding readers
> - must use anything that disables preemption, for example,
> - preempt_disable() and preempt_enable().
> + Similarly, it is necssary to correctly use the RCU Tasks flavors:
> +
> + a. If the updater uses synchronize_rcu_tasks() or
> + call_rcu_tasks(), then the readers must refrain from
> + executing voluntary context switches, that is, from
> + blocking.
> +
> + b. If the updater uses call_rcu_tasks_trace()
> + or synchronize_rcu_tasks_trace(), then the
> + corresponding readers must use rcu_read_lock_trace()
> + and rcu_read_unlock_trace().
> +
> + c. If an updater uses call_rcu_tasks_rude() or
> + synchronize_rcu_tasks_rude(), then the corresponding
> + readers must use anything that disables preemption,
> + for example, preempt_disable() and preempt_enable().
>
> Mixing things up will result in confusion and broken kernels, and
> has even resulted in an exploitable security issue. Therefore,
On Tue, Oct 03, 2023 at 01:38:56PM -0400, Steven Rostedt wrote:
> On Tue, 3 Oct 2023 10:33:33 -0700
> "Paul E. McKenney" <[email protected]> wrote:
>
> > On Tue, Oct 03, 2023 at 10:08:54AM -0400, Steven Rostedt wrote:
> > > On Tue, 3 Oct 2023 06:44:50 -0700
> > > "Paul E. McKenney" <[email protected]> wrote:
> > >
> > > > > That way it is clear what uses what, as I read the original paragraph a
> > > > > couple of times and could have sworn that rcu_read_lock_trace() required
> > > > > tasks to not block.
> > > >
> > > > That would work for me. Would you like to send a patch, or would you
> > > > rather we made the adjustments?
> > >
> > > Which ever.
> >
> > OK, how about like this?
>
> Reviewed-by: Steven Rostedt (Google) <[email protected]>
>
> Link: https://lore.kernel.org/all/[email protected]/
Thank you both! Updated as shown below.
Thanx, Paul
------------------------------------------------------------------------
commit 4d2115e8919760c690e30f48cae2f017c1581546
Author: Paul E. McKenney <[email protected]>
Date: Tue Oct 3 10:30:01 2023 -0700
doc: Clarify RCU Tasks reader/updater checklist
Currently, the reader/updater compatibility rules for the three RCU
Tasks flavors are squished together in a single paragraph, which can
result in confusion. This commit therefore splits them out into a list,
clearly showing the distinction between these flavors.
Link: https://lore.kernel.org/all/[email protected]/
Reported-by: Steven Rostedt <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Reviewed-by: Mathieu Desnoyers <[email protected]>
Reviewed-by: Steven Rostedt (Google) <[email protected]>
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index bd3c58c44bef..c432899aff22 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -241,15 +241,22 @@ over a rather long period of time, but improvements are always welcome!
srcu_struct. The rules for the expedited RCU grace-period-wait
primitives are the same as for their non-expedited counterparts.
- If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
- then the readers must refrain from executing voluntary
- context switches, that is, from blocking. If the updater uses
- call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
- the corresponding readers must use rcu_read_lock_trace() and
- rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
- or synchronize_rcu_tasks_rude(), then the corresponding readers
- must use anything that disables preemption, for example,
- preempt_disable() and preempt_enable().
+ Similarly, it is necssary to correctly use the RCU Tasks flavors:
+
+ a. If the updater uses synchronize_rcu_tasks() or
+ call_rcu_tasks(), then the readers must refrain from
+ executing voluntary context switches, that is, from
+ blocking.
+
+ b. If the updater uses call_rcu_tasks_trace()
+ or synchronize_rcu_tasks_trace(), then the
+ corresponding readers must use rcu_read_lock_trace()
+ and rcu_read_unlock_trace().
+
+ c. If an updater uses call_rcu_tasks_rude() or
+ synchronize_rcu_tasks_rude(), then the corresponding
+ readers must use anything that disables preemption,
+ for example, preempt_disable() and preempt_enable().
Mixing things up will result in confusion and broken kernels, and
has even resulted in an exploitable security issue. Therefore,