Hi Jiri and Alexei,
Here is the 2nd version of fprobe. This version uses the
ftrace_set_filter_ips() for reducing the registering overhead.
Note that this also drops per-probe point private data, which
is not used anyway.
This introduces the fprobe, the function entry/exit probe with
multiple probe point support. This also introduces the rethook
for hooking function return as same as kretprobe does. This
abstraction will help us to generalize the fgraph tracer,
because we can just switch it from rethook in fprobe, depending
on the kernel configuration.
The patch [1/8] and [7/8] are from your series[1]. Other libbpf
patches will not be affected by this change.
[1] https://lore.kernel.org/all/[email protected]/T/#u
I also added an out-of-tree (just for testing) patch at the
end of this series ([8/8]) for adding a wildcard support to
the sample program. With that patch, it shows how long the
registration will take;
# time insmod fprobe_example.ko symbol='btrfs_*'
[ 36.130947] fprobe_init: 1028 symbols found
[ 36.177901] fprobe_init: Planted fprobe at btrfs_*
real 0m 0.08s
user 0m 0.00s
sys 0m 0.07s
Thank you,
---
Jiri Olsa (2):
ftrace: Add ftrace_set_filter_ips function
bpf: Add kprobe link for attaching raw kprobes
Masami Hiramatsu (6):
fprobe: Add ftrace based probe APIs
rethook: Add a generic return hook
rethook: x86: Add rethook x86 implementation
fprobe: Add exit_handler support
fprobe: Add sample program for fprobe
[DO NOT MERGE] Out-of-tree: Support wildcard symbol option to sample
arch/x86/Kconfig | 1
arch/x86/kernel/Makefile | 1
arch/x86/kernel/rethook.c | 115 ++++++++++++++++++++
include/linux/bpf_types.h | 1
include/linux/fprobe.h | 57 ++++++++++
include/linux/ftrace.h | 3 +
include/linux/rethook.h | 74 +++++++++++++
include/linux/sched.h | 3 +
include/uapi/linux/bpf.h | 12 ++
kernel/bpf/syscall.c | 195 +++++++++++++++++++++++++++++++++-
kernel/exit.c | 2
kernel/fork.c | 3 +
kernel/kallsyms.c | 1
kernel/trace/Kconfig | 22 ++++
kernel/trace/Makefile | 2
kernel/trace/fprobe.c | 168 +++++++++++++++++++++++++++++
kernel/trace/ftrace.c | 54 ++++++++-
kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++
samples/Kconfig | 7 +
samples/Makefile | 1
samples/fprobe/Makefile | 3 +
samples/fprobe/fprobe_example.c | 154 +++++++++++++++++++++++++++
tools/include/uapi/linux/bpf.h | 12 ++
23 files changed, 1103 insertions(+), 14 deletions(-)
create mode 100644 arch/x86/kernel/rethook.c
create mode 100644 include/linux/fprobe.h
create mode 100644 include/linux/rethook.h
create mode 100644 kernel/trace/fprobe.c
create mode 100644 kernel/trace/rethook.c
create mode 100644 samples/fprobe/Makefile
create mode 100644 samples/fprobe/fprobe_example.c
--
Masami Hiramatsu (Linaro) <[email protected]>
From: Jiri Olsa <[email protected]>
Adding ftrace_set_filter_ips function to be able to set filter on
multiple ip addresses at once.
With the kprobe multi attach interface we have cases where we need to
initialize ftrace_ops object with thousands of functions, so having
single function diving into ftrace_hash_move_and_update_ops with
ftrace_lock is faster.
The functions ips are passed as unsigned long array with count.
Signed-off-by: Jiri Olsa <[email protected]>
---
include/linux/ftrace.h | 3 +++
kernel/trace/ftrace.c | 53 ++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 47 insertions(+), 9 deletions(-)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9999e29187de..60847cbce0da 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -512,6 +512,8 @@ struct dyn_ftrace {
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset);
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset);
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset);
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
@@ -802,6 +804,7 @@ static inline unsigned long ftrace_location(unsigned long ip)
#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
#define ftrace_set_early_filter(ops, buf, enable) do { } while (0)
#define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; })
+#define ftrace_set_filter_ips(ops, ips, cnt, remove, reset) ({ -ENODEV; })
#define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; })
#define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; })
#define ftrace_free_filter(ops) do { } while (0)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 30bc880c3849..d38ae5063be3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4958,7 +4958,7 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
@@ -4976,9 +4976,25 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
+static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5008,8 +5024,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -5026,10 +5042,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5628,10 +5644,29 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+/**
+ * ftrace_set_filter_ips - set a functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
@@ -5653,7 +5688,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
The fprobe is a wrapper API for ftrace function tracer.
Unlike kprobes, this probes only supports the function entry, but
it can probe multiple functions by one fprobe. The usage is almost
same as the kprobe, user will specify the function names by
fprobe::syms, the number of syms by fprobe::nentry,
and the user handler by fprobe::entry_handler.
struct fprobe fp = { 0 };
const char *targets[] = { "func1", "func2", "func3"};
fp.handler = user_handler;
fp.nentry = ARRAY_SIZE(targets);
fp.syms = targets;
ret = register_fprobe(&fp);
Signed-off-by: Masami Hiramatsu <[email protected]>
---
Changes in v2:
- Remove fprobe_entry.
- Do not sort the address array since there is no user private data.
- Since there is only "fprobe", make filename and config name "fprobe".
- Use ftrace_set_filter_ips()
- Fix style warnings.
---
include/linux/fprobe.h | 53 +++++++++++++++++++++++
kernel/trace/Kconfig | 10 ++++
kernel/trace/Makefile | 1
kernel/trace/fprobe.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 177 insertions(+)
create mode 100644 include/linux/fprobe.h
create mode 100644 kernel/trace/fprobe.c
diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
new file mode 100644
index 000000000000..614d28d5828b
--- /dev/null
+++ b/include/linux/fprobe.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Simple ftrace probe wrapper */
+#ifndef _LINUX_FPROBE_H
+#define _LINUX_FPROBE_H
+
+#include <linux/compiler.h>
+#include <linux/ftrace.h>
+
+struct fprobe {
+ const char **syms;
+ unsigned long *addrs;
+ unsigned int nentry;
+
+ struct ftrace_ops ftrace;
+ unsigned long nmissed;
+ unsigned int flags;
+ void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
+};
+
+#define FPROBE_FL_DISABLED 1
+
+static inline bool fprobe_disabled(struct fprobe *fp)
+{
+ return (fp) ? fp->flags & FPROBE_FL_DISABLED : false;
+}
+
+#ifdef CONFIG_FPROBE
+int register_fprobe(struct fprobe *fp);
+int unregister_fprobe(struct fprobe *fp);
+#else
+static inline int register_fprobe(struct fprobe *fp)
+{
+ return -EOPNOTSUPP;
+}
+static inline int unregister_fprobe(struct fprobe *fp)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static inline void disable_fprobe(struct fprobe *fp)
+{
+ if (fp)
+ fp->flags |= FPROBE_FL_DISABLED;
+}
+
+static inline void enable_fprobe(struct fprobe *fp)
+{
+ if (fp)
+ fp->flags &= ~FPROBE_FL_DISABLED;
+}
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 420ff4bc67fd..6834b0272798 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -223,6 +223,16 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ default n
+ help
+ This option enables kernel function probe feature, which is
+ similar to kprobes, but probes only for kernel function entries
+ and it can probe multiple functions by one fprobe.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..79255f9de9a4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..0247fc7d75e2
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ftrace);
+ if (fprobe_disabled(fp))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ if (fp->entry_handler)
+ fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static int convert_func_addresses(struct fprobe *fp)
+{
+ unsigned int i;
+
+ if (!fp->syms)
+ return 0;
+
+ fp->addrs = kcalloc(fp->nentry, sizeof(*fp->addrs), GFP_KERNEL);
+ if (!fp->addrs)
+ return -ENOMEM;
+
+ for (i = 0; i < fp->nentry; i++) {
+
+ fp->addrs[i] = kallsyms_lookup_name(fp->syms[i]);
+ if (!fp->addrs[i])
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/**
+ * register_fprobe - Register fprobe to ftrace
+ * @fp: A fprobe data structure to be registered.
+ *
+ * This expects the user set @fp::entry_handler, @fp::syms or @fp:addrs,
+ * and @fp::nentry.
+ * Note that you do not set both of @fp::addrs and @fp::syms.
+ */
+int register_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || !fp->nentry || (!fp->syms && !fp->addrs) ||
+ (fp->syms && fp->addrs))
+ return -EINVAL;
+
+ ret = convert_func_addresses(fp);
+ if (ret < 0)
+ return ret;
+
+ fp->nmissed = 0;
+ fp->ftrace.func = fprobe_handler;
+ fp->ftrace.flags = FTRACE_OPS_FL_SAVE_REGS;
+
+ ret = ftrace_set_filter_ips(&fp->ftrace, fp->addrs, fp->nentry, 0, 0);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ftrace);
+
+ if (ret < 0 && fp->syms) {
+ kfree(fp->addrs);
+ fp->addrs = NULL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * unregister_fprobe - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || !fp->nentry || !fp->addrs)
+ return -EINVAL;
+
+ ret = unregister_ftrace_function(&fp->ftrace);
+
+ if (!ret && fp->syms) {
+ kfree(fp->addrs);
+ fp->addrs = NULL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
Add a return hook framework which hooks the function
return. Most of the idea came from the kretprobe, but
this is independent from kretprobe.
Note that this is expected to be used with other
function entry hooking feature, like ftrace, fprobe,
adn kprobes. Eventually this will replace the
kretprobe (e.g. kprobe + rethook = kretprobe), but
at this moment, this is just a additional hook.
Signed-off-by: Masami Hiramatsu <[email protected]>
---
include/linux/rethook.h | 74 +++++++++++++++
include/linux/sched.h | 3 +
kernel/exit.c | 2
kernel/fork.c | 3 +
kernel/trace/Kconfig | 11 ++
kernel/trace/Makefile | 1
kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 320 insertions(+)
create mode 100644 include/linux/rethook.h
create mode 100644 kernel/trace/rethook.c
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
new file mode 100644
index 000000000000..2622bcd5213a
--- /dev/null
+++ b/include/linux/rethook.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Return hooking with list-based shadow stack.
+ */
+#ifndef _LINUX_RETHOOK_H
+#define _LINUX_RETHOOK_H
+
+#include <linux/compiler.h>
+#include <linux/freelist.h>
+#include <linux/llist.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+
+struct rethook_node;
+
+typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs *);
+
+struct rethook {
+ void *data;
+ rethook_handler_t handler;
+ struct freelist_head pool;
+ refcount_t ref;
+ struct rcu_head rcu;
+};
+
+struct rethook_node {
+ union {
+ struct freelist_node freelist;
+ struct rcu_head rcu;
+ };
+ struct llist_node llist;
+ struct rethook *rethook;
+ unsigned long ret_addr;
+ unsigned long frame;
+};
+
+int rethook_node_init(struct rethook_node *node);
+
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+void rethook_free(struct rethook *rh);
+void rethook_add_node(struct rethook *rh, struct rethook_node *node);
+
+struct rethook_node *rethook_try_get(struct rethook *rh);
+void rethook_node_recycle(struct rethook_node *node);
+void rethook_hook_current(struct rethook_node *node, struct pt_regs *regs);
+
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur);
+
+/* Arch dependent code must implement this and trampoline code */
+void arch_rethook_prepare(struct rethook_node *node, struct pt_regs *regs);
+void arch_rethook_trampoline(void);
+
+static inline bool is_rethook_trampoline(unsigned long addr)
+{
+ return addr == (unsigned long)arch_rethook_trampoline;
+}
+
+/* If the architecture needs a fixup the return address, implement it. */
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr);
+
+/* Generic trampoline handler, arch code must prepare asm stub */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame);
+
+#ifdef CONFIG_RETHOOK
+void rethook_flush_task(struct task_struct *tk);
+#else
+#define rethook_flush_task(tsk) do { } while (0)
+#endif
+
+#endif
+
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..2bfabf5355b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1473,6 +1473,9 @@ struct task_struct {
#ifdef CONFIG_KRETPROBES
struct llist_head kretprobe_instances;
#endif
+#ifdef CONFIG_RETHOOK
+ struct llist_head rethooks;
+#endif
#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index f702a6a63686..a39a321c1f37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,6 +64,7 @@
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
+#include <linux/rethook.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3244cc56b697..ffae38be64c4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2282,6 +2282,9 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6834b0272798..44c473ad9021 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ feature like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 79255f9de9a4..c6f11a139eac 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..80c0584e8497
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ preempt_disable();
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ rethook_node_recycle(rhn);
+ }
+
+ preempt_enable();
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ struct rethook_node *rhn;
+ struct freelist_node *node;
+ int count = 1;
+
+ node = rh->pool.head;
+ while (node) {
+ rhn = container_of(node, struct rethook_node, freelist);
+ node = node->next;
+ kfree(rhn);
+ count++;
+ }
+
+ /* The rh->ref is the number of pooled node + 1 */
+ if (refcount_sub_and_test(count, &rh->ref))
+ kfree(rh);
+}
+
+void rethook_free(struct rethook *rh)
+{
+ rh->handler = NULL;
+ rh->data = NULL;
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/*
+ * @handler must not NULL. @handler == NULL means this rethook is
+ * going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+ struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+ if (!rh || !handler)
+ return NULL;
+
+ rh->data = data;
+ rh->handler = handler;
+ rh->pool.head = NULL;
+ refcount_set(&rh->ref, 1);
+
+ return rh;
+}
+
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+ node->rethook = rh;
+ freelist_add(&node->freelist, &rh->pool);
+ refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+ if (refcount_dec_and_test(&node->rethook->ref))
+ kfree(node->rethook);
+ kfree(node);
+}
+
+void rethook_node_recycle(struct rethook_node *node)
+{
+ if (likely(READ_ONCE(node->rethook->handler)))
+ freelist_add(&node->freelist, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ struct freelist_node *fn;
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!READ_ONCE(rh->handler)))
+ return NULL;
+
+ fn = freelist_try_get(&rh->pool);
+ if (!fn)
+ return NULL;
+
+ return container_of(fn, struct rethook_node, freelist);
+}
+
+void rethook_hook_current(struct rethook_node *node, struct pt_regs *regs)
+{
+ arch_rethook_prepare(node, regs);
+ __llist_add(&node->llist, ¤t->rethooks);
+}
+
+/* This assumes the 'tsk' is the current task or the is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type. If it finds the return address, this returns that address value,
+ * or this returns 0.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct rethook_node *rhn;
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr = __rethook_find_ret_addr(current, &node);
+
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ if (rhn->rethook->handler)
+ rhn->rethook->handler(rhn, rhn->rethook->data, regs);
+
+ first = first->next;
+ rethook_node_recycle(rhn);
+ }
+
+ return correct_ret_addr;
+}
+
Add rethook for x86 implementation. Most of the code
has been copied from kretprobes on x86.
Signed-off-by: Masami Hiramatsu <[email protected]>
---
arch/x86/Kconfig | 1
arch/x86/kernel/Makefile | 1
arch/x86/kernel/rethook.c | 115 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 117 insertions(+)
create mode 100644 arch/x86/kernel/rethook.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..939c4c897e63 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -219,6 +219,7 @@ config X86
select HAVE_KPROBES_ON_FTRACE
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
+ select HAVE_RETHOOK
select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..66593d8c4d74 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
new file mode 100644
index 000000000000..f2f3b9526e43
--- /dev/null
+++ b/arch/x86/kernel/rethook.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c.
+ */
+#include <linux/bug.h>
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+
+#include "kprobes/common.h"
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ unsigned long *frame_pointer;
+
+ /* fixup registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)&arch_rethook_trampoline;
+ regs->orig_ax = ~0UL;
+ regs->sp += sizeof(long);
+ frame_pointer = ®s->sp + 1;
+
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_rethook_fixup_return() which called from this
+ * rethook_trampoline_handler().
+ */
+ rethook_trampoline_handler(regs, (unsigned long)frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::sp' so that arch_rethook_trapmoline()
+ * can do RET right after POPF.
+ */
+ regs->sp = regs->flags;
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/*
+ * When a target function returns, this code saves registers and calls
+ * arch_rethook_trampoline_callback(), which calls the rethook handler.
+ */
+asm(
+ ".text\n"
+ ".global arch_rethook_trampoline\n"
+ ".type arch_rethook_trampoline, @function\n"
+ "arch_rethook_trampoline:\n"
+#ifdef CONFIG_X86_64
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushq $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushl $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 4', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->sp'. */
+ " addl $4, %esp\n"
+ " popfl\n"
+#endif
+ " ret\n"
+ ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n"
+);
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
+/*
+ * arch_rethook_trampoline() skips updating frame pointer. The frame pointer
+ * saved in arch_rethook_trampoline_callback() points to the real caller
+ * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have
+ * a standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline);
+
+/* This is called from rethook_trampoline_handler(). */
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ unsigned long *frame_pointer = ®s->sp + 1;
+
+ /* Replace fake return address with real one. */
+ *frame_pointer = correct_ret_addr;
+}
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs)
+{
+ unsigned long *stack = (unsigned long *)regs->sp;
+
+ rh->ret_addr = stack[0];
+ rh->frame = regs->sp;
+
+ /* Replace the return addr with trampoline addr */
+ stack[0] = (unsigned long) arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
Add exit_handler to fprobe. fprobe + rethook allows us
to hook the kernel function return without fgraph tracer.
Eventually, the fgraph tracer will be generic array based
return hooking and fprobe may use it if user requests.
Since both array-based approach and list-based approach
have Pros and Cons, (e.g. memory consumption v.s. less
missing events) it is better to keep both but fprobe
will provide the same exit-handler interface.
Signed-off-by: Masami Hiramatsu <[email protected]>
---
include/linux/fprobe.h | 4 +++
kernel/trace/Kconfig | 1 +
kernel/trace/fprobe.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 614d28d5828b..f566f59af760 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -5,6 +5,7 @@
#include <linux/compiler.h>
#include <linux/ftrace.h>
+#include <linux/rethook.h>
struct fprobe {
const char **syms;
@@ -14,7 +15,10 @@ struct fprobe {
struct ftrace_ops ftrace;
unsigned long nmissed;
unsigned int flags;
+ struct rethook *rethook;
+
void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
+ void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
};
#define FPROBE_FL_DISABLED 1
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 44c473ad9021..00bdd2a2f417 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -238,6 +238,7 @@ config FPROBE
bool "Kernel Function Probe (fprobe)"
depends on FUNCTION_TRACER
depends on DYNAMIC_FTRACE_WITH_REGS
+ select RETHOOK
default n
help
This option enables kernel function probe feature, which is
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 0247fc7d75e2..3333893e5217 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -7,12 +7,20 @@
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
+#include <linux/rethook.h>
#include <linux/slab.h>
#include <linux/sort.h>
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+};
+
static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh;
struct fprobe *fp;
int bit;
@@ -29,10 +37,34 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
if (fp->entry_handler)
fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ goto out;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ rethook_hook_current(rh, ftrace_get_regs(fregs));
+ }
+
+out:
ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(fprobe_handler);
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
static int convert_func_addresses(struct fprobe *fp)
{
unsigned int i;
@@ -64,6 +96,7 @@ static int convert_func_addresses(struct fprobe *fp)
*/
int register_fprobe(struct fprobe *fp)
{
+ unsigned int i, size;
int ret;
if (!fp || !fp->nentry || (!fp->syms && !fp->addrs) ||
@@ -78,10 +111,29 @@ int register_fprobe(struct fprobe *fp)
fp->ftrace.func = fprobe_handler;
fp->ftrace.flags = FTRACE_OPS_FL_SAVE_REGS;
+ /* Initialize rethook if needed */
+ if (fp->exit_handler) {
+ size = fp->nentry * num_possible_cpus() * 2;
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ for (i = 0; i < size; i++) {
+ struct rethook_node *node;
+
+ node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL);
+ if (!node) {
+ rethook_free(fp->rethook);
+ ret = -ENOMEM;
+ goto out;
+ }
+ rethook_add_node(fp->rethook, node);
+ }
+ } else
+ fp->rethook = NULL;
+
ret = ftrace_set_filter_ips(&fp->ftrace, fp->addrs, fp->nentry, 0, 0);
if (!ret)
ret = register_ftrace_function(&fp->ftrace);
+out:
if (ret < 0 && fp->syms) {
kfree(fp->addrs);
fp->addrs = NULL;
@@ -104,9 +156,12 @@ int unregister_fprobe(struct fprobe *fp)
ret = unregister_ftrace_function(&fp->ftrace);
- if (!ret && fp->syms) {
- kfree(fp->addrs);
- fp->addrs = NULL;
+ if (!ret) {
+ rethook_free(fp->rethook);
+ if (fp->syms) {
+ kfree(fp->addrs);
+ fp->addrs = NULL;
+ }
}
return ret;
}
Add a sample program for the fprobe.
Signed-off-by: Masami Hiramatsu <[email protected]>
---
Changes in v2:
- Fix infinit loop for multiple symbols.
- Fix memory leaks for copied string and entry array.
- Update for new fprobe APIs.
- Fix style issues.
---
samples/Kconfig | 7 +++
samples/Makefile | 1
samples/fprobe/Makefile | 3 +
samples/fprobe/fprobe_example.c | 103 +++++++++++++++++++++++++++++++++++++++
4 files changed, 114 insertions(+)
create mode 100644 samples/fprobe/Makefile
create mode 100644 samples/fprobe/fprobe_example.c
diff --git a/samples/Kconfig b/samples/Kconfig
index 43d2e9aa557f..e010c2c1256c 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -73,6 +73,13 @@ config SAMPLE_HW_BREAKPOINT
help
This builds kernel hardware breakpoint example modules.
+config SAMPLE_FPROBE
+ tristate "Build fprobe examples -- loadable modules only"
+ depends on FPROBE && m
+ help
+ This builds a fprobe example module. This module has an option 'symbol'.
+ You can specify a probed symbol or symbols separated with ','.
+
config SAMPLE_KFIFO
tristate "Build kfifo examples -- loadable modules only"
depends on m
diff --git a/samples/Makefile b/samples/Makefile
index 4bcd6b93bffa..4f73fe7aa473 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/
+obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/
diff --git a/samples/fprobe/Makefile b/samples/fprobe/Makefile
new file mode 100644
index 000000000000..ecccbfa6e99b
--- /dev/null
+++ b/samples/fprobe/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_FPROBE) += fprobe_example.o
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
new file mode 100644
index 000000000000..c28320537f98
--- /dev/null
+++ b/samples/fprobe/fprobe_example.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Here's a sample kernel module showing the use of fprobe to dump a
+ * stack trace and selected registers when kernel_clone() is called.
+ *
+ * For more information on theory of operation of kprobes, see
+ * Documentation/trace/kprobes.rst
+ *
+ * You will see the trace data in /var/log/messages and on the console
+ * whenever kernel_clone() is invoked to create a new process.
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fprobe.h>
+#include <linux/slab.h>
+
+#define MAX_SYMBOL_LEN 4096
+struct fprobe sample_probe;
+static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
+module_param_string(symbol, symbol, sizeof(symbol), 0644);
+
+static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+{
+ pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+}
+
+static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+{
+ unsigned long rip = instruction_pointer(regs);
+
+ pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
+ (void *)ip, (void *)ip, (void *)rip, (void *)rip);
+}
+
+static char *symbuf;
+
+static int __init fprobe_init(void)
+{
+ const char **syms;
+ char *p;
+ int ret, count, i;
+
+ sample_probe.entry_handler = sample_entry_handler;
+ sample_probe.exit_handler = sample_exit_handler;
+
+ if (strchr(symbol, ',')) {
+ symbuf = kstrdup(symbol, GFP_KERNEL);
+ if (!symbuf)
+ return -ENOMEM;
+ p = symbuf;
+ count = 1;
+ while ((p = strchr(++p, ',')) != NULL)
+ count++;
+ } else {
+ count = 1;
+ symbuf = symbol;
+ }
+ pr_info("%d symbols found\n", count);
+
+ syms = kcalloc(count, sizeof(char *), GFP_KERNEL);
+ if (!syms) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ p = symbuf;
+ for (i = 0; i < count; i++)
+ syms[i] = strsep(&p, ",");
+
+ sample_probe.syms = syms;
+ sample_probe.nentry = count;
+
+ ret = register_fprobe(&sample_probe);
+ if (ret < 0) {
+ pr_err("register_fprobe failed, returned %d\n", ret);
+ goto error;
+ }
+ pr_info("Planted fprobe at %s\n", symbol);
+ return 0;
+
+error:
+ if (symbuf != symbol)
+ kfree(symbuf);
+ return ret;
+}
+
+static void __exit fprobe_exit(void)
+{
+ unregister_fprobe(&sample_probe);
+
+ kfree(sample_probe.syms);
+ if (symbuf != symbol)
+ kfree(symbuf);
+
+ pr_info("fprobe at %s unregistered\n", symbol);
+}
+
+module_init(fprobe_init)
+module_exit(fprobe_exit)
+MODULE_LICENSE("GPL");
From: Jiri Olsa <[email protected]>
Adding new link type BPF_LINK_TYPE_KPROBE to attach so called
"kprobes" directly through fprobe API. Note that since the
using kprobes with multiple same handler is not efficient,
this uses the fprobe which natively support multiple probe
points for one same handler, but limited on function entry
and exit.
Adding new attach type BPF_TRACE_RAW_KPROBE that enables
such link for kprobe program.
The new link allows to create multiple kprobes link by using
new link_create interface:
struct {
__aligned_u64 addrs;
__u32 cnt;
__u64 bpf_cookie;
} kprobe;
Plus new flag BPF_F_KPROBE_RETURN for link_create.flags to
create return probe.
Signed-off-by: Jiri Olsa <[email protected]>
Signed-off-by: Masami Hiramatsu <[email protected]>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 12 ++
kernel/bpf/syscall.c | 195 +++++++++++++++++++++++++++++++++++++++-
tools/include/uapi/linux/bpf.h | 12 ++
4 files changed, 215 insertions(+), 5 deletions(-)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 48a91c51c015..a9000feab34e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -140,3 +140,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
#ifdef CONFIG_PERF_EVENTS
BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
#endif
+BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE, kprobe)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba5af15e25f5..10e9b56a074e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -995,6 +995,7 @@ enum bpf_attach_type {
BPF_SK_REUSEPORT_SELECT,
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
BPF_PERF_EVENT,
+ BPF_TRACE_RAW_KPROBE,
__MAX_BPF_ATTACH_TYPE
};
@@ -1009,6 +1010,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_NETNS = 5,
BPF_LINK_TYPE_XDP = 6,
BPF_LINK_TYPE_PERF_EVENT = 7,
+ BPF_LINK_TYPE_KPROBE = 8,
MAX_BPF_LINK_TYPE,
};
@@ -1111,6 +1113,11 @@ enum bpf_link_type {
*/
#define BPF_F_SLEEPABLE (1U << 4)
+/* link_create flags used in LINK_CREATE command for BPF_TRACE_RAW_KPROBE
+ * attach type.
+ */
+#define BPF_F_KPROBE_RETURN (1U << 0)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* the following extensions:
*
@@ -1463,6 +1470,11 @@ union bpf_attr {
*/
__u64 bpf_cookie;
} perf_event;
+ struct {
+ __aligned_u64 addrs;
+ __u32 cnt;
+ __u64 bpf_cookie;
+ } kprobe;
};
} link_create;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1033ee8c0caf..066fac3cfaa5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,7 @@
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/fprobe.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -3013,8 +3014,182 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
fput(perf_file);
return err;
}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -ENOTSUPP;
+}
#endif /* CONFIG_PERF_EVENTS */
+#ifdef CONFIG_FPROBE
+
+/* Note that this is called 'kprobe_link' but using fprobe inside */
+struct bpf_kprobe_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ bool is_return;
+ unsigned long *addrs;
+ u32 cnt;
+ u64 bpf_cookie;
+};
+
+static void bpf_kprobe_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_link *kprobe_link;
+
+ kprobe_link = container_of(link, struct bpf_kprobe_link, link);
+
+ unregister_fprobe(&kprobe_link->fp);
+}
+
+static void bpf_kprobe_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_link *kprobe_link;
+
+ kprobe_link = container_of(link, struct bpf_kprobe_link, link);
+ kfree(kprobe_link->addrs);
+ kfree(kprobe_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_link_lops = {
+ .release = bpf_kprobe_link_release,
+ .dealloc = bpf_kprobe_link_dealloc,
+};
+
+static int kprobe_link_prog_run(struct bpf_kprobe_link *kprobe_link,
+ struct pt_regs *regs)
+{
+ struct bpf_trace_run_ctx run_ctx;
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ err = 0;
+ goto out;
+ }
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ run_ctx.bpf_cookie = kprobe_link->bpf_cookie;
+
+ rcu_read_lock();
+ migrate_disable();
+ err = bpf_prog_run(kprobe_link->link.prog, regs);
+ migrate_enable();
+ rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static void kprobe_link_entry_handler(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_link *kprobe_link;
+
+ /*
+ * Because fprobe's regs->ip is set to the next instruction of
+ * dynamic-ftrace insturction, correct entry ip must be set, so
+ * that the bpf program can access entry address via regs as same
+ * as kprobes.
+ */
+ instruction_pointer_set(regs, entry_ip);
+ kprobe_link = container_of(fp, struct bpf_kprobe_link, fp);
+ kprobe_link_prog_run(kprobe_link, regs);
+}
+
+static void kprobe_link_exit_handler(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_link *kprobe_link;
+
+ kprobe_link = container_of(fp, struct bpf_kprobe_link, fp);
+ kprobe_link_prog_run(kprobe_link, regs);
+}
+
+static int bpf_kprobe_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_kprobe_link *link = NULL;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 **tmp;
+ int err;
+
+ flags = attr->link_create.flags;
+ if (flags & ~BPF_F_KPROBE_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe.addrs);
+ cnt = attr->link_create.kprobe.cnt;
+ size = cnt * sizeof(*tmp);
+
+ tmp = kzalloc(size, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ if (copy_from_user(tmp, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ /* TODO add extra copy for 32bit archs */
+ if (sizeof(u64) != sizeof(void *)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ addrs = (unsigned long *) tmp;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE, &bpf_kprobe_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ link->is_return = flags & BPF_F_KPROBE_RETURN;
+ link->addrs = addrs;
+ link->cnt = cnt;
+ link->bpf_cookie = attr->link_create.kprobe.bpf_cookie;
+
+ link->fp.addrs = addrs;
+ link->fp.nentry = cnt;
+
+ if (link->is_return)
+ link->fp.exit_handler = kprobe_link_exit_handler;
+ else
+ link->fp.entry_handler = kprobe_link_entry_handler;
+
+ err = register_fprobe(&link->fp);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto error;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kfree(tmp);
+
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+static int bpf_kprobe_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -ENOTSUPP;
+}
+#endif
+
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
@@ -4241,7 +4416,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe.bpf_cookie
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
@@ -4265,7 +4440,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
@@ -4273,6 +4447,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
}
ptype = prog->type;
break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+ attr->link_create.attach_type != BPF_TRACE_RAW_KPROBE) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ptype = prog->type;
+ break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4304,13 +4486,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
-#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
- case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
-#endif
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else
+ ret = bpf_kprobe_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ba5af15e25f5..10e9b56a074e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -995,6 +995,7 @@ enum bpf_attach_type {
BPF_SK_REUSEPORT_SELECT,
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
BPF_PERF_EVENT,
+ BPF_TRACE_RAW_KPROBE,
__MAX_BPF_ATTACH_TYPE
};
@@ -1009,6 +1010,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_NETNS = 5,
BPF_LINK_TYPE_XDP = 6,
BPF_LINK_TYPE_PERF_EVENT = 7,
+ BPF_LINK_TYPE_KPROBE = 8,
MAX_BPF_LINK_TYPE,
};
@@ -1111,6 +1113,11 @@ enum bpf_link_type {
*/
#define BPF_F_SLEEPABLE (1U << 4)
+/* link_create flags used in LINK_CREATE command for BPF_TRACE_RAW_KPROBE
+ * attach type.
+ */
+#define BPF_F_KPROBE_RETURN (1U << 0)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* the following extensions:
*
@@ -1463,6 +1470,11 @@ union bpf_attr {
*/
__u64 bpf_cookie;
} perf_event;
+ struct {
+ __aligned_u64 addrs;
+ __u32 cnt;
+ __u64 bpf_cookie;
+ } kprobe;
};
} link_create;
This is not intended to be merged to upstream code (since this
expose some kernel internal functions just for an example.)
But this is good to show how the fprobe is time-efficient
for registering a probe on thousands of functions.
# time insmod fprobe_example.ko symbol='btrfs_*'
[ 36.130947] fprobe_init: 1028 symbols found
[ 36.177901] fprobe_init: Planted fprobe at btrfs_*
real 0m 0.08s
user 0m 0.00s
sys 0m 0.07s
Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/kallsyms.c | 1 +
kernel/trace/ftrace.c | 1 +
samples/fprobe/fprobe_example.c | 69 ++++++++++++++++++++++++++++++++++-----
3 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3011bc33a5ba..d0c4073acbfd 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -246,6 +246,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
}
return 0;
}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
#endif /* CONFIG_LIVEPATCH */
static unsigned long get_symbol_pos(unsigned long addr,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d38ae5063be3..feb69ecc5d2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1580,6 +1580,7 @@ unsigned long ftrace_location(unsigned long ip)
{
return ftrace_location_range(ip, ip);
}
+EXPORT_SYMBOL_GPL(ftrace_location);
/**
* ftrace_text_reserved - return true if range contains an ftrace location
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index c28320537f98..df034e00661e 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "%s: " fmt, __func__
+#include <linux/glob.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fprobe.h>
@@ -37,16 +38,51 @@ static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_r
static char *symbuf;
+struct sym_search_param {
+ unsigned long *addrs;
+ const char *pat;
+ int cnt;
+};
+
+#define MAX_FPROBE_ENTS (16 * 1024)
+
+static int wildcard_match(void *data, const char *symbol, struct module *mod,
+ unsigned long addr)
+{
+ struct sym_search_param *param = (struct sym_search_param *)data;
+
+ if (glob_match(param->pat, symbol)) {
+ if (!ftrace_location(addr))
+ return 0;
+
+ if (param->addrs)
+ param->addrs[param->cnt] = addr;
+ param->cnt++;
+ if (param->cnt >= MAX_FPROBE_ENTS)
+ return -E2BIG;
+ }
+ return 0;
+}
+
static int __init fprobe_init(void)
{
- const char **syms;
+ struct sym_search_param param = {.pat = symbol, .addrs = NULL, .cnt = 0};
+ unsigned long *addrs = NULL;
+ const char **syms = NULL;
char *p;
int ret, count, i;
+ bool wildcard = false;
sample_probe.entry_handler = sample_entry_handler;
sample_probe.exit_handler = sample_exit_handler;
- if (strchr(symbol, ',')) {
+ if (strchr(symbol, '*')) {
+ kallsyms_on_each_symbol(wildcard_match, ¶m);
+ count = param.cnt;
+ if (!count)
+ return -ENOENT;
+ wildcard = true;
+ } else if (strchr(symbol, ',')) {
symbuf = kstrdup(symbol, GFP_KERNEL);
if (!symbuf)
return -ENOMEM;
@@ -58,19 +94,31 @@ static int __init fprobe_init(void)
count = 1;
symbuf = symbol;
}
- pr_info("%d symbols found\n", count);
- syms = kcalloc(count, sizeof(char *), GFP_KERNEL);
- if (!syms) {
+ if (wildcard)
+ addrs = kcalloc(count, sizeof(unsigned long), GFP_KERNEL);
+ else
+ syms = kcalloc(count, sizeof(char *), GFP_KERNEL);
+ if (!syms && !addrs) {
ret = -ENOMEM;
goto error;
}
- p = symbuf;
- for (i = 0; i < count; i++)
- syms[i] = strsep(&p, ",");
+ if (wildcard) {
+ param.addrs = addrs;
+ param.cnt = 0;
+
+ kallsyms_on_each_symbol(wildcard_match, ¶m);
+ count = param.cnt;
+ sample_probe.addrs = addrs;
+ } else {
+ p = symbuf;
+ for (i = 0; i < count; i++)
+ syms[i] = strsep(&p, ",");
+ sample_probe.syms = syms;
+ }
+ pr_info("%d symbols found\n", count);
- sample_probe.syms = syms;
sample_probe.nentry = count;
ret = register_fprobe(&sample_probe);
@@ -82,6 +130,8 @@ static int __init fprobe_init(void)
return 0;
error:
+ kfree(addrs);
+ kfree(syms);
if (symbuf != symbol)
kfree(symbuf);
return ret;
@@ -92,6 +142,7 @@ static void __exit fprobe_exit(void)
unregister_fprobe(&sample_probe);
kfree(sample_probe.syms);
+ kfree(sample_probe.addrs);
if (symbuf != symbol)
kfree(symbuf);
On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> Hi Jiri and Alexei,
>
> Here is the 2nd version of fprobe. This version uses the
> ftrace_set_filter_ips() for reducing the registering overhead.
> Note that this also drops per-probe point private data, which
> is not used anyway.
>
> This introduces the fprobe, the function entry/exit probe with
> multiple probe point support. This also introduces the rethook
> for hooking function return as same as kretprobe does. This
nice, I was going through the multi-user-graph support
and was wondering that this might be a better way
> abstraction will help us to generalize the fgraph tracer,
> because we can just switch it from rethook in fprobe, depending
> on the kernel configuration.
>
> The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> patches will not be affected by this change.
I'll try the bpf selftests on top of this
>
> [1] https://lore.kernel.org/all/[email protected]/T/#u
>
> I also added an out-of-tree (just for testing) patch at the
> end of this series ([8/8]) for adding a wildcard support to
> the sample program. With that patch, it shows how long the
> registration will take;
>
> # time insmod fprobe_example.ko symbol='btrfs_*'
> [ 36.130947] fprobe_init: 1028 symbols found
> [ 36.177901] fprobe_init: Planted fprobe at btrfs_*
> real 0m 0.08s
> user 0m 0.00s
> sys 0m 0.07s
I'll run my bpftrace tests on top of that
thanks,
jirka
>
> Thank you,
>
> ---
>
> Jiri Olsa (2):
> ftrace: Add ftrace_set_filter_ips function
> bpf: Add kprobe link for attaching raw kprobes
>
> Masami Hiramatsu (6):
> fprobe: Add ftrace based probe APIs
> rethook: Add a generic return hook
> rethook: x86: Add rethook x86 implementation
> fprobe: Add exit_handler support
> fprobe: Add sample program for fprobe
> [DO NOT MERGE] Out-of-tree: Support wildcard symbol option to sample
>
>
> arch/x86/Kconfig | 1
> arch/x86/kernel/Makefile | 1
> arch/x86/kernel/rethook.c | 115 ++++++++++++++++++++
> include/linux/bpf_types.h | 1
> include/linux/fprobe.h | 57 ++++++++++
> include/linux/ftrace.h | 3 +
> include/linux/rethook.h | 74 +++++++++++++
> include/linux/sched.h | 3 +
> include/uapi/linux/bpf.h | 12 ++
> kernel/bpf/syscall.c | 195 +++++++++++++++++++++++++++++++++-
> kernel/exit.c | 2
> kernel/fork.c | 3 +
> kernel/kallsyms.c | 1
> kernel/trace/Kconfig | 22 ++++
> kernel/trace/Makefile | 2
> kernel/trace/fprobe.c | 168 +++++++++++++++++++++++++++++
> kernel/trace/ftrace.c | 54 ++++++++-
> kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++
> samples/Kconfig | 7 +
> samples/Makefile | 1
> samples/fprobe/Makefile | 3 +
> samples/fprobe/fprobe_example.c | 154 +++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 12 ++
> 23 files changed, 1103 insertions(+), 14 deletions(-)
> create mode 100644 arch/x86/kernel/rethook.c
> create mode 100644 include/linux/fprobe.h
> create mode 100644 include/linux/rethook.h
> create mode 100644 kernel/trace/fprobe.c
> create mode 100644 kernel/trace/rethook.c
> create mode 100644 samples/fprobe/Makefile
> create mode 100644 samples/fprobe/fprobe_example.c
>
> --
> Masami Hiramatsu (Linaro) <[email protected]>
>
On Wed, Jan 12, 2022 at 11:03:22PM +0900, Masami Hiramatsu wrote:
> Add a return hook framework which hooks the function
> return. Most of the idea came from the kretprobe, but
> this is independent from kretprobe.
> Note that this is expected to be used with other
> function entry hooking feature, like ftrace, fprobe,
> adn kprobes. Eventually this will replace the
> kretprobe (e.g. kprobe + rethook = kretprobe), but
> at this moment, this is just a additional hook.
this looks similar to the code kretprobe is using now
would it make sense to incrementaly change current code to provide
this rethook interface? instead of big switch of current kretprobe
to kprobe + new rethook interface in future?
thanks,
jirka
>
> Signed-off-by: Masami Hiramatsu <[email protected]>
> ---
> include/linux/rethook.h | 74 +++++++++++++++
> include/linux/sched.h | 3 +
> kernel/exit.c | 2
> kernel/fork.c | 3 +
> kernel/trace/Kconfig | 11 ++
> kernel/trace/Makefile | 1
> kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++
> 7 files changed, 320 insertions(+)
> create mode 100644 include/linux/rethook.h
> create mode 100644 kernel/trace/rethook.c
>
> diff --git a/include/linux/rethook.h b/include/linux/rethook.h
> new file mode 100644
> index 000000000000..2622bcd5213a
> --- /dev/null
> +++ b/include/linux/rethook.h
> @@ -0,0 +1,74 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Return hooking with list-based shadow stack.
> + */
> +#ifndef _LINUX_RETHOOK_H
> +#define _LINUX_RETHOOK_H
> +
> +#include <linux/compiler.h>
> +#include <linux/freelist.h>
> +#include <linux/llist.h>
> +#include <linux/rcupdate.h>
> +#include <linux/refcount.h>
> +
> +struct rethook_node;
> +
> +typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs *);
> +
> +struct rethook {
> + void *data;
> + rethook_handler_t handler;
> + struct freelist_head pool;
> + refcount_t ref;
> + struct rcu_head rcu;
> +};
> +
> +struct rethook_node {
> + union {
> + struct freelist_node freelist;
> + struct rcu_head rcu;
> + };
> + struct llist_node llist;
> + struct rethook *rethook;
> + unsigned long ret_addr;
> + unsigned long frame;
> +};
> +
> +int rethook_node_init(struct rethook_node *node);
> +
> +struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
> +void rethook_free(struct rethook *rh);
> +void rethook_add_node(struct rethook *rh, struct rethook_node *node);
> +
> +struct rethook_node *rethook_try_get(struct rethook *rh);
> +void rethook_node_recycle(struct rethook_node *node);
> +void rethook_hook_current(struct rethook_node *node, struct pt_regs *regs);
> +
> +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
> + struct llist_node **cur);
> +
> +/* Arch dependent code must implement this and trampoline code */
> +void arch_rethook_prepare(struct rethook_node *node, struct pt_regs *regs);
> +void arch_rethook_trampoline(void);
> +
> +static inline bool is_rethook_trampoline(unsigned long addr)
> +{
> + return addr == (unsigned long)arch_rethook_trampoline;
> +}
> +
> +/* If the architecture needs a fixup the return address, implement it. */
> +void arch_rethook_fixup_return(struct pt_regs *regs,
> + unsigned long correct_ret_addr);
> +
> +/* Generic trampoline handler, arch code must prepare asm stub */
> +unsigned long rethook_trampoline_handler(struct pt_regs *regs,
> + unsigned long frame);
> +
> +#ifdef CONFIG_RETHOOK
> +void rethook_flush_task(struct task_struct *tk);
> +#else
> +#define rethook_flush_task(tsk) do { } while (0)
> +#endif
> +
> +#endif
> +
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 78c351e35fec..2bfabf5355b7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1473,6 +1473,9 @@ struct task_struct {
> #ifdef CONFIG_KRETPROBES
> struct llist_head kretprobe_instances;
> #endif
> +#ifdef CONFIG_RETHOOK
> + struct llist_head rethooks;
> +#endif
>
> #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
> /*
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f702a6a63686..a39a321c1f37 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -64,6 +64,7 @@
> #include <linux/compat.h>
> #include <linux/io_uring.h>
> #include <linux/kprobes.h>
> +#include <linux/rethook.h>
>
> #include <linux/uaccess.h>
> #include <asm/unistd.h>
> @@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
> struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
>
> kprobe_flush_task(tsk);
> + rethook_flush_task(tsk);
> perf_event_delayed_put(tsk);
> trace_sched_process_free(tsk);
> put_task_struct(tsk);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3244cc56b697..ffae38be64c4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2282,6 +2282,9 @@ static __latent_entropy struct task_struct *copy_process(
> #ifdef CONFIG_KRETPROBES
> p->kretprobe_instances.first = NULL;
> #endif
> +#ifdef CONFIG_RETHOOK
> + p->rethooks.first = NULL;
> +#endif
>
> /*
> * Ensure that the cgroup subsystem policies allow the new process to be
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index 6834b0272798..44c473ad9021 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
> config NOP_TRACER
> bool
>
> +config HAVE_RETHOOK
> + bool
> +
> +config RETHOOK
> + bool
> + depends on HAVE_RETHOOK
> + help
> + Enable generic return hooking feature. This is an internal
> + API, which will be used by other function-entry hooking
> + feature like fprobe and kprobes.
> +
> config HAVE_FUNCTION_TRACER
> bool
> help
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 79255f9de9a4..c6f11a139eac 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -98,6 +98,7 @@ obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
> obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
> obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
> obj-$(CONFIG_FPROBE) += fprobe.o
> +obj-$(CONFIG_RETHOOK) += rethook.o
>
> obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
>
> diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
> new file mode 100644
> index 000000000000..80c0584e8497
> --- /dev/null
> +++ b/kernel/trace/rethook.c
> @@ -0,0 +1,226 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#define pr_fmt(fmt) "rethook: " fmt
> +
> +#include <linux/bug.h>
> +#include <linux/kallsyms.h>
> +#include <linux/kprobes.h>
> +#include <linux/preempt.h>
> +#include <linux/rethook.h>
> +#include <linux/slab.h>
> +#include <linux/sort.h>
> +
> +/* Return hook list (shadow stack by list) */
> +
> +void rethook_flush_task(struct task_struct *tk)
> +{
> + struct rethook_node *rhn;
> + struct llist_node *node;
> +
> + preempt_disable();
> +
> + node = __llist_del_all(&tk->rethooks);
> + while (node) {
> + rhn = container_of(node, struct rethook_node, llist);
> + node = node->next;
> + rethook_node_recycle(rhn);
> + }
> +
> + preempt_enable();
> +}
> +
> +static void rethook_free_rcu(struct rcu_head *head)
> +{
> + struct rethook *rh = container_of(head, struct rethook, rcu);
> + struct rethook_node *rhn;
> + struct freelist_node *node;
> + int count = 1;
> +
> + node = rh->pool.head;
> + while (node) {
> + rhn = container_of(node, struct rethook_node, freelist);
> + node = node->next;
> + kfree(rhn);
> + count++;
> + }
> +
> + /* The rh->ref is the number of pooled node + 1 */
> + if (refcount_sub_and_test(count, &rh->ref))
> + kfree(rh);
> +}
> +
> +void rethook_free(struct rethook *rh)
> +{
> + rh->handler = NULL;
> + rh->data = NULL;
> +
> + call_rcu(&rh->rcu, rethook_free_rcu);
> +}
> +
> +/*
> + * @handler must not NULL. @handler == NULL means this rethook is
> + * going to be freed.
> + */
> +struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
> +{
> + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
> +
> + if (!rh || !handler)
> + return NULL;
> +
> + rh->data = data;
> + rh->handler = handler;
> + rh->pool.head = NULL;
> + refcount_set(&rh->ref, 1);
> +
> + return rh;
> +}
> +
> +void rethook_add_node(struct rethook *rh, struct rethook_node *node)
> +{
> + node->rethook = rh;
> + freelist_add(&node->freelist, &rh->pool);
> + refcount_inc(&rh->ref);
> +}
> +
> +static void free_rethook_node_rcu(struct rcu_head *head)
> +{
> + struct rethook_node *node = container_of(head, struct rethook_node, rcu);
> +
> + if (refcount_dec_and_test(&node->rethook->ref))
> + kfree(node->rethook);
> + kfree(node);
> +}
> +
> +void rethook_node_recycle(struct rethook_node *node)
> +{
> + if (likely(READ_ONCE(node->rethook->handler)))
> + freelist_add(&node->freelist, &node->rethook->pool);
> + else
> + call_rcu(&node->rcu, free_rethook_node_rcu);
> +}
> +
> +struct rethook_node *rethook_try_get(struct rethook *rh)
> +{
> + struct freelist_node *fn;
> +
> + /* Check whether @rh is going to be freed. */
> + if (unlikely(!READ_ONCE(rh->handler)))
> + return NULL;
> +
> + fn = freelist_try_get(&rh->pool);
> + if (!fn)
> + return NULL;
> +
> + return container_of(fn, struct rethook_node, freelist);
> +}
> +
> +void rethook_hook_current(struct rethook_node *node, struct pt_regs *regs)
> +{
> + arch_rethook_prepare(node, regs);
> + __llist_add(&node->llist, ¤t->rethooks);
> +}
> +
> +/* This assumes the 'tsk' is the current task or the is not running. */
> +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
> + struct llist_node **cur)
> +{
> + struct rethook_node *rh = NULL;
> + struct llist_node *node = *cur;
> +
> + if (!node)
> + node = tsk->rethooks.first;
> + else
> + node = node->next;
> +
> + while (node) {
> + rh = container_of(node, struct rethook_node, llist);
> + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
> + *cur = node;
> + return rh->ret_addr;
> + }
> + node = node->next;
> + }
> + return 0;
> +}
> +NOKPROBE_SYMBOL(__rethook_find_ret_addr);
> +
> +/**
> + * rethook_find_ret_addr -- Find correct return address modified by rethook
> + * @tsk: Target task
> + * @frame: A frame pointer
> + * @cur: a storage of the loop cursor llist_node pointer for next call
> + *
> + * Find the correct return address modified by a rethook on @tsk in unsigned
> + * long type. If it finds the return address, this returns that address value,
> + * or this returns 0.
> + * The @tsk must be 'current' or a task which is not running. @frame is a hint
> + * to get the currect return address - which is compared with the
> + * rethook::frame field. The @cur is a loop cursor for searching the
> + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
> + * first call, but '@cur' itself must NOT NULL.
> + */
> +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
> + struct llist_node **cur)
> +{
> + struct rethook_node *rhn = NULL;
> + unsigned long ret;
> +
> + if (WARN_ON_ONCE(!cur))
> + return 0;
> +
> + do {
> + ret = __rethook_find_ret_addr(tsk, cur);
> + if (!ret)
> + break;
> + rhn = container_of(*cur, struct rethook_node, llist);
> + } while (rhn->frame != frame);
> +
> + return ret;
> +}
> +NOKPROBE_SYMBOL(rethook_find_ret_addr);
> +
> +void __weak arch_rethook_fixup_return(struct pt_regs *regs,
> + unsigned long correct_ret_addr)
> +{
> + /*
> + * Do nothing by default. If the architecture which uses a
> + * frame pointer to record real return address on the stack,
> + * it should fill this function to fixup the return address
> + * so that stacktrace works from the rethook handler.
> + */
> +}
> +
> +unsigned long rethook_trampoline_handler(struct pt_regs *regs,
> + unsigned long frame)
> +{
> + struct rethook_node *rhn;
> + struct llist_node *first, *node = NULL;
> + unsigned long correct_ret_addr = __rethook_find_ret_addr(current, &node);
> +
> + if (!correct_ret_addr) {
> + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
> + BUG_ON(1);
> + }
> +
> + instruction_pointer_set(regs, correct_ret_addr);
> + arch_rethook_fixup_return(regs, correct_ret_addr);
> +
> + first = current->rethooks.first;
> + current->rethooks.first = node->next;
> + node->next = NULL;
> +
> + while (first) {
> + rhn = container_of(first, struct rethook_node, llist);
> + if (WARN_ON_ONCE(rhn->frame != frame))
> + break;
> + if (rhn->rethook->handler)
> + rhn->rethook->handler(rhn, rhn->rethook->data, regs);
> +
> + first = first->next;
> + rethook_node_recycle(rhn);
> + }
> +
> + return correct_ret_addr;
> +}
> +
>
On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > Hi Jiri and Alexei,
> >
> > Here is the 2nd version of fprobe. This version uses the
> > ftrace_set_filter_ips() for reducing the registering overhead.
> > Note that this also drops per-probe point private data, which
> > is not used anyway.
> >
> > This introduces the fprobe, the function entry/exit probe with
> > multiple probe point support. This also introduces the rethook
> > for hooking function return as same as kretprobe does. This
>
> nice, I was going through the multi-user-graph support
> and was wondering that this might be a better way
>
> > abstraction will help us to generalize the fgraph tracer,
> > because we can just switch it from rethook in fprobe, depending
> > on the kernel configuration.
> >
> > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > patches will not be affected by this change.
>
> I'll try the bpf selftests on top of this
I'm getting crash and stall when running bpf selftests,
the fprobe sample module works fine, I'll check on that
jirka
On Thu, 13 Jan 2022 13:25:52 +0100
Jiri Olsa <[email protected]> wrote:
> On Wed, Jan 12, 2022 at 11:03:22PM +0900, Masami Hiramatsu wrote:
> > Add a return hook framework which hooks the function
> > return. Most of the idea came from the kretprobe, but
> > this is independent from kretprobe.
> > Note that this is expected to be used with other
> > function entry hooking feature, like ftrace, fprobe,
> > adn kprobes. Eventually this will replace the
> > kretprobe (e.g. kprobe + rethook = kretprobe), but
> > at this moment, this is just a additional hook.
>
> this looks similar to the code kretprobe is using now
Yes, I've mostly re-typed the code :)
> would it make sense to incrementaly change current code to provide
> this rethook interface? instead of big switch of current kretprobe
> to kprobe + new rethook interface in future?
Would you mean modifying the kretprobe instance code to provide
similar one, and rename it at some point?
My original idea is to keep the current kretprobe code and build
up the similar one, and switch to it at some point. Actually,
I don't want to change the current kretprobe interface itself,
but the backend will be changed. For example, current kretprobe
has below interface.
struct kretprobe {
struct kprobe kp;
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler;
int maxactive;
int nmissed;
size_t data_size;
struct freelist_head freelist;
struct kretprobe_holder *rph;
};
My idea is switching it to below.
struct kretprobe {
struct kprobe kp;
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler;
int maxactive;
int nmissed;
size_t data_size;
struct rethook *rethook;
};
Of course 'kretprobe_instance' may need to be changed...
struct kretprobe_instance {
struct rethook_node;
char data[];
};
But even though, since there is 'get_kretprobe(ri)' wrapper, user
will be able to access the 'struct kretprobe' from kretprobe_instance
transparently.
Thank you,
--
Masami Hiramatsu <[email protected]>
On Thu, 13 Jan 2022 13:27:34 +0100
Jiri Olsa <[email protected]> wrote:
> On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > Hi Jiri and Alexei,
> > >
> > > Here is the 2nd version of fprobe. This version uses the
> > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > Note that this also drops per-probe point private data, which
> > > is not used anyway.
> > >
> > > This introduces the fprobe, the function entry/exit probe with
> > > multiple probe point support. This also introduces the rethook
> > > for hooking function return as same as kretprobe does. This
> >
> > nice, I was going through the multi-user-graph support
> > and was wondering that this might be a better way
> >
> > > abstraction will help us to generalize the fgraph tracer,
> > > because we can just switch it from rethook in fprobe, depending
> > > on the kernel configuration.
> > >
> > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > patches will not be affected by this change.
> >
> > I'll try the bpf selftests on top of this
>
> I'm getting crash and stall when running bpf selftests,
> the fprobe sample module works fine, I'll check on that
Hmm, would you mean tools/testing/selftests/bpf/prog_tests/ ?
Let me check it too.
Thank you,
--
Masami Hiramatsu <[email protected]>
Hi Jiri and Alexei,
On Thu, 13 Jan 2022 13:27:34 +0100
Jiri Olsa <[email protected]> wrote:
> On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > Hi Jiri and Alexei,
> > >
> > > Here is the 2nd version of fprobe. This version uses the
> > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > Note that this also drops per-probe point private data, which
> > > is not used anyway.
> > >
> > > This introduces the fprobe, the function entry/exit probe with
> > > multiple probe point support. This also introduces the rethook
> > > for hooking function return as same as kretprobe does. This
> >
> > nice, I was going through the multi-user-graph support
> > and was wondering that this might be a better way
> >
> > > abstraction will help us to generalize the fgraph tracer,
> > > because we can just switch it from rethook in fprobe, depending
> > > on the kernel configuration.
> > >
> > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > patches will not be affected by this change.
> >
> > I'll try the bpf selftests on top of this
>
> I'm getting crash and stall when running bpf selftests,
> the fprobe sample module works fine, I'll check on that
I've tried to build tools/testing/selftests/bpf on my machine,
but I got below errors. Would you know how I can setup to build
the bpf selftests correctly? (I tried "make M=samples/bpf", but same result)
~/ksrc/linux/tools/testing/selftests/bpf$ make
[...]
CLANG /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool/pid_iter.bpf.o
skeleton/pid_iter.bpf.c:35:10: error: incomplete definition of type 'struct bpf_link'
return BPF_CORE_READ((struct bpf_link *)ent, id);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:403:2: note: expanded from macro 'BPF_CORE_READ'
___type((src), a, ##__VA_ARGS__) __r; \
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:274:29: note: expanded from macro '___type'
#define ___type(...) typeof(___arrow(__VA_ARGS__))
^~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:272:23: note: expanded from macro '___arrow'
#define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:223:25: note: expanded from macro '___concat'
#define ___concat(a, b) a ## b
^
<scratch space>:16:1: note: expanded from here
___arrow2
^
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:263:26: note: expanded from macro '___arrow2'
#define ___arrow2(a, b) a->b
~^
skeleton/pid_iter.bpf.c:35:32: note: forward declaration of 'struct bpf_link'
return BPF_CORE_READ((struct bpf_link *)ent, id);
^
skeleton/pid_iter.bpf.c:35:10: error: incomplete definition of type 'struct bpf_link'
return BPF_CORE_READ((struct bpf_link *)ent, id);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:404:2: note: expanded from macro 'BPF_CORE_READ'
BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:311:2: note: expanded from macro 'BPF_CORE_READ_INTO'
___core_read(bpf_core_read, bpf_core_read, \
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:302:2: note: expanded from macro '___core_read'
___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
note: (skipping 3 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:296:2: note: expanded from macro '___core_read0'
___read(fn, dst, ___type(src), src, a);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:277:59: note: expanded from macro '___read'
read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:206:79: note: expanded from macro 'bpf_core_read'
bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))
^~~
skeleton/pid_iter.bpf.c:35:32: note: forward declaration of 'struct bpf_link'
return BPF_CORE_READ((struct bpf_link *)ent, id);
^
skeleton/pid_iter.bpf.c:35:10: error: returning 'void' from a function with incompatible result type '__u32' (aka 'unsigned int')
return BPF_CORE_READ((struct bpf_link *)ent, id);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:402:36: note: expanded from macro 'BPF_CORE_READ'
#define BPF_CORE_READ(src, a, ...) ({ \
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
skeleton/pid_iter.bpf.c:42:17: warning: declaration of 'struct bpf_iter__task_file' will not be visible outside of this function [-Wvisibility]
int iter(struct bpf_iter__task_file *ctx)
^
skeleton/pid_iter.bpf.c:44:25: error: incomplete definition of type 'struct bpf_iter__task_file'
struct file *file = ctx->file;
~~~^
skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
int iter(struct bpf_iter__task_file *ctx)
^
skeleton/pid_iter.bpf.c:45:32: error: incomplete definition of type 'struct bpf_iter__task_file'
struct task_struct *task = ctx->task;
~~~^
skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
int iter(struct bpf_iter__task_file *ctx)
^
skeleton/pid_iter.bpf.c:76:19: error: incomplete definition of type 'struct bpf_iter__task_file'
bpf_seq_write(ctx->meta->seq, &e, sizeof(e));
~~~^
skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
int iter(struct bpf_iter__task_file *ctx)
^
1 warning and 6 errors generated.
make[1]: *** [Makefile:188: /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool/pid_iter.bpf.o] Error 1
make: *** [Makefile:219: /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/sbin/bpftool] Error 2
Thank you,
--
Masami Hiramatsu <[email protected]>
On Fri, Jan 14, 2022 at 11:47:04PM +0900, Masami Hiramatsu wrote:
> Hi Jiri and Alexei,
>
> On Thu, 13 Jan 2022 13:27:34 +0100
> Jiri Olsa <[email protected]> wrote:
>
> > On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > > Hi Jiri and Alexei,
> > > >
> > > > Here is the 2nd version of fprobe. This version uses the
> > > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > > Note that this also drops per-probe point private data, which
> > > > is not used anyway.
> > > >
> > > > This introduces the fprobe, the function entry/exit probe with
> > > > multiple probe point support. This also introduces the rethook
> > > > for hooking function return as same as kretprobe does. This
> > >
> > > nice, I was going through the multi-user-graph support
> > > and was wondering that this might be a better way
> > >
> > > > abstraction will help us to generalize the fgraph tracer,
> > > > because we can just switch it from rethook in fprobe, depending
> > > > on the kernel configuration.
> > > >
> > > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > > patches will not be affected by this change.
> > >
> > > I'll try the bpf selftests on top of this
> >
> > I'm getting crash and stall when running bpf selftests,
> > the fprobe sample module works fine, I'll check on that
>
> I've tried to build tools/testing/selftests/bpf on my machine,
> but I got below errors. Would you know how I can setup to build
> the bpf selftests correctly? (I tried "make M=samples/bpf", but same result)
what's your clang version? your distro might be behind,
I'm using clang 14 compiled from sources:
$ /opt/clang/bin/clang --version
clang version 14.0.0 (https://github.com/llvm/llvm-project.git 9f8ffaaa0bddcefeec15a3df9858fd50b05fcbae)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/clang/bin
and compiling bpf selftests with:
$ CLANG=/opt/clang/bin/clang make
jirka
>
> ~/ksrc/linux/tools/testing/selftests/bpf$ make
> [...]
> CLANG /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool/pid_iter.bpf.o
> skeleton/pid_iter.bpf.c:35:10: error: incomplete definition of type 'struct bpf_link'
> return BPF_CORE_READ((struct bpf_link *)ent, id);
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:403:2: note: expanded from macro 'BPF_CORE_READ'
> ___type((src), a, ##__VA_ARGS__) __r; \
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:274:29: note: expanded from macro '___type'
> #define ___type(...) typeof(___arrow(__VA_ARGS__))
> ^~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:272:23: note: expanded from macro '___arrow'
> #define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__)
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:223:25: note: expanded from macro '___concat'
> #define ___concat(a, b) a ## b
> ^
> <scratch space>:16:1: note: expanded from here
> ___arrow2
> ^
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:263:26: note: expanded from macro '___arrow2'
> #define ___arrow2(a, b) a->b
> ~^
> skeleton/pid_iter.bpf.c:35:32: note: forward declaration of 'struct bpf_link'
> return BPF_CORE_READ((struct bpf_link *)ent, id);
> ^
> skeleton/pid_iter.bpf.c:35:10: error: incomplete definition of type 'struct bpf_link'
> return BPF_CORE_READ((struct bpf_link *)ent, id);
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:404:2: note: expanded from macro 'BPF_CORE_READ'
> BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:311:2: note: expanded from macro 'BPF_CORE_READ_INTO'
> ___core_read(bpf_core_read, bpf_core_read, \
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:302:2: note: expanded from macro '___core_read'
> ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> note: (skipping 3 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:296:2: note: expanded from macro '___core_read0'
> ___read(fn, dst, ___type(src), src, a);
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:277:59: note: expanded from macro '___read'
> read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor)
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:206:79: note: expanded from macro 'bpf_core_read'
> bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))
> ^~~
> skeleton/pid_iter.bpf.c:35:32: note: forward declaration of 'struct bpf_link'
> return BPF_CORE_READ((struct bpf_link *)ent, id);
> ^
> skeleton/pid_iter.bpf.c:35:10: error: returning 'void' from a function with incompatible result type '__u32' (aka 'unsigned int')
> return BPF_CORE_READ((struct bpf_link *)ent, id);
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool//bootstrap/libbpf//include/bpf/bpf_core_read.h:402:36: note: expanded from macro 'BPF_CORE_READ'
> #define BPF_CORE_READ(src, a, ...) ({ \
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> skeleton/pid_iter.bpf.c:42:17: warning: declaration of 'struct bpf_iter__task_file' will not be visible outside of this function [-Wvisibility]
> int iter(struct bpf_iter__task_file *ctx)
> ^
> skeleton/pid_iter.bpf.c:44:25: error: incomplete definition of type 'struct bpf_iter__task_file'
> struct file *file = ctx->file;
> ~~~^
> skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
> int iter(struct bpf_iter__task_file *ctx)
> ^
> skeleton/pid_iter.bpf.c:45:32: error: incomplete definition of type 'struct bpf_iter__task_file'
> struct task_struct *task = ctx->task;
> ~~~^
> skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
> int iter(struct bpf_iter__task_file *ctx)
> ^
> skeleton/pid_iter.bpf.c:76:19: error: incomplete definition of type 'struct bpf_iter__task_file'
> bpf_seq_write(ctx->meta->seq, &e, sizeof(e));
> ~~~^
> skeleton/pid_iter.bpf.c:42:17: note: forward declaration of 'struct bpf_iter__task_file'
> int iter(struct bpf_iter__task_file *ctx)
> ^
> 1 warning and 6 errors generated.
> make[1]: *** [Makefile:188: /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/build/bpftool/pid_iter.bpf.o] Error 1
> make: *** [Makefile:219: /home/mhiramat/ksrc/linux/tools/testing/selftests/bpf/tools/sbin/bpftool] Error 2
>
>
> Thank you,
>
> --
> Masami Hiramatsu <[email protected]>
>
On Thu, Jan 13, 2022 at 10:15:32PM +0900, Masami Hiramatsu wrote:
> On Thu, 13 Jan 2022 13:25:52 +0100
> Jiri Olsa <[email protected]> wrote:
>
> > On Wed, Jan 12, 2022 at 11:03:22PM +0900, Masami Hiramatsu wrote:
> > > Add a return hook framework which hooks the function
> > > return. Most of the idea came from the kretprobe, but
> > > this is independent from kretprobe.
> > > Note that this is expected to be used with other
> > > function entry hooking feature, like ftrace, fprobe,
> > > adn kprobes. Eventually this will replace the
> > > kretprobe (e.g. kprobe + rethook = kretprobe), but
> > > at this moment, this is just a additional hook.
> >
> > this looks similar to the code kretprobe is using now
>
> Yes, I've mostly re-typed the code :)
>
> > would it make sense to incrementaly change current code to provide
> > this rethook interface? instead of big switch of current kretprobe
> > to kprobe + new rethook interface in future?
>
> Would you mean modifying the kretprobe instance code to provide
> similar one, and rename it at some point?
> My original idea is to keep the current kretprobe code and build
> up the similar one, and switch to it at some point. Actually,
> I don't want to change the current kretprobe interface itself,
> but the backend will be changed. For example, current kretprobe
> has below interface.
>
> struct kretprobe {
> struct kprobe kp;
> kretprobe_handler_t handler;
> kretprobe_handler_t entry_handler;
> int maxactive;
> int nmissed;
> size_t data_size;
> struct freelist_head freelist;
> struct kretprobe_holder *rph;
> };
>
> My idea is switching it to below.
>
> struct kretprobe {
> struct kprobe kp;
> kretprobe_handler_t handler;
> kretprobe_handler_t entry_handler;
> int maxactive;
> int nmissed;
> size_t data_size;
> struct rethook *rethook;
> };
looks good, will this be a lot of changes?
could you include it in the patchset?
thanks,
jirka
>
> Of course 'kretprobe_instance' may need to be changed...
>
> struct kretprobe_instance {
> struct rethook_node;
> char data[];
> };
>
> But even though, since there is 'get_kretprobe(ri)' wrapper, user
> will be able to access the 'struct kretprobe' from kretprobe_instance
> transparently.
>
> Thank you,
>
>
> --
> Masami Hiramatsu <[email protected]>
>
On Fri, Jan 14, 2022 at 7:10 AM Jiri Olsa <[email protected]> wrote:
>
> On Fri, Jan 14, 2022 at 11:47:04PM +0900, Masami Hiramatsu wrote:
> > Hi Jiri and Alexei,
> >
> > On Thu, 13 Jan 2022 13:27:34 +0100
> > Jiri Olsa <[email protected]> wrote:
> >
> > > On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > > > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > > > Hi Jiri and Alexei,
> > > > >
> > > > > Here is the 2nd version of fprobe. This version uses the
> > > > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > > > Note that this also drops per-probe point private data, which
> > > > > is not used anyway.
> > > > >
> > > > > This introduces the fprobe, the function entry/exit probe with
> > > > > multiple probe point support. This also introduces the rethook
> > > > > for hooking function return as same as kretprobe does. This
> > > >
> > > > nice, I was going through the multi-user-graph support
> > > > and was wondering that this might be a better way
> > > >
> > > > > abstraction will help us to generalize the fgraph tracer,
> > > > > because we can just switch it from rethook in fprobe, depending
> > > > > on the kernel configuration.
> > > > >
> > > > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > > > patches will not be affected by this change.
> > > >
> > > > I'll try the bpf selftests on top of this
> > >
> > > I'm getting crash and stall when running bpf selftests,
> > > the fprobe sample module works fine, I'll check on that
> >
> > I've tried to build tools/testing/selftests/bpf on my machine,
> > but I got below errors. Would you know how I can setup to build
> > the bpf selftests correctly? (I tried "make M=samples/bpf", but same result)
>
> what's your clang version? your distro might be behind,
If you have very recent Clang, decently recent pahole, and qemu, try
using vmtest.sh. That should build the kernel with all the necessary
kernel config options and start qemu image with that latest image and
build selftests. And even run selftests automatically.
> I'm using clang 14 compiled from sources:
>
> $ /opt/clang/bin/clang --version
> clang version 14.0.0 (https://github.com/llvm/llvm-project.git 9f8ffaaa0bddcefeec15a3df9858fd50b05fcbae)
> Target: x86_64-unknown-linux-gnu
> Thread model: posix
> InstalledDir: /opt/clang/bin
>
> and compiling bpf selftests with:
>
> $ CLANG=/opt/clang/bin/clang make
>
> jirka
>
>
> >
[...]
> >
> > Thank you,
> >
> > --
> > Masami Hiramatsu <[email protected]>
> >
>
On Wed, Jan 12, 2022 at 6:02 AM Masami Hiramatsu <[email protected]> wrote:
>
> Hi Jiri and Alexei,
>
> Here is the 2nd version of fprobe. This version uses the
> ftrace_set_filter_ips() for reducing the registering overhead.
> Note that this also drops per-probe point private data, which
> is not used anyway.
This per-probe private data is necessary for the feature called BPF
cookie, in which each attachment has a unique user-provided u64 value
associated to it, accessible at runtime through
bpf_get_attach_cookie() helper. One way or another we'll need to
support this to make these multi-attach BPF programs really useful for
generic tracing applications.
Jiri,
We've discussed with Alexei this week how cookies can be supported for
multi-attach fentry (where it seems even more challenging than in
kprobe case), and agreed on rather simple solution, which roughly goes
like this. When multi-attaching either fentry/fexit program, save
sorted array of IP addresses and then sorted in the same order as IPs
list of u64 cookies. At runtime, bpf_get_attach_cookie() helper should
somehow get access to these two arrays and functions IP (that we
already have with bpf_get_func_ip()), perform binary search and locate
necessary cookie. This offloads the overhead of finding this cookie to
actual call site of bpf_get_attach_cookie() (and it's a log(N), so not
bad at all, especially if BPF program can be optimized to call this
helper just once).
I think something like that should be doable for Masami's fprobe-based
multi-attach kprobes, right? That would allow to have super-fast
attachment, but still support BPF cookie per each individual IP/kernel
function attachment. I haven't looked at code thoroughly, though,
please let me know if I'm missing something fundamental.
>
> This introduces the fprobe, the function entry/exit probe with
> multiple probe point support. This also introduces the rethook
> for hooking function return as same as kretprobe does. This
> abstraction will help us to generalize the fgraph tracer,
> because we can just switch it from rethook in fprobe, depending
> on the kernel configuration.
>
> The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> patches will not be affected by this change.
>
> [1] https://lore.kernel.org/all/[email protected]/T/#u
>
> I also added an out-of-tree (just for testing) patch at the
> end of this series ([8/8]) for adding a wildcard support to
> the sample program. With that patch, it shows how long the
> registration will take;
>
> # time insmod fprobe_example.ko symbol='btrfs_*'
> [ 36.130947] fprobe_init: 1028 symbols found
> [ 36.177901] fprobe_init: Planted fprobe at btrfs_*
> real 0m 0.08s
> user 0m 0.00s
> sys 0m 0.07s
>
> Thank you,
>
> ---
>
> Jiri Olsa (2):
> ftrace: Add ftrace_set_filter_ips function
> bpf: Add kprobe link for attaching raw kprobes
>
> Masami Hiramatsu (6):
> fprobe: Add ftrace based probe APIs
> rethook: Add a generic return hook
> rethook: x86: Add rethook x86 implementation
> fprobe: Add exit_handler support
> fprobe: Add sample program for fprobe
> [DO NOT MERGE] Out-of-tree: Support wildcard symbol option to sample
>
>
> arch/x86/Kconfig | 1
> arch/x86/kernel/Makefile | 1
> arch/x86/kernel/rethook.c | 115 ++++++++++++++++++++
> include/linux/bpf_types.h | 1
> include/linux/fprobe.h | 57 ++++++++++
> include/linux/ftrace.h | 3 +
> include/linux/rethook.h | 74 +++++++++++++
> include/linux/sched.h | 3 +
> include/uapi/linux/bpf.h | 12 ++
> kernel/bpf/syscall.c | 195 +++++++++++++++++++++++++++++++++-
> kernel/exit.c | 2
> kernel/fork.c | 3 +
> kernel/kallsyms.c | 1
> kernel/trace/Kconfig | 22 ++++
> kernel/trace/Makefile | 2
> kernel/trace/fprobe.c | 168 +++++++++++++++++++++++++++++
> kernel/trace/ftrace.c | 54 ++++++++-
> kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++
> samples/Kconfig | 7 +
> samples/Makefile | 1
> samples/fprobe/Makefile | 3 +
> samples/fprobe/fprobe_example.c | 154 +++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 12 ++
> 23 files changed, 1103 insertions(+), 14 deletions(-)
> create mode 100644 arch/x86/kernel/rethook.c
> create mode 100644 include/linux/fprobe.h
> create mode 100644 include/linux/rethook.h
> create mode 100644 kernel/trace/fprobe.c
> create mode 100644 kernel/trace/rethook.c
> create mode 100644 samples/fprobe/Makefile
> create mode 100644 samples/fprobe/fprobe_example.c
>
> --
> Masami Hiramatsu (Linaro) <[email protected]>
Hi Andrii,
On Fri, 14 Jan 2022 17:02:31 -0800
Andrii Nakryiko <[email protected]> wrote:
> On Fri, Jan 14, 2022 at 7:10 AM Jiri Olsa <[email protected]> wrote:
> >
> > On Fri, Jan 14, 2022 at 11:47:04PM +0900, Masami Hiramatsu wrote:
> > > Hi Jiri and Alexei,
> > >
> > > On Thu, 13 Jan 2022 13:27:34 +0100
> > > Jiri Olsa <[email protected]> wrote:
> > >
> > > > On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > > > > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > > > > Hi Jiri and Alexei,
> > > > > >
> > > > > > Here is the 2nd version of fprobe. This version uses the
> > > > > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > > > > Note that this also drops per-probe point private data, which
> > > > > > is not used anyway.
> > > > > >
> > > > > > This introduces the fprobe, the function entry/exit probe with
> > > > > > multiple probe point support. This also introduces the rethook
> > > > > > for hooking function return as same as kretprobe does. This
> > > > >
> > > > > nice, I was going through the multi-user-graph support
> > > > > and was wondering that this might be a better way
> > > > >
> > > > > > abstraction will help us to generalize the fgraph tracer,
> > > > > > because we can just switch it from rethook in fprobe, depending
> > > > > > on the kernel configuration.
> > > > > >
> > > > > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > > > > patches will not be affected by this change.
> > > > >
> > > > > I'll try the bpf selftests on top of this
> > > >
> > > > I'm getting crash and stall when running bpf selftests,
> > > > the fprobe sample module works fine, I'll check on that
> > >
> > > I've tried to build tools/testing/selftests/bpf on my machine,
> > > but I got below errors. Would you know how I can setup to build
> > > the bpf selftests correctly? (I tried "make M=samples/bpf", but same result)
> >
> > what's your clang version? your distro might be behind,
I'm using clang 13.0.0.
$ clang -v
clang version 13.0.0 (https://github.com/llvm/llvm-project.git d7b669b3a30345cfcdb2fde2af6f48aa4b94845d)
Target: x86_64-unknown-linux-gnu
>
> If you have very recent Clang, decently recent pahole, and qemu, try
> using vmtest.sh. That should build the kernel with all the necessary
> kernel config options and start qemu image with that latest image and
> build selftests. And even run selftests automatically.
OK, vmtest.sh works! :)
So I got the vmtest.sh runs out with some failures. Jiri, did you talked about
these failures, or real crash?
Summary: 212/1033 PASSED, 12 SKIPPED, 14 FAILED
Thanks!
>
> > I'm using clang 14 compiled from sources:
> >
> > $ /opt/clang/bin/clang --version
> > clang version 14.0.0 (https://github.com/llvm/llvm-project.git 9f8ffaaa0bddcefeec15a3df9858fd50b05fcbae)
> > Target: x86_64-unknown-linux-gnu
> > Thread model: posix
> > InstalledDir: /opt/clang/bin
> >
> > and compiling bpf selftests with:
> >
> > $ CLANG=/opt/clang/bin/clang make
> >
> > jirka
> >
> >
> > >
>
> [...]
>
> > >
> > > Thank you,
> > >
> > > --
> > > Masami Hiramatsu <[email protected]>
> > >
> >
--
Masami Hiramatsu <[email protected]>
On Fri, 14 Jan 2022 16:18:13 +0100
Jiri Olsa <[email protected]> wrote:
> On Thu, Jan 13, 2022 at 10:15:32PM +0900, Masami Hiramatsu wrote:
> > On Thu, 13 Jan 2022 13:25:52 +0100
> > Jiri Olsa <[email protected]> wrote:
> >
> > > On Wed, Jan 12, 2022 at 11:03:22PM +0900, Masami Hiramatsu wrote:
> > > > Add a return hook framework which hooks the function
> > > > return. Most of the idea came from the kretprobe, but
> > > > this is independent from kretprobe.
> > > > Note that this is expected to be used with other
> > > > function entry hooking feature, like ftrace, fprobe,
> > > > adn kprobes. Eventually this will replace the
> > > > kretprobe (e.g. kprobe + rethook = kretprobe), but
> > > > at this moment, this is just a additional hook.
> > >
> > > this looks similar to the code kretprobe is using now
> >
> > Yes, I've mostly re-typed the code :)
> >
> > > would it make sense to incrementaly change current code to provide
> > > this rethook interface? instead of big switch of current kretprobe
> > > to kprobe + new rethook interface in future?
> >
> > Would you mean modifying the kretprobe instance code to provide
> > similar one, and rename it at some point?
> > My original idea is to keep the current kretprobe code and build
> > up the similar one, and switch to it at some point. Actually,
> > I don't want to change the current kretprobe interface itself,
> > but the backend will be changed. For example, current kretprobe
> > has below interface.
> >
> > struct kretprobe {
> > struct kprobe kp;
> > kretprobe_handler_t handler;
> > kretprobe_handler_t entry_handler;
> > int maxactive;
> > int nmissed;
> > size_t data_size;
> > struct freelist_head freelist;
> > struct kretprobe_holder *rph;
> > };
> >
> > My idea is switching it to below.
> >
> > struct kretprobe {
> > struct kprobe kp;
> > kretprobe_handler_t handler;
> > kretprobe_handler_t entry_handler;
> > int maxactive;
> > int nmissed;
> > size_t data_size;
> > struct rethook *rethook;
> > };
>
> looks good, will this be a lot of changes?
Yes and no, we can easily replace the kretprobe generic trampoline
callback (since it almost same, and have same feature), but it also
needs to update per-arch kretprobe trampoline to rethook trampoline.
> could you include it in the patchset?
Let me try, but since it involves many archs (which support kretprobes)
it may take a time to be merged.
Thank you,
>
> thanks,
> jirka
>
> >
> > Of course 'kretprobe_instance' may need to be changed...
> >
> > struct kretprobe_instance {
> > struct rethook_node;
> > char data[];
> > };
> >
> > But even though, since there is 'get_kretprobe(ri)' wrapper, user
> > will be able to access the 'struct kretprobe' from kretprobe_instance
> > transparently.
> >
> > Thank you,
> >
> >
> > --
> > Masami Hiramatsu <[email protected]>
> >
>
--
Masami Hiramatsu <[email protected]>
On Thu, 13 Jan 2022 13:27:34 +0100
Jiri Olsa <[email protected]> wrote:
> On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > Hi Jiri and Alexei,
> > >
> > > Here is the 2nd version of fprobe. This version uses the
> > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > Note that this also drops per-probe point private data, which
> > > is not used anyway.
> > >
> > > This introduces the fprobe, the function entry/exit probe with
> > > multiple probe point support. This also introduces the rethook
> > > for hooking function return as same as kretprobe does. This
> >
> > nice, I was going through the multi-user-graph support
> > and was wondering that this might be a better way
> >
> > > abstraction will help us to generalize the fgraph tracer,
> > > because we can just switch it from rethook in fprobe, depending
> > > on the kernel configuration.
> > >
> > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > patches will not be affected by this change.
> >
> > I'll try the bpf selftests on top of this
>
> I'm getting crash and stall when running bpf selftests,
> the fprobe sample module works fine, I'll check on that
OK, I got a kernel stall. I missed to enable CONFIG_FPROBE.
I think vmtest.sh should support menuconfig option.
#6 bind_perm:OK
#7 bloom_filter_map:OK
[ 107.282403] clocksource: timekeeping watchdog on CPU0: Marking clocksource 'tsc' as unstable because the skew is too large:
[ 107.283240] clocksource: 'hpet' wd_nsec: 496216090 wd_now: 7ddc7120 wd_last: 7ae746b7 mask: ffffffff
[ 107.284045] clocksource: 'tsc' cs_nsec: 495996979 cs_now: 31fdb69b39 cs_last: 31c2d29219 mask: ffffffffffffffff
[ 107.284926] clocksource: 'tsc' is current clocksource.
[ 107.285487] tsc: Marking TSC unstable due to clocksource watchdog
[ 107.285973] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
[ 107.286616] sched_clock: Marking unstable (107240582544, 45390230)<-(107291410145, -5437339)
[ 107.290408] clocksource: Not enough CPUs to check clocksource 'tsc'.
[ 107.290879] clocksource: Switched to clocksource hpet
[ 604.210415] INFO: rcu_tasks detected stalls on tasks:
[ 604.210830] (____ptrval____): .. nvcsw: 86/86 holdout: 1 idle_cpu: -1/0
[ 604.211314] task:test_progs state:R running task stack: 0 pid: 87 ppid: 85 flags:0x00004000
[ 604.212058] Call Trace:
[ 604.212246] <TASK>
[ 604.212452] __schedule+0x362/0xbb0
[ 604.212723] ? preempt_schedule_notrace_thunk+0x16/0x18
[ 604.213107] preempt_schedule_notrace+0x48/0x80
[ 604.217403] ? asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 604.217790] ? ftrace_regs_call+0xd/0x52
[ 604.218087] ? bpf_test_finish.isra.0+0x190/0x190
[ 604.218461] ? bpf_fentry_test1+0x5/0x10
[ 604.218750] ? trace_clock_x86_tsc+0x10/0x10
[ 604.219064] ? __sys_bpf+0x8b1/0x2970
[ 604.219337] ? lock_is_held_type+0xd7/0x130
[ 604.219680] ? __x64_sys_bpf+0x1c/0x20
[ 604.219957] ? do_syscall_64+0x35/0x80
[ 604.220237] ? entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 604.220653] </TASK>
Jiri, is that what you had seen?
Thank you,
--
Masami Hiramatsu <[email protected]>
On Sat, Jan 15, 2022 at 01:52:19PM +0900, Masami Hiramatsu wrote:
> On Thu, 13 Jan 2022 13:27:34 +0100
> Jiri Olsa <[email protected]> wrote:
>
> > On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > > Hi Jiri and Alexei,
> > > >
> > > > Here is the 2nd version of fprobe. This version uses the
> > > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > > Note that this also drops per-probe point private data, which
> > > > is not used anyway.
> > > >
> > > > This introduces the fprobe, the function entry/exit probe with
> > > > multiple probe point support. This also introduces the rethook
> > > > for hooking function return as same as kretprobe does. This
> > >
> > > nice, I was going through the multi-user-graph support
> > > and was wondering that this might be a better way
> > >
> > > > abstraction will help us to generalize the fgraph tracer,
> > > > because we can just switch it from rethook in fprobe, depending
> > > > on the kernel configuration.
> > > >
> > > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > > patches will not be affected by this change.
> > >
> > > I'll try the bpf selftests on top of this
> >
> > I'm getting crash and stall when running bpf selftests,
> > the fprobe sample module works fine, I'll check on that
>
> OK, I got a kernel stall. I missed to enable CONFIG_FPROBE.
> I think vmtest.sh should support menuconfig option.
>
> #6 bind_perm:OK
> #7 bloom_filter_map:OK
> [ 107.282403] clocksource: timekeeping watchdog on CPU0: Marking clocksource 'tsc' as unstable because the skew is too large:
> [ 107.283240] clocksource: 'hpet' wd_nsec: 496216090 wd_now: 7ddc7120 wd_last: 7ae746b7 mask: ffffffff
> [ 107.284045] clocksource: 'tsc' cs_nsec: 495996979 cs_now: 31fdb69b39 cs_last: 31c2d29219 mask: ffffffffffffffff
> [ 107.284926] clocksource: 'tsc' is current clocksource.
> [ 107.285487] tsc: Marking TSC unstable due to clocksource watchdog
> [ 107.285973] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> [ 107.286616] sched_clock: Marking unstable (107240582544, 45390230)<-(107291410145, -5437339)
> [ 107.290408] clocksource: Not enough CPUs to check clocksource 'tsc'.
> [ 107.290879] clocksource: Switched to clocksource hpet
> [ 604.210415] INFO: rcu_tasks detected stalls on tasks:
> [ 604.210830] (____ptrval____): .. nvcsw: 86/86 holdout: 1 idle_cpu: -1/0
> [ 604.211314] task:test_progs state:R running task stack: 0 pid: 87 ppid: 85 flags:0x00004000
> [ 604.212058] Call Trace:
> [ 604.212246] <TASK>
> [ 604.212452] __schedule+0x362/0xbb0
> [ 604.212723] ? preempt_schedule_notrace_thunk+0x16/0x18
> [ 604.213107] preempt_schedule_notrace+0x48/0x80
> [ 604.217403] ? asm_sysvec_apic_timer_interrupt+0x12/0x20
> [ 604.217790] ? ftrace_regs_call+0xd/0x52
> [ 604.218087] ? bpf_test_finish.isra.0+0x190/0x190
> [ 604.218461] ? bpf_fentry_test1+0x5/0x10
> [ 604.218750] ? trace_clock_x86_tsc+0x10/0x10
> [ 604.219064] ? __sys_bpf+0x8b1/0x2970
> [ 604.219337] ? lock_is_held_type+0xd7/0x130
> [ 604.219680] ? __x64_sys_bpf+0x1c/0x20
> [ 604.219957] ? do_syscall_64+0x35/0x80
> [ 604.220237] ? entry_SYSCALL_64_after_hwframe+0x44/0xae
> [ 604.220653] </TASK>
>
> Jiri, is that what you had seen?
hi,
sorry for late response
I did not get any backtrace for the stall, debugging showed
that the first probed function was called over and over for
some reason
as for the crash I used the small fix below
do you have any newer version I could play with?
jirka
---
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 3333893e5217..883151275892 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -157,7 +157,8 @@ int unregister_fprobe(struct fprobe *fp)
ret = unregister_ftrace_function(&fp->ftrace);
if (!ret) {
- rethook_free(fp->rethook);
+ if (fp->rethook)
+ rethook_free(fp->rethook);
if (fp->syms) {
kfree(fp->addrs);
fp->addrs = NULL;
On Fri, Jan 14, 2022 at 05:08:32PM -0800, Andrii Nakryiko wrote:
> On Wed, Jan 12, 2022 at 6:02 AM Masami Hiramatsu <[email protected]> wrote:
> >
> > Hi Jiri and Alexei,
> >
> > Here is the 2nd version of fprobe. This version uses the
> > ftrace_set_filter_ips() for reducing the registering overhead.
> > Note that this also drops per-probe point private data, which
> > is not used anyway.
>
> This per-probe private data is necessary for the feature called BPF
> cookie, in which each attachment has a unique user-provided u64 value
> associated to it, accessible at runtime through
> bpf_get_attach_cookie() helper. One way or another we'll need to
> support this to make these multi-attach BPF programs really useful for
> generic tracing applications.
>
> Jiri,
>
> We've discussed with Alexei this week how cookies can be supported for
> multi-attach fentry (where it seems even more challenging than in
> kprobe case), and agreed on rather simple solution, which roughly goes
> like this. When multi-attaching either fentry/fexit program, save
> sorted array of IP addresses and then sorted in the same order as IPs
> list of u64 cookies. At runtime, bpf_get_attach_cookie() helper should
> somehow get access to these two arrays and functions IP (that we
> already have with bpf_get_func_ip()), perform binary search and locate
> necessary cookie. This offloads the overhead of finding this cookie to
> actual call site of bpf_get_attach_cookie() (and it's a log(N), so not
> bad at all, especially if BPF program can be optimized to call this
> helper just once).
>
> I think something like that should be doable for Masami's fprobe-based
> multi-attach kprobes, right? That would allow to have super-fast
> attachment, but still support BPF cookie per each individual IP/kernel
> function attachment. I haven't looked at code thoroughly, though,
> please let me know if I'm missing something fundamental.
ok, that seems doable, we should be able to get the link struct
in bpf_get_attach_cookie_trace and reach both ips and cookies
jirka
>
> >
> > This introduces the fprobe, the function entry/exit probe with
> > multiple probe point support. This also introduces the rethook
> > for hooking function return as same as kretprobe does. This
> > abstraction will help us to generalize the fgraph tracer,
> > because we can just switch it from rethook in fprobe, depending
> > on the kernel configuration.
> >
> > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > patches will not be affected by this change.
> >
> > [1] https://lore.kernel.org/all/[email protected]/T/#u
> >
> > I also added an out-of-tree (just for testing) patch at the
> > end of this series ([8/8]) for adding a wildcard support to
> > the sample program. With that patch, it shows how long the
> > registration will take;
> >
> > # time insmod fprobe_example.ko symbol='btrfs_*'
> > [ 36.130947] fprobe_init: 1028 symbols found
> > [ 36.177901] fprobe_init: Planted fprobe at btrfs_*
> > real 0m 0.08s
> > user 0m 0.00s
> > sys 0m 0.07s
> >
> > Thank you,
> >
> > ---
> >
> > Jiri Olsa (2):
> > ftrace: Add ftrace_set_filter_ips function
> > bpf: Add kprobe link for attaching raw kprobes
> >
> > Masami Hiramatsu (6):
> > fprobe: Add ftrace based probe APIs
> > rethook: Add a generic return hook
> > rethook: x86: Add rethook x86 implementation
> > fprobe: Add exit_handler support
> > fprobe: Add sample program for fprobe
> > [DO NOT MERGE] Out-of-tree: Support wildcard symbol option to sample
> >
> >
> > arch/x86/Kconfig | 1
> > arch/x86/kernel/Makefile | 1
> > arch/x86/kernel/rethook.c | 115 ++++++++++++++++++++
> > include/linux/bpf_types.h | 1
> > include/linux/fprobe.h | 57 ++++++++++
> > include/linux/ftrace.h | 3 +
> > include/linux/rethook.h | 74 +++++++++++++
> > include/linux/sched.h | 3 +
> > include/uapi/linux/bpf.h | 12 ++
> > kernel/bpf/syscall.c | 195 +++++++++++++++++++++++++++++++++-
> > kernel/exit.c | 2
> > kernel/fork.c | 3 +
> > kernel/kallsyms.c | 1
> > kernel/trace/Kconfig | 22 ++++
> > kernel/trace/Makefile | 2
> > kernel/trace/fprobe.c | 168 +++++++++++++++++++++++++++++
> > kernel/trace/ftrace.c | 54 ++++++++-
> > kernel/trace/rethook.c | 226 +++++++++++++++++++++++++++++++++++++++
> > samples/Kconfig | 7 +
> > samples/Makefile | 1
> > samples/fprobe/Makefile | 3 +
> > samples/fprobe/fprobe_example.c | 154 +++++++++++++++++++++++++++
> > tools/include/uapi/linux/bpf.h | 12 ++
> > 23 files changed, 1103 insertions(+), 14 deletions(-)
> > create mode 100644 arch/x86/kernel/rethook.c
> > create mode 100644 include/linux/fprobe.h
> > create mode 100644 include/linux/rethook.h
> > create mode 100644 kernel/trace/fprobe.c
> > create mode 100644 kernel/trace/rethook.c
> > create mode 100644 samples/fprobe/Makefile
> > create mode 100644 samples/fprobe/fprobe_example.c
> >
> > --
> > Masami Hiramatsu (Linaro) <[email protected]>
>
On Tue, 18 Jan 2022 15:25:25 +0100
Jiri Olsa <[email protected]> wrote:
> On Sat, Jan 15, 2022 at 01:52:19PM +0900, Masami Hiramatsu wrote:
> > On Thu, 13 Jan 2022 13:27:34 +0100
> > Jiri Olsa <[email protected]> wrote:
> >
> > > On Wed, Jan 12, 2022 at 05:01:15PM +0100, Jiri Olsa wrote:
> > > > On Wed, Jan 12, 2022 at 11:02:46PM +0900, Masami Hiramatsu wrote:
> > > > > Hi Jiri and Alexei,
> > > > >
> > > > > Here is the 2nd version of fprobe. This version uses the
> > > > > ftrace_set_filter_ips() for reducing the registering overhead.
> > > > > Note that this also drops per-probe point private data, which
> > > > > is not used anyway.
> > > > >
> > > > > This introduces the fprobe, the function entry/exit probe with
> > > > > multiple probe point support. This also introduces the rethook
> > > > > for hooking function return as same as kretprobe does. This
> > > >
> > > > nice, I was going through the multi-user-graph support
> > > > and was wondering that this might be a better way
> > > >
> > > > > abstraction will help us to generalize the fgraph tracer,
> > > > > because we can just switch it from rethook in fprobe, depending
> > > > > on the kernel configuration.
> > > > >
> > > > > The patch [1/8] and [7/8] are from your series[1]. Other libbpf
> > > > > patches will not be affected by this change.
> > > >
> > > > I'll try the bpf selftests on top of this
> > >
> > > I'm getting crash and stall when running bpf selftests,
> > > the fprobe sample module works fine, I'll check on that
> >
> > OK, I got a kernel stall. I missed to enable CONFIG_FPROBE.
> > I think vmtest.sh should support menuconfig option.
> >
> > #6 bind_perm:OK
> > #7 bloom_filter_map:OK
> > [ 107.282403] clocksource: timekeeping watchdog on CPU0: Marking clocksource 'tsc' as unstable because the skew is too large:
> > [ 107.283240] clocksource: 'hpet' wd_nsec: 496216090 wd_now: 7ddc7120 wd_last: 7ae746b7 mask: ffffffff
> > [ 107.284045] clocksource: 'tsc' cs_nsec: 495996979 cs_now: 31fdb69b39 cs_last: 31c2d29219 mask: ffffffffffffffff
> > [ 107.284926] clocksource: 'tsc' is current clocksource.
> > [ 107.285487] tsc: Marking TSC unstable due to clocksource watchdog
> > [ 107.285973] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> > [ 107.286616] sched_clock: Marking unstable (107240582544, 45390230)<-(107291410145, -5437339)
> > [ 107.290408] clocksource: Not enough CPUs to check clocksource 'tsc'.
> > [ 107.290879] clocksource: Switched to clocksource hpet
> > [ 604.210415] INFO: rcu_tasks detected stalls on tasks:
> > [ 604.210830] (____ptrval____): .. nvcsw: 86/86 holdout: 1 idle_cpu: -1/0
> > [ 604.211314] task:test_progs state:R running task stack: 0 pid: 87 ppid: 85 flags:0x00004000
> > [ 604.212058] Call Trace:
> > [ 604.212246] <TASK>
> > [ 604.212452] __schedule+0x362/0xbb0
> > [ 604.212723] ? preempt_schedule_notrace_thunk+0x16/0x18
> > [ 604.213107] preempt_schedule_notrace+0x48/0x80
> > [ 604.217403] ? asm_sysvec_apic_timer_interrupt+0x12/0x20
> > [ 604.217790] ? ftrace_regs_call+0xd/0x52
> > [ 604.218087] ? bpf_test_finish.isra.0+0x190/0x190
> > [ 604.218461] ? bpf_fentry_test1+0x5/0x10
> > [ 604.218750] ? trace_clock_x86_tsc+0x10/0x10
> > [ 604.219064] ? __sys_bpf+0x8b1/0x2970
> > [ 604.219337] ? lock_is_held_type+0xd7/0x130
> > [ 604.219680] ? __x64_sys_bpf+0x1c/0x20
> > [ 604.219957] ? do_syscall_64+0x35/0x80
> > [ 604.220237] ? entry_SYSCALL_64_after_hwframe+0x44/0xae
> > [ 604.220653] </TASK>
> >
> > Jiri, is that what you had seen?
>
> hi,
> sorry for late response
>
> I did not get any backtrace for the stall, debugging showed
> that the first probed function was called over and over for
> some reason
>
> as for the crash I used the small fix below
Oops, good catch!
>
> do you have any newer version I could play with?
Let me update the fprobe and rethook. I'm now trying to integrate
the rethook with kretprobes and find some issues.
Thank you!
>
> jirka
>
>
> ---
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index 3333893e5217..883151275892 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -157,7 +157,8 @@ int unregister_fprobe(struct fprobe *fp)
> ret = unregister_ftrace_function(&fp->ftrace);
>
> if (!ret) {
> - rethook_free(fp->rethook);
> + if (fp->rethook)
> + rethook_free(fp->rethook);
> if (fp->syms) {
> kfree(fp->addrs);
> fp->addrs = NULL;
>
--
Masami Hiramatsu <[email protected]>