Tracing with kprobes while running a monolithic kernel is currently
impossible due the kernel module allocator dependency.
Address the issue by allowing architectures to implement module_alloc()
and module_memfree() independent of the module subsystem. An arch tree
can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
Realize the feature on RISC-V by separating allocator to module_alloc.c
and implementing module_memfree().
Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
Link: https://lore.kernel.org/all/[email protected]/ # continuation
Signed-off-by: Jarkko Sakkinen <[email protected]>
---
v2:
- Better late than never right? :-)
- Focus only to RISC-V for now to make the patch more digestable. This
is the arch where I use the patch on a daily basis to help with QA.
- Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
---
arch/Kconfig | 8 +++++++-
arch/riscv/Kconfig | 1 +
arch/riscv/kernel/Makefile | 5 +++++
arch/riscv/kernel/module.c | 11 -----------
arch/riscv/kernel/module_alloc.c | 28 ++++++++++++++++++++++++++++
kernel/kprobes.c | 10 ++++++++++
kernel/trace/trace_kprobe.c | 18 ++++++++++++++++--
7 files changed, 67 insertions(+), 14 deletions(-)
create mode 100644 arch/riscv/kernel/module_alloc.c
diff --git a/arch/Kconfig b/arch/Kconfig
index a5af0edd3eb8..c931f1de98a7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -52,7 +52,7 @@ config GENERIC_ENTRY
config KPROBES
bool "Kprobes"
- depends on MODULES
+ depends on MODULES || HAVE_KPROBES_ALLOC
depends on HAVE_KPROBES
select KALLSYMS
select TASKS_RCU if PREEMPTION
@@ -215,6 +215,12 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool
+config HAVE_KPROBES_ALLOC
+ bool
+ help
+ Architectures that select this option are capable of allocating memory
+ for kprobes withou the kernel module allocator.
+
config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
bool
help
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e3142ce531a0..4f1b925e83d8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -132,6 +132,7 @@ config RISCV
select HAVE_KPROBES if !XIP_KERNEL
select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
select HAVE_KRETPROBES if !XIP_KERNEL
+ select HAVE_KPROBES_ALLOC if !XIP_KERNEL
# https://github.com/ClangBuiltLinux/linux/issues/1881
select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
select HAVE_MOVE_PMD
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 604d6bf7e476..46318194bce1 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -73,6 +73,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o
obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 5e5a82644451..cc324b450f2e 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -905,17 +905,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
return 0;
}
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-void *module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, MODULES_VADDR,
- MODULES_END, GFP_KERNEL,
- PAGE_KERNEL, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-#endif
-
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c
new file mode 100644
index 000000000000..3d9aa8dbca8a
--- /dev/null
+++ b/arch/riscv/kernel/module_alloc.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2017 Zihao Yu
+ * Copyright (c) 2024 Jarkko Sakkinen
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+#include <asm/sections.h>
+
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+void *module_alloc(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, MODULES_VADDR,
+ MODULES_END, GFP_KERNEL,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+
+void module_memfree(void *module_region)
+{
+ if (in_interrupt())
+ pr_warn("In interrupt context: vmalloc may not work.\n");
+
+ vfree(module_region);
+}
+#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e81792..2c583ab6efc4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1580,6 +1580,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
+#ifdef CONFIG_MODULES
/* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
@@ -1603,6 +1604,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = -ENOENT;
}
}
+#endif
+
out:
preempt_enable();
jump_label_unlock();
@@ -2482,6 +2485,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
return 0;
}
+#ifdef CONFIG_MODULES
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
@@ -2499,6 +2503,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
kprobe_remove_area_blacklist(entry, entry + 1);
}
+#endif /* CONFIG_MODULES */
int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
char *type, char *sym)
@@ -2564,6 +2569,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret ? : arch_populate_kprobe_blacklist();
}
+#ifdef CONFIG_MODULES
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
@@ -2665,6 +2671,7 @@ static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
};
+#endif /* CONFIG_MODULES */
void kprobe_free_init_mem(void)
{
@@ -2724,8 +2731,11 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
+
+#ifdef CONFIG_MODULES
if (!err)
err = register_module_notifier(&kprobe_module_nb);
+#endif
kprobes_initialized = (err == 0);
kprobe_sysctls_init();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..f8fbd5e76dda 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -129,6 +130,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
return ret;
}
+#endif /* CONFIG_MODULES */
static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -608,7 +610,11 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
/* Register k*probe */
ret = __register_trace_kprobe(tk);
- if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
+#ifdef CONFIG_MODULES
+ if (ret == -ENOENT && trace_kprobe_module_exist(tk))
+ ret = 0;
+#endif /* CONFIG_MODULES */
+ if (ret == -ENOENT) {
pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
}
@@ -655,7 +661,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register k*probe */
ret = __register_trace_kprobe(tk);
- if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
+#ifdef CONFIG_MODULES
+ if (ret == -ENOENT && trace_kprobe_module_exist(tk))
+ ret = 0;
+#endif /* CONFIG_MODULES */
+ if (ret == -ENOENT) {
pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
}
@@ -670,6 +680,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
return ret;
}
+#ifdef CONFIG_MODULES
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -704,6 +715,7 @@ static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
+#endif /* CONFIG_MODULES */
static int count_symbols(void *data, unsigned long unused)
{
@@ -1897,8 +1909,10 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
+#ifdef CONFIG_MODULES
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
+#endif /* CONFIG_MODULES */
return 0;
}
--
2.44.0
On Sun Mar 24, 2024 at 1:29 AM EET, Jarkko Sakkinen wrote:
> Tracing with kprobes while running a monolithic kernel is currently
> impossible due the kernel module allocator dependency.
>
> Address the issue by allowing architectures to implement module_alloc()
> and module_memfree() independent of the module subsystem. An arch tree
> can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
>
> Realize the feature on RISC-V by separating allocator to module_alloc.c
> and implementing module_memfree().
>
> Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> Link: https://lore.kernel.org/all/20220608000014.3054333-1-jarkko@profiancom/ # continuation
> Signed-off-by: Jarkko Sakkinen <[email protected]>
As for testing I tried the kprobes example for boottime tracing
dcoumentation:
https://www.kernel.org/doc/html/v5.7/trace/boottime-trace.html
I.e.
ftrace.event {
kprobes.vfs_read {
probes = "vfs_read $arg1 $arg2"
filter = "common_pid < 100"
enable
}
}
kernel {
console = hvc0
earlycon = sbi
trace_options = sym-addr
trace_event = "initcall:*"
tp_printk
dump_on_oops = 2
trace_buf_size = 1M
}
BR, Jarkko
On 3/23/24 16:29, Jarkko Sakkinen wrote:
> +config HAVE_KPROBES_ALLOC
> + bool
> + help
> + Architectures that select this option are capable of allocating memory
> + for kprobes withou the kernel module allocator.
without
--
#Randy
On Sun Mar 24, 2024 at 2:37 AM EET, Randy Dunlap wrote:
>
>
> On 3/23/24 16:29, Jarkko Sakkinen wrote:
> > +config HAVE_KPROBES_ALLOC
> > + bool
> > + help
> > + Architectures that select this option are capable of allocating memory
> > + for kprobes withou the kernel module allocator.
>
> without
Thanks, will fix.
BR, Jarkko
Hi Jarkko,
On Sun, 24 Mar 2024 01:29:08 +0200
Jarkko Sakkinen <[email protected]> wrote:
> Tracing with kprobes while running a monolithic kernel is currently
> impossible due the kernel module allocator dependency.
>
> Address the issue by allowing architectures to implement module_alloc()
> and module_memfree() independent of the module subsystem. An arch tree
> can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
>
> Realize the feature on RISC-V by separating allocator to module_alloc.c
> and implementing module_memfree().
Even though, this involves changes in arch-independent part. So it should
be solved by generic way. Did you checked Calvin's thread?
https://lore.kernel.org/all/[email protected]/
I think, we'd better to introduce `alloc_execmem()`,
CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
config HAVE_ALLOC_EXECMEM
bool
config ALLOC_EXECMEM
bool "Executable trampline memory allocation"
depends on MODULES || HAVE_ALLOC_EXECMEM
And define fallback macro to module_alloc() like this.
#ifndef CONFIG_HAVE_ALLOC_EXECMEM
#define alloc_execmem(size, gfp) module_alloc(size)
#endif
Then, introduce a new dependency to kprobes
config KPROBES
bool "Kprobes"
select ALLOC_EXECMEM
and update kprobes to use alloc_execmem and remove module related
code from it.
You also should consider using IS_ENABLED(CONFIG_MODULE) in the code to
avoid using #ifdefs.
Finally, you can add RISCV implementation patch of HAVE_ALLOC_EXECMEM in the
next patch.
Thank you,
>
> Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> Link: https://lore.kernel.org/all/[email protected]/ # continuation
> Signed-off-by: Jarkko Sakkinen <[email protected]>
> ---
> v2:
> - Better late than never right? :-)
> - Focus only to RISC-V for now to make the patch more digestable. This
> is the arch where I use the patch on a daily basis to help with QA.
> - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
> ---
> arch/Kconfig | 8 +++++++-
> arch/riscv/Kconfig | 1 +
> arch/riscv/kernel/Makefile | 5 +++++
> arch/riscv/kernel/module.c | 11 -----------
> arch/riscv/kernel/module_alloc.c | 28 ++++++++++++++++++++++++++++
> kernel/kprobes.c | 10 ++++++++++
> kernel/trace/trace_kprobe.c | 18 ++++++++++++++++--
> 7 files changed, 67 insertions(+), 14 deletions(-)
> create mode 100644 arch/riscv/kernel/module_alloc.c
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index a5af0edd3eb8..c931f1de98a7 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -52,7 +52,7 @@ config GENERIC_ENTRY
>
> config KPROBES
> bool "Kprobes"
> - depends on MODULES
> + depends on MODULES || HAVE_KPROBES_ALLOC
> depends on HAVE_KPROBES
> select KALLSYMS
> select TASKS_RCU if PREEMPTION
> @@ -215,6 +215,12 @@ config HAVE_OPTPROBES
> config HAVE_KPROBES_ON_FTRACE
> bool
>
> +config HAVE_KPROBES_ALLOC
> + bool
> + help
> + Architectures that select this option are capable of allocating memory
> + for kprobes withou the kernel module allocator.
> +
> config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
> bool
> help
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e3142ce531a0..4f1b925e83d8 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -132,6 +132,7 @@ config RISCV
> select HAVE_KPROBES if !XIP_KERNEL
> select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
> select HAVE_KRETPROBES if !XIP_KERNEL
> + select HAVE_KPROBES_ALLOC if !XIP_KERNEL
> # https://github.com/ClangBuiltLinux/linux/issues/1881
> select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
> select HAVE_MOVE_PMD
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 604d6bf7e476..46318194bce1 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -73,6 +73,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o
>
> obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
>
> obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 5e5a82644451..cc324b450f2e 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -905,17 +905,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
> return 0;
> }
>
> -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> -void *module_alloc(unsigned long size)
> -{
> - return __vmalloc_node_range(size, 1, MODULES_VADDR,
> - MODULES_END, GFP_KERNEL,
> - PAGE_KERNEL, VM_FLUSH_RESET_PERMS,
> - NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -#endif
> -
> int module_finalize(const Elf_Ehdr *hdr,
> const Elf_Shdr *sechdrs,
> struct module *me)
> diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..3d9aa8dbca8a
> --- /dev/null
> +++ b/arch/riscv/kernel/module_alloc.c
> @@ -0,0 +1,28 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (c) 2017 Zihao Yu
> + * Copyright (c) 2024 Jarkko Sakkinen
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +#include <asm/sections.h>
> +
> +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> +void *module_alloc(unsigned long size)
> +{
> + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> + MODULES_END, GFP_KERNEL,
> + PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> +
> +void module_memfree(void *module_region)
> +{
> + if (in_interrupt())
> + pr_warn("In interrupt context: vmalloc may not work.\n");
> +
> + vfree(module_region);
> +}
> +#endif
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 9d9095e81792..2c583ab6efc4 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1580,6 +1580,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> goto out;
> }
>
> +#ifdef CONFIG_MODULES
> /* Check if 'p' is probing a module. */
> *probed_mod = __module_text_address((unsigned long) p->addr);
> if (*probed_mod) {
> @@ -1603,6 +1604,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> ret = -ENOENT;
> }
> }
> +#endif
> +
> out:
> preempt_enable();
> jump_label_unlock();
> @@ -2482,6 +2485,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> return 0;
> }
>
> +#ifdef CONFIG_MODULES
> /* Remove all symbols in given area from kprobe blacklist */
> static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> {
> @@ -2499,6 +2503,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> {
> kprobe_remove_area_blacklist(entry, entry + 1);
> }
> +#endif /* CONFIG_MODULES */
>
> int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> char *type, char *sym)
> @@ -2564,6 +2569,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> return ret ? : arch_populate_kprobe_blacklist();
> }
>
> +#ifdef CONFIG_MODULES
> static void add_module_kprobe_blacklist(struct module *mod)
> {
> unsigned long start, end;
> @@ -2665,6 +2671,7 @@ static struct notifier_block kprobe_module_nb = {
> .notifier_call = kprobes_module_callback,
> .priority = 0
> };
> +#endif /* CONFIG_MODULES */
>
> void kprobe_free_init_mem(void)
> {
> @@ -2724,8 +2731,11 @@ static int __init init_kprobes(void)
> err = arch_init_kprobes();
> if (!err)
> err = register_die_notifier(&kprobe_exceptions_nb);
> +
> +#ifdef CONFIG_MODULES
> if (!err)
> err = register_module_notifier(&kprobe_module_nb);
> +#endif
>
> kprobes_initialized = (err == 0);
> kprobe_sysctls_init();
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index c4c6e0e0068b..f8fbd5e76dda 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> }
>
> +#ifdef CONFIG_MODULES
> static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> {
> char *p;
> @@ -129,6 +130,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
>
> return ret;
> }
> +#endif /* CONFIG_MODULES */
>
> static bool trace_kprobe_is_busy(struct dyn_event *ev)
> {
> @@ -608,7 +610,11 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
>
> /* Register k*probe */
> ret = __register_trace_kprobe(tk);
> - if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> +#ifdef CONFIG_MODULES
> + if (ret == -ENOENT && trace_kprobe_module_exist(tk))
> + ret = 0;
> +#endif /* CONFIG_MODULES */
> + if (ret == -ENOENT) {
> pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> ret = 0;
> }
> @@ -655,7 +661,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
>
> /* Register k*probe */
> ret = __register_trace_kprobe(tk);
> - if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> +#ifdef CONFIG_MODULES
> + if (ret == -ENOENT && trace_kprobe_module_exist(tk))
> + ret = 0;
> +#endif /* CONFIG_MODULES */
> + if (ret == -ENOENT) {
> pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> ret = 0;
> }
> @@ -670,6 +680,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> return ret;
> }
>
> +#ifdef CONFIG_MODULES
> /* Module notifier call back, checking event on the module */
> static int trace_kprobe_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> @@ -704,6 +715,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> .notifier_call = trace_kprobe_module_callback,
> .priority = 1 /* Invoked after kprobe module callback */
> };
> +#endif /* CONFIG_MODULES */
>
> static int count_symbols(void *data, unsigned long unused)
> {
> @@ -1897,8 +1909,10 @@ static __init int init_kprobe_trace_early(void)
> if (ret)
> return ret;
>
> +#ifdef CONFIG_MODULES
> if (register_module_notifier(&trace_kprobe_module_nb))
> return -EINVAL;
> +#endif /* CONFIG_MODULES */
>
> return 0;
> }
> --
> 2.44.0
>
--
Masami Hiramatsu (Google) <[email protected]>
On Monday 03/25 at 11:56 +0900, Masami Hiramatsu wrote:
> Hi Jarkko,
>
> On Sun, 24 Mar 2024 01:29:08 +0200
> Jarkko Sakkinen <[email protected]> wrote:
>
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible due the kernel module allocator dependency.
> >
> > Address the issue by allowing architectures to implement module_alloc()
> > and module_memfree() independent of the module subsystem. An arch tree
> > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> >
> > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > and implementing module_memfree().
>
> Even though, this involves changes in arch-independent part. So it should
> be solved by generic way. Did you checked Calvin's thread?
>
> https://lore.kernel.org/all/[email protected]/
FYI, I should have v2 of that series out later this week.
Thanks,
Calvin
> I think, we'd better to introduce `alloc_execmem()`,
> CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
>
> config HAVE_ALLOC_EXECMEM
> bool
>
> config ALLOC_EXECMEM
> bool "Executable trampline memory allocation"
> depends on MODULES || HAVE_ALLOC_EXECMEM
>
> And define fallback macro to module_alloc() like this.
>
> #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> #define alloc_execmem(size, gfp) module_alloc(size)
> #endif
>
> Then, introduce a new dependency to kprobes
>
> config KPROBES
> bool "Kprobes"
> select ALLOC_EXECMEM
>
> and update kprobes to use alloc_execmem and remove module related
> code from it.
>
> You also should consider using IS_ENABLED(CONFIG_MODULE) in the code to
> avoid using #ifdefs.
>
> Finally, you can add RISCV implementation patch of HAVE_ALLOC_EXECMEM in the
> next patch.
>
> Thank you,
>
>
> >
> > Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> > Link: https://lore.kernel.org/all/[email protected]/ # continuation
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
> > ---
> > v2:
> > - Better late than never right? :-)
> > - Focus only to RISC-V for now to make the patch more digestable. This
> > is the arch where I use the patch on a daily basis to help with QA.
> > - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
> > ---
> > arch/Kconfig | 8 +++++++-
> > arch/riscv/Kconfig | 1 +
> > arch/riscv/kernel/Makefile | 5 +++++
> > arch/riscv/kernel/module.c | 11 -----------
> > arch/riscv/kernel/module_alloc.c | 28 ++++++++++++++++++++++++++++
> > kernel/kprobes.c | 10 ++++++++++
> > kernel/trace/trace_kprobe.c | 18 ++++++++++++++++--
> > 7 files changed, 67 insertions(+), 14 deletions(-)
> > create mode 100644 arch/riscv/kernel/module_alloc.c
> >
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index a5af0edd3eb8..c931f1de98a7 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -52,7 +52,7 @@ config GENERIC_ENTRY
> >
> > config KPROBES
> > bool "Kprobes"
> > - depends on MODULES
> > + depends on MODULES || HAVE_KPROBES_ALLOC
> > depends on HAVE_KPROBES
> > select KALLSYMS
> > select TASKS_RCU if PREEMPTION
> > @@ -215,6 +215,12 @@ config HAVE_OPTPROBES
> > config HAVE_KPROBES_ON_FTRACE
> > bool
> >
> > +config HAVE_KPROBES_ALLOC
> > + bool
> > + help
> > + Architectures that select this option are capable of allocating memory
> > + for kprobes withou the kernel module allocator.
> > +
> > config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
> > bool
> > help
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e3142ce531a0..4f1b925e83d8 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -132,6 +132,7 @@ config RISCV
> > select HAVE_KPROBES if !XIP_KERNEL
> > select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
> > select HAVE_KRETPROBES if !XIP_KERNEL
> > + select HAVE_KPROBES_ALLOC if !XIP_KERNEL
> > # https://github.com/ClangBuiltLinux/linux/issues/1881
> > select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
> > select HAVE_MOVE_PMD
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index 604d6bf7e476..46318194bce1 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -73,6 +73,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o
> >
> > obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> > obj-$(CONFIG_MODULES) += module.o
> > +ifeq ($(CONFIG_MODULES),y)
> > +obj-y += module_alloc.o
> > +else
> > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > +endif
> > obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
> >
> > obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> > index 5e5a82644451..cc324b450f2e 100644
> > --- a/arch/riscv/kernel/module.c
> > +++ b/arch/riscv/kernel/module.c
> > @@ -905,17 +905,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
> > return 0;
> > }
> >
> > -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> > -void *module_alloc(unsigned long size)
> > -{
> > - return __vmalloc_node_range(size, 1, MODULES_VADDR,
> > - MODULES_END, GFP_KERNEL,
> > - PAGE_KERNEL, VM_FLUSH_RESET_PERMS,
> > - NUMA_NO_NODE,
> > - __builtin_return_address(0));
> > -}
> > -#endif
> > -
> > int module_finalize(const Elf_Ehdr *hdr,
> > const Elf_Shdr *sechdrs,
> > struct module *me)
> > diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c
> > new file mode 100644
> > index 000000000000..3d9aa8dbca8a
> > --- /dev/null
> > +++ b/arch/riscv/kernel/module_alloc.c
> > @@ -0,0 +1,28 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Copyright (c) 2017 Zihao Yu
> > + * Copyright (c) 2024 Jarkko Sakkinen
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/moduleloader.h>
> > +#include <linux/vmalloc.h>
> > +#include <asm/sections.h>
> > +
> > +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> > +void *module_alloc(unsigned long size)
> > +{
> > + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> > + MODULES_END, GFP_KERNEL,
> > + PAGE_KERNEL, 0, NUMA_NO_NODE,
> > + __builtin_return_address(0));
> > +}
> > +
> > +void module_memfree(void *module_region)
> > +{
> > + if (in_interrupt())
> > + pr_warn("In interrupt context: vmalloc may not work.\n");
> > +
> > + vfree(module_region);
> > +}
> > +#endif
> > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > index 9d9095e81792..2c583ab6efc4 100644
> > --- a/kernel/kprobes.c
> > +++ b/kernel/kprobes.c
> > @@ -1580,6 +1580,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > goto out;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Check if 'p' is probing a module. */
> > *probed_mod = __module_text_address((unsigned long) p->addr);
> > if (*probed_mod) {
> > @@ -1603,6 +1604,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > ret = -ENOENT;
> > }
> > }
> > +#endif
> > +
> > out:
> > preempt_enable();
> > jump_label_unlock();
> > @@ -2482,6 +2485,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Remove all symbols in given area from kprobe blacklist */
> > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> > {
> > @@ -2499,6 +2503,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> > {
> > kprobe_remove_area_blacklist(entry, entry + 1);
> > }
> > +#endif /* CONFIG_MODULES */
> >
> > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> > char *type, char *sym)
> > @@ -2564,6 +2569,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> > return ret ? : arch_populate_kprobe_blacklist();
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static void add_module_kprobe_blacklist(struct module *mod)
> > {
> > unsigned long start, end;
> > @@ -2665,6 +2671,7 @@ static struct notifier_block kprobe_module_nb = {
> > .notifier_call = kprobes_module_callback,
> > .priority = 0
> > };
> > +#endif /* CONFIG_MODULES */
> >
> > void kprobe_free_init_mem(void)
> > {
> > @@ -2724,8 +2731,11 @@ static int __init init_kprobes(void)
> > err = arch_init_kprobes();
> > if (!err)
> > err = register_die_notifier(&kprobe_exceptions_nb);
> > +
> > +#ifdef CONFIG_MODULES
> > if (!err)
> > err = register_module_notifier(&kprobe_module_nb);
> > +#endif
> >
> > kprobes_initialized = (err == 0);
> > kprobe_sysctls_init();
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index c4c6e0e0068b..f8fbd5e76dda 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> > {
> > char *p;
> > @@ -129,6 +130,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> >
> > return ret;
> > }
> > +#endif /* CONFIG_MODULES */
> >
> > static bool trace_kprobe_is_busy(struct dyn_event *ev)
> > {
> > @@ -608,7 +610,11 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
> >
> > /* Register k*probe */
> > ret = __register_trace_kprobe(tk);
> > - if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> > +#ifdef CONFIG_MODULES
> > + if (ret == -ENOENT && trace_kprobe_module_exist(tk))
> > + ret = 0;
> > +#endif /* CONFIG_MODULES */
> > + if (ret == -ENOENT) {
> > pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> > ret = 0;
> > }
> > @@ -655,7 +661,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> >
> > /* Register k*probe */
> > ret = __register_trace_kprobe(tk);
> > - if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> > +#ifdef CONFIG_MODULES
> > + if (ret == -ENOENT && trace_kprobe_module_exist(tk))
> > + ret = 0;
> > +#endif /* CONFIG_MODULES */
> > + if (ret == -ENOENT) {
> > pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> > ret = 0;
> > }
> > @@ -670,6 +680,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Module notifier call back, checking event on the module */
> > static int trace_kprobe_module_callback(struct notifier_block *nb,
> > unsigned long val, void *data)
> > @@ -704,6 +715,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> > .notifier_call = trace_kprobe_module_callback,
> > .priority = 1 /* Invoked after kprobe module callback */
> > };
> > +#endif /* CONFIG_MODULES */
> >
> > static int count_symbols(void *data, unsigned long unused)
> > {
> > @@ -1897,8 +1909,10 @@ static __init int init_kprobe_trace_early(void)
> > if (ret)
> > return ret;
> >
> > +#ifdef CONFIG_MODULES
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif /* CONFIG_MODULES */
> >
> > return 0;
> > }
> > --
> > 2.44.0
> >
>
>
> --
> Masami Hiramatsu (Google) <[email protected]>
On Mon Mar 25, 2024 at 4:56 AM EET, Masami Hiramatsu (Google) wrote:
> Hi Jarkko,
>
> On Sun, 24 Mar 2024 01:29:08 +0200
> Jarkko Sakkinen <[email protected]> wrote:
>
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible due the kernel module allocator dependency.
> >
> > Address the issue by allowing architectures to implement module_alloc()
> > and module_memfree() independent of the module subsystem. An arch tree
> > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> >
> > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > and implementing module_memfree().
>
> Even though, this involves changes in arch-independent part. So it should
> be solved by generic way. Did you checked Calvin's thread?
>
> https://lore.kernel.org/all/[email protected]/
Nope, has not been in my radar but for sure will check it.
I don't mind making this more generic. The point of this version was to
put focus on single architecture and do as little as possible how the
code works right now so that it is easier to give feedback on direction.
> I think, we'd better to introduce `alloc_execmem()`,
> CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
>
> config HAVE_ALLOC_EXECMEM
> bool
>
> config ALLOC_EXECMEM
> bool "Executable trampline memory allocation"
> depends on MODULES || HAVE_ALLOC_EXECMEM
Right so this is logically the same as I have just with ALLOC_EXECMEM
added to cover both MODULES and HAVE_ALLOC_EXECMEM (which is essentially
the same as HAVE_ALLOC_KPROBES just with a different name).
Not at all against this. I think this factor more understandable
structuring, just "peer checking" that I understand what I'm reading :-)
> And define fallback macro to module_alloc() like this.
>
> #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> #define alloc_execmem(size, gfp) module_alloc(size)
> #endif
>
> Then, introduce a new dependency to kprobes
>
> config KPROBES
> bool "Kprobes"
> select ALLOC_EXECMEM
OK I think this is good but now I see actually two logical chunks of
work becuse this select changes how KPROBES kconfig option works. It
has previously required to manually select MODULES first.
So I'll "select MODULES" as separate patch to keep all logical changes
transparent...
>
> and update kprobes to use alloc_execmem and remove module related
> code from it.
>
> You also should consider using IS_ENABLED(CONFIG_MODULE) in the code to
> avoid using #ifdefs.
>
> Finally, you can add RISCV implementation patch of HAVE_ALLOC_EXECMEM in the
> next patch.
OK, I think the suggestions are sane and not that much drift what I have
now so works for me.
> Thank you,
BR, Jarkko
On Mon Mar 25, 2024 at 9:11 PM EET, Jarkko Sakkinen wrote:
> On Mon Mar 25, 2024 at 8:37 PM EET, Jarkko Sakkinen wrote:
> > > You also should consider using IS_ENABLED(CONFIG_MODULE) in the code to
> > > avoid using #ifdefs.
>
> Hmm... I need make a couple of remarks but open for feedback ofc.
>
> First, trace_kprobe_module_exist depends on find_module()
>
> Second, there is a notifier callback that heavily binds to the module
> subsystem.
>
> In both cases using IS_ENABLED would emit a lot of compilation errors.
Also I think adding 'gfp' makes sense exactly at the point as it has
a use case, i.e. two call sites with differing flags. It makes sense
but should be IMHO added exactly at that time.
Leaving it from my patch set does not do any measurable harm but please
correct if I'm missing something.
BR, Jarkko
On Mon Mar 25, 2024 at 8:37 PM EET, Jarkko Sakkinen wrote:
> > You also should consider using IS_ENABLED(CONFIG_MODULE) in the code to
> > avoid using #ifdefs.
Hmm... I need make a couple of remarks but open for feedback ofc.
First, trace_kprobe_module_exist depends on find_module()
Second, there is a notifier callback that heavily binds to the module
subsystem.
In both cases using IS_ENABLED would emit a lot of compilation errors.
BR, Jarkko
Hi Masami,
On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> Hi Jarkko,
>
> On Sun, 24 Mar 2024 01:29:08 +0200
> Jarkko Sakkinen <[email protected]> wrote:
>
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible due the kernel module allocator dependency.
> >
> > Address the issue by allowing architectures to implement module_alloc()
> > and module_memfree() independent of the module subsystem. An arch tree
> > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> >
> > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > and implementing module_memfree().
>
> Even though, this involves changes in arch-independent part. So it should
> be solved by generic way. Did you checked Calvin's thread?
>
> https://lore.kernel.org/all/[email protected]/
>
> I think, we'd better to introduce `alloc_execmem()`,
> CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
>
> config HAVE_ALLOC_EXECMEM
> bool
>
> config ALLOC_EXECMEM
> bool "Executable trampline memory allocation"
> depends on MODULES || HAVE_ALLOC_EXECMEM
>
> And define fallback macro to module_alloc() like this.
>
> #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> #define alloc_execmem(size, gfp) module_alloc(size)
> #endif
Please can we *not* do this? I think this is abstracting at the wrong level (as
I mentioned on the prior execmem proposals).
Different exectuable allocations can have different requirements. For example,
on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
areas can be anywhere in the kernel VA space.
Forcing those behind the same interface makes things *harder* for architectures
and/or makes the common code more complicated (if that ends up having to track
all those different requirements). From my PoV it'd be much better to have
separate kprobes_alloc_*() functions for kprobes which an architecture can then
choose to implement using a common library if it wants to.
I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
and it looks pretty clean to me (and works in testing on arm64):
https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
Could we please start with that approach, with kprobe-specific alloc/free code
provided by the architecture?
Thanks,
Mark.
On Tue, 26 Mar 2024 14:46:10 +0000
Mark Rutland <[email protected]> wrote:
> Hi Masami,
>
> On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > Hi Jarkko,
> >
> > On Sun, 24 Mar 2024 01:29:08 +0200
> > Jarkko Sakkinen <[email protected]> wrote:
> >
> > > Tracing with kprobes while running a monolithic kernel is currently
> > > impossible due the kernel module allocator dependency.
> > >
> > > Address the issue by allowing architectures to implement module_alloc()
> > > and module_memfree() independent of the module subsystem. An arch tree
> > > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> > >
> > > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > > and implementing module_memfree().
> >
> > Even though, this involves changes in arch-independent part. So it should
> > be solved by generic way. Did you checked Calvin's thread?
> >
> > https://lore.kernel.org/all/[email protected]/
> >
> > I think, we'd better to introduce `alloc_execmem()`,
> > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> >
> > config HAVE_ALLOC_EXECMEM
> > bool
> >
> > config ALLOC_EXECMEM
> > bool "Executable trampline memory allocation"
> > depends on MODULES || HAVE_ALLOC_EXECMEM
> >
> > And define fallback macro to module_alloc() like this.
> >
> > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > #define alloc_execmem(size, gfp) module_alloc(size)
> > #endif
>
> Please can we *not* do this? I think this is abstracting at the wrong level (as
> I mentioned on the prior execmem proposals).
>
> Different exectuable allocations can have different requirements. For example,
> on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> areas can be anywhere in the kernel VA space.
>
> Forcing those behind the same interface makes things *harder* for architectures
> and/or makes the common code more complicated (if that ends up having to track
> all those different requirements). From my PoV it'd be much better to have
> separate kprobes_alloc_*() functions for kprobes which an architecture can then
> choose to implement using a common library if it wants to.
>
> I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> and it looks pretty clean to me (and works in testing on arm64):
>
> https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
>
> Could we please start with that approach, with kprobe-specific alloc/free code
> provided by the architecture?
OK, as far as I can read the code, this method also works and neat!
(and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
to user does not help, it should be an internal change. So hiding this change
from user is better choice. Then there is no reason to introduce the new
alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
Mark, can you send this series here, so that others can review/test it?
Thank you!
>
> Thanks,
> Mark.
--
Masami Hiramatsu (Google) <[email protected]>
On Wednesday 03/27 at 00:24 +0900, Masami Hiramatsu wrote:
> On Tue, 26 Mar 2024 14:46:10 +0000
> Mark Rutland <[email protected]> wrote:
>
> > Hi Masami,
> >
> > On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > > Hi Jarkko,
> > >
> > > On Sun, 24 Mar 2024 01:29:08 +0200
> > > Jarkko Sakkinen <[email protected]> wrote:
> > >
> > > > Tracing with kprobes while running a monolithic kernel is currently
> > > > impossible due the kernel module allocator dependency.
> > > >
> > > > Address the issue by allowing architectures to implement module_alloc()
> > > > and module_memfree() independent of the module subsystem. An arch tree
> > > > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> > > >
> > > > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > > > and implementing module_memfree().
> > >
> > > Even though, this involves changes in arch-independent part. So it should
> > > be solved by generic way. Did you checked Calvin's thread?
> > >
> > > https://lore.kernel.org/all/[email protected]/
> > >
> > > I think, we'd better to introduce `alloc_execmem()`,
> > > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> > >
> > > config HAVE_ALLOC_EXECMEM
> > > bool
> > >
> > > config ALLOC_EXECMEM
> > > bool "Executable trampline memory allocation"
> > > depends on MODULES || HAVE_ALLOC_EXECMEM
> > >
> > > And define fallback macro to module_alloc() like this.
> > >
> > > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > > #define alloc_execmem(size, gfp) module_alloc(size)
> > > #endif
> >
> > Please can we *not* do this? I think this is abstracting at the wrong level (as
> > I mentioned on the prior execmem proposals).
> >
> > Different exectuable allocations can have different requirements. For example,
> > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > areas can be anywhere in the kernel VA space.
> >
> > Forcing those behind the same interface makes things *harder* for architectures
> > and/or makes the common code more complicated (if that ends up having to track
> > all those different requirements). From my PoV it'd be much better to have
> > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > choose to implement using a common library if it wants to.
> >
> > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > and it looks pretty clean to me (and works in testing on arm64):
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> >
> > Could we please start with that approach, with kprobe-specific alloc/free code
> > provided by the architecture?
Heh, I also noticed that dead !RWX branch in arm64 patch_map(), I was
about to send a patch to remove it.
> OK, as far as I can read the code, this method also works and neat!
> (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> to user does not help, it should be an internal change. So hiding this change
> from user is better choice. Then there is no reason to introduce the new
> alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
I'm happy with this, it solves the first half of my problem. But I want
eBPF to work in the !MODULES case too.
I think Mark's approach can work for bpf as well, without needing to
touch module_alloc() at all? So I might be able to drop that first patch
entirely.
https://lore.kernel.org/all/a6b162aed1e6fea7f565ef9dd0204d6f2284bcce.1709676663.git.jcalvinowens@gmail.com/
Thanks,
Calvin
> Mark, can you send this series here, so that others can review/test it?
>
> Thank you!
>
>
> >
> > Thanks,
> > Mark.
>
>
> --
> Masami Hiramatsu (Google) <[email protected]>
On Tue, Mar 26, 2024 at 09:15:14AM -0700, Calvin Owens wrote:
> On Wednesday 03/27 at 00:24 +0900, Masami Hiramatsu wrote:
> > On Tue, 26 Mar 2024 14:46:10 +0000
> > Mark Rutland <[email protected]> wrote:
> > > Different exectuable allocations can have different requirements. For example,
> > > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > > areas can be anywhere in the kernel VA space.
> > >
> > > Forcing those behind the same interface makes things *harder* for architectures
> > > and/or makes the common code more complicated (if that ends up having to track
> > > all those different requirements). From my PoV it'd be much better to have
> > > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > > choose to implement using a common library if it wants to.
> > >
> > > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > > and it looks pretty clean to me (and works in testing on arm64):
> > >
> > > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> > >
> > > Could we please start with that approach, with kprobe-specific alloc/free code
> > > provided by the architecture?
>
> Heh, I also noticed that dead !RWX branch in arm64 patch_map(), I was
> about to send a patch to remove it.
>
> > OK, as far as I can read the code, this method also works and neat!
> > (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> > to user does not help, it should be an internal change. So hiding this change
> > from user is better choice. Then there is no reason to introduce the new
> > alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
>
> I'm happy with this, it solves the first half of my problem. But I want
> eBPF to work in the !MODULES case too.
>
> I think Mark's approach can work for bpf as well, without needing to
> touch module_alloc() at all? So I might be able to drop that first patch
> entirely.
I'd be very happy with eBPF following the same approach, with BPF-specific
alloc/free functions that we can implement in arch code.
IIUC eBPF code *does* want to be within range of the core kernel image, so for
arm64 we'd want to factor some common logic out of module_alloc() and into
something that module_alloc() and "bpf_alloc()" (or whatever it would be
called) could use. So I don't think we'd necessarily save on touching
module_alloc(), but I think the resulting split would be better.
Thanks,
Mark.
On Tue Mar 26, 2024 at 4:46 PM EET, Mark Rutland wrote:
> Hi Masami,
>
> On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > Hi Jarkko,
> >
> > On Sun, 24 Mar 2024 01:29:08 +0200
> > Jarkko Sakkinen <[email protected]> wrote:
> >
> > > Tracing with kprobes while running a monolithic kernel is currently
> > > impossible due the kernel module allocator dependency.
> > >
> > > Address the issue by allowing architectures to implement module_alloc()
> > > and module_memfree() independent of the module subsystem. An arch tree
> > > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> > >
> > > Realize the feature on RISC-V by separating allocator to module_allocc
> > > and implementing module_memfree().
> >
> > Even though, this involves changes in arch-independent part. So it should
> > be solved by generic way. Did you checked Calvin's thread?
> >
> > https://lore.kernel.org/all/[email protected]/
> >
> > I think, we'd better to introduce `alloc_execmem()`,
> > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> >
> > config HAVE_ALLOC_EXECMEM
> > bool
> >
> > config ALLOC_EXECMEM
> > bool "Executable trampline memory allocation"
> > depends on MODULES || HAVE_ALLOC_EXECMEM
> >
> > And define fallback macro to module_alloc() like this.
> >
> > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > #define alloc_execmem(size, gfp) module_alloc(size)
> > #endif
>
> Please can we *not* do this? I think this is abstracting at the wrong level (as
> I mentioned on the prior execmem proposals).
>
> Different exectuable allocations can have different requirements. For example,
> on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> areas can be anywhere in the kernel VA space.
>
> Forcing those behind the same interface makes things *harder* for architectures
> and/or makes the common code more complicated (if that ends up having to track
> all those different requirements). From my PoV it'd be much better to have
> separate kprobes_alloc_*() functions for kprobes which an architecture can then
> choose to implement using a common library if it wants to.
>
> I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> and it looks pretty clean to me (and works in testing on arm64):
>
> https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
>
> Could we please start with that approach, with kprobe-specific alloc/free code
> provided by the architecture?
How should we move forward?
I'm fine with someone picking the pieces of my work as long as also the
riscv side is included. Can also continue rotating this, whatever works.
>
> Thanks,
> Mark.
BR, Jarkko
On Wed, Mar 27, 2024 at 12:24:03AM +0900, Masami Hiramatsu wrote:
> On Tue, 26 Mar 2024 14:46:10 +0000
> Mark Rutland <[email protected]> wrote:
> >
> > On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > > I think, we'd better to introduce `alloc_execmem()`,
> > > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> > >
> > > config HAVE_ALLOC_EXECMEM
> > > bool
> > >
> > > config ALLOC_EXECMEM
> > > bool "Executable trampline memory allocation"
> > > depends on MODULES || HAVE_ALLOC_EXECMEM
> > >
> > > And define fallback macro to module_alloc() like this.
> > >
> > > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > > #define alloc_execmem(size, gfp) module_alloc(size)
> > > #endif
> >
> > Please can we *not* do this? I think this is abstracting at the wrong level (as
> > I mentioned on the prior execmem proposals).
> >
> > Different exectuable allocations can have different requirements. For example,
> > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > areas can be anywhere in the kernel VA space.
> >
> > Forcing those behind the same interface makes things *harder* for architectures
> > and/or makes the common code more complicated (if that ends up having to track
> > all those different requirements). From my PoV it'd be much better to have
> > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > choose to implement using a common library if it wants to.
> >
> > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > and it looks pretty clean to me (and works in testing on arm64):
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> >
> > Could we please start with that approach, with kprobe-specific alloc/free code
> > provided by the architecture?
>
> OK, as far as I can read the code, this method also works and neat!
> (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> to user does not help, it should be an internal change. So hiding this change
> from user is better choice. Then there is no reason to introduce the new
> alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
>
> Mark, can you send this series here, so that others can review/test it?
I've written up a cover letter and sent that out:
https://lore.kernel.org/lkml/[email protected]/
Mark.
On Tue Mar 26, 2024 at 5:24 PM EET, Masami Hiramatsu (Google) wrote:
> On Tue, 26 Mar 2024 14:46:10 +0000
> Mark Rutland <[email protected]> wrote:
>
> > Hi Masami,
> >
> > On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > > Hi Jarkko,
> > >
> > > On Sun, 24 Mar 2024 01:29:08 +0200
> > > Jarkko Sakkinen <[email protected]> wrote:
> > >
> > > > Tracing with kprobes while running a monolithic kernel is currently
> > > > impossible due the kernel module allocator dependency.
> > > >
> > > > Address the issue by allowing architectures to implement module_alloc()
> > > > and module_memfree() independent of the module subsystem. An arch tree
> > > > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> > > >
> > > > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > > > and implementing module_memfree().
> > >
> > > Even though, this involves changes in arch-independent part. So it should
> > > be solved by generic way. Did you checked Calvin's thread?
> > >
> > > https://lore.kernel.org/all/[email protected]/
> > >
> > > I think, we'd better to introduce `alloc_execmem()`,
> > > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> > >
> > > config HAVE_ALLOC_EXECMEM
> > > bool
> > >
> > > config ALLOC_EXECMEM
> > > bool "Executable trampline memory allocation"
> > > depends on MODULES || HAVE_ALLOC_EXECMEM
> > >
> > > And define fallback macro to module_alloc() like this.
> > >
> > > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > > #define alloc_execmem(size, gfp) module_alloc(size)
> > > #endif
> >
> > Please can we *not* do this? I think this is abstracting at the wrong level (as
> > I mentioned on the prior execmem proposals).
> >
> > Different exectuable allocations can have different requirements. For example,
> > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > areas can be anywhere in the kernel VA space.
> >
> > Forcing those behind the same interface makes things *harder* for architectures
> > and/or makes the common code more complicated (if that ends up having to track
> > all those different requirements). From my PoV it'd be much better to have
> > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > choose to implement using a common library if it wants to.
> >
> > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > and it looks pretty clean to me (and works in testing on arm64):
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> >
> > Could we please start with that approach, with kprobe-specific alloc/free code
> > provided by the architecture?
>
> OK, as far as I can read the code, this method also works and neat!
> (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> to user does not help, it should be an internal change. So hiding this change
> from user is better choice. Then there is no reason to introduce the new
> alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
>
> Mark, can you send this series here, so that others can review/test it?
I'm totally fine with this but yeah best would be if it could carry
the riscv part. Mark, even if you have only possibility to compile
test that part I can check that it works.
BR, Jarkko
On Tue Mar 26, 2024 at 6:15 PM EET, Calvin Owens wrote:
> On Wednesday 03/27 at 00:24 +0900, Masami Hiramatsu wrote:
> > On Tue, 26 Mar 2024 14:46:10 +0000
> > Mark Rutland <[email protected]> wrote:
> >
> > > Hi Masami,
> > >
> > > On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > > > Hi Jarkko,
> > > >
> > > > On Sun, 24 Mar 2024 01:29:08 +0200
> > > > Jarkko Sakkinen <[email protected]> wrote:
> > > >
> > > > > Tracing with kprobes while running a monolithic kernel is currently
> > > > > impossible due the kernel module allocator dependency.
> > > > >
> > > > > Address the issue by allowing architectures to implement module_alloc()
> > > > > and module_memfree() independent of the module subsystem. An arch tree
> > > > > can signal this by setting HAVE_KPROBES_ALLOC in its Kconfig file.
> > > > >
> > > > > Realize the feature on RISC-V by separating allocator to module_alloc.c
> > > > > and implementing module_memfree().
> > > >
> > > > Even though, this involves changes in arch-independent part. So it should
> > > > be solved by generic way. Did you checked Calvin's thread?
> > > >
> > > > https://lore.kernel.org/all/cover.1709676663.git.jcalvinowens@gmailcom/
> > > >
> > > > I think, we'd better to introduce `alloc_execmem()`,
> > > > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> > > >
> > > > config HAVE_ALLOC_EXECMEM
> > > > bool
> > > >
> > > > config ALLOC_EXECMEM
> > > > bool "Executable trampline memory allocation"
> > > > depends on MODULES || HAVE_ALLOC_EXECMEM
> > > >
> > > > And define fallback macro to module_alloc() like this.
> > > >
> > > > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > > > #define alloc_execmem(size, gfp) module_alloc(size)
> > > > #endif
> > >
> > > Please can we *not* do this? I think this is abstracting at the wrong level (as
> > > I mentioned on the prior execmem proposals).
> > >
> > > Different exectuable allocations can have different requirements. For example,
> > > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > > areas can be anywhere in the kernel VA space.
> > >
> > > Forcing those behind the same interface makes things *harder* for architectures
> > > and/or makes the common code more complicated (if that ends up having to track
> > > all those different requirements). From my PoV it'd be much better to have
> > > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > > choose to implement using a common library if it wants to.
> > >
> > > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > > and it looks pretty clean to me (and works in testing on arm64):
> > >
> > > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> > >
> > > Could we please start with that approach, with kprobe-specific alloc/free code
> > > provided by the architecture?
>
> Heh, I also noticed that dead !RWX branch in arm64 patch_map(), I was
> about to send a patch to remove it.
>
> > OK, as far as I can read the code, this method also works and neat!
> > (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> > to user does not help, it should be an internal change. So hiding this change
> > from user is better choice. Then there is no reason to introduce the new
> > alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
>
> I'm happy with this, it solves the first half of my problem. But I want
> eBPF to work in the !MODULES case too.
>
> I think Mark's approach can work for bpf as well, without needing to
> touch module_alloc() at all? So I might be able to drop that first patch
> entirely.
Yeah, I think we're aligned. Later on, if/when you send the bpf series
please also cc me and I might possibly test those patches too.
BR, Jarkko
On Tue Mar 26, 2024 at 6:38 PM EET, Mark Rutland wrote:
> On Wed, Mar 27, 2024 at 12:24:03AM +0900, Masami Hiramatsu wrote:
> > On Tue, 26 Mar 2024 14:46:10 +0000
> > Mark Rutland <[email protected]> wrote:
> > >
> > > On Mon, Mar 25, 2024 at 11:56:32AM +0900, Masami Hiramatsu wrote:
> > > > I think, we'd better to introduce `alloc_execmem()`,
> > > > CONFIG_HAVE_ALLOC_EXECMEM and CONFIG_ALLOC_EXECMEM at first
> > > >
> > > > config HAVE_ALLOC_EXECMEM
> > > > bool
> > > >
> > > > config ALLOC_EXECMEM
> > > > bool "Executable trampline memory allocation"
> > > > depends on MODULES || HAVE_ALLOC_EXECMEM
> > > >
> > > > And define fallback macro to module_alloc() like this.
> > > >
> > > > #ifndef CONFIG_HAVE_ALLOC_EXECMEM
> > > > #define alloc_execmem(size, gfp) module_alloc(size)
> > > > #endif
> > >
> > > Please can we *not* do this? I think this is abstracting at the wrong level (as
> > > I mentioned on the prior execmem proposals).
> > >
> > > Different exectuable allocations can have different requirements. For example,
> > > on arm64 modules need to be within 2G of the kernel image, but the kprobes XOL
> > > areas can be anywhere in the kernel VA space.
> > >
> > > Forcing those behind the same interface makes things *harder* for architectures
> > > and/or makes the common code more complicated (if that ends up having to track
> > > all those different requirements). From my PoV it'd be much better to have
> > > separate kprobes_alloc_*() functions for kprobes which an architecture can then
> > > choose to implement using a common library if it wants to.
> > >
> > > I took a look at doing that using the core ifdeffery fixups from Jarkko's v6,
> > > and it looks pretty clean to me (and works in testing on arm64):
> > >
> > > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=kprobes/without-modules
> > >
> > > Could we please start with that approach, with kprobe-specific alloc/free code
> > > provided by the architecture?
> >
> > OK, as far as I can read the code, this method also works and neat!
> > (and minimum intrusion). I actually found that exposing CONFIG_ALLOC_EXECMEM
> > to user does not help, it should be an internal change. So hiding this change
> > from user is better choice. Then there is no reason to introduce the new
> > alloc_execmem, but just expand kprobe_alloc_insn_page() is reasonable.
> >
> > Mark, can you send this series here, so that others can review/test it?
>
> I've written up a cover letter and sent that out:
>
> https://lore.kernel.org/lkml/[email protected]/
>
> Mark.
Ya, saw it thanks!
BR, Jarkko