2024-03-25 21:55:18

by Jarkko Sakkinen

[permalink] [raw]
Subject: [PATCH v5 1/2] kprobes: textmem API

Tracing with kprobes while running a monolithic kernel is currently
impossible because CONFIG_KPROBES depends on CONFIG_MODULES because it uses
the kernel module allocator.

Introduce alloc_textmem() and free_textmem() for allocating executable
memory. If an arch implements these functions, it can mark this up with
the HAVE_ALLOC_EXECMEM kconfig flag.

At first this feature will be used for enabling kprobes without
modules support for arch/riscv.

Link: https://lore.kernel.org/all/[email protected]/
Suggested-by: Masami Hiramatsu <[email protected]>
Signed-off-by: Jarkko Sakkinen <[email protected]>
---
v5:
- alloc_execmem() was missing GFP_KERNEL parameter. The patch set did
compile because 2/2 had the fixup (leaked there when rebasing the
patch set).
v4:
- Squashed a couple of unrequired CONFIG_MODULES checks.
- See https://lore.kernel.org/all/[email protected]/
v3:
- A new patch added.
- For IS_DEFINED() I need advice as I could not really find that many
locations where it would be applicable.
---
arch/Kconfig | 16 +++++++++++++++-
include/linux/execmem.h | 13 +++++++++++++
kernel/kprobes.c | 17 ++++++++++++++---
kernel/trace/trace_kprobe.c | 8 ++++++++
4 files changed, 50 insertions(+), 4 deletions(-)
create mode 100644 include/linux/execmem.h

diff --git a/arch/Kconfig b/arch/Kconfig
index a5af0edd3eb8..33ba68b7168f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -52,8 +52,8 @@ config GENERIC_ENTRY

config KPROBES
bool "Kprobes"
- depends on MODULES
depends on HAVE_KPROBES
+ select ALLOC_EXECMEM
select KALLSYMS
select TASKS_RCU if PREEMPTION
help
@@ -215,6 +215,20 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool

+config HAVE_ALLOC_EXECMEM
+ bool
+ help
+ Architectures that select this option are capable of allocating executable
+ memory, which can be used by subsystems but is not dependent of any of its
+ clients.
+
+config ALLOC_EXECMEM
+ bool "Executable (trampoline) memory allocation"
+ depends on MODULES || HAVE_ALLOC_EXECMEM
+ help
+ Select this for executable (trampoline) memory. Can be enabled when either
+ module allocator or arch-specific allocator is available.
+
config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
bool
help
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
new file mode 100644
index 000000000000..ae2ff151523a
--- /dev/null
+++ b/include/linux/execmem.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_EXECMEM_H
+#define _LINUX_EXECMEM_H
+
+#ifdef CONFIG_HAVE_ALLOC_EXECMEM
+void *alloc_execmem(unsigned long size, gfp_t gfp);
+void free_execmem(void *region);
+#else
+#define alloc_execmem(size, gfp) module_alloc(size)
+#define free_execmem(region) module_memfree(region)
+#endif
+
+#endif /* _LINUX_EXECMEM_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e81792..87fd8c14a938 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <linux/uaccess.h>
+#include <linux/execmem.h>

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -113,17 +114,17 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
/*
- * Use module_alloc() so this page is within +/- 2GB of where the
+ * Use alloc_execmem() so this page is within +/- 2GB of where the
* kernel image and loaded module images reside. This is required
* for most of the architectures.
* (e.g. x86-64 needs this to handle the %rip-relative fixups.)
*/
- return module_alloc(PAGE_SIZE);
+ return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
}

static void free_insn_page(void *page)
{
- module_memfree(page);
+ free_execmem(page);
}

struct kprobe_insn_cache kprobe_insn_slots = {
@@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}

+#ifdef CONFIG_MODULES
/* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
@@ -1603,6 +1605,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = -ENOENT;
}
}
+#endif
+
out:
preempt_enable();
jump_label_unlock();
@@ -2482,6 +2486,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
return 0;
}

+#ifdef CONFIG_MODULES
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
@@ -2499,6 +2504,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
kprobe_remove_area_blacklist(entry, entry + 1);
}
+#endif /* CONFIG_MODULES */

int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
char *type, char *sym)
@@ -2564,6 +2570,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret ? : arch_populate_kprobe_blacklist();
}

+#ifdef CONFIG_MODULES
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
@@ -2665,6 +2672,7 @@ static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
};
+#endif /* CONFIG_MODULES */

void kprobe_free_init_mem(void)
{
@@ -2724,8 +2732,11 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
+
+#ifdef CONFIG_MODULES
if (!err)
err = register_module_notifier(&kprobe_module_nb);
+#endif

kprobes_initialized = (err == 0);
kprobe_sysctls_init();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..af70bb1e9c3a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}

+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -129,6 +130,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)

return ret;
}
+#else
+#define trace_kprobe_module_exist(tk) false /* aka a module never exists */
+#endif /* CONFIG_MODULES */

static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -670,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
return ret;
}

+#ifdef CONFIG_MODULES
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -704,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
+#endif /* CONFIG_MODULES */

static int count_symbols(void *data, unsigned long unused)
{
@@ -1897,8 +1903,10 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;

+#ifdef CONFIG_MODULES
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
+#endif /* CONFIG_MODULES */

return 0;
}
--
2.44.0



2024-03-25 21:55:30

by Jarkko Sakkinen

[permalink] [raw]
Subject: [PATCH v5 2/2] arch/riscv: Enable kprobes when CONFIG_MODULES=n

Tacing with kprobes while running a monolithic kernel is currently
impossible due the kernel module allocator dependency.

Address the issue by implementing textmem API for RISC-V.

Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
Link: https://lore.kernel.org/all/[email protected]/ # continuation
Signed-off-by: Jarkko Sakkinen <[email protected]>
---
v5:
- No changes, expect removing alloc_execmem() call which should have
been part of the previous patch.
v4:
- Include linux/execmem.h.
v3:
- Architecture independent parts have been split to separate patches.
- Do not change arch/riscv/kernel/module.c as it is out of scope for
this patch set now.
v2:
- Better late than never right? :-)
- Focus only to RISC-V for now to make the patch more digestable. This
is the arch where I use the patch on a daily basis to help with QA.
- Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
---
arch/riscv/Kconfig | 1 +
arch/riscv/kernel/Makefile | 3 +++
arch/riscv/kernel/execmem.c | 22 ++++++++++++++++++++++
3 files changed, 26 insertions(+)
create mode 100644 arch/riscv/kernel/execmem.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e3142ce531a0..499512fb17ff 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -132,6 +132,7 @@ config RISCV
select HAVE_KPROBES if !XIP_KERNEL
select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
select HAVE_KRETPROBES if !XIP_KERNEL
+ select HAVE_ALLOC_EXECMEM if !XIP_KERNEL
# https://github.com/ClangBuiltLinux/linux/issues/1881
select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
select HAVE_MOVE_PMD
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 604d6bf7e476..337797f10d3e 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -73,6 +73,9 @@ obj-$(CONFIG_SMP) += cpu_ops.o

obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_ALLOC_EXECMEM),y)
+obj-y += execmem.o
+endif
obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o

obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
diff --git a/arch/riscv/kernel/execmem.c b/arch/riscv/kernel/execmem.c
new file mode 100644
index 000000000000..3e52522ead32
--- /dev/null
+++ b/arch/riscv/kernel/execmem.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/mm.h>
+#include <linux/execmem.h>
+#include <linux/vmalloc.h>
+#include <asm/sections.h>
+
+void *alloc_execmem(unsigned long size, gfp_t /* gfp */)
+{
+ return __vmalloc_node_range(size, 1, MODULES_VADDR,
+ MODULES_END, GFP_KERNEL,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+
+void free_execmem(void *region)
+{
+ if (in_interrupt())
+ pr_warn("In interrupt context: vmalloc may not work.\n");
+
+ vfree(region);
+}
--
2.44.0


2024-03-25 22:06:56

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

s/textmem/execmem/ (also in long description)

will hold sending a new version as not a functional issue, will fix
after review.

BR, Jarkko

2024-03-25 22:09:56

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Mon Mar 25, 2024 at 11:55 PM EET, Jarkko Sakkinen wrote:
> +#ifdef CONFIG_MODULES
> if (register_module_notifier(&trace_kprobe_module_nb))
> return -EINVAL;
> +#endif /* CONFIG_MODULES */

register_module_notifier() does have "dummy" version but what
would I pass to it. It makes more mess than it cleans to declare
also a "dummy" version of trace_kprobe_module_nb.

The callback itself has too tight module subsystem bindings so
that they could be simply flagged with IS_DEFINED() (or correct
if I'm mistaken, this the conclusion I've ended up with).

BR, Jarkko

2024-03-25 22:17:14

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 12:09 AM EET, Jarkko Sakkinen wrote:
> On Mon Mar 25, 2024 at 11:55 PM EET, Jarkko Sakkinen wrote:
> > +#ifdef CONFIG_MODULES
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif /* CONFIG_MODULES */
>
> register_module_notifier() does have "dummy" version but what
> would I pass to it. It makes more mess than it cleans to declare
> also a "dummy" version of trace_kprobe_module_nb.
>
> The callback itself has too tight module subsystem bindings so
> that they could be simply flagged with IS_DEFINED() (or correct
> if I'm mistaken, this the conclusion I've ended up with).

One way to clean that up would be to create trace_kprobe_module.c and
move kernel module specific code over there and then change
kernel/trace/Makefile as follows:

ifeq ($(CONFIG_PERF_EVENTS),y)
obj-y += trace_kprobe.o
obj-$(CONFIG_MODULES) += trace_kprobe_module.o
endif

and define trace_kprobe_module_init() or similar to do all the dance
with notifiers etc.

This crossed my mind but did not want to do it without feedback.

BR, Jarkko

2024-03-26 00:36:21

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 1:50 AM EET, Masami Hiramatsu (Google) wrote:
> On Tue, 26 Mar 2024 00:09:42 +0200
> "Jarkko Sakkinen" <[email protected]> wrote:
>
> > On Mon Mar 25, 2024 at 11:55 PM EET, Jarkko Sakkinen wrote:
> > > +#ifdef CONFIG_MODULES
> > > if (register_module_notifier(&trace_kprobe_module_nb))
> > > return -EINVAL;
> > > +#endif /* CONFIG_MODULES */
> >
> > register_module_notifier() does have "dummy" version but what
> > would I pass to it. It makes more mess than it cleans to declare
> > also a "dummy" version of trace_kprobe_module_nb.
>
> That is better than having #ifdef in the function.
>
> >
> > The callback itself has too tight module subsystem bindings so
> > that they could be simply flagged with IS_DEFINED() (or correct
> > if I'm mistaken, this the conclusion I've ended up with).
>
> Please try this.
>
> -----
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 70dc6179086e..bc98db14927f 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -2625,6 +2625,7 @@ static void remove_module_kprobe_blacklist(struct module *mod)
> }
> }
>
> +#ifdef CONFIG_MODULES
> /* Module notifier call back, checking kprobes on the module */
> static int kprobes_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> @@ -2675,6 +2676,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
> mutex_unlock(&kprobe_mutex);
> return NOTIFY_DONE;
> }
> +#else
> +#define kprobes_module_callback (NULL)
> +#endif
>
> static struct notifier_block kprobe_module_nb = {
> .notifier_call = kprobes_module_callback,
> @@ -2739,7 +2743,7 @@ static int __init init_kprobes(void)
> err = arch_init_kprobes();
> if (!err)
> err = register_die_notifier(&kprobe_exceptions_nb);
> - if (!err)
> + if (!err && IS_ENABLED(CONFIG_MODULES))
> err = register_module_notifier(&kprobe_module_nb);
>
> kprobes_initialized = (err == 0);

OK, thanks for the suggestion WFM.

I'll give this also a spin with VisionFive2 RISC-V SBC before sending
v6.

BR, Jarkko

2024-03-26 00:58:51

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Mon, 25 Mar 2024 23:55:01 +0200
Jarkko Sakkinen <[email protected]> wrote:

> Tracing with kprobes while running a monolithic kernel is currently
> impossible because CONFIG_KPROBES depends on CONFIG_MODULES because it uses
> the kernel module allocator.
>
> Introduce alloc_textmem() and free_textmem() for allocating executable
> memory. If an arch implements these functions, it can mark this up with
> the HAVE_ALLOC_EXECMEM kconfig flag.
>
> At first this feature will be used for enabling kprobes without
> modules support for arch/riscv.
>
> Link: https://lore.kernel.org/all/[email protected]/
> Suggested-by: Masami Hiramatsu <[email protected]>
> Signed-off-by: Jarkko Sakkinen <[email protected]>
> ---
> v5:
> - alloc_execmem() was missing GFP_KERNEL parameter. The patch set did
> compile because 2/2 had the fixup (leaked there when rebasing the
> patch set).
> v4:
> - Squashed a couple of unrequired CONFIG_MODULES checks.
> - See https://lore.kernel.org/all/[email protected]/
> v3:
> - A new patch added.
> - For IS_DEFINED() I need advice as I could not really find that many
> locations where it would be applicable.
> ---
> arch/Kconfig | 16 +++++++++++++++-
> include/linux/execmem.h | 13 +++++++++++++
> kernel/kprobes.c | 17 ++++++++++++++---
> kernel/trace/trace_kprobe.c | 8 ++++++++
> 4 files changed, 50 insertions(+), 4 deletions(-)
> create mode 100644 include/linux/execmem.h
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index a5af0edd3eb8..33ba68b7168f 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -52,8 +52,8 @@ config GENERIC_ENTRY
>
> config KPROBES
> bool "Kprobes"
> - depends on MODULES
> depends on HAVE_KPROBES
> + select ALLOC_EXECMEM
> select KALLSYMS
> select TASKS_RCU if PREEMPTION
> help
> @@ -215,6 +215,20 @@ config HAVE_OPTPROBES
> config HAVE_KPROBES_ON_FTRACE
> bool
>
> +config HAVE_ALLOC_EXECMEM
> + bool
> + help
> + Architectures that select this option are capable of allocating executable
> + memory, which can be used by subsystems but is not dependent of any of its
> + clients.
> +
> +config ALLOC_EXECMEM
> + bool "Executable (trampoline) memory allocation"
> + depends on MODULES || HAVE_ALLOC_EXECMEM
> + help
> + Select this for executable (trampoline) memory. Can be enabled when either
> + module allocator or arch-specific allocator is available.
> +
> config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
> bool
> help
> diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> new file mode 100644
> index 000000000000..ae2ff151523a
> --- /dev/null
> +++ b/include/linux/execmem.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_EXECMEM_H
> +#define _LINUX_EXECMEM_H
> +
> +#ifdef CONFIG_HAVE_ALLOC_EXECMEM
> +void *alloc_execmem(unsigned long size, gfp_t gfp);
> +void free_execmem(void *region);
> +#else
> +#define alloc_execmem(size, gfp) module_alloc(size)
> +#define free_execmem(region) module_memfree(region)
> +#endif
> +
> +#endif /* _LINUX_EXECMEM_H */
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 9d9095e81792..87fd8c14a938 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -44,6 +44,7 @@
> #include <asm/cacheflush.h>
> #include <asm/errno.h>
> #include <linux/uaccess.h>
> +#include <linux/execmem.h>
>
> #define KPROBE_HASH_BITS 6
> #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
> @@ -113,17 +114,17 @@ enum kprobe_slot_state {
> void __weak *alloc_insn_page(void)
> {
> /*
> - * Use module_alloc() so this page is within +/- 2GB of where the
> + * Use alloc_execmem() so this page is within +/- 2GB of where the
> * kernel image and loaded module images reside. This is required
> * for most of the architectures.
> * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
> */
> - return module_alloc(PAGE_SIZE);
> + return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
> }
>
> static void free_insn_page(void *page)
> {
> - module_memfree(page);
> + free_execmem(page);
> }
>
> struct kprobe_insn_cache kprobe_insn_slots = {
> @@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> goto out;
> }
>
> +#ifdef CONFIG_MODULES

You don't need this block, because these APIs have dummy functions.

> /* Check if 'p' is probing a module. */
> *probed_mod = __module_text_address((unsigned long) p->addr);
> if (*probed_mod) {

So this block never be true if !CONFIG_MODULES automatically, and it should be
optimized out by compiler.

> @@ -1603,6 +1605,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> ret = -ENOENT;
> }
> }
> +#endif
> +
> out:
> preempt_enable();
> jump_label_unlock();
> @@ -2482,6 +2486,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> return 0;
> }
>
> +#ifdef CONFIG_MODULES
> /* Remove all symbols in given area from kprobe blacklist */
> static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> {
> @@ -2499,6 +2504,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> {
> kprobe_remove_area_blacklist(entry, entry + 1);
> }
> +#endif /* CONFIG_MODULES */

I think this block should be moved right before remove_module_kprobe_blacklist().
Then we can gather the code depending on modules in one place.

>
> int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> char *type, char *sym)
> @@ -2564,6 +2570,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> return ret ? : arch_populate_kprobe_blacklist();
> }
>
> +#ifdef CONFIG_MODULES
> static void add_module_kprobe_blacklist(struct module *mod)
> {
> unsigned long start, end;
> @@ -2665,6 +2672,7 @@ static struct notifier_block kprobe_module_nb = {
> .notifier_call = kprobes_module_callback,
> .priority = 0
> };
> +#endif /* CONFIG_MODULES */

So, keep the kprobe_module_nb outside of this #ifdef as I sed.


>
> void kprobe_free_init_mem(void)
> {
> @@ -2724,8 +2732,11 @@ static int __init init_kprobes(void)
> err = arch_init_kprobes();
> if (!err)
> err = register_die_notifier(&kprobe_exceptions_nb);
> +
> +#ifdef CONFIG_MODULES
> if (!err)
> err = register_module_notifier(&kprobe_module_nb);
> +#endif

Then we don't need this #ifdef.

>
> kprobes_initialized = (err == 0);
> kprobe_sysctls_init();
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index c4c6e0e0068b..af70bb1e9c3a 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> }
>
> +#ifdef CONFIG_MODULES
> static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> {
> char *p;
> @@ -129,6 +130,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
>
> return ret;
> }
> +#else
> +#define trace_kprobe_module_exist(tk) false /* aka a module never exists */
> +#endif /* CONFIG_MODULES */
>
> static bool trace_kprobe_is_busy(struct dyn_event *ev)
> {
> @@ -670,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> return ret;
> }
>
> +#ifdef CONFIG_MODULES
> /* Module notifier call back, checking event on the module */
> static int trace_kprobe_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> @@ -704,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> .notifier_call = trace_kprobe_module_callback,
> .priority = 1 /* Invoked after kprobe module callback */
> };
> +#endif /* CONFIG_MODULES */

As I similar to the above, let's move trace_kprobe_module_nb outside
of #ifdef.

>
> static int count_symbols(void *data, unsigned long unused)
> {
> @@ -1897,8 +1903,10 @@ static __init int init_kprobe_trace_early(void)
> if (ret)
> return ret;
>
> +#ifdef CONFIG_MODULES
> if (register_module_notifier(&trace_kprobe_module_nb))
> return -EINVAL;
> +#endif /* CONFIG_MODULES */

And remove this #ifdef.

Thank you!

>
> return 0;
> }
> --
> 2.44.0
>


--
Masami Hiramatsu (Google) <[email protected]>

2024-03-26 01:31:51

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 2:58 AM EET, Masami Hiramatsu (Google) wrote:
> On Mon, 25 Mar 2024 23:55:01 +0200
> Jarkko Sakkinen <[email protected]> wrote:
>
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible because CONFIG_KPROBES depends on CONFIG_MODULES because it uses
> > the kernel module allocator.
> >
> > Introduce alloc_textmem() and free_textmem() for allocating executable
> > memory. If an arch implements these functions, it can mark this up with
> > the HAVE_ALLOC_EXECMEM kconfig flag.
> >
> > At first this feature will be used for enabling kprobes without
> > modules support for arch/riscv.
> >
> > Link: https://lore.kernel.org/all/[email protected]/
> > Suggested-by: Masami Hiramatsu <[email protected]>
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
> > ---
> > v5:
> > - alloc_execmem() was missing GFP_KERNEL parameter. The patch set did
> > compile because 2/2 had the fixup (leaked there when rebasing the
> > patch set).
> > v4:
> > - Squashed a couple of unrequired CONFIG_MODULES checks.
> > - See https://lore.kernel.org/all/[email protected]/
> > v3:
> > - A new patch added.
> > - For IS_DEFINED() I need advice as I could not really find that many
> > locations where it would be applicable.
> > ---
> > arch/Kconfig | 16 +++++++++++++++-
> > include/linux/execmem.h | 13 +++++++++++++
> > kernel/kprobes.c | 17 ++++++++++++++---
> > kernel/trace/trace_kprobe.c | 8 ++++++++
> > 4 files changed, 50 insertions(+), 4 deletions(-)
> > create mode 100644 include/linux/execmem.h
> >
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index a5af0edd3eb8..33ba68b7168f 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -52,8 +52,8 @@ config GENERIC_ENTRY
> >
> > config KPROBES
> > bool "Kprobes"
> > - depends on MODULES
> > depends on HAVE_KPROBES
> > + select ALLOC_EXECMEM
> > select KALLSYMS
> > select TASKS_RCU if PREEMPTION
> > help
> > @@ -215,6 +215,20 @@ config HAVE_OPTPROBES
> > config HAVE_KPROBES_ON_FTRACE
> > bool
> >
> > +config HAVE_ALLOC_EXECMEM
> > + bool
> > + help
> > + Architectures that select this option are capable of allocating executable
> > + memory, which can be used by subsystems but is not dependent of any of its
> > + clients.
> > +
> > +config ALLOC_EXECMEM
> > + bool "Executable (trampoline) memory allocation"
> > + depends on MODULES || HAVE_ALLOC_EXECMEM
> > + help
> > + Select this for executable (trampoline) memory. Can be enabled when either
> > + module allocator or arch-specific allocator is available.
> > +
> > config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
> > bool
> > help
> > diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> > new file mode 100644
> > index 000000000000..ae2ff151523a
> > --- /dev/null
> > +++ b/include/linux/execmem.h
> > @@ -0,0 +1,13 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _LINUX_EXECMEM_H
> > +#define _LINUX_EXECMEM_H
> > +
> > +#ifdef CONFIG_HAVE_ALLOC_EXECMEM
> > +void *alloc_execmem(unsigned long size, gfp_t gfp);
> > +void free_execmem(void *region);
> > +#else
> > +#define alloc_execmem(size, gfp) module_alloc(size)
> > +#define free_execmem(region) module_memfree(region)
> > +#endif
> > +
> > +#endif /* _LINUX_EXECMEM_H */
> > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > index 9d9095e81792..87fd8c14a938 100644
> > --- a/kernel/kprobes.c
> > +++ b/kernel/kprobes.c
> > @@ -44,6 +44,7 @@
> > #include <asm/cacheflush.h>
> > #include <asm/errno.h>
> > #include <linux/uaccess.h>
> > +#include <linux/execmem.h>
> >
> > #define KPROBE_HASH_BITS 6
> > #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
> > @@ -113,17 +114,17 @@ enum kprobe_slot_state {
> > void __weak *alloc_insn_page(void)
> > {
> > /*
> > - * Use module_alloc() so this page is within +/- 2GB of where the
> > + * Use alloc_execmem() so this page is within +/- 2GB of where the
> > * kernel image and loaded module images reside. This is required
> > * for most of the architectures.
> > * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
> > */
> > - return module_alloc(PAGE_SIZE);
> > + return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
> > }
> >
> > static void free_insn_page(void *page)
> > {
> > - module_memfree(page);
> > + free_execmem(page);
> > }
> >
> > struct kprobe_insn_cache kprobe_insn_slots = {
> > @@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > goto out;
> > }
> >
> > +#ifdef CONFIG_MODULES
>
> You don't need this block, because these APIs have dummy functions.

Hmm... I'll verify this tomorrow.

>
> > /* Check if 'p' is probing a module. */
> > *probed_mod = __module_text_address((unsigned long) p->addr);
> > if (*probed_mod) {
>
> So this block never be true if !CONFIG_MODULES automatically, and it should be
> optimized out by compiler.

Yeah sure, was not done for saving cycles. Just wanted to make sure that
my stuff compiles with different config flag combinations related but
I'll check tomorrow morning if I can relax this further.

>
> > @@ -1603,6 +1605,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > ret = -ENOENT;
> > }
> > }
> > +#endif
> > +
> > out:
> > preempt_enable();
> > jump_label_unlock();
> > @@ -2482,6 +2486,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Remove all symbols in given area from kprobe blacklist */
> > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> > {
> > @@ -2499,6 +2504,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> > {
> > kprobe_remove_area_blacklist(entry, entry + 1);
> > }
> > +#endif /* CONFIG_MODULES */
>
> I think this block should be moved right before remove_module_kprobe_blacklist().
> Then we can gather the code depending on modules in one place.

Agree with this without verification :-)


>
> >
> > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> > char *type, char *sym)
> > @@ -2564,6 +2570,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> > return ret ? : arch_populate_kprobe_blacklist();
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static void add_module_kprobe_blacklist(struct module *mod)
> > {
> > unsigned long start, end;
> > @@ -2665,6 +2672,7 @@ static struct notifier_block kprobe_module_nb = {
> > .notifier_call = kprobes_module_callback,
> > .priority = 0
> > };
> > +#endif /* CONFIG_MODULES */
>
> So, keep the kprobe_module_nb outside of this #ifdef as I sed.

Yup, already done in v6.

>
>
> >
> > void kprobe_free_init_mem(void)
> > {
> > @@ -2724,8 +2732,11 @@ static int __init init_kprobes(void)
> > err = arch_init_kprobes();
> > if (!err)
> > err = register_die_notifier(&kprobe_exceptions_nb);
> > +
> > +#ifdef CONFIG_MODULES
> > if (!err)
> > err = register_module_notifier(&kprobe_module_nb);
> > +#endif
>
> Then we don't need this #ifdef.

Addressed in v6.

>
> >
> > kprobes_initialized = (err == 0);
> > kprobe_sysctls_init();
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index c4c6e0e0068b..af70bb1e9c3a 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> > {
> > char *p;
> > @@ -129,6 +130,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> >
> > return ret;
> > }
> > +#else
> > +#define trace_kprobe_module_exist(tk) false /* aka a module never exists */
> > +#endif /* CONFIG_MODULES */
> >
> > static bool trace_kprobe_is_busy(struct dyn_event *ev)
> > {
> > @@ -670,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Module notifier call back, checking event on the module */
> > static int trace_kprobe_module_callback(struct notifier_block *nb,
> > unsigned long val, void *data)
> > @@ -704,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> > .notifier_call = trace_kprobe_module_callback,
> > .priority = 1 /* Invoked after kprobe module callback */
> > };
> > +#endif /* CONFIG_MODULES */
>
> As I similar to the above, let's move trace_kprobe_module_nb outside
> of #ifdef.

Ditto (also in v6).

>
> >
> > static int count_symbols(void *data, unsigned long unused)
> > {
> > @@ -1897,8 +1903,10 @@ static __init int init_kprobe_trace_early(void)
> > if (ret)
> > return ret;
> >
> > +#ifdef CONFIG_MODULES
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif /* CONFIG_MODULES */
>
> And remove this #ifdef.
>
> Thank you!

Thanks for quick reviews and sorry for the spam. I just try to move this
forward with small pushes because with compilation flags it is easy to
blow up compilation.

I'll do the aforementioned suggestions that were missing from v6 to v7.

>
> >
> > return 0;
> > }
> > --
> > 2.44.0
> >


BR, Jarkko

2024-03-26 01:39:00

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue, 26 Mar 2024 00:09:42 +0200
"Jarkko Sakkinen" <[email protected]> wrote:

> On Mon Mar 25, 2024 at 11:55 PM EET, Jarkko Sakkinen wrote:
> > +#ifdef CONFIG_MODULES
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif /* CONFIG_MODULES */
>
> register_module_notifier() does have "dummy" version but what
> would I pass to it. It makes more mess than it cleans to declare
> also a "dummy" version of trace_kprobe_module_nb.

That is better than having #ifdef in the function.

>
> The callback itself has too tight module subsystem bindings so
> that they could be simply flagged with IS_DEFINED() (or correct
> if I'm mistaken, this the conclusion I've ended up with).

Please try this.

-----
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 70dc6179086e..bc98db14927f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2625,6 +2625,7 @@ static void remove_module_kprobe_blacklist(struct module *mod)
}
}

+#ifdef CONFIG_MODULES
/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -2675,6 +2676,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_unlock(&kprobe_mutex);
return NOTIFY_DONE;
}
+#else
+#define kprobes_module_callback (NULL)
+#endif

static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
@@ -2739,7 +2743,7 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
- if (!err)
+ if (!err && IS_ENABLED(CONFIG_MODULES))
err = register_module_notifier(&kprobe_module_nb);

kprobes_initialized = (err == 0);

-----

Thank you,

--
Masami Hiramatsu (Google) <[email protected]>

2024-03-26 02:01:39

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 3:31 AM EET, Jarkko Sakkinen wrote:
> > > +#endif /* _LINUX_EXECMEM_H */
> > > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > > index 9d9095e81792..87fd8c14a938 100644
> > > --- a/kernel/kprobes.c
> > > +++ b/kernel/kprobes.c
> > > @@ -44,6 +44,7 @@
> > > #include <asm/cacheflush.h>
> > > #include <asm/errno.h>
> > > #include <linux/uaccess.h>
> > > +#include <linux/execmem.h>
> > >
> > > #define KPROBE_HASH_BITS 6
> > > #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
> > > @@ -113,17 +114,17 @@ enum kprobe_slot_state {
> > > void __weak *alloc_insn_page(void)
> > > {
> > > /*
> > > - * Use module_alloc() so this page is within +/- 2GB of where the
> > > + * Use alloc_execmem() so this page is within +/- 2GB of where the
> > > * kernel image and loaded module images reside. This is required
> > > * for most of the architectures.
> > > * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
> > > */
> > > - return module_alloc(PAGE_SIZE);
> > > + return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
> > > }
> > >
> > > static void free_insn_page(void *page)
> > > {
> > > - module_memfree(page);
> > > + free_execmem(page);
> > > }
> > >
> > > struct kprobe_insn_cache kprobe_insn_slots = {
> > > @@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > > goto out;
> > > }
> > >
> > > +#ifdef CONFIG_MODULES
> >
> > You don't need this block, because these APIs have dummy functions.
>
> Hmm... I'll verify this tomorrow.

It depends on having struct module available given "(*probed_mod)->state".

It is non-existent unless CONFIG_MODULES is set given how things are
flagged in include/linux/module.h.

BR, Jarkko

2024-03-26 13:24:46

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 4:01 AM EET, Jarkko Sakkinen wrote:
> On Tue Mar 26, 2024 at 3:31 AM EET, Jarkko Sakkinen wrote:
> > > > +#endif /* _LINUX_EXECMEM_H */
> > > > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > > > index 9d9095e81792..87fd8c14a938 100644
> > > > --- a/kernel/kprobes.c
> > > > +++ b/kernel/kprobes.c
> > > > @@ -44,6 +44,7 @@
> > > > #include <asm/cacheflush.h>
> > > > #include <asm/errno.h>
> > > > #include <linux/uaccess.h>
> > > > +#include <linux/execmem.h>
> > > >
> > > > #define KPROBE_HASH_BITS 6
> > > > #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
> > > > @@ -113,17 +114,17 @@ enum kprobe_slot_state {
> > > > void __weak *alloc_insn_page(void)
> > > > {
> > > > /*
> > > > - * Use module_alloc() so this page is within +/- 2GB of where the
> > > > + * Use alloc_execmem() so this page is within +/- 2GB of where the
> > > > * kernel image and loaded module images reside. This is required
> > > > * for most of the architectures.
> > > > * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
> > > > */
> > > > - return module_alloc(PAGE_SIZE);
> > > > + return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
> > > > }
> > > >
> > > > static void free_insn_page(void *page)
> > > > {
> > > > - module_memfree(page);
> > > > + free_execmem(page);
> > > > }
> > > >
> > > > struct kprobe_insn_cache kprobe_insn_slots = {
> > > > @@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > > > goto out;
> > > > }
> > > >
> > > > +#ifdef CONFIG_MODULES
> > >
> > > You don't need this block, because these APIs have dummy functions.
> >
> > Hmm... I'll verify this tomorrow.
>
> It depends on having struct module available given "(*probed_mod)->state".
>
> It is non-existent unless CONFIG_MODULES is set given how things are
> flagged in include/linux/module.h.

Hey, noticed kconfig issue.

According to kconfig-language.txt:

"select should be used with care. select will force a symbol to a value
without visiting the dependencies."

So the problem here lies in KPROBES config entry using select statement
to pick ALLOC_EXECMEM. It will not take the depends on statement into
account and thus will allow to select kprobes without any allocator in
place.

So to address this I'd suggest to use depends on statement also for
describing relation between KPROBES and ALLOC_EXECMEM. It does not make
life worse than before for anyone because even with the current kernel
you have to select MODULES before you can move forward with kprobes.

BR, Jarkko

2024-03-26 13:58:03

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH v5 2/2] arch/riscv: Enable kprobes when CONFIG_MODULES=n

Hi Jarkko,

On 25/03/2024 22:55, Jarkko Sakkinen wrote:
> Tacing with kprobes while running a monolithic kernel is currently
> impossible due the kernel module allocator dependency.
>
> Address the issue by implementing textmem API for RISC-V.
>
> Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> Link: https://lore.kernel.org/all/[email protected]/ # continuation
> Signed-off-by: Jarkko Sakkinen <[email protected]>
> ---
> v5:
> - No changes, expect removing alloc_execmem() call which should have
> been part of the previous patch.
> v4:
> - Include linux/execmem.h.
> v3:
> - Architecture independent parts have been split to separate patches.
> - Do not change arch/riscv/kernel/module.c as it is out of scope for
> this patch set now.
> v2:
> - Better late than never right? :-)
> - Focus only to RISC-V for now to make the patch more digestable. This
> is the arch where I use the patch on a daily basis to help with QA.
> - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
> ---
> arch/riscv/Kconfig | 1 +
> arch/riscv/kernel/Makefile | 3 +++
> arch/riscv/kernel/execmem.c | 22 ++++++++++++++++++++++
> 3 files changed, 26 insertions(+)
> create mode 100644 arch/riscv/kernel/execmem.c
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e3142ce531a0..499512fb17ff 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -132,6 +132,7 @@ config RISCV
> select HAVE_KPROBES if !XIP_KERNEL
> select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
> select HAVE_KRETPROBES if !XIP_KERNEL
> + select HAVE_ALLOC_EXECMEM if !XIP_KERNEL
> # https://github.com/ClangBuiltLinux/linux/issues/1881
> select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
> select HAVE_MOVE_PMD
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 604d6bf7e476..337797f10d3e 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -73,6 +73,9 @@ obj-$(CONFIG_SMP) += cpu_ops.o
>
> obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_ALLOC_EXECMEM),y)
> +obj-y += execmem.o
> +endif
> obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
>
> obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> diff --git a/arch/riscv/kernel/execmem.c b/arch/riscv/kernel/execmem.c
> new file mode 100644
> index 000000000000..3e52522ead32
> --- /dev/null
> +++ b/arch/riscv/kernel/execmem.c
> @@ -0,0 +1,22 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#include <linux/mm.h>
> +#include <linux/execmem.h>
> +#include <linux/vmalloc.h>
> +#include <asm/sections.h>
> +
> +void *alloc_execmem(unsigned long size, gfp_t /* gfp */)
> +{
> + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> + MODULES_END, GFP_KERNEL,
> + PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}


The __vmalloc_node_range() line ^^ must be from an old kernel since we
added VM_FLUSH_RESET_PERMS in 6.8, see 749b94b08005 ("riscv: Fix
module_alloc() that did not reset the linear mapping permissions").

In addition, I guess module_alloc() should now use alloc_execmem() right?


> +
> +void free_execmem(void *region)
> +{
> + if (in_interrupt())
> + pr_warn("In interrupt context: vmalloc may not work.\n");
> +
> + vfree(region);
> +}


I remember Mike Rapoport sent a patchset to introduce an API for
executable memory allocation
(https://lore.kernel.org/linux-mm/[email protected]/),
how does this intersect with your work? I don't know the status of his
patchset though.

Thanks,

Alex


2024-03-26 15:05:34

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue, 26 Mar 2024 15:18:21 +0200
"Jarkko Sakkinen" <[email protected]> wrote:

> On Tue Mar 26, 2024 at 4:01 AM EET, Jarkko Sakkinen wrote:
> > On Tue Mar 26, 2024 at 3:31 AM EET, Jarkko Sakkinen wrote:
> > > > > +#endif /* _LINUX_EXECMEM_H */
> > > > > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > > > > index 9d9095e81792..87fd8c14a938 100644
> > > > > --- a/kernel/kprobes.c
> > > > > +++ b/kernel/kprobes.c
> > > > > @@ -44,6 +44,7 @@
> > > > > #include <asm/cacheflush.h>
> > > > > #include <asm/errno.h>
> > > > > #include <linux/uaccess.h>
> > > > > +#include <linux/execmem.h>
> > > > >
> > > > > #define KPROBE_HASH_BITS 6
> > > > > #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
> > > > > @@ -113,17 +114,17 @@ enum kprobe_slot_state {
> > > > > void __weak *alloc_insn_page(void)
> > > > > {
> > > > > /*
> > > > > - * Use module_alloc() so this page is within +/- 2GB of where the
> > > > > + * Use alloc_execmem() so this page is within +/- 2GB of where the
> > > > > * kernel image and loaded module images reside. This is required
> > > > > * for most of the architectures.
> > > > > * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
> > > > > */
> > > > > - return module_alloc(PAGE_SIZE);
> > > > > + return alloc_execmem(PAGE_SIZE, GFP_KERNEL);
> > > > > }
> > > > >
> > > > > static void free_insn_page(void *page)
> > > > > {
> > > > > - module_memfree(page);
> > > > > + free_execmem(page);
> > > > > }
> > > > >
> > > > > struct kprobe_insn_cache kprobe_insn_slots = {
> > > > > @@ -1580,6 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > > > > goto out;
> > > > > }
> > > > >
> > > > > +#ifdef CONFIG_MODULES
> > > >
> > > > You don't need this block, because these APIs have dummy functions.
> > >
> > > Hmm... I'll verify this tomorrow.
> >
> > It depends on having struct module available given "(*probed_mod)->state".

Ah, indeed. We need module_state() function to avoid it.

> >
> > It is non-existent unless CONFIG_MODULES is set given how things are
> > flagged in include/linux/module.h.
>
> Hey, noticed kconfig issue.
>
> According to kconfig-language.txt:
>
> "select should be used with care. select will force a symbol to a value
> without visiting the dependencies."
>
> So the problem here lies in KPROBES config entry using select statement
> to pick ALLOC_EXECMEM. It will not take the depends on statement into
> account and thus will allow to select kprobes without any allocator in
> place.

OK, in that case "depend on" is good.

>
> So to address this I'd suggest to use depends on statement also for
> describing relation between KPROBES and ALLOC_EXECMEM. It does not make
> life worse than before for anyone because even with the current kernel
> you have to select MODULES before you can move forward with kprobes.

Yeah, since ALLOC_EXECMEM is enabled by default.

Thank you!

>
> BR, Jarkko


--
Masami Hiramatsu (Google) <[email protected]>

2024-03-26 16:50:04

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 2/2] arch/riscv: Enable kprobes when CONFIG_MODULES=n

On Tue Mar 26, 2024 at 3:57 PM EET, Alexandre Ghiti wrote:
> Hi Jarkko,
>
> On 25/03/2024 22:55, Jarkko Sakkinen wrote:
> > Tacing with kprobes while running a monolithic kernel is currently
> > impossible due the kernel module allocator dependency.
> >
> > Address the issue by implementing textmem API for RISC-V.
> >
> > Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> > Link: https://lore.kernel.org/all/[email protected]/ # continuation
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
> > ---
> > v5:
> > - No changes, expect removing alloc_execmem() call which should have
> > been part of the previous patch.
> > v4:
> > - Include linux/execmem.h.
> > v3:
> > - Architecture independent parts have been split to separate patches.
> > - Do not change arch/riscv/kernel/module.c as it is out of scope for
> > this patch set now.
> > v2:
> > - Better late than never right? :-)
> > - Focus only to RISC-V for now to make the patch more digestable. This
> > is the arch where I use the patch on a daily basis to help with QA.
> > - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
> > ---
> > arch/riscv/Kconfig | 1 +
> > arch/riscv/kernel/Makefile | 3 +++
> > arch/riscv/kernel/execmem.c | 22 ++++++++++++++++++++++
> > 3 files changed, 26 insertions(+)
> > create mode 100644 arch/riscv/kernel/execmem.c
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e3142ce531a0..499512fb17ff 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -132,6 +132,7 @@ config RISCV
> > select HAVE_KPROBES if !XIP_KERNEL
> > select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
> > select HAVE_KRETPROBES if !XIP_KERNEL
> > + select HAVE_ALLOC_EXECMEM if !XIP_KERNEL
> > # https://github.com/ClangBuiltLinux/linux/issues/1881
> > select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
> > select HAVE_MOVE_PMD
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index 604d6bf7e476..337797f10d3e 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -73,6 +73,9 @@ obj-$(CONFIG_SMP) += cpu_ops.o
> >
> > obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> > obj-$(CONFIG_MODULES) += module.o
> > +ifeq ($(CONFIG_ALLOC_EXECMEM),y)
> > +obj-y += execmem.o
> > +endif
> > obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
> >
> > obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> > diff --git a/arch/riscv/kernel/execmem.c b/arch/riscv/kernel/execmem.c
> > new file mode 100644
> > index 000000000000..3e52522ead32
> > --- /dev/null
> > +++ b/arch/riscv/kernel/execmem.c
> > @@ -0,0 +1,22 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +
> > +#include <linux/mm.h>
> > +#include <linux/execmem.h>
> > +#include <linux/vmalloc.h>
> > +#include <asm/sections.h>
> > +
> > +void *alloc_execmem(unsigned long size, gfp_t /* gfp */)

Need to have the parameter name here. I guess this could just as well
pass through gfp to vmalloc from the caller as kprobes does call
module_alloc() with GFP_KERNEL set in RISC-V.

> > +{
> > + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> > + MODULES_END, GFP_KERNEL,
> > + PAGE_KERNEL, 0, NUMA_NO_NODE,
> > + __builtin_return_address(0));
> > +}
>
>
> The __vmalloc_node_range() line ^^ must be from an old kernel since we
> added VM_FLUSH_RESET_PERMS in 6.8, see 749b94b08005 ("riscv: Fix
> module_alloc() that did not reset the linear mapping permissions").
>
> In addition, I guess module_alloc() should now use alloc_execmem() right?

Ack for the first comment. For the 2nd it is up to arch/<arch> to choose
whether to have shared or separate allocators.

So if you want I can change it that way but did not want to make the
call myself.

>
>
> > +
> > +void free_execmem(void *region)
> > +{
> > + if (in_interrupt())
> > + pr_warn("In interrupt context: vmalloc may not work.\n");
> > +
> > + vfree(region);
> > +}
>
>
> I remember Mike Rapoport sent a patchset to introduce an API for
> executable memory allocation
> (https://lore.kernel.org/linux-mm/[email protected]/),
> how does this intersect with your work? I don't know the status of his
> patchset though.
>
> Thanks,
>
> Alex

I have also made a patch set for kprobes in the 2022:

https://lore.kernel.org/all/[email protected]/

I think this Calvin's, Mike's and my early patch set have the same
problem: they try to choke all architectures at once. And further,
Calvin's and Mike's work also try to cover also tracing subsystems
at once.

I feel that my relatively small patch set which deals only with
trivial kprobe (which is more in the leaf than e.g. bpf which
is more like orchestrator tool) and implements one arch of which
dog food I actually eat is a better starting point.

Arch code is always something where you need to have genuine
understanding so full architecture coverage from day one is
just too risky for stability. Linux is better off if people who
work on a specific arch proactively will "fill the holes".

So the way I see my patch set is "lowest common denominator"
in both architecture axis and tracing subsystem axist. It should
not interfere that much with the other work (like bpf).

BR, Jarkko

2024-03-26 17:03:09

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 2/2] arch/riscv: Enable kprobes when CONFIG_MODULES=n

On Tue Mar 26, 2024 at 6:49 PM EET, Jarkko Sakkinen wrote:
> On Tue Mar 26, 2024 at 3:57 PM EET, Alexandre Ghiti wrote:
> > Hi Jarkko,
> >
> > On 25/03/2024 22:55, Jarkko Sakkinen wrote:
> > > Tacing with kprobes while running a monolithic kernel is currently
> > > impossible due the kernel module allocator dependency.
> > >
> > > Address the issue by implementing textmem API for RISC-V.
> > >
> > > Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
> > > Link: https://lore.kernel.org/all/[email protected]/ # continuation
> > > Signed-off-by: Jarkko Sakkinen <[email protected]>
> > > ---
> > > v5:
> > > - No changes, expect removing alloc_execmem() call which should have
> > > been part of the previous patch.
> > > v4:
> > > - Include linux/execmem.h.
> > > v3:
> > > - Architecture independent parts have been split to separate patches.
> > > - Do not change arch/riscv/kernel/module.c as it is out of scope for
> > > this patch set now.
> > > v2:
> > > - Better late than never right? :-)
> > > - Focus only to RISC-V for now to make the patch more digestable. This
> > > is the arch where I use the patch on a daily basis to help with QA.
> > > - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
> > > ---
> > > arch/riscv/Kconfig | 1 +
> > > arch/riscv/kernel/Makefile | 3 +++
> > > arch/riscv/kernel/execmem.c | 22 ++++++++++++++++++++++
> > > 3 files changed, 26 insertions(+)
> > > create mode 100644 arch/riscv/kernel/execmem.c
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index e3142ce531a0..499512fb17ff 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -132,6 +132,7 @@ config RISCV
> > > select HAVE_KPROBES if !XIP_KERNEL
> > > select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
> > > select HAVE_KRETPROBES if !XIP_KERNEL
> > > + select HAVE_ALLOC_EXECMEM if !XIP_KERNEL
> > > # https://github.com/ClangBuiltLinux/linux/issues/1881
> > > select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
> > > select HAVE_MOVE_PMD
> > > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > > index 604d6bf7e476..337797f10d3e 100644
> > > --- a/arch/riscv/kernel/Makefile
> > > +++ b/arch/riscv/kernel/Makefile
> > > @@ -73,6 +73,9 @@ obj-$(CONFIG_SMP) += cpu_ops.o
> > >
> > > obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> > > obj-$(CONFIG_MODULES) += module.o
> > > +ifeq ($(CONFIG_ALLOC_EXECMEM),y)
> > > +obj-y += execmem.o
> > > +endif
> > > obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
> > >
> > > obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> > > diff --git a/arch/riscv/kernel/execmem.c b/arch/riscv/kernel/execmem.c
> > > new file mode 100644
> > > index 000000000000..3e52522ead32
> > > --- /dev/null
> > > +++ b/arch/riscv/kernel/execmem.c
> > > @@ -0,0 +1,22 @@
> > > +// SPDX-License-Identifier: GPL-2.0-or-later
> > > +
> > > +#include <linux/mm.h>
> > > +#include <linux/execmem.h>
> > > +#include <linux/vmalloc.h>
> > > +#include <asm/sections.h>
> > > +
> > > +void *alloc_execmem(unsigned long size, gfp_t /* gfp */)
>
> Need to have the parameter name here. I guess this could just as well
> pass through gfp to vmalloc from the caller as kprobes does call
> module_alloc() with GFP_KERNEL set in RISC-V.
>
> > > +{
> > > + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> > > + MODULES_END, GFP_KERNEL,
> > > + PAGE_KERNEL, 0, NUMA_NO_NODE,
> > > + __builtin_return_address(0));
> > > +}
> >
> >
> > The __vmalloc_node_range() line ^^ must be from an old kernel since we
> > added VM_FLUSH_RESET_PERMS in 6.8, see 749b94b08005 ("riscv: Fix
> > module_alloc() that did not reset the linear mapping permissions").
> >
> > In addition, I guess module_alloc() should now use alloc_execmem() right?
>
> Ack for the first comment. For the 2nd it is up to arch/<arch> to choose
> whether to have shared or separate allocators.
>
> So if you want I can change it that way but did not want to make the
> call myself.
>
> >
> >
> > > +
> > > +void free_execmem(void *region)
> > > +{
> > > + if (in_interrupt())
> > > + pr_warn("In interrupt context: vmalloc may not work.\n");
> > > +
> > > + vfree(region);
> > > +}
> >
> >
> > I remember Mike Rapoport sent a patchset to introduce an API for
> > executable memory allocation
> > (https://lore.kernel.org/linux-mm/[email protected]/),
> > how does this intersect with your work? I don't know the status of his
> > patchset though.
> >
> > Thanks,
> >
> > Alex
>
> I have also made a patch set for kprobes in the 2022:
>
> https://lore.kernel.org/all/[email protected]/
>
> I think this Calvin's, Mike's and my early patch set have the same
> problem: they try to choke all architectures at once. And further,
> Calvin's and Mike's work also try to cover also tracing subsystems
> at once.
>
> I feel that my relatively small patch set which deals only with
> trivial kprobe (which is more in the leaf than e.g. bpf which
> is more like orchestrator tool) and implements one arch of which
> dog food I actually eat is a better starting point.
>
> Arch code is always something where you need to have genuine
> understanding so full architecture coverage from day one is
> just too risky for stability. Linux is better off if people who
> work on a specific arch proactively will "fill the holes".
>
> So the way I see my patch set is "lowest common denominator"
> in both architecture axis and tracing subsystem axist. It should
> not interfere that much with the other work (like bpf).

Here also there is a lot of kconfig flag logic changes. I've verified
them but still I think we are better off if this stuff is put in the
wild first in small scale rather than spraying kernel with code changes
in the first run.

BR, Jarkko

2024-03-26 17:05:35

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] kprobes: textmem API

On Tue Mar 26, 2024 at 5:05 PM EET, Masami Hiramatsu (Google) wrote:
> > According to kconfig-language.txt:
> >
> > "select should be used with care. select will force a symbol to a value
> > without visiting the dependencies."
> >
> > So the problem here lies in KPROBES config entry using select statement
> > to pick ALLOC_EXECMEM. It will not take the depends on statement into
> > account and thus will allow to select kprobes without any allocator in
> > place.
>
> OK, in that case "depend on" is good.

Yeah, did not remember this at all. Only recalled when I started to
get linking errors when compiling just the first patch... It's a bit
uninituitive twist in kconfig :-)

BR, Jarkko

2024-03-26 19:03:52

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [PATCH v5 2/2] arch/riscv: Enable kprobes when CONFIG_MODULES=n

On 26/03/2024 17:49, Jarkko Sakkinen wrote:
> On Tue Mar 26, 2024 at 3:57 PM EET, Alexandre Ghiti wrote:
>> Hi Jarkko,
>>
>> On 25/03/2024 22:55, Jarkko Sakkinen wrote:
>>> Tacing with kprobes while running a monolithic kernel is currently
>>> impossible due the kernel module allocator dependency.
>>>
>>> Address the issue by implementing textmem API for RISC-V.
>>>
>>> Link: https://www.sochub.fi # for power on testing new SoC's with a minimal stack
>>> Link: https://lore.kernel.org/all/[email protected]/ # continuation
>>> Signed-off-by: Jarkko Sakkinen <[email protected]>
>>> ---
>>> v5:
>>> - No changes, expect removing alloc_execmem() call which should have
>>> been part of the previous patch.
>>> v4:
>>> - Include linux/execmem.h.
>>> v3:
>>> - Architecture independent parts have been split to separate patches.
>>> - Do not change arch/riscv/kernel/module.c as it is out of scope for
>>> this patch set now.
>>> v2:
>>> - Better late than never right? :-)
>>> - Focus only to RISC-V for now to make the patch more digestable. This
>>> is the arch where I use the patch on a daily basis to help with QA.
>>> - Introduce HAVE_KPROBES_ALLOC flag to help with more gradual migration.
>>> ---
>>> arch/riscv/Kconfig | 1 +
>>> arch/riscv/kernel/Makefile | 3 +++
>>> arch/riscv/kernel/execmem.c | 22 ++++++++++++++++++++++
>>> 3 files changed, 26 insertions(+)
>>> create mode 100644 arch/riscv/kernel/execmem.c
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index e3142ce531a0..499512fb17ff 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -132,6 +132,7 @@ config RISCV
>>> select HAVE_KPROBES if !XIP_KERNEL
>>> select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
>>> select HAVE_KRETPROBES if !XIP_KERNEL
>>> + select HAVE_ALLOC_EXECMEM if !XIP_KERNEL
>>> # https://github.com/ClangBuiltLinux/linux/issues/1881
>>> select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
>>> select HAVE_MOVE_PMD
>>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>>> index 604d6bf7e476..337797f10d3e 100644
>>> --- a/arch/riscv/kernel/Makefile
>>> +++ b/arch/riscv/kernel/Makefile
>>> @@ -73,6 +73,9 @@ obj-$(CONFIG_SMP) += cpu_ops.o
>>>
>>> obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
>>> obj-$(CONFIG_MODULES) += module.o
>>> +ifeq ($(CONFIG_ALLOC_EXECMEM),y)
>>> +obj-y += execmem.o
>>> +endif
>>> obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
>>>
>>> obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
>>> diff --git a/arch/riscv/kernel/execmem.c b/arch/riscv/kernel/execmem.c
>>> new file mode 100644
>>> index 000000000000..3e52522ead32
>>> --- /dev/null
>>> +++ b/arch/riscv/kernel/execmem.c
>>> @@ -0,0 +1,22 @@
>>> +// SPDX-License-Identifier: GPL-2.0-or-later
>>> +
>>> +#include <linux/mm.h>
>>> +#include <linux/execmem.h>
>>> +#include <linux/vmalloc.h>
>>> +#include <asm/sections.h>
>>> +
>>> +void *alloc_execmem(unsigned long size, gfp_t /* gfp */)
> Need to have the parameter name here. I guess this could just as well
> pass through gfp to vmalloc from the caller as kprobes does call
> module_alloc() with GFP_KERNEL set in RISC-V.
>
>>> +{
>>> + return __vmalloc_node_range(size, 1, MODULES_VADDR,
>>> + MODULES_END, GFP_KERNEL,
>>> + PAGE_KERNEL, 0, NUMA_NO_NODE,
>>> + __builtin_return_address(0));
>>> +}
>>
>> The __vmalloc_node_range() line ^^ must be from an old kernel since we
>> added VM_FLUSH_RESET_PERMS in 6.8, see 749b94b08005 ("riscv: Fix
>> module_alloc() that did not reset the linear mapping permissions").
>>
>> In addition, I guess module_alloc() should now use alloc_execmem() right?
> Ack for the first comment. For the 2nd it is up to arch/<arch> to choose
> whether to have shared or separate allocators.
>
> So if you want I can change it that way but did not want to make the
> call myself.


I'd say module_alloc() should use alloc_execmem() then since there are
no differences for now.


>>
>>> +
>>> +void free_execmem(void *region)
>>> +{
>>> + if (in_interrupt())
>>> + pr_warn("In interrupt context: vmalloc may not work.\n");
>>> +
>>> + vfree(region);
>>> +}
>>
>> I remember Mike Rapoport sent a patchset to introduce an API for
>> executable memory allocation
>> (https://lore.kernel.org/linux-mm/[email protected]/),
>> how does this intersect with your work? I don't know the status of his
>> patchset though.
>>
>> Thanks,
>>
>> Alex
> I have also made a patch set for kprobes in the 2022:
>
> https://lore.kernel.org/all/[email protected]/
>
> I think this Calvin's, Mike's and my early patch set have the same
> problem: they try to choke all architectures at once. And further,
> Calvin's and Mike's work also try to cover also tracing subsystems
> at once.
>
> I feel that my relatively small patch set which deals only with
> trivial kprobe (which is more in the leaf than e.g. bpf which
> is more like orchestrator tool) and implements one arch of which
> dog food I actually eat is a better starting point.
>
> Arch code is always something where you need to have genuine
> understanding so full architecture coverage from day one is
> just too risky for stability. Linux is better off if people who
> work on a specific arch proactively will "fill the holes".
>
> So the way I see my patch set is "lowest common denominator"
> in both architecture axis and tracing subsystem axist. It should
> not interfere that much with the other work (like bpf).


I understand your point. But is there any consensus among you on how to
deal with this issue? I'm not opposed at all to your patch, and it's
small enough that we can revert it later on if that's not the right
solution anyway.


>
> BR, Jarkko
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv