Tracing with kprobes while running a monolithic kernel is currently
impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
dependency is a result of kprobes code using the module allocator for the
trampoline code.
Detaching kprobes from modules helps to squeeze down the user space,
e.g. when developing new core kernel features, while still having all
the nice tracing capabilities.
For kernel/ and arch/*, move module_alloc() and module_memfree() to
module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
code with CONFIG_MODULES.
As the result, kprobes can be used with a monolithic kernel.
Signed-off-by: Jarkko Sakkinen <[email protected]>
---
Tested with the help of BuildRoot and QEMU:
- arm (function tracer)
- arm64 (function tracer)
- mips (function tracer)
- powerpc (function tracer)
- riscv (function tracer)
- s390 (function tracer)
- sparc (function tracer)
- x86 (function tracer)
- sh (function tracer, for the "pure" kernel/modules_alloc.c path)
---
arch/Kconfig | 1 -
arch/arm/kernel/Makefile | 5 +++
arch/arm/kernel/module.c | 32 ----------------
arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++
arch/arm64/kernel/Makefile | 5 +++
arch/arm64/kernel/module.c | 47 -----------------------
arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++
arch/mips/kernel/Makefile | 5 +++
arch/mips/kernel/module.c | 9 -----
arch/mips/kernel/module_alloc.c | 18 +++++++++
arch/parisc/kernel/Makefile | 5 +++
arch/parisc/kernel/module.c | 11 ------
arch/parisc/kernel/module_alloc.c | 23 +++++++++++
arch/powerpc/kernel/Makefile | 5 +++
arch/powerpc/kernel/module.c | 37 ------------------
arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++
arch/riscv/kernel/Makefile | 5 +++
arch/riscv/kernel/module.c | 10 -----
arch/riscv/kernel/module_alloc.c | 19 ++++++++++
arch/s390/kernel/Makefile | 5 +++
arch/s390/kernel/module.c | 17 ---------
arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++
arch/sparc/kernel/Makefile | 5 +++
arch/sparc/kernel/module.c | 30 ---------------
arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++
arch/x86/kernel/Makefile | 5 +++
arch/x86/kernel/module.c | 50 ------------------------
arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++
kernel/Makefile | 5 +++
kernel/kprobes.c | 10 +++++
kernel/module/main.c | 17 ---------
kernel/module_alloc.c | 26 +++++++++++++
kernel/trace/trace_kprobe.c | 10 ++++-
33 files changed, 434 insertions(+), 262 deletions(-)
create mode 100644 arch/arm/kernel/module_alloc.c
create mode 100644 arch/arm64/kernel/module_alloc.c
create mode 100644 arch/mips/kernel/module_alloc.c
create mode 100644 arch/parisc/kernel/module_alloc.c
create mode 100644 arch/powerpc/kernel/module_alloc.c
create mode 100644 arch/riscv/kernel/module_alloc.c
create mode 100644 arch/s390/kernel/module_alloc.c
create mode 100644 arch/sparc/kernel/module_alloc.c
create mode 100644 arch/x86/kernel/module_alloc.c
create mode 100644 kernel/module_alloc.c
diff --git a/arch/Kconfig b/arch/Kconfig
index fcf9a41a4ef5..e8e3e7998a2e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -39,7 +39,6 @@ config GENERIC_ENTRY
config KPROBES
bool "Kprobes"
- depends on MODULES
depends on HAVE_KPROBES
select KALLSYMS
select TASKS_RCU if PREEMPTION
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 553866751e1a..d2bb954cd54f 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
obj-$(CONFIG_ISA_DMA_API) += dma.o
obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
obj-$(CONFIG_MODULES) += armksyms.o module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o
obj-$(CONFIG_ISA_DMA) += dma-isa.o
obj-$(CONFIG_PCI) += bios32.o isa.o
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index d59c36dc0494..054e799e7091 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -22,38 +22,6 @@
#include <asm/unwind.h>
#include <asm/opcodes.h>
-#ifdef CONFIG_XIP_KERNEL
-/*
- * The XIP kernel text is mapped in the module area for modules and
- * some other stuff to work without any indirect relocations.
- * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
- * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
- */
-#undef MODULES_VADDR
-#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
-#endif
-
-#ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
-{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- /* Silence the initial allocation */
- if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
- gfp_mask |= __GFP_NOWARN;
-
- p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
- return p;
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-#endif
-
bool module_init_section(const char *name)
{
return strstarts(name, ".init") ||
diff --git a/arch/arm/kernel/module_alloc.c b/arch/arm/kernel/module_alloc.c
new file mode 100644
index 000000000000..c3cfb2b60575
--- /dev/null
+++ b/arch/arm/kernel/module_alloc.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2002 Russell King.
+ * Modified for nommu by Hyok S. Choi
+ *
+ * Module allocation method suggested by Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_XIP_KERNEL
+/*
+ * The XIP kernel text is mapped in the module area for modules and
+ * some other stuff to work without any indirect relocations.
+ * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
+ * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
+ */
+#undef MODULES_VADDR
+#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
+#endif
+
+#ifdef CONFIG_MMU
+void *module_alloc(unsigned long size)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+ void *p;
+
+ /* Silence the initial allocation */
+ if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
+ gfp_mask |= __GFP_NOWARN;
+
+ p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
+ return p;
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index fa7981d0d917..d61c02da857e 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -43,6 +43,11 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o
obj-$(CONFIG_KUSER_HELPERS) += kuser32.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index f2d4bb14bfab..d2a979746da0 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,53 +20,6 @@
#include <asm/insn.h>
#include <asm/sections.h>
-void *module_alloc(unsigned long size)
-{
- u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- /* Silence the initial allocation */
- if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
- gfp_mask |= __GFP_NOWARN;
-
- if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
- IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- /* don't exceed the static module region - see below */
- module_alloc_end = MODULES_END;
-
- p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
- module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
- NUMA_NO_NODE, __builtin_return_address(0));
-
- if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
- (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
- (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
- /*
- * KASAN without KASAN_VMALLOC can only deal with module
- * allocations being served from the reserved module region,
- * since the remainder of the vmalloc region is already
- * backed by zero shadow pages, and punching holes into it
- * is non-trivial. Since the module region is not randomized
- * when KASAN is enabled without KASAN_VMALLOC, it is even
- * less likely that the module region gets exhausted, so we
- * can simply omit this fallback in that case.
- */
- p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
- module_alloc_base + SZ_2G, GFP_KERNEL,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-
- if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
- vfree(p);
- return NULL;
- }
-
- /* Memory is intended to be executable, reset the pointer tag. */
- return kasan_reset_tag(p);
-}
-
enum aarch64_reloc_op {
RELOC_OP_NONE,
RELOC_OP_ABS,
diff --git a/arch/arm64/kernel/module_alloc.c b/arch/arm64/kernel/module_alloc.c
new file mode 100644
index 000000000000..0340c8c0b782
--- /dev/null
+++ b/arch/arm64/kernel/module_alloc.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 ARM Limited
+ *
+ * Author: Will Deacon <[email protected]>
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *module_alloc(unsigned long size)
+{
+ u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
+ gfp_t gfp_mask = GFP_KERNEL;
+ void *p;
+
+ /* Silence the initial allocation */
+ if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+ gfp_mask |= __GFP_NOWARN;
+
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ /* don't exceed the static module region - see below */
+ module_alloc_end = MODULES_END;
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+ module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
+ if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+ (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
+ /*
+ * KASAN without KASAN_VMALLOC can only deal with module
+ * allocations being served from the reserved module region,
+ * since the remainder of the vmalloc region is already
+ * backed by zero shadow pages, and punching holes into it
+ * is non-trivial. Since the module region is not randomized
+ * when KASAN is enabled without KASAN_VMALLOC, it is even
+ * less likely that the module region gets exhausted, so we
+ * can simply omit this fallback in that case.
+ */
+ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+ module_alloc_base + SZ_2G, GFP_KERNEL,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ /* Memory is intended to be executable, reset the pointer tag. */
+ return kasan_reset_tag(p);
+}
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 7c96282bff2e..cb9297b613a1 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -38,6 +38,11 @@ obj-$(CONFIG_SYNC_R4K) += sync-r4k.o
obj-$(CONFIG_DEBUG_FS) += segment.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 14f46d17500a..214b2d1868a5 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,15 +31,6 @@ struct mips_hi16 {
static LIST_HEAD(dbe_list);
static DEFINE_SPINLOCK(dbe_lock);
-#ifdef MODULE_START
-void *module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
- GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-#endif
-
static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v)
{
*location = base + v;
diff --git a/arch/mips/kernel/module_alloc.c b/arch/mips/kernel/module_alloc.c
new file mode 100644
index 000000000000..582ab5ed8916
--- /dev/null
+++ b/arch/mips/kernel/module_alloc.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2001 Rusty Russell.
+ * Copyright (C) 2003, 2004 Ralf Baechle ([email protected])
+ * Copyright (C) 2005 Thiemo Seufer
+ */
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#ifdef MODULE_START
+void *module_alloc(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+ GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+#endif
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index d0bfac89a842..cd38083ae077 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -25,6 +25,11 @@ obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PA11) += pci-dma.o
obj-$(CONFIG_PCI) += pci.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_64BIT) += sys_parisc32.o signal32.o
obj-$(CONFIG_STACKTRACE)+= stacktrace.o
obj-$(CONFIG_AUDIT) += audit.o
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 7df140545b22..c81e63e2549b 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
((as22 & 0x0003ff) << 3));
}
-void *module_alloc(unsigned long size)
-{
- /* using RWX means less protection for modules, but it's
- * easier than trying to map the text, data, init_text and
- * init_data correctly */
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL,
- PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-
#ifndef CONFIG_64BIT
static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
{
diff --git a/arch/parisc/kernel/module_alloc.c b/arch/parisc/kernel/module_alloc.c
new file mode 100644
index 000000000000..4e4e4ce1132e
--- /dev/null
+++ b/arch/parisc/kernel/module_alloc.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Linux/PA-RISC Project
+ * Copyright (C) 2003 Randolph Chung <tausq at debian . org>
+ * Copyright (C) 2008 Helge Deller <[email protected]>
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *module_alloc(unsigned long size)
+{
+ /*
+ * Using RWX means less protection for modules, but it's
+ * easier than trying to map the text, data, init_text and
+ * init_data correctly.
+ */
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL,
+ PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2e2a2a9bcf43..5a811cdf230b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o
endif
obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
obj-$(CONFIG_MODULES) += module.o module_$(BITS).o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_44x) += cpu_setup_44x.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
obj-$(CONFIG_PPC_DOORBELL) += dbell.o
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index f6d6ae0a1692..b30e00964a60 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr,
return 0;
}
-
-static __always_inline void *
-__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
-{
- pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
- gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
-
- /*
- * Don't do huge page allocations for modules yet until more testing
- * is done. STRICT_MODULE_RWX may require extra work to support this
- * too.
- */
- return __vmalloc_node_range(size, 1, start, end, gfp, prot,
- VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
-void *module_alloc(unsigned long size)
-{
-#ifdef MODULES_VADDR
- unsigned long limit = (unsigned long)_etext - SZ_32M;
- void *ptr = NULL;
-
- BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
-
- /* First try within 32M limit from _etext to avoid branch trampolines */
- if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
- ptr = __module_alloc(size, limit, MODULES_END, true);
-
- if (!ptr)
- ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
-
- return ptr;
-#else
- return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
-#endif
-}
diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c
new file mode 100644
index 000000000000..48541c27ce46
--- /dev/null
+++ b/arch/powerpc/kernel/module_alloc.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel module help for powerpc.
+ * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
+ * Copyright (C) 2008 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+static __always_inline void *
+__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
+{
+ pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
+ gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
+
+ /*
+ * Don't do huge page allocations for modules yet until more testing
+ * is done. STRICT_MODULE_RWX may require extra work to support this
+ * too.
+ */
+ return __vmalloc_node_range(size, 1, start, end, gfp, prot,
+ VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+void *module_alloc(unsigned long size)
+{
+#ifdef MODULES_VADDR
+ unsigned long limit = (unsigned long)_etext - SZ_32M;
+ void *ptr = NULL;
+
+ BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+
+ /* First try within 32M limit from _etext to avoid branch trampolines */
+ if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
+ ptr = __module_alloc(size, limit, MODULES_END, true);
+
+ if (!ptr)
+ ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
+
+ return ptr;
+#else
+ return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
+#endif
+}
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index c71d6591d539..7af346ae575e 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -61,6 +61,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o
obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 91fe16bfaa07..1621d5e458f7 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -419,16 +419,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
return 0;
}
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-void *module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, MODULES_VADDR,
- MODULES_END, GFP_KERNEL,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-#endif
-
static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
const char *name)
diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c
new file mode 100644
index 000000000000..ed87ed048713
--- /dev/null
+++ b/arch/riscv/kernel/module_alloc.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Zihao Yu
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+#include <asm/sections.h>
+
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+void *module_alloc(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, MODULES_VADDR,
+ MODULES_END, GFP_KERNEL,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+#endif
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 27d6b3c7aa06..71e73e34b441 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -48,6 +48,11 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o
CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_AUDIT) += audit.o
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 26125a9c436d..e15f09fa50e2 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -35,23 +35,6 @@
#define PLT_ENTRY_SIZE 22
-void *module_alloc(unsigned long size)
-{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- if (PAGE_ALIGN(size) > MODULES_LEN)
- return NULL;
- p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
- gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
- vfree(p);
- return NULL;
- }
- return p;
-}
-
#ifdef CONFIG_FUNCTION_TRACER
void module_arch_cleanup(struct module *mod)
{
diff --git a/arch/s390/kernel/module_alloc.c b/arch/s390/kernel/module_alloc.c
new file mode 100644
index 000000000000..e4c4175fb759
--- /dev/null
+++ b/arch/s390/kernel/module_alloc.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Kernel module help for s390.
+ *
+ * S390 version
+ * Copyright IBM Corp. 2002, 2003
+ * Author(s): Arnd Bergmann ([email protected])
+ * Martin Schwidefsky ([email protected])
+ *
+ * based on i386 version
+ * Copyright (C) 2001 Rusty Russell.
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *module_alloc(unsigned long size)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+ void *p;
+
+ if (PAGE_ALIGN(size) > MODULES_LEN)
+ return NULL;
+ p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+ gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+ return p;
+}
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index d3a0e072ebe8..53905c00a094 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -89,6 +89,11 @@ obj-$(CONFIG_SUN_PM) += apc.o pmc.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULES) += sparc_ksyms.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_SPARC_LED) += led.o
obj-$(CONFIG_KGDB) += kgdb_$(BITS).o
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index df39580f398d..f2babc69f189 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,36 +21,6 @@
#include "entry.h"
-#ifdef CONFIG_SPARC64
-
-#include <linux/jump_label.h>
-
-static void *module_map(unsigned long size)
-{
- if (PAGE_ALIGN(size) > MODULES_LEN)
- return NULL;
- return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
-}
-#else
-static void *module_map(unsigned long size)
-{
- return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
- void *ret;
-
- ret = module_map(size);
- if (ret)
- memset(ret, 0, size);
-
- return ret;
-}
-
/* Make generic code ignore STT_REGISTER dummy undefined symbols. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
diff --git a/arch/sparc/kernel/module_alloc.c b/arch/sparc/kernel/module_alloc.c
new file mode 100644
index 000000000000..4bacf8cff012
--- /dev/null
+++ b/arch/sparc/kernel/module_alloc.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Kernel module help for sparc64.
+ *
+ * Copyright (C) 2001 Rusty Russell.
+ * Copyright (C) 2002 David S. Miller.
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_SPARC64
+#include <linux/jump_label.h>
+
+static void *module_map(unsigned long size)
+{
+ if (PAGE_ALIGN(size) > MODULES_LEN)
+ return NULL;
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+#else
+static void *module_map(unsigned long size)
+{
+ return vmalloc(size);
+}
+#endif /* CONFIG_SPARC64 */
+
+void *module_alloc(unsigned long size)
+{
+ void *ret;
+
+ ret = module_map(size);
+ if (ret)
+ memset(ret, 0, size);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 03364dc40d8d..1f0304140f44 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -108,6 +108,11 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_X86_32) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b98ffcf4d250..40bb10b5a68d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -36,56 +36,6 @@ do { \
} while (0)
#endif
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
-
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
- if (kaslr_enabled()) {
- mutex_lock(&module_kaslr_mutex);
- /*
- * Calculate the module_load_offset the first time this
- * code is called. Once calculated it stays the same until
- * reboot.
- */
- if (module_load_offset == 0)
- module_load_offset =
- (get_random_int() % 1024 + 1) * PAGE_SIZE;
- mutex_unlock(&module_kaslr_mutex);
- }
- return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
- return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- if (PAGE_ALIGN(size) > MODULES_LEN)
- return NULL;
-
- p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask,
- PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
- vfree(p);
- return NULL;
- }
-
- return p;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
diff --git a/arch/x86/kernel/module_alloc.c b/arch/x86/kernel/module_alloc.c
new file mode 100644
index 000000000000..a7df2af1467a
--- /dev/null
+++ b/arch/x86/kernel/module_alloc.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel module help for x86.
+ * Copyright (C) 2001 Rusty Russell.
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static unsigned long get_module_load_offset(void)
+{
+ if (kaslr_enabled()) {
+ mutex_lock(&module_kaslr_mutex);
+ /*
+ * Calculate the module_load_offset the first time this
+ * code is called. Once calculated it stays the same until
+ * reboot.
+ */
+ if (module_load_offset == 0)
+ module_load_offset =
+ (get_random_int() % 1024 + 1) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ }
+ return module_load_offset;
+}
+#else
+static unsigned long get_module_load_offset(void)
+{
+ return 0;
+}
+#endif
+
+void *module_alloc(unsigned long size)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+ void *p;
+
+ if (PAGE_ALIGN(size) > MODULES_LEN)
+ return NULL;
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask,
+ PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
+}
diff --git a/kernel/Makefile b/kernel/Makefile
index 318789c728d3..2981fe42060d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -53,6 +53,11 @@ obj-y += livepatch/
obj-y += dma/
obj-y += entry/
obj-$(CONFIG_MODULES) += module/
+ifeq ($(CONFIG_MODULES),y)
+obj-y += module_alloc.o
+else
+obj-$(CONFIG_KPROBES) += module_alloc.o
+endif
obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f214f8c088ed..3f9876374cd3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
+#ifdef CONFIG_MODULES
/* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
@@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = -ENOENT;
}
}
+#endif
+
out:
preempt_enable();
jump_label_unlock();
@@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
return 0;
}
+#ifdef CONFIG_MODULES
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
@@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
kprobe_remove_area_blacklist(entry, entry + 1);
}
+#endif /* CONFIG_MODULES */
int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
char *type, char *sym)
@@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret ? : arch_populate_kprobe_blacklist();
}
+#ifdef CONFIG_MODULES
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
@@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
};
+#endif /* CONFIG_MODULES */
void kprobe_free_init_mem(void)
{
@@ -2717,8 +2724,11 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
+
+#ifdef CONFIG_MODULES
if (!err)
err = register_module_notifier(&kprobe_module_nb);
+#endif
kprobes_initialized = (err == 0);
kprobe_sysctls_init();
diff --git a/kernel/module/main.c b/kernel/module/main.c
index fed58d30725d..7fa182b78550 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod,
return ksym;
}
-void __weak module_memfree(void *module_region)
-{
- /*
- * This memory may be RO, and freeing RO memory in an interrupt is not
- * supported by vmalloc.
- */
- WARN_ON(in_interrupt());
- vfree(module_region);
-}
-
void __weak module_arch_cleanup(struct module *mod)
{
}
@@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
ddebug_remove_module(mod->name);
}
-void * __weak module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
bool __weak module_init_section(const char *name)
{
return strstarts(name, ".init");
diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c
new file mode 100644
index 000000000000..26a4c60998ad
--- /dev/null
+++ b/kernel/module_alloc.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ */
+
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void * __weak module_alloc(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+void __weak module_memfree(void *module_region)
+{
+ /*
+ * This memory may be RO, and freeing RO memory in an interrupt is not
+ * supported by vmalloc.
+ */
+ WARN_ON(in_interrupt());
+ vfree(module_region);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 93507330462c..050b2975332e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
return kprobe_gone(&tk->rp.kp);
}
+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
@@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
+#endif /* CONFIG_MODULES */
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
+ bool ret = false;
+#ifdef CONFIG_MODULES
char *p;
- bool ret;
if (!tk->symbol)
return false;
@@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
ret = !!find_module(tk->symbol);
rcu_read_unlock_sched();
*p = ':';
+#endif /* CONFIG_MODULES */
return ret;
}
@@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
return ret;
}
+#ifdef CONFIG_MODULES
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
+#endif /* CONFIG_MODULES */
static int __trace_kprobe_create(int argc, const char *argv[])
{
@@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
+#ifdef CONFIG_MODULES
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
+#endif /* CONFIG_MODULES */
return 0;
}
--
2.36.1
Hello Jarkko,
On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <[email protected]> wrote:
>
> Tracing with kprobes while running a monolithic kernel is currently
> impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> dependency is a result of kprobes code using the module allocator for the
> trampoline code.
>
> Detaching kprobes from modules helps to squeeze down the user space,
> e.g. when developing new core kernel features, while still having all
> the nice tracing capabilities.
>
> For kernel/ and arch/*, move module_alloc() and module_memfree() to
> module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> code with CONFIG_MODULES.
>
> As the result, kprobes can be used with a monolithic kernel.
I think I may have mentioned this the previous time as well, but I
don't think this is the right approach.
Kprobes uses alloc_insn_page() to allocate executable memory, but the
requirements for this memory are radically different compared to
loadable modules, which need to be within an arch-specific distance of
the core kernel, need KASAN backing etc etc.
This is why arm64, for instance, does not implement alloc_insn_page()
in terms of module_alloc() [and likely does not belong in this patch
for that reason]
Is there any reason kprobes cannot simply use vmalloc()?
>
> Signed-off-by: Jarkko Sakkinen <[email protected]>
> ---
> Tested with the help of BuildRoot and QEMU:
> - arm (function tracer)
> - arm64 (function tracer)
> - mips (function tracer)
> - powerpc (function tracer)
> - riscv (function tracer)
> - s390 (function tracer)
> - sparc (function tracer)
> - x86 (function tracer)
> - sh (function tracer, for the "pure" kernel/modules_alloc.c path)
> ---
> arch/Kconfig | 1 -
> arch/arm/kernel/Makefile | 5 +++
> arch/arm/kernel/module.c | 32 ----------------
> arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++
> arch/arm64/kernel/Makefile | 5 +++
> arch/arm64/kernel/module.c | 47 -----------------------
> arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++
> arch/mips/kernel/Makefile | 5 +++
> arch/mips/kernel/module.c | 9 -----
> arch/mips/kernel/module_alloc.c | 18 +++++++++
> arch/parisc/kernel/Makefile | 5 +++
> arch/parisc/kernel/module.c | 11 ------
> arch/parisc/kernel/module_alloc.c | 23 +++++++++++
> arch/powerpc/kernel/Makefile | 5 +++
> arch/powerpc/kernel/module.c | 37 ------------------
> arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++
> arch/riscv/kernel/Makefile | 5 +++
> arch/riscv/kernel/module.c | 10 -----
> arch/riscv/kernel/module_alloc.c | 19 ++++++++++
> arch/s390/kernel/Makefile | 5 +++
> arch/s390/kernel/module.c | 17 ---------
> arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++
> arch/sparc/kernel/Makefile | 5 +++
> arch/sparc/kernel/module.c | 30 ---------------
> arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++
> arch/x86/kernel/Makefile | 5 +++
> arch/x86/kernel/module.c | 50 ------------------------
> arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++
> kernel/Makefile | 5 +++
> kernel/kprobes.c | 10 +++++
> kernel/module/main.c | 17 ---------
> kernel/module_alloc.c | 26 +++++++++++++
> kernel/trace/trace_kprobe.c | 10 ++++-
> 33 files changed, 434 insertions(+), 262 deletions(-)
> create mode 100644 arch/arm/kernel/module_alloc.c
> create mode 100644 arch/arm64/kernel/module_alloc.c
> create mode 100644 arch/mips/kernel/module_alloc.c
> create mode 100644 arch/parisc/kernel/module_alloc.c
> create mode 100644 arch/powerpc/kernel/module_alloc.c
> create mode 100644 arch/riscv/kernel/module_alloc.c
> create mode 100644 arch/s390/kernel/module_alloc.c
> create mode 100644 arch/sparc/kernel/module_alloc.c
> create mode 100644 arch/x86/kernel/module_alloc.c
> create mode 100644 kernel/module_alloc.c
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index fcf9a41a4ef5..e8e3e7998a2e 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -39,7 +39,6 @@ config GENERIC_ENTRY
>
> config KPROBES
> bool "Kprobes"
> - depends on MODULES
> depends on HAVE_KPROBES
> select KALLSYMS
> select TASKS_RCU if PREEMPTION
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index 553866751e1a..d2bb954cd54f 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> obj-$(CONFIG_ISA_DMA_API) += dma.o
> obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> obj-$(CONFIG_MODULES) += armksyms.o module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o
> obj-$(CONFIG_ISA_DMA) += dma-isa.o
> obj-$(CONFIG_PCI) += bios32.o isa.o
> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
> index d59c36dc0494..054e799e7091 100644
> --- a/arch/arm/kernel/module.c
> +++ b/arch/arm/kernel/module.c
> @@ -22,38 +22,6 @@
> #include <asm/unwind.h>
> #include <asm/opcodes.h>
>
> -#ifdef CONFIG_XIP_KERNEL
> -/*
> - * The XIP kernel text is mapped in the module area for modules and
> - * some other stuff to work without any indirect relocations.
> - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
> - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
> - */
> -#undef MODULES_VADDR
> -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
> -#endif
> -
> -#ifdef CONFIG_MMU
> -void *module_alloc(unsigned long size)
> -{
> - gfp_t gfp_mask = GFP_KERNEL;
> - void *p;
> -
> - /* Silence the initial allocation */
> - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> - gfp_mask |= __GFP_NOWARN;
> -
> - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> - return p;
> - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -#endif
> -
> bool module_init_section(const char *name)
> {
> return strstarts(name, ".init") ||
> diff --git a/arch/arm/kernel/module_alloc.c b/arch/arm/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..c3cfb2b60575
> --- /dev/null
> +++ b/arch/arm/kernel/module_alloc.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2002 Russell King.
> + * Modified for nommu by Hyok S. Choi
> + *
> + * Module allocation method suggested by Andi Kleen.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_XIP_KERNEL
> +/*
> + * The XIP kernel text is mapped in the module area for modules and
> + * some other stuff to work without any indirect relocations.
> + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
> + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
> + */
> +#undef MODULES_VADDR
> +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
> +#endif
> +
> +#ifdef CONFIG_MMU
> +void *module_alloc(unsigned long size)
> +{
> + gfp_t gfp_mask = GFP_KERNEL;
> + void *p;
> +
> + /* Silence the initial allocation */
> + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> + gfp_mask |= __GFP_NOWARN;
> +
> + p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> + gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> + if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> + return p;
> + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> + GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index fa7981d0d917..d61c02da857e 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -43,6 +43,11 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o
> obj-$(CONFIG_KUSER_HELPERS) += kuser32.o
> obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
> obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
> obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
> diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> index f2d4bb14bfab..d2a979746da0 100644
> --- a/arch/arm64/kernel/module.c
> +++ b/arch/arm64/kernel/module.c
> @@ -20,53 +20,6 @@
> #include <asm/insn.h>
> #include <asm/sections.h>
>
> -void *module_alloc(unsigned long size)
> -{
> - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> - gfp_t gfp_mask = GFP_KERNEL;
> - void *p;
> -
> - /* Silence the initial allocation */
> - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> - gfp_mask |= __GFP_NOWARN;
> -
> - if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> - IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> - /* don't exceed the static module region - see below */
> - module_alloc_end = MODULES_END;
> -
> - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
> - NUMA_NO_NODE, __builtin_return_address(0));
> -
> - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> - (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
> - (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
> - !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
> - /*
> - * KASAN without KASAN_VMALLOC can only deal with module
> - * allocations being served from the reserved module region,
> - * since the remainder of the vmalloc region is already
> - * backed by zero shadow pages, and punching holes into it
> - * is non-trivial. Since the module region is not randomized
> - * when KASAN is enabled without KASAN_VMALLOC, it is even
> - * less likely that the module region gets exhausted, so we
> - * can simply omit this fallback in that case.
> - */
> - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> - module_alloc_base + SZ_2G, GFP_KERNEL,
> - PAGE_KERNEL, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -
> - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> - vfree(p);
> - return NULL;
> - }
> -
> - /* Memory is intended to be executable, reset the pointer tag. */
> - return kasan_reset_tag(p);
> -}
> -
> enum aarch64_reloc_op {
> RELOC_OP_NONE,
> RELOC_OP_ABS,
> diff --git a/arch/arm64/kernel/module_alloc.c b/arch/arm64/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..0340c8c0b782
> --- /dev/null
> +++ b/arch/arm64/kernel/module_alloc.c
> @@ -0,0 +1,57 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2012 ARM Limited
> + *
> + * Author: Will Deacon <[email protected]>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *module_alloc(unsigned long size)
> +{
> + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> + gfp_t gfp_mask = GFP_KERNEL;
> + void *p;
> +
> + /* Silence the initial allocation */
> + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> + gfp_mask |= __GFP_NOWARN;
> +
> + if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> + IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> + /* don't exceed the static module region - see below */
> + module_alloc_end = MODULES_END;
> +
> + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
> + NUMA_NO_NODE, __builtin_return_address(0));
> +
> + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> + (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
> + (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
> + !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
> + /*
> + * KASAN without KASAN_VMALLOC can only deal with module
> + * allocations being served from the reserved module region,
> + * since the remainder of the vmalloc region is already
> + * backed by zero shadow pages, and punching holes into it
> + * is non-trivial. Since the module region is not randomized
> + * when KASAN is enabled without KASAN_VMALLOC, it is even
> + * less likely that the module region gets exhausted, so we
> + * can simply omit this fallback in that case.
> + */
> + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> + module_alloc_base + SZ_2G, GFP_KERNEL,
> + PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +
> + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> + vfree(p);
> + return NULL;
> + }
> +
> + /* Memory is intended to be executable, reset the pointer tag. */
> + return kasan_reset_tag(p);
> +}
> diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
> index 7c96282bff2e..cb9297b613a1 100644
> --- a/arch/mips/kernel/Makefile
> +++ b/arch/mips/kernel/Makefile
> @@ -38,6 +38,11 @@ obj-$(CONFIG_SYNC_R4K) += sync-r4k.o
> obj-$(CONFIG_DEBUG_FS) += segment.o
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
>
> obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
> obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o
> diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
> index 14f46d17500a..214b2d1868a5 100644
> --- a/arch/mips/kernel/module.c
> +++ b/arch/mips/kernel/module.c
> @@ -31,15 +31,6 @@ struct mips_hi16 {
> static LIST_HEAD(dbe_list);
> static DEFINE_SPINLOCK(dbe_lock);
>
> -#ifdef MODULE_START
> -void *module_alloc(unsigned long size)
> -{
> - return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -#endif
> -
> static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v)
> {
> *location = base + v;
> diff --git a/arch/mips/kernel/module_alloc.c b/arch/mips/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..582ab5ed8916
> --- /dev/null
> +++ b/arch/mips/kernel/module_alloc.c
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2001 Rusty Russell.
> + * Copyright (C) 2003, 2004 Ralf Baechle ([email protected])
> + * Copyright (C) 2005 Thiemo Seufer
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef MODULE_START
> +void *module_alloc(unsigned long size)
> +{
> + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
> index d0bfac89a842..cd38083ae077 100644
> --- a/arch/parisc/kernel/Makefile
> +++ b/arch/parisc/kernel/Makefile
> @@ -25,6 +25,11 @@ obj-$(CONFIG_SMP) += smp.o
> obj-$(CONFIG_PA11) += pci-dma.o
> obj-$(CONFIG_PCI) += pci.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_64BIT) += sys_parisc32.o signal32.o
> obj-$(CONFIG_STACKTRACE)+= stacktrace.o
> obj-$(CONFIG_AUDIT) += audit.o
> diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
> index 7df140545b22..c81e63e2549b 100644
> --- a/arch/parisc/kernel/module.c
> +++ b/arch/parisc/kernel/module.c
> @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
> ((as22 & 0x0003ff) << 3));
> }
>
> -void *module_alloc(unsigned long size)
> -{
> - /* using RWX means less protection for modules, but it's
> - * easier than trying to map the text, data, init_text and
> - * init_data correctly */
> - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> - GFP_KERNEL,
> - PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -
> #ifndef CONFIG_64BIT
> static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
> {
> diff --git a/arch/parisc/kernel/module_alloc.c b/arch/parisc/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..4e4e4ce1132e
> --- /dev/null
> +++ b/arch/parisc/kernel/module_alloc.c
> @@ -0,0 +1,23 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Linux/PA-RISC Project
> + * Copyright (C) 2003 Randolph Chung <tausq at debian . org>
> + * Copyright (C) 2008 Helge Deller <[email protected]>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *module_alloc(unsigned long size)
> +{
> + /*
> + * Using RWX means less protection for modules, but it's
> + * easier than trying to map the text, data, init_text and
> + * init_data correctly.
> + */
> + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> + GFP_KERNEL,
> + PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 2e2a2a9bcf43..5a811cdf230b 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o
> endif
> obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
> obj-$(CONFIG_MODULES) += module.o module_$(BITS).o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_44x) += cpu_setup_44x.o
> obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
> obj-$(CONFIG_PPC_DOORBELL) += dbell.o
> diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
> index f6d6ae0a1692..b30e00964a60 100644
> --- a/arch/powerpc/kernel/module.c
> +++ b/arch/powerpc/kernel/module.c
> @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr,
>
> return 0;
> }
> -
> -static __always_inline void *
> -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> -{
> - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> -
> - /*
> - * Don't do huge page allocations for modules yet until more testing
> - * is done. STRICT_MODULE_RWX may require extra work to support this
> - * too.
> - */
> - return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> - VM_FLUSH_RESET_PERMS,
> - NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
> -void *module_alloc(unsigned long size)
> -{
> -#ifdef MODULES_VADDR
> - unsigned long limit = (unsigned long)_etext - SZ_32M;
> - void *ptr = NULL;
> -
> - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> -
> - /* First try within 32M limit from _etext to avoid branch trampolines */
> - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> - ptr = __module_alloc(size, limit, MODULES_END, true);
> -
> - if (!ptr)
> - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> -
> - return ptr;
> -#else
> - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> -#endif
> -}
> diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..48541c27ce46
> --- /dev/null
> +++ b/arch/powerpc/kernel/module_alloc.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Kernel module help for powerpc.
> + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
> + * Copyright (C) 2008 Freescale Semiconductor, Inc.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +static __always_inline void *
> +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> +{
> + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> +
> + /*
> + * Don't do huge page allocations for modules yet until more testing
> + * is done. STRICT_MODULE_RWX may require extra work to support this
> + * too.
> + */
> + return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> + VM_FLUSH_RESET_PERMS,
> + NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +
> +void *module_alloc(unsigned long size)
> +{
> +#ifdef MODULES_VADDR
> + unsigned long limit = (unsigned long)_etext - SZ_32M;
> + void *ptr = NULL;
> +
> + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> +
> + /* First try within 32M limit from _etext to avoid branch trampolines */
> + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> + ptr = __module_alloc(size, limit, MODULES_END, true);
> +
> + if (!ptr)
> + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> +
> + return ptr;
> +#else
> + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> +#endif
> +}
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index c71d6591d539..7af346ae575e 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -61,6 +61,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o
>
> obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
>
> obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 91fe16bfaa07..1621d5e458f7 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -419,16 +419,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
> return 0;
> }
>
> -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> -void *module_alloc(unsigned long size)
> -{
> - return __vmalloc_node_range(size, 1, MODULES_VADDR,
> - MODULES_END, GFP_KERNEL,
> - PAGE_KERNEL, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -#endif
> -
> static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
> const Elf_Shdr *sechdrs,
> const char *name)
> diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..ed87ed048713
> --- /dev/null
> +++ b/arch/riscv/kernel/module_alloc.c
> @@ -0,0 +1,19 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2017 Zihao Yu
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +#include <asm/sections.h>
> +
> +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> +void *module_alloc(unsigned long size)
> +{
> + return __vmalloc_node_range(size, 1, MODULES_VADDR,
> + MODULES_END, GFP_KERNEL,
> + PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
> index 27d6b3c7aa06..71e73e34b441 100644
> --- a/arch/s390/kernel/Makefile
> +++ b/arch/s390/kernel/Makefile
> @@ -48,6 +48,11 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o
> CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
>
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
> obj-$(CONFIG_NUMA) += numa.o
> obj-$(CONFIG_AUDIT) += audit.o
> diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
> index 26125a9c436d..e15f09fa50e2 100644
> --- a/arch/s390/kernel/module.c
> +++ b/arch/s390/kernel/module.c
> @@ -35,23 +35,6 @@
>
> #define PLT_ENTRY_SIZE 22
>
> -void *module_alloc(unsigned long size)
> -{
> - gfp_t gfp_mask = GFP_KERNEL;
> - void *p;
> -
> - if (PAGE_ALIGN(size) > MODULES_LEN)
> - return NULL;
> - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> - gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
> - __builtin_return_address(0));
> - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> - vfree(p);
> - return NULL;
> - }
> - return p;
> -}
> -
> #ifdef CONFIG_FUNCTION_TRACER
> void module_arch_cleanup(struct module *mod)
> {
> diff --git a/arch/s390/kernel/module_alloc.c b/arch/s390/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..e4c4175fb759
> --- /dev/null
> +++ b/arch/s390/kernel/module_alloc.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Kernel module help for s390.
> + *
> + * S390 version
> + * Copyright IBM Corp. 2002, 2003
> + * Author(s): Arnd Bergmann ([email protected])
> + * Martin Schwidefsky ([email protected])
> + *
> + * based on i386 version
> + * Copyright (C) 2001 Rusty Russell.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *module_alloc(unsigned long size)
> +{
> + gfp_t gfp_mask = GFP_KERNEL;
> + void *p;
> +
> + if (PAGE_ALIGN(size) > MODULES_LEN)
> + return NULL;
> + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
> + __builtin_return_address(0));
> + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> + vfree(p);
> + return NULL;
> + }
> + return p;
> +}
> diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
> index d3a0e072ebe8..53905c00a094 100644
> --- a/arch/sparc/kernel/Makefile
> +++ b/arch/sparc/kernel/Makefile
> @@ -89,6 +89,11 @@ obj-$(CONFIG_SUN_PM) += apc.o pmc.o
>
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULES) += sparc_ksyms.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_SPARC_LED) += led.o
> obj-$(CONFIG_KGDB) += kgdb_$(BITS).o
>
> diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
> index df39580f398d..f2babc69f189 100644
> --- a/arch/sparc/kernel/module.c
> +++ b/arch/sparc/kernel/module.c
> @@ -21,36 +21,6 @@
>
> #include "entry.h"
>
> -#ifdef CONFIG_SPARC64
> -
> -#include <linux/jump_label.h>
> -
> -static void *module_map(unsigned long size)
> -{
> - if (PAGE_ALIGN(size) > MODULES_LEN)
> - return NULL;
> - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> - __builtin_return_address(0));
> -}
> -#else
> -static void *module_map(unsigned long size)
> -{
> - return vmalloc(size);
> -}
> -#endif /* CONFIG_SPARC64 */
> -
> -void *module_alloc(unsigned long size)
> -{
> - void *ret;
> -
> - ret = module_map(size);
> - if (ret)
> - memset(ret, 0, size);
> -
> - return ret;
> -}
> -
> /* Make generic code ignore STT_REGISTER dummy undefined symbols. */
> int module_frob_arch_sections(Elf_Ehdr *hdr,
> Elf_Shdr *sechdrs,
> diff --git a/arch/sparc/kernel/module_alloc.c b/arch/sparc/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..4bacf8cff012
> --- /dev/null
> +++ b/arch/sparc/kernel/module_alloc.c
> @@ -0,0 +1,39 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Kernel module help for sparc64.
> + *
> + * Copyright (C) 2001 Rusty Russell.
> + * Copyright (C) 2002 David S. Miller.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_SPARC64
> +#include <linux/jump_label.h>
> +
> +static void *module_map(unsigned long size)
> +{
> + if (PAGE_ALIGN(size) > MODULES_LEN)
> + return NULL;
> + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> + __builtin_return_address(0));
> +}
> +#else
> +static void *module_map(unsigned long size)
> +{
> + return vmalloc(size);
> +}
> +#endif /* CONFIG_SPARC64 */
> +
> +void *module_alloc(unsigned long size)
> +{
> + void *ret;
> +
> + ret = module_map(size);
> + if (ret)
> + memset(ret, 0, size);
> +
> + return ret;
> +}
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 03364dc40d8d..1f0304140f44 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -108,6 +108,11 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
> obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
> obj-y += kprobes/
> obj-$(CONFIG_MODULES) += module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
> obj-$(CONFIG_X86_32) += doublefault_32.o
> obj-$(CONFIG_KGDB) += kgdb.o
> obj-$(CONFIG_VM86) += vm86_32.o
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index b98ffcf4d250..40bb10b5a68d 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -36,56 +36,6 @@ do { \
> } while (0)
> #endif
>
> -#ifdef CONFIG_RANDOMIZE_BASE
> -static unsigned long module_load_offset;
> -
> -/* Mutex protects the module_load_offset. */
> -static DEFINE_MUTEX(module_kaslr_mutex);
> -
> -static unsigned long int get_module_load_offset(void)
> -{
> - if (kaslr_enabled()) {
> - mutex_lock(&module_kaslr_mutex);
> - /*
> - * Calculate the module_load_offset the first time this
> - * code is called. Once calculated it stays the same until
> - * reboot.
> - */
> - if (module_load_offset == 0)
> - module_load_offset =
> - (get_random_int() % 1024 + 1) * PAGE_SIZE;
> - mutex_unlock(&module_kaslr_mutex);
> - }
> - return module_load_offset;
> -}
> -#else
> -static unsigned long int get_module_load_offset(void)
> -{
> - return 0;
> -}
> -#endif
> -
> -void *module_alloc(unsigned long size)
> -{
> - gfp_t gfp_mask = GFP_KERNEL;
> - void *p;
> -
> - if (PAGE_ALIGN(size) > MODULES_LEN)
> - return NULL;
> -
> - p = __vmalloc_node_range(size, MODULE_ALIGN,
> - MODULES_VADDR + get_module_load_offset(),
> - MODULES_END, gfp_mask,
> - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
> - __builtin_return_address(0));
> - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> - vfree(p);
> - return NULL;
> - }
> -
> - return p;
> -}
> -
> #ifdef CONFIG_X86_32
> int apply_relocate(Elf32_Shdr *sechdrs,
> const char *strtab,
> diff --git a/arch/x86/kernel/module_alloc.c b/arch/x86/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..a7df2af1467a
> --- /dev/null
> +++ b/arch/x86/kernel/module_alloc.c
> @@ -0,0 +1,61 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Kernel module help for x86.
> + * Copyright (C) 2001 Rusty Russell.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/random.h>
> +#include <linux/vmalloc.h>
> +#include <asm/setup.h>
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> +static unsigned long module_load_offset;
> +
> +/* Mutex protects the module_load_offset. */
> +static DEFINE_MUTEX(module_kaslr_mutex);
> +
> +static unsigned long get_module_load_offset(void)
> +{
> + if (kaslr_enabled()) {
> + mutex_lock(&module_kaslr_mutex);
> + /*
> + * Calculate the module_load_offset the first time this
> + * code is called. Once calculated it stays the same until
> + * reboot.
> + */
> + if (module_load_offset == 0)
> + module_load_offset =
> + (get_random_int() % 1024 + 1) * PAGE_SIZE;
> + mutex_unlock(&module_kaslr_mutex);
> + }
> + return module_load_offset;
> +}
> +#else
> +static unsigned long get_module_load_offset(void)
> +{
> + return 0;
> +}
> +#endif
> +
> +void *module_alloc(unsigned long size)
> +{
> + gfp_t gfp_mask = GFP_KERNEL;
> + void *p;
> +
> + if (PAGE_ALIGN(size) > MODULES_LEN)
> + return NULL;
> +
> + p = __vmalloc_node_range(size, MODULE_ALIGN,
> + MODULES_VADDR + get_module_load_offset(),
> + MODULES_END, gfp_mask,
> + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
> + __builtin_return_address(0));
> + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> + vfree(p);
> + return NULL;
> + }
> +
> + return p;
> +}
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 318789c728d3..2981fe42060d 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -53,6 +53,11 @@ obj-y += livepatch/
> obj-y += dma/
> obj-y += entry/
> obj-$(CONFIG_MODULES) += module/
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
>
> obj-$(CONFIG_KCMP) += kcmp.o
> obj-$(CONFIG_FREEZER) += freezer.o
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index f214f8c088ed..3f9876374cd3 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> goto out;
> }
>
> +#ifdef CONFIG_MODULES
> /* Check if 'p' is probing a module. */
> *probed_mod = __module_text_address((unsigned long) p->addr);
> if (*probed_mod) {
> @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> ret = -ENOENT;
> }
> }
> +#endif
> +
> out:
> preempt_enable();
> jump_label_unlock();
> @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> return 0;
> }
>
> +#ifdef CONFIG_MODULES
> /* Remove all symbols in given area from kprobe blacklist */
> static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> {
> @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> {
> kprobe_remove_area_blacklist(entry, entry + 1);
> }
> +#endif /* CONFIG_MODULES */
>
> int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> char *type, char *sym)
> @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> return ret ? : arch_populate_kprobe_blacklist();
> }
>
> +#ifdef CONFIG_MODULES
> static void add_module_kprobe_blacklist(struct module *mod)
> {
> unsigned long start, end;
> @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = {
> .notifier_call = kprobes_module_callback,
> .priority = 0
> };
> +#endif /* CONFIG_MODULES */
>
> void kprobe_free_init_mem(void)
> {
> @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void)
> err = arch_init_kprobes();
> if (!err)
> err = register_die_notifier(&kprobe_exceptions_nb);
> +
> +#ifdef CONFIG_MODULES
> if (!err)
> err = register_module_notifier(&kprobe_module_nb);
> +#endif
>
> kprobes_initialized = (err == 0);
> kprobe_sysctls_init();
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index fed58d30725d..7fa182b78550 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod,
> return ksym;
> }
>
> -void __weak module_memfree(void *module_region)
> -{
> - /*
> - * This memory may be RO, and freeing RO memory in an interrupt is not
> - * supported by vmalloc.
> - */
> - WARN_ON(in_interrupt());
> - vfree(module_region);
> -}
> -
> void __weak module_arch_cleanup(struct module *mod)
> {
> }
> @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
> ddebug_remove_module(mod->name);
> }
>
> -void * __weak module_alloc(unsigned long size)
> -{
> - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> - NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
> bool __weak module_init_section(const char *name)
> {
> return strstarts(name, ".init");
> diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..26a4c60998ad
> --- /dev/null
> +++ b/kernel/module_alloc.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2002 Richard Henderson
> + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void * __weak module_alloc(unsigned long size)
> +{
> + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> + NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +
> +void __weak module_memfree(void *module_region)
> +{
> + /*
> + * This memory may be RO, and freeing RO memory in an interrupt is not
> + * supported by vmalloc.
> + */
> + WARN_ON(in_interrupt());
> + vfree(module_region);
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 93507330462c..050b2975332e 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
> return kprobe_gone(&tk->rp.kp);
> }
>
> +#ifdef CONFIG_MODULES
> static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> struct module *mod)
> {
> @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
>
> return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> }
> +#endif /* CONFIG_MODULES */
>
> static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> {
> + bool ret = false;
> +#ifdef CONFIG_MODULES
> char *p;
> - bool ret;
>
> if (!tk->symbol)
> return false;
> @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> ret = !!find_module(tk->symbol);
> rcu_read_unlock_sched();
> *p = ':';
> +#endif /* CONFIG_MODULES */
>
> return ret;
> }
> @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> return ret;
> }
>
> +#ifdef CONFIG_MODULES
> /* Module notifier call back, checking event on the module */
> static int trace_kprobe_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> .notifier_call = trace_kprobe_module_callback,
> .priority = 1 /* Invoked after kprobe module callback */
> };
> +#endif /* CONFIG_MODULES */
>
> static int __trace_kprobe_create(int argc, const char *argv[])
> {
> @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void)
> if (ret)
> return ret;
>
> +#ifdef CONFIG_MODULES
> if (register_module_notifier(&trace_kprobe_module_nb))
> return -EINVAL;
> +#endif /* CONFIG_MODULES */
>
> return 0;
> }
> --
> 2.36.1
>
On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <[email protected]> wrote:
>
> Hello Jarkko,
>
> On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <[email protected]> wrote:
> >
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> > dependency is a result of kprobes code using the module allocator for the
> > trampoline code.
> >
> > Detaching kprobes from modules helps to squeeze down the user space,
> > e.g. when developing new core kernel features, while still having all
> > the nice tracing capabilities.
> >
> > For kernel/ and arch/*, move module_alloc() and module_memfree() to
> > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> > code with CONFIG_MODULES.
> >
> > As the result, kprobes can be used with a monolithic kernel.
>
> I think I may have mentioned this the previous time as well, but I
> don't think this is the right approach.
>
> Kprobes uses alloc_insn_page() to allocate executable memory, but the
> requirements for this memory are radically different compared to
> loadable modules, which need to be within an arch-specific distance of
> the core kernel, need KASAN backing etc etc.
I think the distance of core kernel requirement is the same for kprobe
alloc_insn_page and modules, no?
Thanks,
Song
>
> This is why arm64, for instance, does not implement alloc_insn_page()
> in terms of module_alloc() [and likely does not belong in this patch
> for that reason]
>
> Is there any reason kprobes cannot simply use vmalloc()?
>
On Wed, Jun 08, 2022 at 06:27:51PM +0200, Ard Biesheuvel wrote:
> Hello Jarkko,
>
> On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <[email protected]> wrote:
> >
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> > dependency is a result of kprobes code using the module allocator for the
> > trampoline code.
> >
> > Detaching kprobes from modules helps to squeeze down the user space,
> > e.g. when developing new core kernel features, while still having all
> > the nice tracing capabilities.
> >
> > For kernel/ and arch/*, move module_alloc() and module_memfree() to
> > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> > code with CONFIG_MODULES.
> >
> > As the result, kprobes can be used with a monolithic kernel.
>
> I think I may have mentioned this the previous time as well, but I
> don't think this is the right approach.
OK, I apologize for my ignorance. It's been a while.
> Kprobes uses alloc_insn_page() to allocate executable memory, but the
> requirements for this memory are radically different compared to
> loadable modules, which need to be within an arch-specific distance of
> the core kernel, need KASAN backing etc etc.
>
> This is why arm64, for instance, does not implement alloc_insn_page()
> in terms of module_alloc() [and likely does not belong in this patch
> for that reason]
>
> Is there any reason kprobes cannot simply use vmalloc()?
All arch's, except nios2 use vmalloc() in the end for module_alloc().
nios2 uses kmalloc() for the reasons that I'm not aware of, but it does
not support kprobes in the first place.
Based on this, I think that could work out just fine.
I could cope with that.
BR, Jarkko
On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote:
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index 553866751e1a..d2bb954cd54f 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> obj-$(CONFIG_ISA_DMA_API) += dma.o
> obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> obj-$(CONFIG_MODULES) += armksyms.o module.o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif
Doesn't:
obj-$(CONFIG_MODULES) += module_alloc.o
obj-$(CONFIG_KPROBES) += module_alloc.o
work just as well? The kbuild modules.rst documentation says:
The order of files in $(obj-y) is significant. Duplicates in
the lists are allowed: the first instance will be linked into
built-in.a and succeeding instances will be ignored.
so you should be fine... or the documentation is wrong!
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote:
> On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote:
> > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> > index 553866751e1a..d2bb954cd54f 100644
> > --- a/arch/arm/kernel/Makefile
> > +++ b/arch/arm/kernel/Makefile
> > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> > obj-$(CONFIG_ISA_DMA_API) += dma.o
> > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> > obj-$(CONFIG_MODULES) += armksyms.o module.o
> > +ifeq ($(CONFIG_MODULES),y)
> > +obj-y += module_alloc.o
> > +else
> > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > +endif
>
> Doesn't:
>
> obj-$(CONFIG_MODULES) += module_alloc.o
> obj-$(CONFIG_KPROBES) += module_alloc.o
>
> work just as well? The kbuild modules.rst documentation says:
>
> The order of files in $(obj-y) is significant. Duplicates in
> the lists are allowed: the first instance will be linked into
> built-in.a and succeeding instances will be ignored.
>
> so you should be fine... or the documentation is wrong!
OK, I did not know this. Thanks for the tip!
BR, Jarkko
On Thu, Jun 09, 2022 at 08:30:12AM +0000, Christophe Leroy wrote:
>
>
> Le 08/06/2022 ? 01:59, Jarkko Sakkinen a ?crit?:
> > [You don't often get email from [email protected]. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> >
> > Tracing with kprobes while running a monolithic kernel is currently
> > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> > dependency is a result of kprobes code using the module allocator for the
> > trampoline code.
> >
> > Detaching kprobes from modules helps to squeeze down the user space,
> > e.g. when developing new core kernel features, while still having all
> > the nice tracing capabilities.
>
> Nice idea, could also be nice to have BPF without MODULES.
Yeah, for sure. You have to start from somewhere :-) I'd guess this
a step forward also for BPF.
> >
> > For kernel/ and arch/*, move module_alloc() and module_memfree() to
> > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> > code with CONFIG_MODULES.
>
> Nice, but that's not enough. You have to audit every peace of code that
> depends on CONFIG_MODULES and see if it needs to be activated for your
> case as well. For instance some powerpc configurations don't honor exec
> page faults on kernel pages when CONFIG_MODULES is not selected.
Thanks for pointing this out. With "every peace of code" you probably
are referring to the 13 arch-folders, which support kprobes in the first
place (just checking)?
> > As the result, kprobes can be used with a monolithic kernel.
> >
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
>
> I think this patch should be split in a several patches, one (or even
> one per architectures ?) to make modules_alloc() independant of
> CONFIG_MODULES, then a patch to make CONFIG_KPROBES independant on
> CONFIG_MOUDLES.
Agreed. And also because of your previous remark, i.e. each arch needs
it own conclusions of the changes. I purposely did this first as a one
patch in order to get a better picture of the situation.
> > ---
> > Tested with the help of BuildRoot and QEMU:
> > - arm (function tracer)
> > - arm64 (function tracer)
> > - mips (function tracer)
> > - powerpc (function tracer)
> > - riscv (function tracer)
> > - s390 (function tracer)
> > - sparc (function tracer)
> > - x86 (function tracer)
> > - sh (function tracer, for the "pure" kernel/modules_alloc.c path)
> > ---
> > arch/Kconfig | 1 -
> > arch/arm/kernel/Makefile | 5 +++
> > arch/arm/kernel/module.c | 32 ----------------
> > arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++
> > arch/arm64/kernel/Makefile | 5 +++
> > arch/arm64/kernel/module.c | 47 -----------------------
> > arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++
> > arch/mips/kernel/Makefile | 5 +++
> > arch/mips/kernel/module.c | 9 -----
> > arch/mips/kernel/module_alloc.c | 18 +++++++++
> > arch/parisc/kernel/Makefile | 5 +++
> > arch/parisc/kernel/module.c | 11 ------
> > arch/parisc/kernel/module_alloc.c | 23 +++++++++++
> > arch/powerpc/kernel/Makefile | 5 +++
> > arch/powerpc/kernel/module.c | 37 ------------------
> > arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++
>
> You are missing necessary changes for powerpc.
>
> On powerpc 8xx or powerpc 603, software TLB handlers don't honor
> instruction TLB miss when CONFIG_MODULES are not set, look into
> head_8xx.S and head_book3s_32.S
>
> On powerpc book3s/32, all kernel space is set to NX except the module
> segment. When CONFIG_MODULES is all space is set NX. See
> mmu_mark_initmem_nx() and is_module_segment().
Thank you! I'll go this through and also try to build an environment
with BuildRoot where I can test-run this configuration.
> > arch/riscv/kernel/Makefile | 5 +++
> > arch/riscv/kernel/module.c | 10 -----
> > arch/riscv/kernel/module_alloc.c | 19 ++++++++++
> > arch/s390/kernel/Makefile | 5 +++
> > arch/s390/kernel/module.c | 17 ---------
> > arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++
> > arch/sparc/kernel/Makefile | 5 +++
> > arch/sparc/kernel/module.c | 30 ---------------
> > arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++
> > arch/x86/kernel/Makefile | 5 +++
> > arch/x86/kernel/module.c | 50 ------------------------
> > arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++
> > kernel/Makefile | 5 +++
> > kernel/kprobes.c | 10 +++++
> > kernel/module/main.c | 17 ---------
> > kernel/module_alloc.c | 26 +++++++++++++
> > kernel/trace/trace_kprobe.c | 10 ++++-
> > 33 files changed, 434 insertions(+), 262 deletions(-)
> > create mode 100644 arch/arm/kernel/module_alloc.c
> > create mode 100644 arch/arm64/kernel/module_alloc.c
> > create mode 100644 arch/mips/kernel/module_alloc.c
> > create mode 100644 arch/parisc/kernel/module_alloc.c
> > create mode 100644 arch/powerpc/kernel/module_alloc.c
> > create mode 100644 arch/riscv/kernel/module_alloc.c
> > create mode 100644 arch/s390/kernel/module_alloc.c
> > create mode 100644 arch/sparc/kernel/module_alloc.c
> > create mode 100644 arch/x86/kernel/module_alloc.c
> > create mode 100644 kernel/module_alloc.c
> >
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index fcf9a41a4ef5..e8e3e7998a2e 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -39,7 +39,6 @@ config GENERIC_ENTRY
> >
> > config KPROBES
> > bool "Kprobes"
> > - depends on MODULES
> > depends on HAVE_KPROBES
> > select KALLSYMS
> > select TASKS_RCU if PREEMPTION
>
> > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> > index 2e2a2a9bcf43..5a811cdf230b 100644
> > --- a/arch/powerpc/kernel/Makefile
> > +++ b/arch/powerpc/kernel/Makefile
> > @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o
> > endif
> > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
> > obj-$(CONFIG_MODULES) += module.o module_$(BITS).o
> > +ifeq ($(CONFIG_MODULES),y)
> > +obj-y += module_alloc.o
> > +else
> > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > +endif
>
> Why not just do:
>
> obj-$(CONFIG_MODULES) += module_alloc.o
> obj-$(CONFIG_KPROBES) += module_alloc.o
>
> However, a new hidden config item (eg: CONFIG_DYNAMIC_TEXT) selected by
> both CONFIG_MODULES and CONFIG_KPROBES would make like easier when
> you'll come to do the changes required.
I'll do this. Russell King also pointed out the same thing.
> > obj-$(CONFIG_44x) += cpu_setup_44x.o
> > obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
> > obj-$(CONFIG_PPC_DOORBELL) += dbell.o
> > diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
> > index f6d6ae0a1692..b30e00964a60 100644
> > --- a/arch/powerpc/kernel/module.c
> > +++ b/arch/powerpc/kernel/module.c
> > @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr,
> >
> > return 0;
> > }
> > -
> > -static __always_inline void *
> > -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> > -{
> > - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> > - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> > -
> > - /*
> > - * Don't do huge page allocations for modules yet until more testing
> > - * is done. STRICT_MODULE_RWX may require extra work to support this
> > - * too.
> > - */
> > - return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> > - VM_FLUSH_RESET_PERMS,
> > - NUMA_NO_NODE, __builtin_return_address(0));
> > -}
> > -
> > -void *module_alloc(unsigned long size)
> > -{
> > -#ifdef MODULES_VADDR
> > - unsigned long limit = (unsigned long)_etext - SZ_32M;
> > - void *ptr = NULL;
> > -
> > - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> > -
> > - /* First try within 32M limit from _etext to avoid branch trampolines */
> > - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> > - ptr = __module_alloc(size, limit, MODULES_END, true);
> > -
> > - if (!ptr)
> > - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> > -
> > - return ptr;
> > -#else
> > - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> > -#endif
> > -}
> > diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c
> > new file mode 100644
> > index 000000000000..48541c27ce46
> > --- /dev/null
> > +++ b/arch/powerpc/kernel/module_alloc.c
> > @@ -0,0 +1,47 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Kernel module help for powerpc.
> > + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
> > + * Copyright (C) 2008 Freescale Semiconductor, Inc.
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/moduleloader.h>
> > +#include <linux/vmalloc.h>
> > +
> > +static __always_inline void *
> > +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> > +{
> > + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> > + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> > +
> > + /*
> > + * Don't do huge page allocations for modules yet until more testing
> > + * is done. STRICT_MODULE_RWX may require extra work to support this
> > + * too.
> > + */
> > + return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> > + VM_FLUSH_RESET_PERMS,
> > + NUMA_NO_NODE, __builtin_return_address(0));
> > +}
> > +
> > +void *module_alloc(unsigned long size)
> > +{
> > +#ifdef MODULES_VADDR
>
> Is MODULES_VADDR defined even when CONFIG_MODULES is not ?
Yes, by this in ppc's asm/pgtable.h:
#ifdef CONFIG_PPC_BOOK3S
#include <asm/book3s/pgtable.h>
#else
#include <asm/nohash/pgtable.h>
#endif /* !CONFIG_PPC_BOOK3S */
> > + unsigned long limit = (unsigned long)_etext - SZ_32M;
> > + void *ptr = NULL;
> > +
> > + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> > +
> > + /* First try within 32M limit from _etext to avoid branch trampolines */
> > + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> > + ptr = __module_alloc(size, limit, MODULES_END, true);
> > +
> > + if (!ptr)
> > + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> > +
> > + return ptr;
> > +#else
> > + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> > +#endif
> > +}
>
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 318789c728d3..2981fe42060d 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -53,6 +53,11 @@ obj-y += livepatch/
> > obj-y += dma/
> > obj-y += entry/
> > obj-$(CONFIG_MODULES) += module/
> > +ifeq ($(CONFIG_MODULES),y)
> > +obj-y += module_alloc.o
> > +else
> > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > +endif
>
> Same comment, could be:
>
> obj-$(CONFIG_MODULES) += module_alloc.o
> obj-$(CONFIG_KPROBES) += module_alloc.o
Ditto.
>
> >
> > obj-$(CONFIG_KCMP) += kcmp.o
> > obj-$(CONFIG_FREEZER) += freezer.o
> > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > index f214f8c088ed..3f9876374cd3 100644
> > --- a/kernel/kprobes.c
> > +++ b/kernel/kprobes.c
> > @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > goto out;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Check if 'p' is probing a module. */
> > *probed_mod = __module_text_address((unsigned long) p->addr);
> > if (*probed_mod) {
> > @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > ret = -ENOENT;
> > }
> > }
> > +#endif
> > +
> > out:
> > preempt_enable();
> > jump_label_unlock();
> > @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Remove all symbols in given area from kprobe blacklist */
> > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> > {
> > @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> > {
> > kprobe_remove_area_blacklist(entry, entry + 1);
> > }
> > +#endif /* CONFIG_MODULES */
> >
> > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
> > char *type, char *sym)
> > @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> > return ret ? : arch_populate_kprobe_blacklist();
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static void add_module_kprobe_blacklist(struct module *mod)
> > {
> > unsigned long start, end;
> > @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = {
> > .notifier_call = kprobes_module_callback,
> > .priority = 0
> > };
> > +#endif /* CONFIG_MODULES */
> >
> > void kprobe_free_init_mem(void)
> > {
> > @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void)
> > err = arch_init_kprobes();
> > if (!err)
> > err = register_die_notifier(&kprobe_exceptions_nb);
> > +
> > +#ifdef CONFIG_MODULES
> > if (!err)
> > err = register_module_notifier(&kprobe_module_nb);
> > +#endif
> >
> > kprobes_initialized = (err == 0);
> > kprobe_sysctls_init();
> > diff --git a/kernel/module/main.c b/kernel/module/main.c
> > index fed58d30725d..7fa182b78550 100644
> > --- a/kernel/module/main.c
> > +++ b/kernel/module/main.c
> > @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod,
> > return ksym;
> > }
> >
> > -void __weak module_memfree(void *module_region)
> > -{
> > - /*
> > - * This memory may be RO, and freeing RO memory in an interrupt is not
> > - * supported by vmalloc.
> > - */
> > - WARN_ON(in_interrupt());
> > - vfree(module_region);
> > -}
> > -
> > void __weak module_arch_cleanup(struct module *mod)
> > {
> > }
> > @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
> > ddebug_remove_module(mod->name);
> > }
> >
> > -void * __weak module_alloc(unsigned long size)
> > -{
> > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> > - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> > - NUMA_NO_NODE, __builtin_return_address(0));
> > -}
> > -
> > bool __weak module_init_section(const char *name)
> > {
> > return strstarts(name, ".init");
> > diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c
> > new file mode 100644
> > index 000000000000..26a4c60998ad
> > --- /dev/null
> > +++ b/kernel/module_alloc.c
> > @@ -0,0 +1,26 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Copyright (C) 2002 Richard Henderson
> > + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/moduleloader.h>
> > +#include <linux/vmalloc.h>
> > +
> > +void * __weak module_alloc(unsigned long size)
> > +{
> > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> > + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> > + NUMA_NO_NODE, __builtin_return_address(0));
> > +}
> > +
> > +void __weak module_memfree(void *module_region)
> > +{
> > + /*
> > + * This memory may be RO, and freeing RO memory in an interrupt is not
> > + * supported by vmalloc.
> > + */
> > + WARN_ON(in_interrupt());
> > + vfree(module_region);
> > +}
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index 93507330462c..050b2975332e 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
> > return kprobe_gone(&tk->rp.kp);
> > }
> >
> > +#ifdef CONFIG_MODULES
> > static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> > struct module *mod)
> > {
> > @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> >
> > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
> > }
> > +#endif /* CONFIG_MODULES */
> >
> > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> > {
> > + bool ret = false;
> > +#ifdef CONFIG_MODULES
> > char *p;
> > - bool ret;
> >
> > if (!tk->symbol)
> > return false;
> > @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> > ret = !!find_module(tk->symbol);
> > rcu_read_unlock_sched();
> > *p = ':';
> > +#endif /* CONFIG_MODULES */
> >
> > return ret;
> > }
> > @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_MODULES
> > /* Module notifier call back, checking event on the module */
> > static int trace_kprobe_module_callback(struct notifier_block *nb,
> > unsigned long val, void *data)
> > @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> > .notifier_call = trace_kprobe_module_callback,
> > .priority = 1 /* Invoked after kprobe module callback */
> > };
> > +#endif /* CONFIG_MODULES */
> >
> > static int __trace_kprobe_create(int argc, const char *argv[])
> > {
> > @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void)
> > if (ret)
> > return ret;
> >
> > +#ifdef CONFIG_MODULES
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif /* CONFIG_MODULES */
> >
> > return 0;
> > }
> > --
> > 2.36.1
> >
Thanks for the well-considered remarks!
BR, Jarkko
On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote:
> On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote:
> > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> > index 553866751e1a..d2bb954cd54f 100644
> > --- a/arch/arm/kernel/Makefile
> > +++ b/arch/arm/kernel/Makefile
> > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> > obj-$(CONFIG_ISA_DMA_API) += dma.o
> > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> > obj-$(CONFIG_MODULES) += armksyms.o module.o
> > +ifeq ($(CONFIG_MODULES),y)
> > +obj-y += module_alloc.o
> > +else
> > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > +endif
>
> Doesn't:
>
> obj-$(CONFIG_MODULES) += module_alloc.o
> obj-$(CONFIG_KPROBES) += module_alloc.o
That just begs for a new kconfig symbol for the object, and for
the object then to be built with it.
The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC.
Please note that the respective free is important as well and its
not clear if we need an another define for the free. Someone has
to do that work. We want to ensure to noexec the code on free and
this can vary on each arch.
> work just as well? The kbuild modules.rst documentation says:
>
> The order of files in $(obj-y) is significant. Duplicates in
> the lists are allowed: the first instance will be linked into
> built-in.a and succeeding instances will be ignored.
>
> so you should be fine... or the documentation is wrong!
Agreed, but this is just sloppy, better to use a new kconfig symbol
to represent what is actually being required.
Luis
On Thu, Jun 9, 2022 at 1:34 AM Christophe Leroy
<[email protected]> wrote:
>
>
>
> Le 08/06/2022 à 18:12, Song Liu a écrit :
> > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <[email protected]> wrote:
> >>
> >> Hi Jarkko,
> >>
> >> On Wed, 8 Jun 2022 08:25:38 +0300
> >> Jarkko Sakkinen <[email protected]> wrote:
> >>
> >>> On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote:
> >>>> .
> >>>>
> >>>> On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <[email protected]> wrote:
> >>>>>
> >>>>> Tracing with kprobes while running a monolithic kernel is currently
> >>>>> impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> >>>>> dependency is a result of kprobes code using the module allocator for the
> >>>>> trampoline code.
> >>>>>
> >>>>> Detaching kprobes from modules helps to squeeze down the user space,
> >>>>> e.g. when developing new core kernel features, while still having all
> >>>>> the nice tracing capabilities.
> >>>>>
> >>>>> For kernel/ and arch/*, move module_alloc() and module_memfree() to
> >>>>> module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> >>>>> or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> >>>>> code with CONFIG_MODULES.
> >>>>>
> >>>>> As the result, kprobes can be used with a monolithic kernel.
> >>>> It's strange when MODULES is n, but vmlinux still obtains module_alloc.
> >>>>
> >>>> Maybe we need a kprobe_alloc, right?
> >>>
> >>> Perhaps not the best name but at least it documents the fact that
> >>> they use the same allocator.
> >>>
> >>> Few years ago I carved up something "half-way there" for kprobes,
> >>> and I used the name text_alloc() [*].
> >>>
> >>> [*] https://lore.kernel.org/all/[email protected]/
> >>
> >> Yeah, I remember that. Thank you for updating your patch!
> >> I think the idea (split module_alloc() from CONFIG_MODULE) is good to me.
> >> If module support maintainers think this name is not good, you may be
> >> able to rename it as text_alloc() and make the module_alloc() as a
> >> wrapper of it.
> >
> > IIUC, most users of module_alloc() use it to allocate memory for text, except
> > that module code uses it for both text and data. Therefore, I guess calling it
> > text_alloc() is not 100% accurate until we change the module code (to use
> > a different API to allocate memory for data).
>
> When CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC, module code uses
> module_alloc() for text and vmalloc() for data, see function
> move_module() in kernel/module/main.c
Thanks for the pointer! I will play with it.
Song
On Wed, 8 Jun 2022 11:19:19 -0700
Song Liu <[email protected]> wrote:
> On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <[email protected]> wrote:
> >
> > Hello Jarkko,
> >
> > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <[email protected]> wrote:
> > >
> > > Tracing with kprobes while running a monolithic kernel is currently
> > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> > > dependency is a result of kprobes code using the module allocator for the
> > > trampoline code.
> > >
> > > Detaching kprobes from modules helps to squeeze down the user space,
> > > e.g. when developing new core kernel features, while still having all
> > > the nice tracing capabilities.
> > >
> > > For kernel/ and arch/*, move module_alloc() and module_memfree() to
> > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> > > code with CONFIG_MODULES.
> > >
> > > As the result, kprobes can be used with a monolithic kernel.
> >
> > I think I may have mentioned this the previous time as well, but I
> > don't think this is the right approach.
> >
> > Kprobes uses alloc_insn_page() to allocate executable memory, but the
> > requirements for this memory are radically different compared to
> > loadable modules, which need to be within an arch-specific distance of
> > the core kernel, need KASAN backing etc etc.
>
> I think the distance of core kernel requirement is the same for kprobe
> alloc_insn_page and modules, no?
This strongly depends on how kprobes (software breakpoint and
single-step) is implemented on the arch. For example, x86 implements
the so-called "kprobe-booster" which jumps back from the single
stepping trampoline buffer. Then the buffer address must be within
the range where it can jump to the original address.
However, if the arch implements single-step as an instruction
emulation, it has no such limitation. As far as I know, arm64
will do emulation for the instructions which change PC register
and will do direct execution with another software breakpoint
for other instructions.
Why I'm using module_alloc() for a generic function, is that
can cover the limitation most widely.
Thus, if we have CONFIG_ARCH_HAVE_ALLOC_INSN_PAGE flag and
kprobes can check it instead of using __weak function, the
kprobes may not need to depend on module_alloc() in general.
Thank you,
>
> Thanks,
> Song
>
> >
> > This is why arm64, for instance, does not implement alloc_insn_page()
> > in terms of module_alloc() [and likely does not belong in this patch
> > for that reason]
>
>
>
> >
> > Is there any reason kprobes cannot simply use vmalloc()?
> >
--
Masami Hiramatsu (Google) <[email protected]>
On Thu, Jun 09, 2022 at 06:44:45AM -0700, Luis Chamberlain wrote:
> On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote:
> > On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote:
> > > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> > > index 553866751e1a..d2bb954cd54f 100644
> > > --- a/arch/arm/kernel/Makefile
> > > +++ b/arch/arm/kernel/Makefile
> > > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> > > obj-$(CONFIG_ISA_DMA_API) += dma.o
> > > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> > > obj-$(CONFIG_MODULES) += armksyms.o module.o
> > > +ifeq ($(CONFIG_MODULES),y)
> > > +obj-y += module_alloc.o
> > > +else
> > > +obj-$(CONFIG_KPROBES) += module_alloc.o
> > > +endif
> >
> > Doesn't:
> >
> > obj-$(CONFIG_MODULES) += module_alloc.o
> > obj-$(CONFIG_KPROBES) += module_alloc.o
>
> That just begs for a new kconfig symbol for the object, and for
> the object then to be built with it.
>
> The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC.
> Please note that the respective free is important as well and its
> not clear if we need an another define for the free. Someone has
> to do that work. We want to ensure to noexec the code on free and
> this can vary on each arch.
Let me check if I understand this (not 100% sure).
So if arch define ARCH_HAS_VMALLOC_EXEC, then this would set
config flag CONFIG_VMALLOC_EXEC, which would be used to include
the compilation unit?
BR, Jarkko
On Sun, Jun 12, 2022 at 09:30:41PM +0900, Masami Hiramatsu wrote:
> On Wed, 8 Jun 2022 11:19:19 -0700
> Song Liu <[email protected]> wrote:
>
> > On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <[email protected]> wrote:
> > >
> > > Hello Jarkko,
> > >
> > > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <[email protected]> wrote:
> > > >
> > > > Tracing with kprobes while running a monolithic kernel is currently
> > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This
> > > > dependency is a result of kprobes code using the module allocator for the
> > > > trampoline code.
> > > >
> > > > Detaching kprobes from modules helps to squeeze down the user space,
> > > > e.g. when developing new core kernel features, while still having all
> > > > the nice tracing capabilities.
> > > >
> > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to
> > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific
> > > > code with CONFIG_MODULES.
> > > >
> > > > As the result, kprobes can be used with a monolithic kernel.
> > >
> > > I think I may have mentioned this the previous time as well, but I
> > > don't think this is the right approach.
> > >
> > > Kprobes uses alloc_insn_page() to allocate executable memory, but the
> > > requirements for this memory are radically different compared to
> > > loadable modules, which need to be within an arch-specific distance of
> > > the core kernel, need KASAN backing etc etc.
> >
> > I think the distance of core kernel requirement is the same for kprobe
> > alloc_insn_page and modules, no?
>
> This strongly depends on how kprobes (software breakpoint and
> single-step) is implemented on the arch. For example, x86 implements
> the so-called "kprobe-booster" which jumps back from the single
> stepping trampoline buffer. Then the buffer address must be within
> the range where it can jump to the original address.
> However, if the arch implements single-step as an instruction
> emulation, it has no such limitation. As far as I know, arm64
> will do emulation for the instructions which change PC register
> and will do direct execution with another software breakpoint
> for other instructions.
>
> Why I'm using module_alloc() for a generic function, is that
> can cover the limitation most widely.
> Thus, if we have CONFIG_ARCH_HAVE_ALLOC_INSN_PAGE flag and
> kprobes can check it instead of using __weak function, the
> kprobes may not need to depend on module_alloc() in general.
OK, I guess this is what Luis meant.
I'll try to carve up something based on this.
BR, Jarkko
On Tue, Jun 14, 2022 at 12:36:25PM +0000, Christophe Leroy wrote:
>
>
> Le 14/06/2022 ? 14:26, Jarkko Sakkinen a ?crit?:
> > On Thu, Jun 09, 2022 at 06:44:45AM -0700, Luis Chamberlain wrote:
> >> On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote:
> >>> On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote:
> >>>> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> >>>> index 553866751e1a..d2bb954cd54f 100644
> >>>> --- a/arch/arm/kernel/Makefile
> >>>> +++ b/arch/arm/kernel/Makefile
> >>>> @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> >>>> obj-$(CONFIG_ISA_DMA_API) += dma.o
> >>>> obj-$(CONFIG_FIQ) += fiq.o fiqasm.o
> >>>> obj-$(CONFIG_MODULES) += armksyms.o module.o
> >>>> +ifeq ($(CONFIG_MODULES),y)
> >>>> +obj-y += module_alloc.o
> >>>> +else
> >>>> +obj-$(CONFIG_KPROBES) += module_alloc.o
> >>>> +endif
> >>>
> >>> Doesn't:
> >>>
> >>> obj-$(CONFIG_MODULES) += module_alloc.o
> >>> obj-$(CONFIG_KPROBES) += module_alloc.o
> >>
> >> That just begs for a new kconfig symbol for the object, and for
> >> the object then to be built with it.
> >>
> >> The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC.
> >> Please note that the respective free is important as well and its
> >> not clear if we need an another define for the free. Someone has
> >> to do that work. We want to ensure to noexec the code on free and
> >> this can vary on each arch.
> >
> > Let me check if I understand this (not 100% sure).
> >
> > So if arch define ARCH_HAS_VMALLOC_EXEC, then this would set
> > config flag CONFIG_VMALLOC_EXEC, which would be used to include
> > the compilation unit?
> >
>
> I guess you have two possible approaches.
>
> Either architectures select CONFIG_ARCH_HAS_VMALLOC_EXEC at all time and
> then you add a CONFIG_VMALLOC_EXEC which depends on
> CONFIG_ARCH_HAS_VMALLOC_EXEC and CONFIG_MODULES or CONFIG_KPROBES,
>
> Or architectures select CONFIG_ARCH_HAS_VMALLOC_EXEC only when either
> CONFIG_MODULES or CONFIG_KPROBES is selected, in that case there is no
> need for a CONFIG_VMALLOC_EXEC.
Right, got it now. Thanks for the elaboration.
> Christophe
BR, Jarkko