Remove MODULES dependency and migrate from module_alloc to vmalloc().
According to Andi, the history with this dependency is that kprobes
originally required custom LKM's, which does not hold today anymore.
Right now one has to compile LKM support only to enable kprobes. With
this change applied, it is somewhat easier to create custom test
kernel's with a proper debugging capabilities, thus making Linux more
developer friendly.
Cc: Andi Kleen <[email protected]>
Signed-off-by: Jarkko Sakkinen <[email protected]>
---
arch/Kconfig | 1 -
arch/x86/kernel/kprobes/core.c | 5 +++--
kernel/kprobes.c | 22 ++++++++++++++++++++--
kernel/trace/trace_kprobe.c | 12 ++++++++++++
4 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..bb59cdf335ab 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -61,7 +61,6 @@ config OPROFILE_NMI_TIMER
config KPROBES
bool "Kprobes"
- depends on MODULES
depends on HAVE_KPROBES
select KALLSYMS
help
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..dc7b8d6fd20d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -42,6 +42,7 @@
#include <linux/moduleloader.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
+#include <linux/vmalloc.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -423,7 +424,7 @@ void *alloc_insn_page(void)
{
void *page;
- page = module_alloc(PAGE_SIZE);
+ page = vmalloc(PAGE_SIZE);
if (!page)
return NULL;
@@ -446,7 +447,7 @@ void *alloc_insn_page(void)
/* Recover page to RW mode before releasing it */
void free_insn_page(void *page)
{
- module_memfree(page);
+ vfree(page);
}
static int arch_copy_kprobe(struct kprobe *p)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..02680642ea11 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/vmalloc.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -111,12 +112,12 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
- return module_alloc(PAGE_SIZE);
+ return vmalloc(PAGE_SIZE);
}
void __weak free_insn_page(void *page)
{
- module_memfree(page);
+ vfree(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
@@ -563,8 +564,11 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_lock(&kprobe_mutex);
cpus_read_lock();
mutex_lock(&text_mutex);
+
+#ifdef CONFIG_MODULES
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
+#endif
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -589,7 +593,9 @@ static void kprobe_optimizer(struct work_struct *work)
/* Step 4: Free cleaned kprobes after quiesence period */
do_free_cleaned_kprobes();
+#ifdef CONFIG_MODULES
mutex_unlock(&module_mutex);
+#endif
mutex_unlock(&text_mutex);
cpus_read_unlock();
@@ -739,6 +745,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
return 0;
}
+#ifdef CONFIG_MODULE
/* Remove optimized instructions */
static void kill_optimized_kprobe(struct kprobe *p)
{
@@ -764,6 +771,7 @@ static void kill_optimized_kprobe(struct kprobe *p)
/* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
+#endif
static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
@@ -1608,6 +1616,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
+#ifdef CONFIG_MODULE
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
@@ -1618,6 +1627,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
*probed_mod = NULL;
ret = -ENOENT;
}
+#endif
}
out:
preempt_enable();
@@ -2090,6 +2100,7 @@ NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
+#ifdef CONFIG_MODULE
/* Set the kprobe gone and remove its instruction buffer. */
static void kill_kprobe(struct kprobe *p)
{
@@ -2114,6 +2125,7 @@ static void kill_kprobe(struct kprobe *p)
*/
arch_remove_kprobe(p);
}
+#endif
/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
@@ -2214,6 +2226,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
return 0;
}
+#ifdef CONFIG_MODULE
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
@@ -2231,6 +2244,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
kprobe_remove_area_blacklist(entry, entry + 1);
}
+#endif
int __init __weak arch_populate_kprobe_blacklist(void)
{
@@ -2274,6 +2288,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret ? : arch_populate_kprobe_blacklist();
}
+#ifdef CONFIG_MODULE
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
@@ -2375,6 +2390,7 @@ static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
};
+#endif /* CONFIG_MODULE */
/* Markers of _kprobe_blacklist section */
extern unsigned long __start_kprobe_blacklist[];
@@ -2425,8 +2441,10 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
+#ifdef CONFIG_MODULE
if (!err)
err = register_module_notifier(&kprobe_module_nb);
+#endif
kprobes_initialized = (err == 0);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aefb6065b508..30969b38fce8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -103,6 +103,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
return !!(kprobe_gone(&tk->rp.kp));
}
+#ifdef CONFIG_MODULE
static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
@@ -110,7 +111,9 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
const char *name = trace_kprobe_symbol(tk);
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
}
+#endif
+#ifdef CONFIG_MODULE
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -129,6 +132,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
return ret;
}
+#endif
static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -608,10 +612,12 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
/* Register k*probe */
ret = __register_trace_kprobe(tk);
+#ifdef CONFIG_MODULE
if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
}
+#endif
if (ret)
trace_probe_unlink(&tk->tp);
@@ -651,10 +657,12 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register k*probe */
ret = __register_trace_kprobe(tk);
+#ifdef CONFIG_MODULE
if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
}
+#endif
if (ret < 0)
unregister_kprobe_event(tk);
@@ -666,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
return ret;
}
+#ifdef CONFIG_MODULE
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -700,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
+#endif
/* Convert certain expected symbols into '_' when generating event names */
static inline void sanitize_event_name(char *name)
@@ -1891,8 +1901,10 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
+#ifdef CONFIG_MODULE
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
+#endif
return 0;
}
--
2.25.1
On Fri, Jul 10, 2020 at 02:45:19AM +0300, Jarkko Sakkinen wrote:
> Remove MODULES dependency and migrate from module_alloc to vmalloc().
> According to Andi, the history with this dependency is that kprobes
> originally required custom LKM's, which does not hold today anymore.
>
> Right now one has to compile LKM support only to enable kprobes. With
> this change applied, it is somewhat easier to create custom test
> kernel's with a proper debugging capabilities, thus making Linux more
> developer friendly.
>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Jarkko Sakkinen <[email protected]>
NAK
this patch is horrific, it sprinkles a metric ton of #ifdef and silently
disables a lot of kprobe features (like all the opt stuff).
How about unconditionally providing module_alloc() instead?
Hi Jarkko,
On Fri, 10 Jul 2020 02:45:19 +0300
Jarkko Sakkinen <[email protected]> wrote:
> Remove MODULES dependency and migrate from module_alloc to vmalloc().
> According to Andi, the history with this dependency is that kprobes
> originally required custom LKM's, which does not hold today anymore.
>
> Right now one has to compile LKM support only to enable kprobes. With
> this change applied, it is somewhat easier to create custom test
> kernel's with a proper debugging capabilities, thus making Linux more
> developer friendly.
Agreed. Now we have several way to access kprobes via ftrace/perf/bpf
without modules. So it's a good time to remove CONFIG_MODULE dependency.
But I also would like to minimize the changes for code readability.
I made some comments below.
>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Jarkko Sakkinen <[email protected]>
> ---
> arch/Kconfig | 1 -
> arch/x86/kernel/kprobes/core.c | 5 +++--
> kernel/kprobes.c | 22 ++++++++++++++++++++--
> kernel/trace/trace_kprobe.c | 12 ++++++++++++
> 4 files changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 8cc35dc556c7..bb59cdf335ab 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -61,7 +61,6 @@ config OPROFILE_NMI_TIMER
>
> config KPROBES
> bool "Kprobes"
> - depends on MODULES
> depends on HAVE_KPROBES
> select KALLSYMS
> help
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index ada39ddbc922..dc7b8d6fd20d 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -42,6 +42,7 @@
> #include <linux/moduleloader.h>
> #include <linux/vmalloc.h>
> #include <linux/pgtable.h>
> +#include <linux/vmalloc.h>
>
> #include <asm/text-patching.h>
> #include <asm/cacheflush.h>
> @@ -423,7 +424,7 @@ void *alloc_insn_page(void)
> {
> void *page;
>
> - page = module_alloc(PAGE_SIZE);
> + page = vmalloc(PAGE_SIZE);
No, you can not use vmalloc here. The reason why we use module_alloc()
is to allocate the executable memory for trampoline code.
So, you need to use vmalloc_exec() instead.
> if (!page)
> return NULL;
>
> @@ -446,7 +447,7 @@ void *alloc_insn_page(void)
> /* Recover page to RW mode before releasing it */
> void free_insn_page(void *page)
> {
> - module_memfree(page);
> + vfree(page);
> }
>
> static int arch_copy_kprobe(struct kprobe *p)
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 4a904cc56d68..02680642ea11 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -35,6 +35,7 @@
> #include <linux/ftrace.h>
> #include <linux/cpu.h>
> #include <linux/jump_label.h>
> +#include <linux/vmalloc.h>
>
> #include <asm/sections.h>
> #include <asm/cacheflush.h>
> @@ -111,12 +112,12 @@ enum kprobe_slot_state {
>
> void __weak *alloc_insn_page(void)
> {
> - return module_alloc(PAGE_SIZE);
> + return vmalloc(PAGE_SIZE);
Ditto.
> }
>
> void __weak free_insn_page(void *page)
> {
> - module_memfree(page);
> + vfree(page);
> }
>
> struct kprobe_insn_cache kprobe_insn_slots = {
> @@ -563,8 +564,11 @@ static void kprobe_optimizer(struct work_struct *work)
> mutex_lock(&kprobe_mutex);
> cpus_read_lock();
> mutex_lock(&text_mutex);
> +
> +#ifdef CONFIG_MODULES
> /* Lock modules while optimizing kprobes */
> mutex_lock(&module_mutex);
> +#endif
Hmm, can you reduce these "#ifdef CONFIG_MODULE"s ?
e.g.
#ifdef CONFIG_MODULES
static void lock_modules(void)
{
mutex_lock(&module_mutex);
}
...
#else
#define lock_modules() do { } while (0)
...
#endif
>
> /*
> * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
> @@ -589,7 +593,9 @@ static void kprobe_optimizer(struct work_struct *work)
> /* Step 4: Free cleaned kprobes after quiesence period */
> do_free_cleaned_kprobes();
>
> +#ifdef CONFIG_MODULES
> mutex_unlock(&module_mutex);
> +#endif
> mutex_unlock(&text_mutex);
> cpus_read_unlock();
>
> @@ -739,6 +745,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
> return 0;
> }
>
> +#ifdef CONFIG_MODULE
> /* Remove optimized instructions */
> static void kill_optimized_kprobe(struct kprobe *p)
> {
> @@ -764,6 +771,7 @@ static void kill_optimized_kprobe(struct kprobe *p)
> /* Don't touch the code, because it is already freed. */
> arch_remove_optimized_kprobe(op);
> }
> +#endif
This optprobe related one is OK to keep in this place.
>
> static inline
> void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
> @@ -1608,6 +1616,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> goto out;
> }
>
> +#ifdef CONFIG_MODULE
> /*
> * If the module freed .init.text, we couldn't insert
> * kprobes in there.
> @@ -1618,6 +1627,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> *probed_mod = NULL;
> ret = -ENOENT;
> }
> +#endif
> }
> out:
> preempt_enable();
> @@ -2090,6 +2100,7 @@ NOKPROBE_SYMBOL(pre_handler_kretprobe);
>
> #endif /* CONFIG_KRETPROBES */
>
> +#ifdef CONFIG_MODULE
> /* Set the kprobe gone and remove its instruction buffer. */
> static void kill_kprobe(struct kprobe *p)
> {
> @@ -2114,6 +2125,7 @@ static void kill_kprobe(struct kprobe *p)
> */
> arch_remove_kprobe(p);
> }
> +#endif
>
> /* Disable one kprobe */
> int disable_kprobe(struct kprobe *kp)
> @@ -2214,6 +2226,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> return 0;
> }
>
> +#ifdef CONFIG_MODULE
> /* Remove all symbols in given area from kprobe blacklist */
> static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> {
> @@ -2231,6 +2244,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> {
> kprobe_remove_area_blacklist(entry, entry + 1);
> }
> +#endif
>
> int __init __weak arch_populate_kprobe_blacklist(void)
> {
> @@ -2274,6 +2288,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> return ret ? : arch_populate_kprobe_blacklist();
> }
>
> +#ifdef CONFIG_MODULE
> static void add_module_kprobe_blacklist(struct module *mod)
> {
> unsigned long start, end;
> @@ -2375,6 +2390,7 @@ static struct notifier_block kprobe_module_nb = {
> .notifier_call = kprobes_module_callback,
> .priority = 0
> };
> +#endif /* CONFIG_MODULE */
>
> /* Markers of _kprobe_blacklist section */
> extern unsigned long __start_kprobe_blacklist[];
> @@ -2425,8 +2441,10 @@ static int __init init_kprobes(void)
> err = arch_init_kprobes();
> if (!err)
> err = register_die_notifier(&kprobe_exceptions_nb);
> +#ifdef CONFIG_MODULE
> if (!err)
> err = register_module_notifier(&kprobe_module_nb);
> +#endif
>
> kprobes_initialized = (err == 0);
>
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index aefb6065b508..30969b38fce8 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -103,6 +103,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
> return !!(kprobe_gone(&tk->rp.kp));
> }
>
> +#ifdef CONFIG_MODULE
> static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> struct module *mod)
> {
> @@ -110,7 +111,9 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> const char *name = trace_kprobe_symbol(tk);
> return strncmp(mod->name, name, len) == 0 && name[len] == ':';
> }
> +#endif
>
> +#ifdef CONFIG_MODULE
> static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> {
> char *p;
> @@ -129,6 +132,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
>
> return ret;
> }
> +#endif
Here, too. If you make a dummy function in case of !CONFIG_MODULE,
we don't need to modify caller-side code.
>
> static bool trace_kprobe_is_busy(struct dyn_event *ev)
> {
> @@ -608,10 +612,12 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
>
> /* Register k*probe */
> ret = __register_trace_kprobe(tk);
> +#ifdef CONFIG_MODULE
> if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> ret = 0;
> }
> +#endif
So, please make a dummy trace_kprobe_module_exist() and keep this untouched.
>
> if (ret)
> trace_probe_unlink(&tk->tp);
> @@ -651,10 +657,12 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
>
> /* Register k*probe */
> ret = __register_trace_kprobe(tk);
> +#ifdef CONFIG_MODULE
> if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> ret = 0;
> }
> +#endif
Ditto.
>
> if (ret < 0)
> unregister_kprobe_event(tk);
> @@ -666,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> return ret;
> }
>
> +#ifdef CONFIG_MODULE
> /* Module notifier call back, checking event on the module */
> static int trace_kprobe_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> @@ -700,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> .notifier_call = trace_kprobe_module_callback,
> .priority = 1 /* Invoked after kprobe module callback */
> };
> +#endif
Unless it makes any build error, please keep it untouched.
>
> /* Convert certain expected symbols into '_' when generating event names */
> static inline void sanitize_event_name(char *name)
> @@ -1891,8 +1901,10 @@ static __init int init_kprobe_trace_early(void)
> if (ret)
> return ret;
>
> +#ifdef CONFIG_MODULE
> if (register_module_notifier(&trace_kprobe_module_nb))
> return -EINVAL;
> +#endif
Ditto.
Thank you,
>
> return 0;
> }
> --
> 2.25.1
>
--
Masami Hiramatsu <[email protected]>
On Fri, Jul 10, 2020 at 11:03:44AM +0200, Peter Zijlstra wrote:
> On Fri, Jul 10, 2020 at 02:45:19AM +0300, Jarkko Sakkinen wrote:
> > Remove MODULES dependency and migrate from module_alloc to vmalloc().
> > According to Andi, the history with this dependency is that kprobes
> > originally required custom LKM's, which does not hold today anymore.
> >
> > Right now one has to compile LKM support only to enable kprobes. With
> > this change applied, it is somewhat easier to create custom test
> > kernel's with a proper debugging capabilities, thus making Linux more
> > developer friendly.
> >
> > Cc: Andi Kleen <[email protected]>
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
>
> NAK
>
> this patch is horrific, it sprinkles a metric ton of #ifdef and silently
> disables a lot of kprobe features (like all the opt stuff).
Perfectly nderstandable. I just drafted something quick andy dirty
together for idea's sake (and put RFC tag to state that).
The application where I use this chhange, is when I refactor large patch
set that I'm working on (namely SGX patch set in my case). I just want
squeece all the extra out from the kernel build and still have means for
instrumentation. A static kernel is very convenient for this kind of
purpose, as with EFI stub and statically linked user space you can have
a single test binary.
> How about unconditionally providing module_alloc() instead?
I believe so, yes.
Just so that I know (and learn), what did exactly disable optprobes?
Not too familiar with this part of the kernel - that's why I'm asking.
Does the module_alloc to vmalloc change disable it?
/Jarkko
On Fri, Jul 10, 2020 at 01:36:38PM +0300, Jarkko Sakkinen wrote:
> Just so that I know (and learn), what did exactly disable optprobes?
So regular, old-skool style kprobe is:
- copy original instruction out
- replace instruction with breakpoint (int3 on x86)
- have exception handler return to the copied instruction with
single-step on
- have single step exception handler return to the original
instruction stream
which is 2 exceptions.
optprobes avoid the single-step by not only writing a single
instruction, but additionally placing a JMP instruction behind it such
that it will automagically continue in the original instruction stream.
This brings the requirement that the copied instruction is placed
within the JMP displacement of the regular kernel text (s32 on x86).
module_alloc() ensures the memory provided is within that range.
On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > - page = module_alloc(PAGE_SIZE);
> > + page = vmalloc(PAGE_SIZE);
>
> No, you can not use vmalloc here. The reason why we use module_alloc()
> is to allocate the executable memory for trampoline code.
> So, you need to use vmalloc_exec() instead.
vmalloc_exec() would be broken too, also hch recently got rid of that
thing.
module_alloc() really is the only sane choice here.
We should make module_alloc() unconditionally available, and maybe even
rename it to text_alloc().
On Fri, Jul 10, 2020 at 01:32:38PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > > - page = module_alloc(PAGE_SIZE);
> > > + page = vmalloc(PAGE_SIZE);
> >
> > No, you can not use vmalloc here. The reason why we use module_alloc()
> > is to allocate the executable memory for trampoline code.
> > So, you need to use vmalloc_exec() instead.
>
> vmalloc_exec() would be broken too, also hch recently got rid of that
> thing.
>
> module_alloc() really is the only sane choice here.
>
> We should make module_alloc() unconditionally available, and maybe even
> rename it to text_alloc().
I think unconitionally might be a bit too much, but for
MODULES || KRPOBES or a new symbol select by them makes sense. As does
the rename.
On Fri, 10 Jul 2020 13:32:38 +0200
Peter Zijlstra <[email protected]> wrote:
> On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > > - page = module_alloc(PAGE_SIZE);
> > > + page = vmalloc(PAGE_SIZE);
> >
> > No, you can not use vmalloc here. The reason why we use module_alloc()
> > is to allocate the executable memory for trampoline code.
> > So, you need to use vmalloc_exec() instead.
>
> vmalloc_exec() would be broken too, also hch recently got rid of that
> thing.
>
> module_alloc() really is the only sane choice here.
>
> We should make module_alloc() unconditionally available, and maybe even
> rename it to text_alloc().
Agreed. As far as I know, ftrace and bpf also depends on module_alloc(),
so text_alloc() will help them too.
Thank you,
--
Masami Hiramatsu <[email protected]>
On Fri, 10 Jul 2020 22:18:02 +0900
Masami Hiramatsu <[email protected]> wrote:
>
> Agreed. As far as I know, ftrace and bpf also depends on module_alloc(),
> so text_alloc() will help them too.
>
Yes please.
arch/x86/kernel/ftrace.c:
#ifdef CONFIG_MODULES
#include <linux/moduleloader.h>
/* Module allocation simplifies allocating memory for code */
static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
static inline void tramp_free(void *tramp)
{
module_memfree(tramp);
}
#else
/* Trampolines can only be created if modules are supported */
static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
static inline void tramp_free(void *tramp) { }
#endif
-- Steve
On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> On Fri, 10 Jul 2020 02:45:19 +0300
> Jarkko Sakkinen <[email protected]> wrote:
> > +#ifdef CONFIG_MODULES
> > /* Lock modules while optimizing kprobes */
> > mutex_lock(&module_mutex);
> > +#endif
>
> Hmm, can you reduce these "#ifdef CONFIG_MODULE"s ?
>
> e.g.
>
> #ifdef CONFIG_MODULES
> static void lock_modules(void)
> {
> mutex_lock(&module_mutex);
> }
> ...
> #else
> #define lock_modules() do { } while (0)
> ...
> #endif
I prefer using "static inline" for no-op functions just because they
will maintain argument type validation by the compiler regardless of the
CONFIG state (though it doesn't really matter here since it's void).
#else
static inline lock_modules(void) { }
#endif
--
Kees Cook
On Fri, Jul 10, 2020 at 12:49:10PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 10, 2020 at 01:36:38PM +0300, Jarkko Sakkinen wrote:
> > Just so that I know (and learn), what did exactly disable optprobes?
>
> So regular, old-skool style kprobe is:
>
> - copy original instruction out
> - replace instruction with breakpoint (int3 on x86)
> - have exception handler return to the copied instruction with
> single-step on
> - have single step exception handler return to the original
> instruction stream
>
> which is 2 exceptions.
Out of pure interest, how does it handle a jump (as the original
opcode), given that it single steps a copy?
> optprobes avoid the single-step by not only writing a single
> instruction, but additionally placing a JMP instruction behind it such
> that it will automagically continue in the original instruction stream.
>
> This brings the requirement that the copied instruction is placed
> within the JMP displacement of the regular kernel text (s32 on x86).
>
> module_alloc() ensures the memory provided is within that range.
Right, a relative jump is placed instead of 0xcc to the breakpoint?
/Jarkko
On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> Hi Jarkko,
>
> On Fri, 10 Jul 2020 02:45:19 +0300
> Jarkko Sakkinen <[email protected]> wrote:
>
> > Remove MODULES dependency and migrate from module_alloc to vmalloc().
> > According to Andi, the history with this dependency is that kprobes
> > originally required custom LKM's, which does not hold today anymore.
> >
> > Right now one has to compile LKM support only to enable kprobes. With
> > this change applied, it is somewhat easier to create custom test
> > kernel's with a proper debugging capabilities, thus making Linux more
> > developer friendly.
>
> Agreed. Now we have several way to access kprobes via ftrace/perf/bpf
> without modules. So it's a good time to remove CONFIG_MODULE dependency.
>
> But I also would like to minimize the changes for code readability.
> I made some comments below.
Thanks, appreciate this.
> > Cc: Andi Kleen <[email protected]>
> > Signed-off-by: Jarkko Sakkinen <[email protected]>
> > ---
> > arch/Kconfig | 1 -
> > arch/x86/kernel/kprobes/core.c | 5 +++--
> > kernel/kprobes.c | 22 ++++++++++++++++++++--
> > kernel/trace/trace_kprobe.c | 12 ++++++++++++
> > 4 files changed, 35 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index 8cc35dc556c7..bb59cdf335ab 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -61,7 +61,6 @@ config OPROFILE_NMI_TIMER
> >
> > config KPROBES
> > bool "Kprobes"
> > - depends on MODULES
> > depends on HAVE_KPROBES
> > select KALLSYMS
> > help
> > diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> > index ada39ddbc922..dc7b8d6fd20d 100644
> > --- a/arch/x86/kernel/kprobes/core.c
> > +++ b/arch/x86/kernel/kprobes/core.c
> > @@ -42,6 +42,7 @@
> > #include <linux/moduleloader.h>
> > #include <linux/vmalloc.h>
> > #include <linux/pgtable.h>
> > +#include <linux/vmalloc.h>
> >
> > #include <asm/text-patching.h>
> > #include <asm/cacheflush.h>
> > @@ -423,7 +424,7 @@ void *alloc_insn_page(void)
> > {
> > void *page;
> >
> > - page = module_alloc(PAGE_SIZE);
> > + page = vmalloc(PAGE_SIZE);
>
> No, you can not use vmalloc here. The reason why we use module_alloc()
> is to allocate the executable memory for trampoline code.
> So, you need to use vmalloc_exec() instead.
Right, I'm on the wagon with this after reading Peter's explanation.
>
> > if (!page)
> > return NULL;
> >
> > @@ -446,7 +447,7 @@ void *alloc_insn_page(void)
> > /* Recover page to RW mode before releasing it */
> > void free_insn_page(void *page)
> > {
> > - module_memfree(page);
> > + vfree(page);
> > }
> >
> > static int arch_copy_kprobe(struct kprobe *p)
> > diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> > index 4a904cc56d68..02680642ea11 100644
> > --- a/kernel/kprobes.c
> > +++ b/kernel/kprobes.c
> > @@ -35,6 +35,7 @@
> > #include <linux/ftrace.h>
> > #include <linux/cpu.h>
> > #include <linux/jump_label.h>
> > +#include <linux/vmalloc.h>
> >
> > #include <asm/sections.h>
> > #include <asm/cacheflush.h>
> > @@ -111,12 +112,12 @@ enum kprobe_slot_state {
> >
> > void __weak *alloc_insn_page(void)
> > {
> > - return module_alloc(PAGE_SIZE);
> > + return vmalloc(PAGE_SIZE);
>
> Ditto.
>
> > }
> >
> > void __weak free_insn_page(void *page)
> > {
> > - module_memfree(page);
> > + vfree(page);
> > }
> >
> > struct kprobe_insn_cache kprobe_insn_slots = {
> > @@ -563,8 +564,11 @@ static void kprobe_optimizer(struct work_struct *work)
> > mutex_lock(&kprobe_mutex);
> > cpus_read_lock();
> > mutex_lock(&text_mutex);
> > +
> > +#ifdef CONFIG_MODULES
> > /* Lock modules while optimizing kprobes */
> > mutex_lock(&module_mutex);
> > +#endif
>
> Hmm, can you reduce these "#ifdef CONFIG_MODULE"s ?
>
> e.g.
>
> #ifdef CONFIG_MODULES
> static void lock_modules(void)
> {
> mutex_lock(&module_mutex);
> }
> ...
> #else
> #define lock_modules() do { } while (0)
> ...
> #endif
Yes, I'll use this idea also to address to issues reported below.
>
> >
> > /*
> > * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
> > @@ -589,7 +593,9 @@ static void kprobe_optimizer(struct work_struct *work)
> > /* Step 4: Free cleaned kprobes after quiesence period */
> > do_free_cleaned_kprobes();
> >
> > +#ifdef CONFIG_MODULES
> > mutex_unlock(&module_mutex);
> > +#endif
> > mutex_unlock(&text_mutex);
> > cpus_read_unlock();
> >
> > @@ -739,6 +745,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
> > return 0;
> > }
> >
>
> > +#ifdef CONFIG_MODULE
> > /* Remove optimized instructions */
> > static void kill_optimized_kprobe(struct kprobe *p)
> > {
> > @@ -764,6 +771,7 @@ static void kill_optimized_kprobe(struct kprobe *p)
> > /* Don't touch the code, because it is already freed. */
> > arch_remove_optimized_kprobe(op);
> > }
> > +#endif
>
> This optprobe related one is OK to keep in this place.
>
> >
> > static inline
> > void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
> > @@ -1608,6 +1616,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > goto out;
> > }
> >
> > +#ifdef CONFIG_MODULE
> > /*
> > * If the module freed .init.text, we couldn't insert
> > * kprobes in there.
> > @@ -1618,6 +1627,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
> > *probed_mod = NULL;
> > ret = -ENOENT;
> > }
> > +#endif
> > }
> > out:
> > preempt_enable();
> > @@ -2090,6 +2100,7 @@ NOKPROBE_SYMBOL(pre_handler_kretprobe);
> >
> > #endif /* CONFIG_KRETPROBES */
> >
> > +#ifdef CONFIG_MODULE
> > /* Set the kprobe gone and remove its instruction buffer. */
> > static void kill_kprobe(struct kprobe *p)
> > {
> > @@ -2114,6 +2125,7 @@ static void kill_kprobe(struct kprobe *p)
> > */
> > arch_remove_kprobe(p);
> > }
> > +#endif
> >
> > /* Disable one kprobe */
> > int disable_kprobe(struct kprobe *kp)
> > @@ -2214,6 +2226,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MODULE
> > /* Remove all symbols in given area from kprobe blacklist */
> > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
> > {
> > @@ -2231,6 +2244,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
> > {
> > kprobe_remove_area_blacklist(entry, entry + 1);
> > }
> > +#endif
> >
> > int __init __weak arch_populate_kprobe_blacklist(void)
> > {
> > @@ -2274,6 +2288,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
> > return ret ? : arch_populate_kprobe_blacklist();
> > }
> >
> > +#ifdef CONFIG_MODULE
> > static void add_module_kprobe_blacklist(struct module *mod)
> > {
> > unsigned long start, end;
> > @@ -2375,6 +2390,7 @@ static struct notifier_block kprobe_module_nb = {
> > .notifier_call = kprobes_module_callback,
> > .priority = 0
> > };
> > +#endif /* CONFIG_MODULE */
> >
> > /* Markers of _kprobe_blacklist section */
> > extern unsigned long __start_kprobe_blacklist[];
> > @@ -2425,8 +2441,10 @@ static int __init init_kprobes(void)
> > err = arch_init_kprobes();
> > if (!err)
> > err = register_die_notifier(&kprobe_exceptions_nb);
> > +#ifdef CONFIG_MODULE
> > if (!err)
> > err = register_module_notifier(&kprobe_module_nb);
> > +#endif
> >
> > kprobes_initialized = (err == 0);
> >
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index aefb6065b508..30969b38fce8 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -103,6 +103,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
> > return !!(kprobe_gone(&tk->rp.kp));
> > }
> >
> > +#ifdef CONFIG_MODULE
> > static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> > struct module *mod)
> > {
> > @@ -110,7 +111,9 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> > const char *name = trace_kprobe_symbol(tk);
> > return strncmp(mod->name, name, len) == 0 && name[len] == ':';
> > }
> > +#endif
> >
> > +#ifdef CONFIG_MODULE
> > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> > {
> > char *p;
> > @@ -129,6 +132,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
> >
> > return ret;
> > }
> > +#endif
>
> Here, too. If you make a dummy function in case of !CONFIG_MODULE,
> we don't need to modify caller-side code.
>
> >
> > static bool trace_kprobe_is_busy(struct dyn_event *ev)
> > {
> > @@ -608,10 +612,12 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
> >
> > /* Register k*probe */
> > ret = __register_trace_kprobe(tk);
> > +#ifdef CONFIG_MODULE
> > if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> > pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> > ret = 0;
> > }
> > +#endif
>
> So, please make a dummy trace_kprobe_module_exist() and keep this untouched.
>
> >
> > if (ret)
> > trace_probe_unlink(&tk->tp);
> > @@ -651,10 +657,12 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> >
> > /* Register k*probe */
> > ret = __register_trace_kprobe(tk);
> > +#ifdef CONFIG_MODULE
> > if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
> > pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
> > ret = 0;
> > }
> > +#endif
>
> Ditto.
>
> >
> > if (ret < 0)
> > unregister_kprobe_event(tk);
> > @@ -666,6 +674,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_MODULE
> > /* Module notifier call back, checking event on the module */
> > static int trace_kprobe_module_callback(struct notifier_block *nb,
> > unsigned long val, void *data)
> > @@ -700,6 +709,7 @@ static struct notifier_block trace_kprobe_module_nb = {
> > .notifier_call = trace_kprobe_module_callback,
> > .priority = 1 /* Invoked after kprobe module callback */
> > };
> > +#endif
>
> Unless it makes any build error, please keep it untouched.
>
> >
> > /* Convert certain expected symbols into '_' when generating event names */
> > static inline void sanitize_event_name(char *name)
> > @@ -1891,8 +1901,10 @@ static __init int init_kprobe_trace_early(void)
> > if (ret)
> > return ret;
> >
> > +#ifdef CONFIG_MODULE
> > if (register_module_notifier(&trace_kprobe_module_nb))
> > return -EINVAL;
> > +#endif
>
> Ditto.
>
> Thank you,
Thanks a lot for the remarks.
>
> >
> > return 0;
> > }
> > --
> > 2.25.1
> >
>
>
> --
> Masami Hiramatsu <[email protected]>
/Jarkko
On Fri, Jul 10, 2020 at 01:32:38PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > > - page = module_alloc(PAGE_SIZE);
> > > + page = vmalloc(PAGE_SIZE);
> >
> > No, you can not use vmalloc here. The reason why we use module_alloc()
> > is to allocate the executable memory for trampoline code.
> > So, you need to use vmalloc_exec() instead.
>
> vmalloc_exec() would be broken too, also hch recently got rid of that
> thing.
>
> module_alloc() really is the only sane choice here.
>
> We should make module_alloc() unconditionally available, and maybe even
> rename it to text_alloc().
Thanks for the remarks.
The current module_alloc looks like this:
void * __weak module_alloc(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
NUMA_NO_NODE, __builtin_return_address(0));
}
What if I create inline functions for text_alloc() and text_memfree() and
convert this function as:
void * __weak module_alloc(unsigned long size)
{
return text_alloc(size);
}
/Jarkko
On Fri, Jul 10, 2020 at 03:04:29PM +0200, Christoph Hellwig wrote:
> On Fri, Jul 10, 2020 at 01:32:38PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > > > - page = module_alloc(PAGE_SIZE);
> > > > + page = vmalloc(PAGE_SIZE);
> > >
> > > No, you can not use vmalloc here. The reason why we use module_alloc()
> > > is to allocate the executable memory for trampoline code.
> > > So, you need to use vmalloc_exec() instead.
> >
> > vmalloc_exec() would be broken too, also hch recently got rid of that
> > thing.
> >
> > module_alloc() really is the only sane choice here.
> >
> > We should make module_alloc() unconditionally available, and maybe even
> > rename it to text_alloc().
>
> I think unconitionally might be a bit too much, but for
> MODULES || KRPOBES or a new symbol select by them makes sense. As does
> the rename.
Given that they are simple wrappers, would it be too much harmo to have
inline functions for text_alloc() and text_memfree() and use them inside
module_alloc() and alloc_memfree() in order not to initially turn things
over too much?
/Jarkko
On Fri, Jul 10, 2020 at 08:51:56AM -0700, Kees Cook wrote:
> On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > On Fri, 10 Jul 2020 02:45:19 +0300
> > Jarkko Sakkinen <[email protected]> wrote:
> > > +#ifdef CONFIG_MODULES
> > > /* Lock modules while optimizing kprobes */
> > > mutex_lock(&module_mutex);
> > > +#endif
> >
> > Hmm, can you reduce these "#ifdef CONFIG_MODULE"s ?
> >
> > e.g.
> >
> > #ifdef CONFIG_MODULES
> > static void lock_modules(void)
> > {
> > mutex_lock(&module_mutex);
> > }
> > ...
> > #else
> > #define lock_modules() do { } while (0)
> > ...
> > #endif
>
> I prefer using "static inline" for no-op functions just because they
> will maintain argument type validation by the compiler regardless of the
> CONFIG state (though it doesn't really matter here since it's void).
>
> #else
> static inline lock_modules(void) { }
> #endif
>
> --
> Kees Cook
Thanks Kees, good remark.
/Jarkko
On Fri, Jul 10, 2020 at 09:22:43AM -0400, Steven Rostedt wrote:
> On Fri, 10 Jul 2020 22:18:02 +0900
> Masami Hiramatsu <[email protected]> wrote:
>
> >
> > Agreed. As far as I know, ftrace and bpf also depends on module_alloc(),
> > so text_alloc() will help them too.
> >
>
> Yes please.
>
> arch/x86/kernel/ftrace.c:
>
> #ifdef CONFIG_MODULES
> #include <linux/moduleloader.h>
> /* Module allocation simplifies allocating memory for code */
> static inline void *alloc_tramp(unsigned long size)
> {
> return module_alloc(size);
> }
> static inline void tramp_free(void *tramp)
> {
> module_memfree(tramp);
> }
> #else
> /* Trampolines can only be created if modules are supported */
> static inline void *alloc_tramp(unsigned long size)
> {
> return NULL;
> }
> static inline void tramp_free(void *tramp) { }
> #endif
>
> -- Steve
Thanks, note taken. I'll take this into account in the next version.
/Jarkko
On Mon, Jul 13, 2020 at 08:05:49AM +0300, Jarkko Sakkinen wrote:
> On Fri, Jul 10, 2020 at 12:49:10PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 10, 2020 at 01:36:38PM +0300, Jarkko Sakkinen wrote:
> > > Just so that I know (and learn), what did exactly disable optprobes?
> >
> > So regular, old-skool style kprobe is:
> >
> > - copy original instruction out
> > - replace instruction with breakpoint (int3 on x86)
> > - have exception handler return to the copied instruction with
> > single-step on
> > - have single step exception handler return to the original
> > instruction stream
> >
> > which is 2 exceptions.
>
> Out of pure interest, how does it handle a jump (as the original
> opcode), given that it single steps a copy?
Good question, I hadn't ever looked at that detail. Anyway, I dug around
a little and it disallows 'boosting' (replacing single-step with a jmp)
for jump instructions and takes the double exception. It single steps
the original jump into 'thin-air' and does a relative fixup of the
resulting IP in the single-step exception.
For more details also see
arch/x86/kernel/kprobes/core.c:resume_execution().
> > optprobes avoid the single-step by not only writing a single
> > instruction, but additionally placing a JMP instruction behind it such
> > that it will automagically continue in the original instruction stream.
> >
> > This brings the requirement that the copied instruction is placed
> > within the JMP displacement of the regular kernel text (s32 on x86).
> >
> > module_alloc() ensures the memory provided is within that range.
>
> Right, a relative jump is placed instead of 0xcc to the breakpoint?
So there's all sorts of optimizations. The one I was talking about is
apparently called boosting. That one still uses INT3 but avoids (where
possible) the single-step #DB trap by appending a JMP.d32 after it.
There's also optimized kprobes and that avoids all traps by replacing
the original instruction(s) with a JMP.d32 to a trampoline, this
trampoline calls the kprobe handler, after which it runs the original
instruction(s) and then a JMP.d32 back into where we came from.
These fully optimized kprobes have very specific constraints, best to
read the code if you want more details.
Anyway, the common theme here is that all the various optimizations rely
on the out-of-line text being withing the s32 displacement of relative
jumps.
On Mon, 13 Jul 2020 08:49:55 +0300
Jarkko Sakkinen <[email protected]> wrote:
> On Fri, Jul 10, 2020 at 01:32:38PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 10, 2020 at 07:32:57PM +0900, Masami Hiramatsu wrote:
> > > > - page = module_alloc(PAGE_SIZE);
> > > > + page = vmalloc(PAGE_SIZE);
> > >
> > > No, you can not use vmalloc here. The reason why we use module_alloc()
> > > is to allocate the executable memory for trampoline code.
> > > So, you need to use vmalloc_exec() instead.
> >
> > vmalloc_exec() would be broken too, also hch recently got rid of that
> > thing.
> >
> > module_alloc() really is the only sane choice here.
> >
> > We should make module_alloc() unconditionally available, and maybe even
> > rename it to text_alloc().
>
> Thanks for the remarks.
>
> The current module_alloc looks like this:
>
> void * __weak module_alloc(unsigned long size)
> {
> return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> NUMA_NO_NODE, __builtin_return_address(0));
> }
>
> What if I create inline functions for text_alloc() and text_memfree() and
> convert this function as:
>
> void * __weak module_alloc(unsigned long size)
> {
> return text_alloc(size);
> }
Yeah, that'll be good.
Thank you,
--
Masami Hiramatsu <[email protected]>
On Mon, 13 Jul 2020 08:05:49 +0300
Jarkko Sakkinen <[email protected]> wrote:
> On Fri, Jul 10, 2020 at 12:49:10PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 10, 2020 at 01:36:38PM +0300, Jarkko Sakkinen wrote:
> > > Just so that I know (and learn), what did exactly disable optprobes?
> >
> > So regular, old-skool style kprobe is:
> >
> > - copy original instruction out
> > - replace instruction with breakpoint (int3 on x86)
> > - have exception handler return to the copied instruction with
> > single-step on
> > - have single step exception handler return to the original
> > instruction stream
> >
> > which is 2 exceptions.
>
> Out of pure interest, how does it handle a jump (as the original
> opcode), given that it single steps a copy?
Yes, the jump will be executed with a single-step on the copy buffer
and kprobes (on x86) fixes up the result, this means we modifies
the regs->ip. Also, there are some architectures which emulate the
jump instead of single-stepping.
>
> > optprobes avoid the single-step by not only writing a single
> > instruction, but additionally placing a JMP instruction behind it such
> > that it will automagically continue in the original instruction stream.
> >
> > This brings the requirement that the copied instruction is placed
> > within the JMP displacement of the regular kernel text (s32 on x86).
> >
> > module_alloc() ensures the memory provided is within that range.
>
> Right, a relative jump is placed instead of 0xcc to the breakpoint?
Yes, a relative (far) jump is used. So the target address (copied buffer)
must be in +-2GB range from the jump.
Thank you,
--
Masami Hiramatsu <[email protected]>