Hello,
Yet, again, a new version. I integrated a full probe management mechanism. See
below.
Comments are welcome,
Mathieu
---BEGIN---
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1082,6 +1082,8 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+source "kernel/Kconfig.marker"
+
source "ltt/Kconfig"
endmenu
--- /dev/null
+++ b/include/linux/marker.h
@@ -0,0 +1,118 @@
+/*****************************************************************************
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing.
+ *
+ * Example :
+ *
+ * MARK(subsystem_event, "%d %s", someint, somestring);
+ * Where :
+ * - Subsystem is the name of your subsystem.
+ * - event is the name of the event to mark.
+ * - "%d %s" is the formatted string for printk.
+ * - someint is an integer.
+ * - somestring is a char *.
+ * - subsystem_event must be unique thorough the kernel!
+ *
+ * Dynamically overridable function call based on marker mechanism
+ * from Frank Ch. Eigler <[email protected]>.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#define MARK_KPROBE_PREFIX "__mark_kprobe_"
+#define MARK_CALL_PREFIX "__mark_call_"
+#define MARK_JUMP_SELECT_PREFIX "__mark_jump_select_"
+#define MARK_JUMP_CALL_PREFIX "__mark_jump_call_"
+#define MARK_JUMP_INLINE_PREFIX "__mark_jump_inline_"
+#define MARK_JUMP_OVER_PREFIX "__mark_jump_over_"
+
+#ifdef CONFIG_MARK_SYMBOL
+#define MARK_SYM(name) \
+ do { \
+ __label__ here; \
+ here: asm volatile \
+ (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \
+ } while(0)
+#else
+#define MARK_SYM(name)
+#endif
+
+#ifdef CONFIG_MARK_JUMP_CALL
+#define MARK_JUMP_CALL_PROTOTYPE(name) \
+ static void \
+ (*__mark_call_##name)(const char *fmt, ...) \
+ asm (MARK_CALL_PREFIX#name) = \
+ __mark_empty_function
+#define MARK_JUMP_CALL(name, format, args...) \
+ do { \
+ preempt_disable(); \
+ (*__mark_call_##name)(format, ## args); \
+ preempt_enable_no_resched(); \
+ } while(0)
+#else
+#define MARK_JUMP_CALL_PROTOTYPE(name)
+#define MARK_JUMP_CALL(name, format, args...)
+#endif
+
+#ifdef CONFIG_MARK_JUMP_INLINE
+#define MARK_JUMP_INLINE(name, format, args...) \
+ (void) (__mark_inline_##name(format, ## args))
+#else
+#define MARK_JUMP_INLINE(name, format, args...)
+#endif
+
+#define MARK_JUMP(name, format, args...) \
+ do { \
+ __label__ over_label, call_label, inline_label; \
+ volatile static void *__mark_jump_select_##name \
+ asm (MARK_JUMP_SELECT_PREFIX#name) = \
+ &&over_label; \
+ volatile static void *__mark_jump_call_##name \
+ asm (MARK_JUMP_CALL_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&call_label; \
+ volatile static void *__mark_jump_inline_##name \
+ asm (MARK_JUMP_INLINE_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&inline_label; \
+ volatile static void *__mark_jump_over_##name \
+ asm (MARK_JUMP_OVER_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&over_label; \
+ MARK_JUMP_CALL_PROTOTYPE(name); \
+ goto *__mark_jump_select_##name; \
+call_label: \
+ MARK_JUMP_CALL(name, format, ## args); \
+ goto over_label; \
+inline_label: \
+ MARK_JUMP_INLINE(name, format, ## args); \
+over_label: \
+ do {} while(0); \
+ } while(0)
+
+#define MARK(name, format, args...) \
+ do { \
+ __mark_check_format(format, ## args); \
+ MARK_SYM(name); \
+ MARK_JUMP(name, format, ## args); \
+ } while(0)
+
+enum marker_type { MARKER_CALL, MARKER_INLINE };
+
+typedef void (*marker_probe)(const char *fmt, ...);
+
+static inline __attribute__ ((format (printf, 1, 2)))
+void __mark_check_format(const char *fmt, ...)
+{ }
+
+extern void __mark_empty_function(const char *fmt, ...);
+
+int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type);
+
+void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type);
--- /dev/null
+++ b/kernel/Kconfig.marker
@@ -0,0 +1,31 @@
+# Code markers configuration
+
+menu "Marker configuration"
+
+
+config MARK_SYMBOL
+ bool "Replace markers with symbols"
+ default n
+ help
+ Put symbols in place of markers, useful for kprobe.
+
+config MARK_JUMP_CALL
+ bool "Replace markers with a jump over an inactive function call"
+ default n
+ help
+ Put a jump over a call in place of markers.
+
+config MARK_JUMP_INLINE
+ bool "Replace markers with a jump over an inline function"
+ default n
+ help
+ Put a jump over an inline function.
+
+config MARK_JUMP
+ bool "Jump marker probes set/disable infrastructure"
+ select KALLSYMS
+ default n
+ help
+ Install or remove probes from markers.
+
+endmenu
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_MARK_JUMP) += marker.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,178 @@
+/*****************************************************************************
+ * marker.c
+ *
+ * Code markup for dynamic and static tracing. Marker control module.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ *
+ * Design :
+ * kernel/marker.c deals with all marker activation from a centralized,
+ * coherent mechanism. The functions that will be called will simply sit in
+ * modules.
+ *
+ * Before activating a probe, the marker module :
+ * 1 - takes proper locking
+ * 2 - verifies that the function pointer and jmp target are at their default
+ * values, otherwise the "set" operation fails.
+ * 4 - does function pointer and jump setup.
+ *
+ * Setting them back to disabled is :
+ * 1 - setting back the default jmp and call values
+ * 2 - call synchronize_sched()
+ *
+ * A probe module must call marker disable on all its markers before module
+ * unload.
+ *
+ * The marker module will also deal with inline jump selection, which is
+ * the same case as presented here, but without the function pointer.
+ */
+
+
+#include <linux/marker.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/string.h>
+
+static DEFINE_SPINLOCK(marker_lock);
+
+struct marker_pointers {
+ void **call;
+ void **jmpselect;
+ void **jmpcall;
+ void **jmpinline;
+ void **jmpover;
+};
+
+void __mark_empty_function(const char *fmt, ...)
+{
+}
+EXPORT_SYMBOL(__mark_empty_function);
+
+/* Pointers can be used around preemption disabled */
+static int marker_get_pointers(const char *name,
+ struct marker_pointers *ptrs)
+{
+ char call_sym[KSYM_NAME_LEN] = MARK_CALL_PREFIX;
+ char jmpselect_sym[KSYM_NAME_LEN] = MARK_JUMP_SELECT_PREFIX;
+ char jmpcall_sym[KSYM_NAME_LEN] = MARK_JUMP_CALL_PREFIX;
+ char jmpinline_sym[KSYM_NAME_LEN] = MARK_JUMP_INLINE_PREFIX;
+ char jmpover_sym[KSYM_NAME_LEN] = MARK_JUMP_OVER_PREFIX;
+ unsigned int call_sym_len = sizeof(MARK_CALL_PREFIX);
+ unsigned int jmpselect_sym_len = sizeof(MARK_JUMP_SELECT_PREFIX);
+ unsigned int jmpcall_sym_len = sizeof(MARK_JUMP_CALL_PREFIX);
+ unsigned int jmpinline_sym_len = sizeof(MARK_JUMP_INLINE_PREFIX);
+ unsigned int jmpover_sym_len = sizeof(MARK_JUMP_OVER_PREFIX);
+
+ strncat(call_sym, name, KSYM_NAME_LEN-call_sym_len);
+ strncat(jmpselect_sym, name, KSYM_NAME_LEN-jmpselect_sym_len);
+ strncat(jmpcall_sym, name, KSYM_NAME_LEN-jmpcall_sym_len);
+ strncat(jmpinline_sym, name, KSYM_NAME_LEN-jmpinline_sym_len);
+ strncat(jmpover_sym, name, KSYM_NAME_LEN-jmpover_sym_len);
+
+ ptrs->call = (void**)kallsyms_lookup_name(call_sym);
+ ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym);
+ ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym);
+ ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym);
+ ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym);
+
+ if(!(ptrs->call && ptrs->jmpselect && ptrs->jmpcall
+ && ptrs->jmpinline && ptrs->jmpover)) {
+ return ENOENT;
+ }
+ return 0;
+}
+
+int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type)
+{
+ int result = 0;
+ struct marker_pointers ptrs;
+
+ spin_lock(&marker_lock);
+ result = marker_get_pointers(name, &ptrs);
+ if(result) {
+ printk(KERN_NOTICE
+ "Unable to find kallsyms for markers in %s\n",
+ name);
+ goto unlock;
+ }
+
+ switch(type) {
+ case MARKER_CALL:
+ if(*ptrs.call != __mark_empty_function) {
+ result = EBUSY;
+ printk(KERN_NOTICE
+ "Probe already installed on "
+ "marker in %s\n",
+ name);
+ goto unlock;
+ }
+ /* Setup the call pointer */
+ *ptrs.call = probe;
+ /* Setup the jump */
+ *ptrs.jmpselect = *ptrs.jmpcall;
+ break;
+ case MARKER_INLINE:
+ if(*ptrs.jmpover == *ptrs.jmpinline) {
+ result = ENODEV;
+ printk(KERN_NOTICE
+ "No inline probe exists "
+ "for marker in %s\n",
+ name);
+ goto unlock;
+ }
+ /* Setup the call pointer */
+ *ptrs.call = __mark_empty_function;
+ /* Setup the jump */
+ *ptrs.jmpselect = *ptrs.jmpinline;
+ break;
+ default:
+ result = ENOENT;
+ printk(KERN_ERR
+ "Unknown marker type\n");
+ break;
+ }
+unlock:
+ spin_unlock(&marker_lock);
+ return result;
+}
+EXPORT_SYMBOL_GPL(marker_set_probe);
+
+void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type)
+{
+ int result = 0;
+ struct marker_pointers ptrs;
+
+ spin_lock(&marker_lock);
+ result = marker_get_pointers(name, &ptrs);
+ if(result)
+ goto unlock;
+
+ switch(type) {
+ case MARKER_CALL:
+ if(*ptrs.call == probe) {
+ *ptrs.jmpselect = *ptrs.jmpover;
+ *ptrs.call = __mark_empty_function;
+ }
+ break;
+ case MARKER_INLINE:
+ if(*ptrs.jmpselect == *ptrs.jmpinline)
+ *ptrs.jmpselect = *ptrs.jmpover;
+ break;
+ default:
+ result = ENOENT;
+ printk(KERN_ERR
+ "Unknown marker type\n");
+ break;
+ }
+unlock:
+ spin_unlock(&marker_lock);
+ if(!result && type == MARKER_CALL)
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(marker_disable_probe);
---END---
-- probe module ---
---BEGIN---
/* probe.c
*
* Loads a function at a marker call site.
*
* (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
*
* This file is released under the GPLv2.
* See the file COPYING for more details.
*/
#include <linux/marker.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
/* function to install */
void do_mark1(const char *format, int value)
{
printk("value is %d\n", value);
}
int init_module(void)
{
return marker_set_probe("subsys_mark1", (marker_probe)do_mark1,
MARKER_CALL);
}
void cleanup_module(void)
{
marker_disable_probe("subsys_mark1", (marker_probe)do_mark1,
MARKER_CALL);
}
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Probe");
---END---
-- sample test marked module ---
---BEGIN---
/* test-mark.c
*
*/
#include <linux/marker.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
int x=7;
struct proc_dir_entry *pentry = NULL;
static int my_open(struct inode *inode, struct file *file)
{
MARK(subsys_mark1, "%d", 1);
MARK(subsys_mark2, "%d %s", 2, "blah2");
MARK(subsys_mark3, "%d %s", x, "blah3");
return -EPERM;
}
static struct file_operations my_operations = {
.open = my_open,
};
int init_module(void)
{
pentry = create_proc_entry("testmark", 0444, NULL);
if(pentry)
pentry->proc_fops = &my_operations;
return 0;
}
void cleanup_module(void)
{
remove_proc_entry("testmark", NULL);
}
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Marker Test");
---END---
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Mathieu Desnoyers <[email protected]> wrote:
> +config MARK_SYMBOL
> +config MARK_JUMP_CALL
> +config MARK_JUMP_INLINE
> +config MARK_JUMP
same NACK over the proliferation of options as before:
http://marc.theaimsgroup.com/?l=linux-kernel&m=115881457219562&w=2
Tap, tap, is this thing on? ;)
found one related reply from you that i didnt answer yet:
"As an example, LTTng traces the page fault handler, when kprobes just
can't instrument it."
but tracing a raw pagefault at the arch level is a bad idea anyway, we
want to trace __handle_mm_fault(). That way you can avoid having to
modify every architecture's pagefault handler ...
but the other points remained unanswered as far as i can see.
Ingo
Hi -
> Yet, again, a new version. I integrated a full probe management
> mechanism. [...]
Thank you for continuing on.
> +#define MARK_SYM(name) \
> + here: asm volatile \
> + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \
Regarding MARK_SYM, if I read Ingo's message correctly, this is the
only type of marker he believes is necessary, since it would put a
place for kprobes to put a breakpoint. FWIW, this still appears to be
applicable only if the int3 overheads are tolerable, and if parameter
data extraction is unnecessary or sufficiently robust "by accident".
Regarding:
> +#define MARK_JUMP(name, format, args...) \
> [...]
All this leap/jump/branch elaboration may be based upon scanty
benchmarks. The plain conditional-indirect function call is already
"fast", especially if its hosting compilation unit is compiled with
-freorder-blocks.
Direct jumps to instrumentation are unlikely to be of a great deal of
use, in that there would have to be some assembly-level code in
between to save/restore affected registers. Remember, ultimately the
handler will be written in C.
Regarding use of an empty_function() sentinel instead of NULLs, it is
worth measuring carefully whether a unconditional indirect call to
such a dummy function is faster than a conditional indirect call. It
may have to be a per-architecture internal implementation option.
Regarding varargs, I still believe that varargs have poorer
type-safety. Remember, it's not just type-safety at the marker site
(which gcc's printf-format-checker could validate), but the handler's
too. I believe it is incorrect that a non-varargs function can always
safely take a call from a varargs call - varargs changes the calling
conventions. This would mean that the handler would itself have to be
varargs, with all the attendant run-time overheads. (Plus the idea of
using a build-time script to generate the non-verargs handlers from
analyzing particular MARK() calls is itself probably a bit complex for
its own good.)
Regarding measurements in general, one must avoid being misled by
microbenchmarks such as those that represent an extreme of caching
friendliness. It may be necessary to run large system-level tests to
meaningfully compare alternatives.
> +int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
> + enum marker_type type);
> +void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
> + enum marker_type type);
I'm unclear about what a merker_type value represents. Could you go
over that again?
> +static int marker_get_pointers(const char *name,
> + [...]
> + ptrs->call = (void**)kallsyms_lookup_name(call_sym);
> + ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym);
> + ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym);
> + ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym);
> + ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym);
> [...]
I wonder whether, as for kprobes, it will be necessary to lock into
memory a module containing an active marker.
- FChE
* Frank Ch. Eigler <[email protected]> wrote:
> > +#define MARK_SYM(name) \
> > + here: asm volatile \
> > + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \
>
> Regarding MARK_SYM, if I read Ingo's message correctly, this is the
> only type of marker he believes is necessary, since it would put a
> place for kprobes to put a breakpoint. FWIW, this still appears to be
> applicable only if the int3 overheads are tolerable, and if parameter
> data extraction is unnecessary or sufficiently robust "by accident".
let me qualify that: parameters must be prepared there too - but no
actual function call inserted. (at most a NOP inserted). The register
filling doesnt even have to be function-calling-convention compliant -
that makes the symbolic probe almost zero-impact to register
allocation/scheduling, the only thing it should ensure is that the
parameters that are annotated to be available in register, stack or
memory _somewhere_. (i.e. not hidden or destroyed at that point by gcc)
Does a simple asm() that takes read-only parameters but only adds a NOP
achieve this result?
Ingo
Ingo Molnar wrote:
> let me qualify that: parameters must be prepared there too - but no
> actual function call inserted. (at most a NOP inserted). The register
> filling doesnt even have to be function-calling-convention compliant -
> that makes the symbolic probe almost zero-impact to register
> allocation/scheduling, the only thing it should ensure is that the
> parameters that are annotated to be available in register, stack or
> memory _somewhere_. (i.e. not hidden or destroyed at that point by gcc)
> Does a simple asm() that takes read-only parameters but only adds a NOP
> achieve this result?
Do you mean using the asm to make sure gcc puts a reference to a
variable into the DWARF info, or some other way of encoding the value
locations?
Hm, another problem. If the mark is in a loop, and gcc decides to
unroll the loop, then you'll probably only get a mark in one iteration
of the loop (or 1/Nth of the iterations). Or worse, assembler errors.
The only way I can see to deal with this is to not use symbols, but put
records in a special section. That way, if the asm() inserting them
gets duplicated, you'll get duplicate records in the marker section.
I guess you'd get a similar problem with markers inserted in inlined
functions.
(How does gdb deal with breakpoints in unrolled loops?)
J
* Frank Ch. Eigler ([email protected]) wrote:
> Thank you for continuing on.
>
Thanks for the comments, see below :
> > +#define MARK_SYM(name) \
> > + here: asm volatile \
> > + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \
>
> Regarding MARK_SYM, if I read Ingo's message correctly, this is the
> only type of marker he believes is necessary, since it would put a
> place for kprobes to put a breakpoint. FWIW, this still appears to be
> applicable only if the int3 overheads are tolerable, and if parameter
> data extraction is unnecessary or sufficiently robust "by accident".
>
I agree. And I don't see how the int3 approach fills the need of embedded
systems and high performance computing developers, so I think that a "call"
alternative, where the stack is set up by gcc itself, is worthy.
>
> Regarding:
>
> > +#define MARK_JUMP(name, format, args...) \
> > [...]
>
> All this leap/jump/branch elaboration may be based upon scanty
> benchmarks. The plain conditional-indirect function call is already
> "fast", especially if its hosting compilation unit is compiled with
> -freorder-blocks.
>
Yes, I just wanted to show that there was some performance merits to this
technique, more benchmarks will be needed. The main problem I have seen with
your condition+call approach is that there is no safe way to remove the function
pointer without causing a huge mess (null pointer dereference). This is why I
thought of this alternative that has the abolity of keep a consistent execution
flow at _all_ times.
> Direct jumps to instrumentation are unlikely to be of a great deal of
> use, in that there would have to be some assembly-level code in
> between to save/restore affected registers. Remember, ultimately the
> handler will be written in C.
>
Yup, I agree, but it's nice to have the ability to jump over inline functions :)
> Regarding use of an empty_function() sentinel instead of NULLs, it is
> worth measuring carefully whether a unconditional indirect call to
> such a dummy function is faster than a conditional indirect call. It
> may have to be a per-architecture internal implementation option.
>
Yes, but you also have to take in account the stack setup cost. Idoubt that it
will be faster than a jump, but this is worthy to test.
> Regarding varargs, I still believe that varargs have poorer
> type-safety. Remember, it's not just type-safety at the marker site
> (which gcc's printf-format-checker could validate), but the handler's
> too. I believe it is incorrect that a non-varargs function can always
> safely take a call from a varargs call - varargs changes the calling
> conventions. This would mean that the handler would itself have to be
> varargs, with all the attendant run-time overheads. (Plus the idea of
> using a build-time script to generate the non-verargs handlers from
> analyzing particular MARK() calls is itself probably a bit complex for
> its own good.)
>
Absolutely, you are right. Good catch! I am putting a asmlinkage prefix to the
probe function, which will make sure that every argument is passed on the stack.
From http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
"regparm (number)
On the Intel 386, the regparm attribute causes the compiler to
pass arguments number one to number if they are of integral type in
registers EAX, EDX, and ECX instead of on the stack. Functions that
take a variable number of arguments will continue to be passed all
of their arguments on the stack.
Beware that on some ELF systems this attribute is unsuitable for
global functions in shared libraries with lazy binding (which is the
default). Lazy binding will send the first call via resolving code
in the loader, which might assume EAX, EDX and ECX can be clobbered,
as per the standard calling conventions. Solaris 8 is affected by
this. GNU systems with GLIBC 2.1 or higher, and FreeBSD, are believed
to be safe since the loaders there save all registers. (Lazy binding
can be disabled with the linker or the loader if desired, to avoid
the problem.)"
so a regparm(0) seems like a good solution there.
> Regarding measurements in general, one must avoid being misled by
> microbenchmarks such as those that represent an extreme of caching
> friendliness. It may be necessary to run large system-level tests to
> meaningfully compare alternatives.
>
Yes, the problem is that, after having ran such tests on a slower approach, the
macrobenchmarks failed to show any difference. Yes, those tests should be done,
but I don't expect any revelation from them.
>
> > +int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
> > + enum marker_type type);
> > +void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
> > + enum marker_type type);
>
> I'm unclear about what a marker_type value represents. Could you go
> over that again?
>
marker_type MARKER_CALL
Sets the jump to a function call
marker_type MARKER_INLINE
Sets the jump to the provided inline function
>
> > +static int marker_get_pointers(const char *name,
> > + [...]
> > + ptrs->call = (void**)kallsyms_lookup_name(call_sym);
> > + ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym);
> > + ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym);
> > + ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym);
> > + ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym);
> > [...]
>
> I wonder whether, as for kprobes, it will be necessary to lock into
> memory a module containing an active marker.
>
Nope, as the module_exit _has_ to call marker_disable_probe and that
marker_disable_probe does a synchronize_sched(). Because the function call is
surrounded by preempt_disable(), there will be no execution stream within the
function when the module will be unloaded.
Regards,
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Ingo Molnar ([email protected]) wrote:
> "As an example, LTTng traces the page fault handler, when kprobes just
> can't instrument it."
>
> but tracing a raw pagefault at the arch level is a bad idea anyway, we
> want to trace __handle_mm_fault(). That way you can avoid having to
> modify every architecture's pagefault handler ...
>
Then you lose the ability to trace in-kernel minor page faults.
> but the other points remained unanswered as far as i can see.
>
Hi Ingo,
I clearly expressed my position in the previous emails, so did you. You argued
about a use of tracing that is not relevant to my vision of reality, which is :
- Embedded systems developers won't want a breakpoint-based probe
- High performance computing users won't want a breakpoint-based probe
- djprobe is far away from being in an acceptable state on architectures with
very inconvenient erratas (x86).
- kprobe and djprobe cannot access local variables in every cases
For those reasons, I prefer a jump-over-call approach which lets gcc give us the
local variables. No need of DWARF or SystemTAP macro Kung Fu. Just C and a
loadable module.
By no means is it a replacement for a completely dynamic breakpoint-based
instrumentation mechanism. I really think that both mechanism should coexist.
This is my position : I let the distribution/user decide what is appropriate for
their use. My goal is to provide them a flexible mechanism that takes the
multiple variety of uses in account without performance impact if they are not
willing to pay it to benefit from tracing.
With all due respect, yes, there are Linux users different from the typical
Redhat client. If your vision is still limited to this scope after a 500
emails debate, I am afraid that there is very little I can do about it in
one more.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Mathieu Desnoyers ([email protected]) wrote:
> * Ingo Molnar ([email protected]) wrote:
>
> > "As an example, LTTng traces the page fault handler, when kprobes just
> > can't instrument it."
> >
> > but tracing a raw pagefault at the arch level is a bad idea anyway, we
> > want to trace __handle_mm_fault(). That way you can avoid having to
> > modify every architecture's pagefault handler ...
> >
>
> Then you lose the ability to trace in-kernel minor page faults.
>
But I agree with you that an upstream MARKER makes more sense in
__handle_mm_fault(). :-)
Regards,
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
And now for some friendly fire :) No just kidding, but I thought I'd
pour some water in your wine here.
Mathieu Desnoyers wrote:
> With all due respect, yes, there are Linux users different from the typical
> Redhat client. If your vision is still limited to this scope after a 500
> emails debate, I am afraid that there is very little I can do about it in
> one more.
Actually I think Ingo should be commended for having come around on
a significant number of crucial and sensitive issues. He's accepted
on principle the need for static markup, the fact that said markup
needs to somewhat be contextually accurate (variables, etc.) and the
fact that any form of markup made could be trivially used for
fast-path direct calls (albeit he wants you to package that as a
separate patch -- which you obviously don't like very much, but I
think I've fixed this one to the satisfaction of both of you below.)
Now, we should stop for a moment, look back and contrast Ingo's
current position to this week's painful thread, and we should thank
him for having come around on these things. I can't say any of it was
enjoyable myself and I'm sure others would agree, but it remains that
Ingo has taken a second look at things. He hasn't come to our specific
position, but that, I think, can be diplomatically resolved, more below.
Now with regards to the fact that Ingo works for RedHat, I think this
should only be used to understand his position. For if the "gut feeling"
of one of RedHat's top kernel engineers is that something is wrong,
then maybe we should try to understand why. Now I don't want to judge
or qualify whether he successfully conveyed the real essence of why he's
got this "gut feeling", we've lost enough time already on non-essential
banter. So let's stick to the facts. Just the facts ma'am. And to that
end, I suggest you and I (and everyone for that matter that agrees with
us) do some role-playing. Now what I say below could be off, but I'm
convinced there's some element of truth here -- and the reason I say this
is that I can imagine myself thinking the same as Ingo if I were in his
position *and* the assumptions I make below were correct.
I believe everyone at RedHat understands the value of tracing. In fact,
I believe their customers have been asking for just that. Their
investment in SystemTap can only be seen in this light. But why, oh
why, if the customers are asking for it is nobody just stick the
freeking thing in (all static) and be done with it? Well, it's
unfortunately not that simple. There are tons of issues (and that's
obviously why LTT has been held up for so many years -- the arguments
against it were just a big ball of wax and nobody ever took the time
to look at what that ball of wax was made of.) Some of them, indeed
I think the most important ones, we've already covered, and I don't
wish to get into those again. Suffice it to say, though, that we've
taken enough wax off the ball that some things have settled in:
some form of static markup being a major one.
Ingo's continued insistence, though, seems to point to the fact that
there's just a little bit more wax to remove. So on with my little
thought experiment.
Say you're RedHat -- and I'm just using RedHat as an example, but I'm
sure much of this applies to any other distro or distro maintainer. Say
you've been shipping LTT for a couple of versions (and, you'll have to
admit Mathieu, that's what we'd like), and obviously yours being the most
widely used distro out there, LTT gets a huge following and everybody
and his little sister uses it for tracing. All is well and everybody's
happy, right? Unfortunately not. Somewhere down the road somebody
tries to push a change into mainline that breaks a tracepoint. Ooops,
that's a problem for Ingo (and all other kernel developers from
RedHat I'd say) you see. Because then he has two nasty choices. Either
he opposes the change -- which may in context be totally nonsensical,
like trying to preserve a variable that doesn't exist no more or
worse a tracepoint that makes absolutely no sense (and I think that's
what Ingo means when he compares those calls to "System calls" that
will have to be maintained forever.) Or, and that's probably worse
from RedHat's position and Ingo's of course, somebody has to go fix
all of LTT's now complicated stack of analysis tools to get it to
continue working for the next release. Judging from the patches Ingo
has been pumping out for the last year, I'd say fixing up userspace
analysis tools isn't on his radars. Of course as the LTT maintainer,
in defense, you would say either: a) you don't need to fix LTT
because it gracefully fails when events go missing (which is true,
but doesn't help RedHat because now its customers don't get the
exact same analysis they were used to), b) that you'd obviously fix
LTT's analysis stack as a consequence (but this too is problematic
because of the reasons that follow.) And then Ingo could say:
1) you don't work for RedHat
2) even if you did work for RedHat (and Ingo could just pull your
ear to have you "fix" LTT), there's nobody that can predict two
or three years down the road what will have been built on top of
your tool by 3rd parties and which may actually be used by RedHat
customers.
This is as best I make of Ingo's continued insistence on keeping the
direct call stuff out of tree. Because if it's dynamic, then the
above issue *somewhat* goes away: If it's a statically-marked-up dynamic
point, then even if he doesn't control the stack of tools that get
built around those, he can certainly hack up the SystemTap scripts to
continue showing the same "events" to the upper-layer stack, even if
somebody down the line makes the event obsolete. With a tool that
depends on direct non-dynamic instrumentation point, he wouldn't be
able to do that. He'd then have to maintain a RedHat-specific patch
for making that event continue showing up, and looking at the list
of patches those distros end up including, I'm sure he's got zero
interest in doing that. And with the temporary hack in the SystemTap
scripts done, he can then hand that off to someone else and inform
them that eventually that specific event has become obsolete. With
a tool that depends on direct static calls he can't do that. The
stack of analysis tools breaks right away. So when Ingo insists that
you keep your direct call stuff out of tree, what he's really
telling you is that he doesn't want to have to fix your stuff for
your users. If your users depend on your external script to make it
work, RedHat (and therefore Ingo in some way) is not accountable for
that (though it could be some contract work), you are.
Now bare with me, I'm not asking you to drop your position, I think
there's a middle ground everybody will agree on -- of course if I
got the above right.
So below is kind of like a "review" of all the mechanisms that have
been put forth and how they rank depending on perspective. The closer
to "1" the better. And here are the mechanisms I'm reviewing:
- kprobes
- djprobes
- bprobes (Martin's thing amended with a 5-byte filler per instrumented
function.)
- direct calls (i.e. "static" stuff.)
I've deliberately included Martin's idea here because everything I
have seen about it points to it working. Within the specific case
of where a 5-byte filler "single instruction" is put at the entry of
a function of interest, then djprobes can be safely used to get to
an alternate function. And don't worry, I was one of the first to
actually doubt that djprobes could work. See the SystemTap archives
if you don't believe me. What I've always doubted, and continue to
doubt actually, is its ability to replace bytes covering multiple
instructions with a 5-byte jump. But that's a different topic. Very
early on in my argumentation against djprobes on the SystemTap list,
it became evident that it could be used to effectively replace
anything that had 5 bytes or more. And I knew about the errata
stuff, but this ability was confirmed by the folks at IBM who had
to deal with this errata before. So if they said it works, I had
no reason to doubt that it could safely do what it claimed it could.
So, in sum, unless somebody can mount a solid case against Martin's
slightly-amended proposed mechanism (with the 5-byte filler) in
combination with djprobes, I think we can proceed. Note that the
need for djprobes is x86-specific. On other archs errata limitations
may not be present. And, if worse came to worse, the filler could
be a simple static-jump to an outside replacement (without any
parameters being passed); even with that crude a method the
advantages of bprobes would remain unchanged: a) fast (both while
dormant or active), b) flexible, c) easily used by distro kernel
maintainers for trivially fixing up instrumentation.
One more thing. I sometimes make a distinction below about
djprobes. The "full djprobes" is djprobes that can replace
instructions of any length -- this hasn't been figured out yet as
others have stated. The "limited djprobes" is djprobes that can
replace any instruction of 5 bytes or more. I've yet to see
anything showing this can't work today.
So here's the summary of this "review" with a comment on which
of Ingo or Mathieu insists on which. There are, of course, many
other people that may agree with either, but let's just keep
this simple for now.
In terms of runtime speed of event-to-log (this view of things
is favored by Mathieu):
1- direct calls
2- bprobes
3- full djprobes
4- kprobes
It could be argued that 1, 2, and 3 are the same here, and I
won't disagree too much with that -- if 3 existed.
Lowest cost in "dormant" state (this view of things is favored
by Ingo):
1- kprobes/full djprobes
2- bprobes (single 5byte filler for each instrumented function,
no matter how many events)
3- direct call (linearly incrementing cost with each event)
Technical simplicity of implementation on x86 (this one is tricky)
(this view of things is favored by Roman -- well, ok, he's not part
of my two choice rule above):
1- direct calls
2- bprobes without binary fixup (filler is direct call or funnction
pointer -- not elegant, but very simple)
3- kprobes
4- limited djprobes (can replace any instruction of 5 bytes or more)
5- bprobes with binary fixup (filler is 5-byte replaced using
limited djprobes.)
6- full djprobes (can replace any length of instruction -- this
has yet to be shown to work.)
Technical simplicity of implementation on "non-errata" arch:
1- direct calls
2- bprobes without binary fixup
3- bprobes with binary fixup
4- kprobes
5- djprobes
Least potential for maintainability nightmare for vendor kernel
engineers (a.k.a. simplicity of fixup in case event doesn't exist
no more or variables are rendered non-relevant) (this is inferred
by me as being Ingo's point-of-view, but I could be wrong):
1- kprobes/full djprobes/bprobes
2- direct calls
Note: I've put bprobes up there with kprobes and djprobes in this
evaluation because I think that the above mind-experiment about
how Ingo could do a simple fixup without imposing the requirement
for a patch to the kernel shipped by RedHat works in the same way.
IOW, what works for SystemTap scripts in this regards, works just
fine for bprobes. I'll take Ingo's positive comments on Martin's
idea as an indication that my conclusion here is correct. And
indeed Martin himself would like to have this in order to avoid
having to do with SystemTap scripts altogether.
How closely does the fixup environment mimic normal kernel code
hacking (this would be Martin's point-of-view):
1- direct calls / bprobes
2- kprobes / full djprobes (in the form of SystemTap of course)
Ability to insert probe points on non-previously marked up regions
(I think everybody would agree on this one):
1- kprobes / full djprobes
2- bprobes (even if the given event we're looking for isn't
specifically marked up, if the encompassing function is, then
we can still get our way.)
3- direct calls (of course this sucks, you've got to recompile,
and that's why it comes strong third and would likely move
down the list if other mechanisms were found that didn't
require rebuilding.)
And there may be other criteria which I missed, but I think the
important ones are there.
Now, if you tally this whole thing up, I think you would get that
bprobes is the single thing that would make everyone content.
Really, it does. It's very fast during active tracing, there is no
correlation between the number of markers in a function and
the cost on that function when no tracing is conducted, it should
be relatively easy to implement (all right, I'm using the word
"easy" quite liberally here), it's easy to fixup for distro kernel
maintainers, it provides a work environment which kernel developers
are already very familiar with (the kernel source code), etc.
Now I've never been a fan of binary gunk work. But, gut feelings
running rampant here, I was kind of naturally attracted to
Martin's idea. The above more rational analysis seems to support
such intuition.
For all these reasons, I think bprobes warrants some very serious
investigation. If made to work, and I have yet to see any reason
why it shouldn't, I think it would make most everybody happy --
at least that's what I can make up of all this. Mathieu, in fact,
if you look at it from the LTT perspective, this is just fine,
it provides the events without perturbing the system. And for
Ingo, he gets to be able to do very much what he would have been
able to do with SystemTap for fixing up events without having to
go fix some userspace analysis tool.
And with regards to the fate of embedded systems, which trust me
I know everything about, then I'm sure Ingo won't object to your
markers generating direct static calls if that's tied to
CONFIG_EMBEDDED. And if you talk to him on a sunny day, he might
just agree that direct static calls would just be fine on m68k
too ;)
And if, and only if, bprobes can't be made to work, then you might
just have to keep that static direct call outside the tree until
you can prove to Ingo that on the long run you'll be able to manage
keeping LTT in-sync with kernel changes -- or unless Ingo's point
of view is ignored by everybody upstairs, which I have my doubts
about. Which, of course, wouldn't preclude RedHat shipping LTT so
long as it depended on a mechanism which allowed Ingo to fix up
events for it and not have to delve into fixing the LTT analysis
stack for RedHat customers to be happy.
Hope this helps dissipate some confusion and focus attention where
progress would help achieving a more universal consensus,
Karim
--
President / Opersys Inc.
Embedded Linux Training and Expertise
http://www.opersys.com / 1.866.677.4546
* Mathieu Desnoyers <[email protected]> wrote:
> > "As an example, LTTng traces the page fault handler, when kprobes
> > just can't instrument it."
> >
> > but tracing a raw pagefault at the arch level is a bad idea anyway,
> > we want to trace __handle_mm_fault(). That way you can avoid having
> > to modify every architecture's pagefault handler ...
>
> Then you lose the ability to trace in-kernel minor page faults.
that's wrong, minor pagefaults go through __handle_mm_fault() just as
much.
Ingo
* Mathieu Desnoyers <[email protected]> wrote:
> I clearly expressed my position in the previous emails, so did you.
> You argued about a use of tracing that is not relevant to my vision of
> reality, which is :
>
> - Embedded systems developers won't want a breakpoint-based probe
are you arguing that i'm trying to force breakpoint-based probing on
you? I dont. In fact i explicitly mentioned that i'd accept and support
a 5-byte NOP in the body of the marker, in the following mails:
"just go for [...] the 5-NOP variant"
http://marc.theaimsgroup.com/?l=linux-kernel&m=115859771924187&w=2
(my reply to your second proposal)
"or at most one NOP"
http://marc.theaimsgroup.com/?l=linux-kernel&m=115865412332230&w=2
(my reply to your third proposal)
"at most a NOP inserted"
http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2
(my reply to your fifth proposal)
That enables the probe to be turned into a function call - not an INT3
breakpoint. Does it take some effort to implement that on your part?
Yes, of course, but getting code upstream is never easy, /especially/ in
cases where most of the users wont use a particular feature.
> - High performance computing users won't want a breakpoint-based probe
I am not forcing breakpoint-based probing, at all. I dont want _static,
build-time function call based_ probing, and there is a big difference.
And one reason why i want to avoid "static, build-time function call
based probing" is because high-performance computing users dont want any
overhead at all in the kernel fastpath.
> - djprobe is far away from being in an acceptable state on
> architectures with very inconvenient erratas (x86).
djprobes over a NOP marker are perfectly usable and safe: just add a
simple constraint to them to only allow a djprobes insertion if it
replaces a 5-byte NOP.
> - kprobe and djprobe cannot access local variables in every cases
it is possible with the marker mechanism i outlined before:
http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2
have i missed to address any concern of yours?
Ingo
Ingo Molnar wrote:
> are you arguing that i'm trying to force breakpoint-based probing on
> you? I dont. In fact i explicitly mentioned that i'd accept and support
> a 5-byte NOP in the body of the marker, in the following mails:
Actually that won't work if the kernel runs directly from rom/flash
simply because it's not possible to do any form of binary editing
on the image, as is possible in the more common desktop or workstation
case where the kernel runs from ram.
Karim
--
President / Opersys Inc.
Embedded Linux Training and Expertise
http://www.opersys.com / 1.866.677.4546
* Ingo Molnar ([email protected]) wrote:
>
> * Mathieu Desnoyers <[email protected]> wrote:
>
> > > "As an example, LTTng traces the page fault handler, when kprobes
> > > just can't instrument it."
> > >
> > > but tracing a raw pagefault at the arch level is a bad idea anyway,
> > > we want to trace __handle_mm_fault(). That way you can avoid having
> > > to modify every architecture's pagefault handler ...
> >
> > Then you lose the ability to trace in-kernel minor page faults.
>
> that's wrong, minor pagefaults go through __handle_mm_fault() just as
> much.
>
Hi Ingo,
On a 2.6.17 kernel tree :
In do_page_fault()
if (unlikely(address >= TASK_SIZE)) {
if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
return;
In vmalloc_fault()
static inline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
return 0;
}
It seems like a shortcut path that will never call __handle_mm_fault. This path
is precisely used to handle vmalloc faults.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
I hate AOL-style me-toos, but there's nothing to add to this mail.
Thanks for this coherent writeup Mathieu.
On Thu, Sep 21, 2006 at 05:42:48PM -0400, Mathieu Desnoyers wrote:
> Hi Ingo,
>
> I clearly expressed my position in the previous emails, so did you. You argued
> about a use of tracing that is not relevant to my vision of reality, which is :
>
> - Embedded systems developers won't want a breakpoint-based probe
> - High performance computing users won't want a breakpoint-based probe
> - djprobe is far away from being in an acceptable state on architectures with
> very inconvenient erratas (x86).
> - kprobe and djprobe cannot access local variables in every cases
>
> For those reasons, I prefer a jump-over-call approach which lets gcc give us the
> local variables. No need of DWARF or SystemTAP macro Kung Fu. Just C and a
> loadable module.
>
> By no means is it a replacement for a completely dynamic breakpoint-based
> instrumentation mechanism. I really think that both mechanism should coexist.
>
> This is my position : I let the distribution/user decide what is appropriate for
> their use. My goal is to provide them a flexible mechanism that takes the
> multiple variety of uses in account without performance impact if they are not
> willing to pay it to benefit from tracing.
>
> With all due respect, yes, there are Linux users different from the typical
> Redhat client. If your vision is still limited to this scope after a 500
> emails debate, I am afraid that there is very little I can do about it in
> one more.
Good morning Ingo,
* Ingo Molnar ([email protected]) wrote:
>
> * Mathieu Desnoyers <[email protected]> wrote:
>
> > I clearly expressed my position in the previous emails, so did you.
> > You argued about a use of tracing that is not relevant to my vision of
> > reality, which is :
> >
> > - Embedded systems developers won't want a breakpoint-based probe
>
> are you arguing that i'm trying to force breakpoint-based probing on
> you? I dont. In fact i explicitly mentioned that i'd accept and support
> a 5-byte NOP in the body of the marker, in the following mails:
>
> "just go for [...] the 5-NOP variant"
> http://marc.theaimsgroup.com/?l=linux-kernel&m=115859771924187&w=2
> (my reply to your second proposal)
>
> "or at most one NOP"
> http://marc.theaimsgroup.com/?l=linux-kernel&m=115865412332230&w=2
> (my reply to your third proposal)
>
> "at most a NOP inserted"
> http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2
> (my reply to your fifth proposal)
>
> That enables the probe to be turned into a function call - not an INT3
> breakpoint. Does it take some effort to implement that on your part?
> Yes, of course, but getting code upstream is never easy, /especially/ in
> cases where most of the users wont use a particular feature.
>
Some details are worth to be mentioned :
- The 5-NOP variant will imply a replacement of 5 1 bytes instructions with 1 5
bytes one, which is trickier. Masami Hiramatsu's proposal of 2 bytes near jump
+ 3 NOPS is nicer.
- Patching such a 5-bytes instruction memory region doesn't turn markers into a
complete function call, which includes argument passing.
- The argument "most of the users wont use a particular feature" contradicts
what you said earlier about every distribution wanting to enable a tracing
mechanism for their users.
> > - High performance computing users won't want a breakpoint-based probe
>
> I am not forcing breakpoint-based probing, at all. I dont want _static,
> build-time function call based_ probing, and there is a big difference.
> And one reason why i want to avoid "static, build-time function call
> based probing" is because high-performance computing users dont want any
> overhead at all in the kernel fastpath.
>
I think that the performance benefits gained by using tracing information for
studying a system makes the overhead of a jump in the kernel fast path
insignificant. Having a stack setup + function call already put there by the
compiler has the following advantages :
- It is very robust (I could think of using it on a live server, which is not
true of the djprobe approach).
- It is predictable on every architecture.
- The information extracted is _always_ coherent with the marked variables,
because the compiler itself created the full function call (stack setup
included).
> > - djprobe is far away from being in an acceptable state on
> > architectures with very inconvenient erratas (x86).
>
> djprobes over a NOP marker are perfectly usable and safe: just add a
> simple constraint to them to only allow a djprobes insertion if it
> replaces a 5-byte NOP.
>
2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an
illegal instruction, but how ? Are you aware that their approach has to :
- put an int3
- wait for _all_ the CPUs to execute this int3
- then change the 5 bytes instruction
I can think of a lot of cases where the CPUs will never execute this int3.
Probably that sending an IPI or launching a kernel thread on each CPU to make
sure that this int3 is executed could give more guarantees there. But my point
is not even there : I have seen very skillful teams work hard on those
hardware-caused problems for years and the result is still not usable. It looks
to me like a race between software developers and hardware manufacturers, where
the software guy is always one step behind. This kind of scenario happens when
you want to use an architecture in a way it was not designed and tested for.
As long as CPU manufacturers won't design for live instruction patching (and why
should they do that ? the in3 breakpoint is all what is needed from their
perspective), this will be a race where software developers will lose.
> > - kprobe and djprobe cannot access local variables in every cases
>
> it is possible with the marker mechanism i outlined before:
>
> http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2
>
> have i missed to address any concern of yours?
>
Interesting idea. That would make it possible to probe local variables at the
marker site. That's very good for use of kprobes on low rate debug-type markers,
but that doesn't solve my concern about the cat-and-mouse race expressed earlier
about live kernel polymorphic code.
I would be all in for this kind of combo :
If you can find a way to make a kprobe-based probe extract the variables from
such a variable-dependency marked site, that would be great for dynamic of low
event rate code paths. For the high event rate, and while we wait for such a
probe to exist, I think that the load+jump over a complete call is the lowest
cost, most robust, coherent, predictable and portable mechanism I have seen
so far.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Funny, I never thought I'd be defending djprobes ...
Mathieu Desnoyers wrote:
> 2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an
> illegal instruction, but how ? Are you aware that their approach has to :
> - put an int3
> - wait for _all_ the CPUs to execute this int3
> - then change the 5 bytes instruction
>
> I can think of a lot of cases where the CPUs will never execute this int3.
> Probably that sending an IPI or launching a kernel thread on each CPU to make
> sure that this int3 is executed could give more guarantees there. But my point
> is not even there : I have seen very skillful teams work hard on those
> hardware-caused problems for years and the result is still not usable. It looks
> to me like a race between software developers and hardware manufacturers, where
> the software guy is always one step behind. This kind of scenario happens when
> you want to use an architecture in a way it was not designed and tested for.
>
> As long as CPU manufacturers won't design for live instruction patching (and why
> should they do that ? the in3 breakpoint is all what is needed from their
> perspective), this will be a race where software developers will lose.
I'm with you all the way if we're talking about patching instructions
which are less than 5 bytes. And I must fault djprobes backers
for their insistence on trying to get it to work for all possible
instruction lengths. But in the specific case discussed between
Hiramatsu-san and myself (5 byte short jmp + nops) I have no reason
to believe it doesn't work. Continuing to try to get it to work on
any instruction length can be argued to be a waste of time, but not
what has been talked of of late.
With regards to all CPUs executing the int3, here's a rather savage,
but effective way of making this work:
- launch one thread per cpu (as you explained)
- have each thread make a direct jump to the location of the int3
- catch the int3 and kill active thread if this is a forced jump
This is deterministic.
So if your proposal is to amend the markup to use the short-jmp+nops
at every marker site instead of my earlier suggestion for the bprobes
thing, I'm all with you.
And I agree, 5 NOPs is *not* the right thing. 1 short jmp + 3 nops,
this works.
Karim
--
President / Opersys Inc.
Embedded Linux Training and Expertise
http://www.opersys.com / 1.866.677.4546
* Karim Yaghmour ([email protected]) wrote:
> So if your proposal is to amend the markup to use the short-jmp+nops
> at every marker site instead of my earlier suggestion for the bprobes
> thing, I'm all with you.
>
First of all, I think that specific architecture-specific optimisations can and
should be integrated in a more generic portable framework.
Hrm, your comment makes me think of an interesting idea :
.align
jump_address:
near jump to end
setup_stack_address:
setup stack
call empty function
end:
So, instead of putting nops in the target area, we fill it with a useful
function call. Near jump being 2 bytes, it might be much easier to modify.
If necessary, making sure the instruction is aligned would help to change it
atomically. If we mark the jump address, the setup stack address and the end
tag address with symbols, we can easily calculate (portably) the offset of the
near jump to activate either the setup_stack_address or end tags.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Mathieu Desnoyers wrote:
> First of all, I think that specific architecture-specific optimisations can and
> should be integrated in a more generic portable framework.
No disagreement there. If Ingo would care to comment, I think it might
be an acceptable compromise to have x86 fully use kprobes/djprobes
immediately, and the other archs could walk there at their rate.
Practically, some stuff in include/asm-i386/markers.h and
include/asm-x86_64/markers.h would contain the binary modifiable stuff
and include/asm-generic/markers.h could contain a platform-independent
fallback.
> Hrm, your comment makes me think of an interesting idea :
>
> .align
> jump_address:
> near jump to end
> setup_stack_address:
> setup stack
> call empty function
> end:
>
> So, instead of putting nops in the target area, we fill it with a useful
> function call. Near jump being 2 bytes, it might be much easier to modify.
> If necessary, making sure the instruction is aligned would help to change it
> atomically. If we mark the jump address, the setup stack address and the end
> tag address with symbols, we can easily calculate (portably) the offset of the
> near jump to activate either the setup_stack_address or end tags.
That's another possibility. It seems more C friendly than the simple
short-jump+3bytes.
Ingo?
Karim
--
President / Opersys Inc.
Embedded Linux Training and Expertise
http://www.opersys.com / 1.866.677.4546
* Mathieu Desnoyers <[email protected]> wrote:
> Some details are worth to be mentioned :
>
> - The 5-NOP variant will imply a replacement of 5 1 bytes instructions
> with 1 5 bytes one, which is trickier. Masami Hiramatsu's proposal
> of 2 bytes near jump + 3 NOPS is nicer.
yeah.
> - Patching such a 5-bytes instruction memory region doesn't turn
> markers into a complete function call, which includes argument
> passing.
please read:
Message-ID: <[email protected]>
about how to turn it into a complete function call / jump.
[ i came back to this point from the end of the mail, where you consider
this an "interesting idea" - so i guess your above claim is moot. If
it is not moot not then by all means i'm happy to further debate it. ]
> > And one reason why i want to avoid "static, build-time function call
> > based probing" is because high-performance computing users dont want
> > any overhead at all in the kernel fastpath.
>
> - The argument "most of the users wont use a particular feature"
> contradicts what you said earlier about every distribution wanting
> to enable a tracing mechanism for their users.
enabling a feature does not mean it's actually used by most users!
For example, Fedora currently enables:
CONFIG_NETPOLL=y
but only a tiny, tiny minority of users actually make use of it. Then
why is it still enabled? Because it has little to zero overhead to have
it enabled but not utilized. In the fastpath it has zero overhead. (and
yes, i originally designed and implemented netpoll/netconsole to be like
that, and i intentionally shaped it to be zero-overhead because i knew
it would be used as a debug feature.)
the same goes for tracing. We want to have it enabled in distros, but
only if it's near zero-cost. But because tracing wants to be there in
every fastpath, we have to work /hard/ to achieve this desired
end-result. I am "in your way" unfortunately /precisely/ because tracing
we want to enable in Fedora, but we want to pay as little cycles for it
on 99.998% of the Fedora boxes that wont be using any tracing at all.
> > > - High performance computing users won't want a breakpoint-based probe
> >
> > I am not forcing breakpoint-based probing, at all. I dont want _static,
> > build-time function call based_ probing, and there is a big difference.
> > And one reason why i want to avoid "static, build-time function call
> > based probing" is because high-performance computing users dont want any
> > overhead at all in the kernel fastpath.
> >
>
> I think that the performance benefits gained by using tracing
> information for studying a system makes the overhead of a jump in the
> kernel fast path insignificant. [...]
sorry, but as Alan mentioned it before, this leads to "death by a
million cuts". See above to understand what kind of feature is
acceptable to a distro (and hence to the upstream kernel) and what not.
> > > - djprobe is far away from being in an acceptable state on
> > > architectures with very inconvenient erratas (x86).
> >
> > djprobes over a NOP marker are perfectly usable and safe: just add a
> > simple constraint to them to only allow a djprobes insertion if it
> > replaces a 5-byte NOP.
> >
>
> 2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an
> illegal instruction, but how ? Are you aware that their approach has to :
> - put an int3
> - wait for _all_ the CPUs to execute this int3
> - then change the 5 bytes instruction
>
> I can think of a lot of cases where the CPUs will never execute this
> int3. Probably that sending an IPI or launching a kernel thread on
> each CPU to make sure that this int3 is executed could give more
> guarantees there. [...]
this is easy to solve, for example via the use of freeze_processes() and
thaw_processes().
> > > - kprobe and djprobe cannot access local variables in every cases
> >
> > it is possible with the marker mechanism i outlined before:
> >
> > http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2
> >
> > have i missed to address any concern of yours?
>
> Interesting idea. That would make it possible to probe local variables
> at the marker site. That's very good for use of kprobes on low rate
> debug-type markers, but that doesn't solve my concern about the
> cat-and-mouse race expressed earlier about live kernel polymorphic
> code.
i dont consider polymorphic code a problem at all. Java JITs (and the
dot-NET runtime) are faced with it every day and they have no problem
_dynamically rewriting high-performance code all the time_. In fact, in
the kernel we have /more/ tools to solve such problems. We can disable
interrupts, we can flush caches, we can send IRQs, we have total control
over every task in the system, etc., etc.
Furthermore, it is even more of a side issue in our case because unlike
Java JITs, for us the polymorphic code is in the /absolute slowpath/. We
could even turn off all caches while doing the code patching and nobody
would notice. (but such drastic measures are not needed at all)
Ingo
* Mathieu Desnoyers <[email protected]> wrote:
> > > Then you lose the ability to trace in-kernel minor page faults.
> >
> > that's wrong, minor pagefaults go through __handle_mm_fault() just as
> > much.
> >
>
> Hi Ingo,
>
> On a 2.6.17 kernel tree :
> It seems like a shortcut path that will never call __handle_mm_fault.
> This path is precisely used to handle vmalloc faults.
yes, but you said "minor fault", not "vmalloc fault".
minor faults are the things that happen when a task does read-after-COW
or read-mmap-ed-pagecache-page, and they very much go through
__handle_mm_fault().
vmalloc faults are extremely rare, x86-specific and they are a pure
kernel-internal matter. (I'd never want to trace them, especially if it
pushes tracepoints into every architecture's page fault handler. I
implemented the initial version of them IIRC, but my memory fails
precisely why. I think it was 4:4 related, but i'm unsure.)
(i now realize that above you said "in-kernel minor faults" - under that
you meant vmalloc faults?)
Ingo
* Ingo Molnar ([email protected]) wrote:
>
> * Mathieu Desnoyers <[email protected]> wrote:
>
> > > > Then you lose the ability to trace in-kernel minor page faults.
> > >
> > > that's wrong, minor pagefaults go through __handle_mm_fault() just as
> > > much.
> > >
> >
> > Hi Ingo,
> >
> > On a 2.6.17 kernel tree :
>
> > It seems like a shortcut path that will never call __handle_mm_fault.
> > This path is precisely used to handle vmalloc faults.
>
> yes, but you said "minor fault", not "vmalloc fault".
>
> minor faults are the things that happen when a task does read-after-COW
> or read-mmap-ed-pagecache-page, and they very much go through
> __handle_mm_fault().
>
> vmalloc faults are extremely rare, x86-specific and they are a pure
> kernel-internal matter. (I'd never want to trace them, especially if it
> pushes tracepoints into every architecture's page fault handler. I
> implemented the initial version of them IIRC, but my memory fails
> precisely why. I think it was 4:4 related, but i'm unsure.)
>
> (i now realize that above you said "in-kernel minor faults" - under that
> you meant vmalloc faults?)
>
Yes, sorry, my mistake. This kind of fault is not as infrequent as you may
think, as every newly allocated vmalloc region will cause vmalloc faults on
every processes on the system that are trying to access them. I agree that
it should not be a standard event people would be interested in.
Regards,
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Mathieu Desnoyers <[email protected]> wrote:
> * Ingo Molnar ([email protected]) wrote:
> >
> > * Mathieu Desnoyers <[email protected]> wrote:
> >
> > > > > Then you lose the ability to trace in-kernel minor page faults.
> > > >
> > > > that's wrong, minor pagefaults go through __handle_mm_fault() just as
> > > > much.
> > > >
> > >
> > > Hi Ingo,
> > >
> > > On a 2.6.17 kernel tree :
> >
> > > It seems like a shortcut path that will never call __handle_mm_fault.
> > > This path is precisely used to handle vmalloc faults.
> >
> > yes, but you said "minor fault", not "vmalloc fault".
> >
> > minor faults are the things that happen when a task does read-after-COW
> > or read-mmap-ed-pagecache-page, and they very much go through
> > __handle_mm_fault().
> >
> > vmalloc faults are extremely rare, x86-specific and they are a pure
> > kernel-internal matter. (I'd never want to trace them, especially if it
> > pushes tracepoints into every architecture's page fault handler. I
> > implemented the initial version of them IIRC, but my memory fails
> > precisely why. I think it was 4:4 related, but i'm unsure.)
> >
> > (i now realize that above you said "in-kernel minor faults" - under that
> > you meant vmalloc faults?)
> >
>
> Yes, sorry, my mistake. This kind of fault is not as infrequent as you
> may think, as every newly allocated vmalloc region will cause vmalloc
> faults on every processes on the system that are trying to access
> them. I agree that it should not be a standard event people would be
> interested in.
most of the vmalloc area that is allocated on a typical system are
modules - and they get loaded on bootup and rarely unloaded. Even for
other vmalloc-ed areas like netfilter, the activation of them is during
bootup. So from that point on the number of vmalloc faults is quite low.
(zero on most systems) If you still want to trace it i'd suggest a
separate type of event for it.
(meanwhile i remember why i implemented vmalloc faults to begin with:
during vmalloc() we used to have a for_each_process() over all
kernel-pagetables of tasks to fix up their pagetables. This caused both
high latencies and overhead back in the days when we still were frequent
vmalloc()ers.)
Ingo
* Ingo Molnar ([email protected]) wrote:
> >
> > Yes, sorry, my mistake. This kind of fault is not as infrequent as you
> > may think, as every newly allocated vmalloc region will cause vmalloc
> > faults on every processes on the system that are trying to access
> > them. I agree that it should not be a standard event people would be
> > interested in.
>
> most of the vmalloc area that is allocated on a typical system are
> modules - and they get loaded on bootup and rarely unloaded. Even for
> other vmalloc-ed areas like netfilter, the activation of them is during
> bootup. So from that point on the number of vmalloc faults is quite low.
> (zero on most systems) If you still want to trace it i'd suggest a
> separate type of event for it.
Yes, with this typical vmalloc usage pattern in perspective, we completely
agree. We also agree on having a separate kind of event for this, as it requires
the tracer code to be vmalloc-fault-reentrant (very tricky).
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Karim Yaghmour ([email protected]) wrote:
>
> > Hrm, your comment makes me think of an interesting idea :
> >
> > .align
> > jump_address:
> > near jump to end
> > setup_stack_address:
> > setup stack
> > call empty function
> > end:
> >
> > So, instead of putting nops in the target area, we fill it with a useful
> > function call. Near jump being 2 bytes, it might be much easier to modify.
> > If necessary, making sure the instruction is aligned would help to change it
> > atomically. If we mark the jump address, the setup stack address and the end
> > tag address with symbols, we can easily calculate (portably) the offset of the
> > near jump to activate either the setup_stack_address or end tags.
>
> That's another possibility. It seems more C friendly than the simple
> short-jump+3bytes.
>
Here is the implementation :-)
Comments are welcome.
/*****************************************************************************
* marker.h
*
* Code markup for dynamic and static tracing. i386 architecture optimisations.
*
* (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
*
* This file is released under the GPLv2.
* See the file COPYING for more details.
*/
#define MARK_NEAR_JUMP_PREFIX "__mark_near_jump_"
#define MARK_NEAR_JUMP_SELECT_PREFIX "__mark_near_jump_select_"
/* Note : max 256 bytes between over_label and near_jump */
#define MARK_DO_JUMP(name) \
do { \
__label__ near_jump; \
volatile static void *__mark_near_jump_##name \
asm (MARK_NEAR_JUMP_PREFIX#name) \
__attribute__((unused)) = &&near_jump; \
volatile static void *__mark_near_jump_select_##name \
asm (MARK_NEAR_JUMP_SELECT_PREFIX#name) \
__attribute__((unused)) = &&near_jump_select; \
asm volatile ( ".align 16;\n\t" : : ); \
asm volatile ( ".byte 0xeb;\n\t" : : ); \
near_jump_select: \
asm volatile ( ".byte %0-%1;\n\t" : : \
"m" (*&&over_label), "m" (*&&near_jump)); \
near_jump: \
asm volatile ( "" : : ); \
} while(0)
To change it, we can dynamically overwrite the __mark_near_jump_select_##name
value (a byte) to (__mark_jump_call_##name - __mark_near_jump_##name).
So we have one architecture specific optimisation within the architecture
agnostic marking mechanism.
Comments are welcome.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Mathieu Desnoyers wrote:
> Here is the implementation :-)
Trigger happy :)
> To change it, we can dynamically overwrite the __mark_near_jump_select_##name
> value (a byte) to (__mark_jump_call_##name - __mark_near_jump_##name).
Hmm... I don't know if you won't still need to resort to int3 and
then overwrite the byte. From my understanding it sounds like you
wouldn't but that's where Richard's insight on the errata stuff
might come in handy.
Have you actually tried to run this code by any chance? *If* it did
work, I think this might just mean that you don't need either
kprobes or djprobes.
> So we have one architecture specific optimisation within the architecture
> agnostic marking mechanism.
That would seem reasonable, I think. You might want to test out
your mechanism, get confirmation from Richard and then post an
update to your patches.
Karim
* Ingo Molnar ([email protected]) wrote:
> but tracing a raw pagefault at the arch level is a bad idea anyway, we
> want to trace __handle_mm_fault(). That way you can avoid having to
> modify every architecture's pagefault handler ...
>
The problem with __handle_mm_fault() is that the struct pt_regs * is not
passed to this function. An event containing both the address where the fault
occurs and the instruction pointer that caused the fault is very useful.
Mathieu
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Hi -
On Thu, Sep 21, 2006 at 08:50:29PM +0200, Ingo Molnar wrote:
> [...] let me qualify that: parameters must be prepared there too -
> but no actual function call inserted. (at most a NOP
> inserted). [...] Does a simple asm() that takes read-only
> parameters but only adds a NOP achieve this result?
You mean something like this?
#define MARK(n,v1,v2,v3) asm ("__mark_" #n ": nop" :: \
"X" (v1), "X" (v2), "X" (v3))
I haven't been able to get gcc to emit any better debuginfo for
parameters pseudo-passed like this.
(I've tested such a marker inserted into an inner loop of dhrystone.
It was compiled with "-ggdb -O3". Neither gdb nor systemtap could
resolve the same values/symbols being passed as MARK() arguments,
though at least the breakpoint address was nicely marked.)
- FChE