2007-05-10 02:53:53

by Mathieu Desnoyers

[permalink] [raw]
Subject: [patch 05/10] Linux Kernel Markers - i386 optimized version

[[email protected]: marker exports must be EXPORT_SYMBOL_GPL]
Signed-off-by: Mathieu Desnoyers <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/kernel/Makefile | 1
arch/i386/kernel/marker.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++
include/asm-i386/marker.h | 84 +++++++++++++++++++++++++++++++++++++++
3 files changed, 184 insertions(+)

Index: linux-2.6-lttng/arch/i386/kernel/Makefile
===================================================================
--- linux-2.6-lttng.orig/arch/i386/kernel/Makefile 2007-05-09 18:14:51.000000000 -0400
+++ linux-2.6-lttng/arch/i386/kernel/Makefile 2007-05-09 18:16:03.000000000 -0400
@@ -33,6 +33,7 @@
obj-y += sysenter.o vsyscall.o
obj-$(CONFIG_ACPI_SRAT) += srat.o
obj-$(CONFIG_EFI) += efi.o efi_stub.o
+obj-$(CONFIG_MARKERS_ENABLE_OPTIMIZATION) += marker.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_SERIAL_8250) += legacy_serial.o
obj-$(CONFIG_VM86) += vm86.o
Index: linux-2.6-lttng/arch/i386/kernel/marker.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/arch/i386/kernel/marker.c 2007-05-09 18:16:03.000000000 -0400
@@ -0,0 +1,99 @@
+/* marker.c
+ *
+ * Erratum 49 fix for Intel PIII and higher.
+ *
+ * Permits marker activation by XMC with correct serialization.
+ *
+ * Reentrant for NMI and trap handler instrumentation. Permits XMC to a
+ * location that has preemption enabled because it involves no temporary or
+ * reused data structure.
+ *
+ * Mathieu Desnoyers <[email protected]>
+ */
+
+#include <linux/notifier.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/kdebug.h>
+
+#include <asm/cacheflush.h>
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define BREAKPOINT_INS_LEN 1
+
+static DEFINE_MUTEX(mark_mutex);
+static long target_eip = 0;
+
+static void mark_synchronize_core(void *info)
+{
+ sync_core(); /* use cpuid to stop speculative execution */
+}
+
+/* We simply skip the 2 bytes load immediate here, leaving the register in an
+ * undefined state. We don't care about the content (0 or !0), because we are
+ * changing the value 0->1 or 1->0. This small window of undefined value
+ * doesn't matter.
+ */
+static int mark_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ enum die_val die_val = (enum die_val) val;
+ struct die_args *args = (struct die_args *)data;
+
+ if (!args->regs || user_mode_vm(args->regs))
+ return NOTIFY_DONE;
+
+ if (die_val == DIE_INT3 && args->regs->eip == target_eip) {
+ args->regs->eip += 1; /* Skip the next byte of load immediate */
+ return NOTIFY_STOP;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mark_notify = {
+ .notifier_call = mark_notifier,
+ .priority = 0x7fffffff, /* we need to be notified first */
+};
+
+int marker_optimized_set_enable(void *address, char enable)
+{
+ char saved_byte;
+ int ret;
+ char *dest = address;
+
+ if (!(enable ^ dest[1])) /* Must be a state change 0<->1 to execute */
+ return 0;
+
+ mutex_lock(&mark_mutex);
+ target_eip = (long)address + BREAKPOINT_INS_LEN;
+ /* register_die_notifier has memory barriers */
+ register_die_notifier(&mark_notify);
+ saved_byte = *dest;
+ *dest = BREAKPOINT_INSTRUCTION;
+ wmb();
+ /* Execute serializing instruction on each CPU.
+ * Acts as a memory barrier. */
+ ret = on_each_cpu(mark_synchronize_core, NULL, 1, 1);
+ BUG_ON(ret != 0);
+
+ dest[1] = enable;
+ wmb();
+ *dest = saved_byte;
+ /* Wait for all int3 handlers to end
+ (interrupts are disabled in int3).
+ This CPU is clearly not in a int3 handler
+ (not preemptible).
+ synchronize_sched has memory barriers */
+ synchronize_sched();
+ unregister_die_notifier(&mark_notify);
+ /* unregister_die_notifier has memory barriers */
+ target_eip = 0;
+ mutex_unlock(&mark_mutex);
+ flush_icache_range(address, size);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(marker_optimized_set_enable);
Index: linux-2.6-lttng/include/asm-i386/marker.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/include/asm-i386/marker.h 2007-05-09 18:16:03.000000000 -0400
@@ -0,0 +1,84 @@
+#ifndef _ASM_I386_MARKER_H
+#define _ASM_I386_MARKER_H
+
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. i386 architecture optimisations.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+
+#ifdef CONFIG_MARKERS
+
+#define MF_DEFAULT (MF_OPTIMIZED | MF_LOCKDEP | MF_PRINTK)
+
+/* Optimized version of the markers */
+#define trace_mark_optimized(flags, name, format, args...) \
+ do { \
+ static const char __mstrtab_name_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = #name; \
+ static const char __mstrtab_format_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = format; \
+ static const char __mstrtab_args_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = #args; \
+ static struct __mark_marker_data __mark_data_##name \
+ __attribute__((section("__markers_data"))) = \
+ { __mstrtab_name_##name, __mstrtab_format_##name, \
+ __mstrtab_args_##name, \
+ (flags) | MF_OPTIMIZED, __mark_empty_function, NULL }; \
+ char condition; \
+ asm volatile( ".section __markers, \"a\", @progbits;\n\t" \
+ ".long %1, 0f;\n\t" \
+ ".previous;\n\t" \
+ ".align 2\n\t" \
+ "0:\n\t" \
+ "movb $0,%0;\n\t" \
+ : "=r" (condition) \
+ : "m" (__mark_data_##name)); \
+ __mark_check_format(format, ## args); \
+ if (likely(!condition)) { \
+ } else { \
+ preempt_disable(); \
+ (*__mark_data_##name.call)(&__mark_data_##name, \
+ format, ## args); \
+ preempt_enable(); \
+ } \
+ } while (0)
+
+/* Marker macro selecting the generic or optimized version of marker, depending
+ * on the flags specified. */
+#define _trace_mark(flags, format, args...) \
+do { \
+ if (((flags) & MF_LOCKDEP) && ((flags) & MF_OPTIMIZED)) \
+ trace_mark_optimized(flags, format, ## args); \
+ else \
+ trace_mark_generic(flags, format, ## args); \
+} while (0)
+
+/* Marker with default behavior */
+#define trace_mark(format, args...) _trace_mark(MF_DEFAULT, format, ## args)
+
+/* Architecture dependant marker information, used internally for marker
+ * activation. */
+
+/* Offset of the immediate value from the start of the movb instruction, in
+ * bytes. */
+#define MARK_OPTIMIZED_ENABLE_IMMEDIATE_OFFSET 1
+#define MARK_OPTIMIZED_ENABLE_TYPE char
+/* Dereference enable as lvalue from a pointer to its instruction */
+#define MARK_OPTIMIZED_ENABLE(a) \
+ *(MARK_OPTIMIZED_ENABLE_TYPE*) \
+ ((char*)a+MARK_OPTIMIZED_ENABLE_IMMEDIATE_OFFSET)
+
+extern int marker_optimized_set_enable(void *address, char enable);
+
+#endif
+#endif //_ASM_I386_MARKER_H

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68


2007-05-10 09:07:14

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

On Wed, May 09, 2007 at 09:56:00PM -0400, Mathieu Desnoyers wrote:
> @@ -0,0 +1,99 @@
> +/* marker.c
> + *
> + * Erratum 49 fix for Intel PIII and higher.

Errata are CPU specific so they can't be higher. You mean it's a P3
erratum only?

In general you need some more description why the int3 handler is needed.

> + *
> + * Permits marker activation by XMC with correct serialization.

This doesn't follow the Intel recommended algorithm for cross modifying
code (7.1.3 in the IA32 manual) which requires the other CPUs to spin
during the modification.

If you used that would you really need the INT3 handler?

> +static DEFINE_MUTEX(mark_mutex);
> +static long target_eip = 0;
> +
> +static void mark_synchronize_core(void *info)
> +{
> + sync_core(); /* use cpuid to stop speculative execution */
> +}
> +
> +/* We simply skip the 2 bytes load immediate here, leaving the register in an
> + * undefined state. We don't care about the content (0 or !0), because we are
> + * changing the value 0->1 or 1->0. This small window of undefined value
> + * doesn't matter.
> + */
> +static int mark_notifier(struct notifier_block *nb,
> + unsigned long val, void *data)
> +{
> + enum die_val die_val = (enum die_val) val;
> + struct die_args *args = (struct die_args *)data;
> +
> + if (!args->regs || user_mode_vm(args->regs))

I don't think regs should be ever NULL

> + return NOTIFY_DONE;
> +
> + if (die_val == DIE_INT3 && args->regs->eip == target_eip) {
> + args->regs->eip += 1; /* Skip the next byte of load immediate */

In what instruction results that then? This is definitely underdocumented.

> +int marker_optimized_set_enable(void *address, char enable)
> +{
> + char saved_byte;
> + int ret;
> + char *dest = address;
> +
> + if (!(enable ^ dest[1])) /* Must be a state change 0<->1 to execute */
> + return 0;

Wouldn't you need that inside the mutex too to avoid races?


> +
> + mutex_lock(&mark_mutex);
> + target_eip = (long)address + BREAKPOINT_INS_LEN;
> + /* register_die_notifier has memory barriers */
> + register_die_notifier(&mark_notify);
> + saved_byte = *dest;
> + *dest = BREAKPOINT_INSTRUCTION;
> + wmb();

wmb is a nop

> + /* Execute serializing instruction on each CPU.
> + * Acts as a memory barrier. */
> + ret = on_each_cpu(mark_synchronize_core, NULL, 1, 1);
> + BUG_ON(ret != 0);
> +
> + dest[1] = enable;
> + wmb();
> + *dest = saved_byte;
> + /* Wait for all int3 handlers to end
> + (interrupts are disabled in int3).
> + This CPU is clearly not in a int3 handler
> + (not preemptible).

So what happens when the kernel is preemptible?

> + synchronize_sched has memory barriers */
> + synchronize_sched();
> + unregister_die_notifier(&mark_notify);
> + /* unregister_die_notifier has memory barriers */
> + target_eip = 0;
> + mutex_unlock(&mark_mutex);
> + flush_icache_range(address, size);

That's a nop on x86

> +
> +#ifdef CONFIG_MARKERS
> +
> +#define MF_DEFAULT (MF_OPTIMIZED | MF_LOCKDEP | MF_PRINTK)
> +
> +/* Optimized version of the markers */
> +#define trace_mark_optimized(flags, name, format, args...) \
> + do { \
> + static const char __mstrtab_name_##name[] \
> + __attribute__((section("__markers_strings"))) \
> + = #name; \
> + static const char __mstrtab_format_##name[] \
> + __attribute__((section("__markers_strings"))) \
> + = format; \
> + static const char __mstrtab_args_##name[] \
> + __attribute__((section("__markers_strings"))) \
> + = #args; \

For what do you need special string sections?

If it's something to be read by external programs the interface
would need to be clearly defined and commented.
If not just use normal variables.

> + static struct __mark_marker_data __mark_data_##name \
> + __attribute__((section("__markers_data"))) = \
> + { __mstrtab_name_##name, __mstrtab_format_##name, \
> + __mstrtab_args_##name, \
> + (flags) | MF_OPTIMIZED, __mark_empty_function, NULL }; \
> + char condition; \
> + asm volatile( ".section __markers, \"a\", @progbits;\n\t" \

The volatile is not needed because the output is used.

> + ".long %1, 0f;\n\t" \
> + ".previous;\n\t" \
> + ".align 2\n\t" \
> + "0:\n\t" \
> + "movb $0,%0;\n\t" \

This should be a generic facility in a separate include file / separate
macros etc so that it can be used by other code too.

> + : "=r" (condition) \
> + : "m" (__mark_data_##name)); \
> + __mark_check_format(format, ## args); \
> + if (likely(!condition)) { \
> + } else { \
> + preempt_disable(); \

Why the preempt disable here and why can't it be in the hook?

> + (*__mark_data_##name.call)(&__mark_data_##name, \
> + format, ## args); \
> + preempt_enable(); \
> + } \
> + } while (0)
> +
> +/* Marker macro selecting the generic or optimized version of marker, depending
> + * on the flags specified. */
> +#define _trace_mark(flags, format, args...) \
> +do { \
> + if (((flags) & MF_LOCKDEP) && ((flags) & MF_OPTIMIZED)) \
> + trace_mark_optimized(flags, format, ## args); \
> + else \
> + trace_mark_generic(flags, format, ## args); \

Is there ever a good reason not to use optimized markers?

> + * bytes. */
> +#define MARK_OPTIMIZED_ENABLE_IMMEDIATE_OFFSET 1
> +#define MARK_OPTIMIZED_ENABLE_TYPE char

unsigned char is usually safer to avoid sign extension

-Andi

2007-05-10 15:55:23

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

* Andi Kleen ([email protected]) wrote:
> On Wed, May 09, 2007 at 09:56:00PM -0400, Mathieu Desnoyers wrote:
> > @@ -0,0 +1,99 @@
> > +/* marker.c
> > + *
> > + * Erratum 49 fix for Intel PIII and higher.
>
> Errata are CPU specific so they can't be higher. You mean it's a P3
> erratum only?
>
> In general you need some more description why the int3 handler is needed.
>

>From :
Intel® Core™2 Duo Processor for
Intel® Centrino® Duo Processor
Technology
Specification Update

AH33.
Unsynchronized Cross-Modifying Code Operations Can Cause
Unexpected Instruction Execution Results

It is therefore still relevant on newer CPUs. I can add reference to this
documentation in the header.


> > + *
> > + * Permits marker activation by XMC with correct serialization.
>
> This doesn't follow the Intel recommended algorithm for cross modifying
> code (7.1.3 in the IA32 manual) which requires the other CPUs to spin
> during the modification.
>
> If you used that would you really need the INT3 handler?
>

Let's go through the excercice of applying their algorithm to a running
kernel.

You would have to do something like :
CPU A wants to modify code; it initializes 2 completion barriers (one to
know when all other cpus are spinning and another to tell them they can
stop spinning) and sends an IPI to every other CPUs. All other CPUs,
upon entry in the IPI handler, disable interrupts, indicate that they
are spinning in the 1st barrier and wait for the completion variable to
be reset by CPU A. When all other CPUs are ready, CPU A disables
interrupts and proceeds to the change, then removes the second memory
barrier to let the other CPUs exit their loops.

* First issue : Impact on the system. If we try to make this system
scale, we will create very long irq disable sections. The expected
duration is the worse case IPI latency plus the time it takes to CPU A
to change the variable. We therefore directly grow the worse case
system's interrupt latency.

* Second issue : irq disabling does not protect us from NMI and traps.
We cannot use this algorithm to mark these code segments.

* Third issue : Scalability. Changing code will stop every CPU on the
system for a while. Compared to this, the int3-based approach will run
through the breakpoint handler "if" one of the CPU happens to execute
this code at the wrong time. The standard case is just an IPI (to
issue one "iret" and a synchronize_sched().

Also note that djprobes, a jump-based enhancement of kprobes, uses a
mechanism very similar to mine. I did not use their implementation
because I consider that kprobes has too much side-effects to be used in
"hard to instrument" sites (traps, nmi handlers, lockdep code). kprobes
also uses a temporary data structure to put the stepping instructions,
something I wanted to avoid. I enables me to do the teardown of the
breakpoint even when it is placed in preemptible code.

> > +static DEFINE_MUTEX(mark_mutex);
> > +static long target_eip = 0;
> > +
> > +static void mark_synchronize_core(void *info)
> > +{
> > + sync_core(); /* use cpuid to stop speculative execution */
> > +}
> > +
> > +/* We simply skip the 2 bytes load immediate here, leaving the register in an
> > + * undefined state. We don't care about the content (0 or !0), because we are
> > + * changing the value 0->1 or 1->0. This small window of undefined value
> > + * doesn't matter.
> > + */
> > +static int mark_notifier(struct notifier_block *nb,
> > + unsigned long val, void *data)
> > +{
> > + enum die_val die_val = (enum die_val) val;
> > + struct die_args *args = (struct die_args *)data;
> > +
> > + if (!args->regs || user_mode_vm(args->regs))
>
> I don't think regs should be ever NULL
>

I played safe and used the same tests done in kprobes. I'll let them
explain. See

arch/i386/kernel/kprobes.c

int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
int ret = NOTIFY_DONE;

if (args->regs && user_mode_vm(args->regs))
return ret;
...


> > + return NOTIFY_DONE;
> > +
> > + if (die_val == DIE_INT3 && args->regs->eip == target_eip) {
> > + args->regs->eip += 1; /* Skip the next byte of load immediate */
>
> In what instruction results that then? This is definitely underdocumented.
>

Should maybe document this more :

The eip there points right after the 1 byte breakpoint. The breakpoint
replaces the 1st byte of a 2 bytes load immediate. Therefore, if I skip
the following byte (the load immediate value), I am then at the
instruction following the load immediate. As I stated in my comments,
since I check that there is a state change to the variable (0->1 or
1->0), I can afford having garbage in my registers at this point,
because it will be either the state prior to the change I am currently
doing or the new state after the change.

I hope it makes the following comment, found just over mark_notifier(),
clearer :
/* We simply skip the 2 bytes load immediate here, leaving the register in an
* undefined state. We don't care about the content (0 or !0), because we are
* changing the value 0->1 or 1->0. This small window of undefined value
* doesn't matter.
*/

I will add this paragraph prior to the existing one to make things clearer :

/* The eip value points right after the breakpoint instruction, in the second
* byte of the movb. Incrementing it of 1 byte makes the code resume right after
* the movb instruction, effectively skipping this instruction.

> > +int marker_optimized_set_enable(void *address, char enable)
> > +{
> > + char saved_byte;
> > + int ret;
> > + char *dest = address;
> > +
> > + if (!(enable ^ dest[1])) /* Must be a state change 0<->1 to execute */
> > + return 0;
>
> Wouldn't you need that inside the mutex too to avoid races?
>

Absolutely, will fix.

>
> > +
> > + mutex_lock(&mark_mutex);
> > + target_eip = (long)address + BREAKPOINT_INS_LEN;
> > + /* register_die_notifier has memory barriers */
> > + register_die_notifier(&mark_notify);
> > + saved_byte = *dest;
> > + *dest = BREAKPOINT_INSTRUCTION;
> > + wmb();
>
> wmb is a nop
>

Quoting asm-i386/system.h :

* Some non intel clones support out of order store. wmb() ceases to be
* a nop for these.

#ifdef CONFIG_X86_OOSTORE
/* Actually there are no OOO store capable CPUs for now that do SSE,
but make it already an possibility. */
#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
#else
#define wmb() __asm__ __volatile__ ("": : :"memory")
#endif

As I will soon reuse this code on x86_64, wmb() is not a nop under
CONFIG_UNORDERED_IO.

Therefore, I think I am playing safe by leaving a wmb() there.

> > + /* Execute serializing instruction on each CPU.
> > + * Acts as a memory barrier. */
> > + ret = on_each_cpu(mark_synchronize_core, NULL, 1, 1);
> > + BUG_ON(ret != 0);
> > +
> > + dest[1] = enable;
> > + wmb();
> > + *dest = saved_byte;
> > + /* Wait for all int3 handlers to end
> > + (interrupts are disabled in int3).
> > + This CPU is clearly not in a int3 handler
> > + (not preemptible).
>
> So what happens when the kernel is preemptible?
>

This comment may be unclear : the "(not preemptible)" applies to the
int3 handler itself. I'll arrange it.

Because the int3 handler disables interrupts, it is not preemptible.
Therefore, the CPU running this code, doing the synchronize_sched(),
cannot reschedule a int3 handler running on a preempted thread stack
waiting to be scheduled again. Since original mov instruction has been
placed back before the call to synchronize_sched(), and there is a
memory barrier in synchronize_sched, we know there is no possibility for
an int3 handler to run for this instruction on the CPU that runs
synchronize_sched(). At the end of the synchronize_sched(), we know we
have exited every currently executing non-preemptible sections, which
includes the int3 handlers of every other CPUs.


> > + synchronize_sched has memory barriers */
> > + synchronize_sched();
> > + unregister_die_notifier(&mark_notify);
> > + /* unregister_die_notifier has memory barriers */
> > + target_eip = 0;
> > + mutex_unlock(&mark_mutex);
> > + flush_icache_range(address, size);
>
> That's a nop on x86
>

Right, and eventually on x86_64 too. Will remove.

> > +
> > +#ifdef CONFIG_MARKERS
> > +
> > +#define MF_DEFAULT (MF_OPTIMIZED | MF_LOCKDEP | MF_PRINTK)
> > +
> > +/* Optimized version of the markers */
> > +#define trace_mark_optimized(flags, name, format, args...) \
> > + do { \
> > + static const char __mstrtab_name_##name[] \
> > + __attribute__((section("__markers_strings"))) \
> > + = #name; \
> > + static const char __mstrtab_format_##name[] \
> > + __attribute__((section("__markers_strings"))) \
> > + = format; \
> > + static const char __mstrtab_args_##name[] \
> > + __attribute__((section("__markers_strings"))) \
> > + = #args; \
>
> For what do you need special string sections?
>
> If it's something to be read by external programs the interface
> would need to be clearly defined and commented.
> If not just use normal variables.
>

Markers, by their nature, will be declared everywhere through the kernel
code. Therefore, the strings would pollute the kernel data and use up
space in the data caches even when the markers are disabled. This is why
I think it makes sense to put this data in a separate section.


> > + static struct __mark_marker_data __mark_data_##name \
> > + __attribute__((section("__markers_data"))) = \
> > + { __mstrtab_name_##name, __mstrtab_format_##name, \
> > + __mstrtab_args_##name, \
> > + (flags) | MF_OPTIMIZED, __mark_empty_function, NULL }; \
> > + char condition; \
> > + asm volatile( ".section __markers, \"a\", @progbits;\n\t" \
>
> The volatile is not needed because the output is used.
>

True. Will fix. The same applies to asm-powerpc/marker.h.


> > + ".long %1, 0f;\n\t" \
> > + ".previous;\n\t" \
> > + ".align 2\n\t" \
> > + "0:\n\t" \
> > + "movb $0,%0;\n\t" \
>
> This should be a generic facility in a separate include file / separate
> macros etc so that it can be used by other code too.
>

Yep, the split into a "conditional call" infrastructure will come soon.
I was thinking of something like cond_call(func(arg1, arg2));. If you
think of a better macro name, I am open to suggestions.

> > + : "=r" (condition) \
> > + : "m" (__mark_data_##name)); \
> > + __mark_check_format(format, ## args); \
> > + if (likely(!condition)) { \
> > + } else { \
> > + preempt_disable(); \
>
> Why the preempt disable here and why can't it be in the hook?
>

Because I want to safely disconnect hooks. And I need to issue a
synchronize_sched() after disconnection to make sure every handler
finished running before I unload the module that implements the hook
code. Therefore, I must surround the call by preempt_disable.

> > + (*__mark_data_##name.call)(&__mark_data_##name, \
> > + format, ## args); \
> > + preempt_enable(); \
> > + } \
> > + } while (0)
> > +
> > +/* Marker macro selecting the generic or optimized version of marker, depending
> > + * on the flags specified. */
> > +#define _trace_mark(flags, format, args...) \
> > +do { \
> > + if (((flags) & MF_LOCKDEP) && ((flags) & MF_OPTIMIZED)) \
> > + trace_mark_optimized(flags, format, ## args); \
> > + else \
> > + trace_mark_generic(flags, format, ## args); \
>
> Is there ever a good reason not to use optimized markers?
>

Yep, instrumentation of the breakpoint handler, and instrumentation of
anything that can be called from the breakpoint handler, namely
lockdep.c code.

> > + * bytes. */
> > +#define MARK_OPTIMIZED_ENABLE_IMMEDIATE_OFFSET 1
> > +#define MARK_OPTIMIZED_ENABLE_TYPE char
>
> unsigned char is usually safer to avoid sign extension
>

Ok, will fix.

Thanks for the thorough review,

Mathieu

> -Andi

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-05-10 16:25:33

by Alan

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

> * First issue : Impact on the system. If we try to make this system
> scale, we will create very long irq disable sections. The expected
> duration is the worse case IPI latency plus the time it takes to CPU A
> to change the variable. We therefore directly grow the worse case
> system's interrupt latency.

Not a huge problem. It doesn't scale in really horrible ways and the IPI
latency on a PIV or later is actually very good. Also the impact is less
than you might think as on huge huge boxes you want multiple copies of
the kernel text pages to reduce NUMA traffic, so you only have to sync
the group of processors involved

> * Second issue : irq disabling does not protect us from NMI and traps.
> We cannot use this algorithm to mark these code segments.

If you synchronize all the other processors and disable local interrupts
then the only traps you have to worry about are those you cause, and the
only person taking the trap will be you so you're ok.

NMI is hard but NMI is a special case not worth solving IMHO.

> * Third issue : Scalability. Changing code will stop every CPU on the
> system for a while. Compared to this, the int3-based approach will run
> through the breakpoint handler "if" one of the CPU happens to execute
> this code at the wrong time. The standard case is just an IPI (to

If I read the errata right then patching in an int3 will itself trigger
the errata so anything could happen.

I believe there are other safe sequences for doing code patching - perhaps
one of the Intel folk can advise ?

Alan

2007-05-10 16:59:44

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

* Alan Cox ([email protected]) wrote:
> > * First issue : Impact on the system. If we try to make this system
> > scale, we will create very long irq disable sections. The expected
> > duration is the worse case IPI latency plus the time it takes to CPU A
> > to change the variable. We therefore directly grow the worse case
> > system's interrupt latency.
>
> Not a huge problem. It doesn't scale in really horrible ways and the IPI
> latency on a PIV or later is actually very good. Also the impact is less
> than you might think as on huge huge boxes you want multiple copies of
> the kernel text pages to reduce NUMA traffic, so you only have to sync
> the group of processors involved
>
> > * Second issue : irq disabling does not protect us from NMI and traps.
> > We cannot use this algorithm to mark these code segments.
>
> If you synchronize all the other processors and disable local interrupts
> then the only traps you have to worry about are those you cause, and the
> only person taking the trap will be you so you're ok.
>
> NMI is hard but NMI is a special case not worth solving IMHO.
>

Not caring about NMIs may have more impact than one could expect. You
have to be aware that (at least) the following code is executed in NMI
context. Trying to patch any of these functions could result in a dying
CPU :

default_do_nmi
notify_die
nmi_watchdog_tick
printk
die_nmi (should cause a OOPS, not kill the cpu..)
do_nmi_callback
unknown_nmi_panic_callback
sprintf
unknown_nmi_error
panic
reassert_nmi

In entry.S, there is also a call to local_irq_enable(), which falls into
lockdep code.

Tracing those core kernel functions is a fundamental need of crash
tracing. So, in my point of view, it is not "just" about tracing NMIs,
but it's about tracing code that can be touched by NMIs.

> > * Third issue : Scalability. Changing code will stop every CPU on the
> > system for a while. Compared to this, the int3-based approach will run
> > through the breakpoint handler "if" one of the CPU happens to execute
> > this code at the wrong time. The standard case is just an IPI (to
>
> If I read the errata right then patching in an int3 will itself trigger
> the errata so anything could happen.
>
> I believe there are other safe sequences for doing code patching - perhaps
> one of the Intel folk can advise ?
>

I'll let the Intel guys confirm this, I don't have the reference nearby
(I got this information by talking with the kprobe team members, and
they got this information directly from Intel developers) but the
int3 is the one special case to which the errata does not apply.
Otherwise, kprobes and gdb would have a big, big issue.

Mathieu


> Alan

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

On Thu, May 10, 2007 at 12:59:18PM -0400, Mathieu Desnoyers wrote:
> * Alan Cox ([email protected]) wrote:

...
> > > * Third issue : Scalability. Changing code will stop every CPU on the
> > > system for a while. Compared to this, the int3-based approach will run
> > > through the breakpoint handler "if" one of the CPU happens to execute
> > > this code at the wrong time. The standard case is just an IPI (to
> >
> > If I read the errata right then patching in an int3 will itself trigger
> > the errata so anything could happen.
> >
> > I believe there are other safe sequences for doing code patching - perhaps
> > one of the Intel folk can advise ?

IIRC, when the first implementation of what exists now as kprobes was
done (as part of the dprobes framework), this question did come up. I
think the conclusion was that the errata applies only to multi-byte
modifications and single-byte changes are guaranteed to be atomic.
Given int3 on Intel is just 1-byte, we are safe.

> I'll let the Intel guys confirm this, I don't have the reference nearby
> (I got this information by talking with the kprobe team members, and
> they got this information directly from Intel developers) but the
> int3 is the one special case to which the errata does not apply.
> Otherwise, kprobes and gdb would have a big, big issue.

Perhaps Richard/Suparna can confirm.

Ananth

2007-05-11 06:04:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

On Thu, May 10, 2007 at 12:59:18PM -0400, Mathieu Desnoyers wrote:
> * Alan Cox ([email protected]) wrote:
> > > * First issue : Impact on the system. If we try to make this system
> > > scale, we will create very long irq disable sections. The expected
> > > duration is the worse case IPI latency plus the time it takes to CPU A
> > > to change the variable. We therefore directly grow the worse case
> > > system's interrupt latency.
> >
> > Not a huge problem. It doesn't scale in really horrible ways and the IPI
> > latency on a PIV or later is actually very good. Also the impact is less
> > than you might think as on huge huge boxes you want multiple copies of
> > the kernel text pages to reduce NUMA traffic, so you only have to sync
> > the group of processors involved

I agree with Alan and disagree with you on the impact on the system.

> >
> > > * Second issue : irq disabling does not protect us from NMI and traps.
> > > We cannot use this algorithm to mark these code segments.
> >
> > If you synchronize all the other processors and disable local interrupts
> > then the only traps you have to worry about are those you cause, and the
> > only person taking the trap will be you so you're ok.
> >
> > NMI is hard but NMI is a special case not worth solving IMHO.
> >
>
> Not caring about NMIs may have more impact than one could expect. You
> have to be aware that (at least) the following code is executed in NMI
> context. Trying to patch any of these functions could result in a dying
> CPU :

There is a function to disable the nmi watchdog temporarily now


> In entry.S, there is also a call to local_irq_enable(), which falls into
> lockdep code.

??

>
> Tracing those core kernel functions is a fundamental need of crash
> tracing. So, in my point of view, it is not "just" about tracing NMIs,
> but it's about tracing code that can be touched by NMIs.

You only need to handle the erratas during the modification, not during
the whole lifetime of the marker.

The only frequent NMIs are watchdog and oprofile which both can
be stopped. Other NMIs are very infrequent.

BTW if you worry about NMI you would need to worry about machine
check and SMI too.

-Andi

2007-05-11 18:12:33

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

* Andi Kleen ([email protected]) wrote:
> On Thu, May 10, 2007 at 12:59:18PM -0400, Mathieu Desnoyers wrote:
> > * Alan Cox ([email protected]) wrote:
> > > > * First issue : Impact on the system. If we try to make this system
> > > > scale, we will create very long irq disable sections. The expected
> > > > duration is the worse case IPI latency plus the time it takes to CPU A
> > > > to change the variable. We therefore directly grow the worse case
> > > > system's interrupt latency.
> > >
> > > Not a huge problem. It doesn't scale in really horrible ways and the IPI
> > > latency on a PIV or later is actually very good. Also the impact is less
> > > than you might think as on huge huge boxes you want multiple copies of
> > > the kernel text pages to reduce NUMA traffic, so you only have to sync
> > > the group of processors involved
>
> I agree with Alan and disagree with you on the impact on the system.
>

I just want to make sure I understand your disagreement. You do not seem
to provide any counter-argument to the following technical fact : the
proposed algorithm will increase the worse-case interrupt latency of the
kernel.

The IPI might be fast, but I have seen interrupts being disabled for
quite a long time in some kernel code paths. Having interrupts disabled
on _each cpu_ while running an IPI handler waiting to be synchronized
with other CPUs has this side-effect. Therefore, if I understand well,
you object that the worse-case interrupt latency in the Linux kernel is
not important. Since I have some difficulty agreeing with your
objection, I'll leave the debate about the importance of such
side-effects to others, since it is mostly a political issue.

Or maybe am I not understanding you correctly ?


> > >
> > > > * Second issue : irq disabling does not protect us from NMI and traps.
> > > > We cannot use this algorithm to mark these code segments.
> > >
> > > If you synchronize all the other processors and disable local interrupts
> > > then the only traps you have to worry about are those you cause, and the
> > > only person taking the trap will be you so you're ok.
> > >
> > > NMI is hard but NMI is a special case not worth solving IMHO.
> > >
> >
> > Not caring about NMIs may have more impact than one could expect. You
> > have to be aware that (at least) the following code is executed in NMI
> > context. Trying to patch any of these functions could result in a dying
> > CPU :
>
> There is a function to disable the nmi watchdog temporarily now
>

As you pointed out below, NMI is only one example of interrupt sources
that cannot be protected by irq disable, such as MCE and SMIs. See below
for the rest of the discussion about this point.

>
> > In entry.S, there is also a call to local_irq_enable(), which falls into
> > lockdep code.
>
> ??
>

arch/i386/kernel/entry.S :
iret_exc:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code

include/asm-i386/irqflags.h
# define TRACE_IRQS_ON \
pushl %eax; \
pushl %ecx; \
pushl %edx; \
call trace_hardirqs_on; \
popl %edx; \
popl %ecx; \
popl %eax;

Which falls into the lockdep.c code.

> >
> > Tracing those core kernel functions is a fundamental need of crash
> > tracing. So, in my point of view, it is not "just" about tracing NMIs,
> > but it's about tracing code that can be touched by NMIs.
>
> You only need to handle the erratas during the modification, not during
> the whole lifetime of the marker.
>

I agree with you, but you need to make the modification of every callees
of functions such as printk() safe in order to be able to trace them
later.

> The only frequent NMIs are watchdog and oprofile which both can
> be stopped. Other NMIs are very infrequent.

If we race with an "infrequent" NMI with this algorithm, it will result
in a unspecified trap, most likely a GPF. So having a solution that is
correct most of the time is not an option here. It will not just cause a
glitch, but bring the whole system down.

>
> BTW if you worry about NMI you would need to worry about machine
> check and SMI too.
>

Absolutely. I use NMIs as an example of these conditions, but MCE and
SMI also present the same issue.

So we get another example (I am sure we could easily find more) :

arch/i386/kernel/cpu/mcheck/p4.c:intel_machine_check
printk (and everything that printk calls)
vprintk
printk_clock
sched_clock
release_console_sem
call_console_drivers
.. therefore serial port code..
wake_up_klogd
wake_up_interruptible
try_to_wake_up

So.. just the fact that the MCE uses printk involves scheduler code
execution. If you plan not to support NMI, MCE or SMI, you have to
forbid instrumentation of any of those code paths.

Mathieu

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-05-11 18:55:26

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

* Ananth N Mavinakayanahalli ([email protected]) wrote:
> On Thu, May 10, 2007 at 12:59:18PM -0400, Mathieu Desnoyers wrote:
> > * Alan Cox ([email protected]) wrote:
>
> ...
> > > > * Third issue : Scalability. Changing code will stop every CPU on the
> > > > system for a while. Compared to this, the int3-based approach will run
> > > > through the breakpoint handler "if" one of the CPU happens to execute
> > > > this code at the wrong time. The standard case is just an IPI (to
> > >
> > > If I read the errata right then patching in an int3 will itself trigger
> > > the errata so anything could happen.
> > >
> > > I believe there are other safe sequences for doing code patching - perhaps
> > > one of the Intel folk can advise ?
>
> IIRC, when the first implementation of what exists now as kprobes was
> done (as part of the dprobes framework), this question did come up. I
> think the conclusion was that the errata applies only to multi-byte
> modifications and single-byte changes are guaranteed to be atomic.
> Given int3 on Intel is just 1-byte, we are safe.
>
> > I'll let the Intel guys confirm this, I don't have the reference nearby
> > (I got this information by talking with the kprobe team members, and
> > they got this information directly from Intel developers) but the
> > int3 is the one special case to which the errata does not apply.
> > Otherwise, kprobes and gdb would have a big, big issue.
>
> Perhaps Richard/Suparna can confirm.
>

Ha-ha! I found the reference. It's worth quoting in full :
http://sourceware.org/ml/systemtap/2005-q3/msg00208.html
------
From: Richard J Moore <richardj_moore at uk dot ibm dot com>

There is another issue to consider when looking into using probes other
then int3:

Intel erratum 54 - Unsynchronized Cross-modifying code - refers to the
practice of modifying code on one processor where another has prefetched
the unmodified version of the code. Intel states that unpredictable
general protection faults may result if a synchronizing instruction
(iret, int, int3, cpuid, etc ) is not executed on the second processor
before it executes the pre-fetched out-of-date copy of the instruction.

When we became aware of this I had a long discussion with Intel's
microarchitecture guys. It turns out that the reason for this erratum
(which incidentally Intel does not intend to fix) is because the trace
cache - the stream of micorops resulting from instruction interpretation
- cannot guaranteed to be valid. Reading between the lines I assume this
issue arises because of optimization done in the trace cache, where it
is no longer possible to identify the original instruction boundaries.
If the CPU discoverers that the trace cache has been invalidated because
of unsynchronized cross-modification then instruction execution will be
aborted with a GPF. Further discussion with Intel revealed that
replacing the first opcode byte with an int3 would not be subject to
this erratum.

So, is cmpxchg reliable? One has to guarantee more than mere atomicity.

-----

Therefore, it is exactly what my implementation is doing : I make sure
that no CPU sees an out-of-date copy of a pre-fetched instruction by 1 -
using a breakpoint, which skips the instruction that is going to be
modified, 2 - issuing an IPI to every CPU to execute a sync_core(), to
make sure that even when the breakpoint is removed, no cpu could
possibly still have the out-of-date copy of the instruction, modify the
now unused 2nd byte of the instruction, and then put back the original
1st byte of the instruction.

It has exactly the same intent as the algorithm proposed by Intel, but
it has less side-effects, scales better and supports NMI, SMI and MCE.

Mathieu

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-05-11 21:53:16

by Alan

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

> The IPI might be fast, but I have seen interrupts being disabled for
> quite a long time in some kernel code paths. Having interrupts disabled
> on _each cpu_ while running an IPI handler waiting to be synchronized
> with other CPUs has this side-effect. Therefore, if I understand well,

This can already occur worst case when we spin on an IPI (eg a cross CPU
TLB shootdown)

If the INT3 is acknowledged as safe by intel either as is or by some
specific usage like lock mov then great. If not it isn't too bad a
problem.

And to be real about this - how many benchmarks do you know that care
about mega-kernel-debugs per second ?

2007-05-12 05:28:25

by Suparna Bhattacharya

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

On Fri, May 11, 2007 at 10:27:29AM +0530, Ananth N Mavinakayanahalli wrote:
> On Thu, May 10, 2007 at 12:59:18PM -0400, Mathieu Desnoyers wrote:
> > * Alan Cox ([email protected]) wrote:
>
> ...
> > > > * Third issue : Scalability. Changing code will stop every CPU on the
> > > > system for a while. Compared to this, the int3-based approach will run
> > > > through the breakpoint handler "if" one of the CPU happens to execute
> > > > this code at the wrong time. The standard case is just an IPI (to
> > >
> > > If I read the errata right then patching in an int3 will itself trigger
> > > the errata so anything could happen.
> > >
> > > I believe there are other safe sequences for doing code patching - perhaps
> > > one of the Intel folk can advise ?
>
> IIRC, when the first implementation of what exists now as kprobes was
> done (as part of the dprobes framework), this question did come up. I
> think the conclusion was that the errata applies only to multi-byte
> modifications and single-byte changes are guaranteed to be atomic.
> Given int3 on Intel is just 1-byte, we are safe.
>
> > I'll let the Intel guys confirm this, I don't have the reference nearby
> > (I got this information by talking with the kprobe team members, and
> > they got this information directly from Intel developers) but the
> > int3 is the one special case to which the errata does not apply.
> > Otherwise, kprobes and gdb would have a big, big issue.
>
> Perhaps Richard/Suparna can confirm.

I just tried digging up past discussions on this from Richard, about int3
being safe

http://sourceware.org/ml/systemtap/2005-q3/msg00208.html
http://lkml.org/lkml/2006/9/20/30

Regards
Suparna

>
> Ananth

--
Suparna Bhattacharya ([email protected])
Linux Technology Center
IBM Software Lab, India

2007-05-13 16:21:15

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [patch 05/10] Linux Kernel Markers - i386 optimized version

* Alan Cox ([email protected]) wrote:
> > The IPI might be fast, but I have seen interrupts being disabled for
> > quite a long time in some kernel code paths. Having interrupts disabled
> > on _each cpu_ while running an IPI handler waiting to be synchronized
> > with other CPUs has this side-effect. Therefore, if I understand well,
>
> This can already occur worst case when we spin on an IPI (eg a cross CPU
> TLB shootdown)
>

Hrm, maybe am I understanding something incorrectly there :

arch/i386/kernel/smp.c: native_flush_tlb_others() takes a spinlock, but
does not disable interrupts, while spinning waiting for other CPUs.
smp_invalidate_interrupt(), in the same file, does not spin waiting for
other CPUs. Therefore, I understand that none of these functions spin
with interrupts disabled, so this TLB flush does not show the same
behavior.


> If the INT3 is acknowledged as safe by intel either as is or by some
> specific usage like lock mov then great. If not it isn't too bad a
> problem.
>

Another mail in this thread explains that the main issue is not the
atomicity of the code modification operation (although it must be atomic
for the CPU to see a correct instruction), but to the fact that the CPU
expects the pre-fetched instruction and the executed instruction to be
the same, except for the int3 case.

> And to be real about this - how many benchmarks do you know that care
> about mega-kernel-debugs per second ?

For users with real-time needs, the overall IRQ latency of the system
gives an upper-bound to what can be executed by the application in a
given time-frame. People doing audio/video acquisition should be quite
interested in this metric.

So this is mostly a matter of how this action (enabling a marker) can
influence the overall system's latency. One of my goals is to provide
tracing in the Linux kernel with minimal performance and behavioral
impact on the system so it does not make the system flakyer than normal
and can be activated on a bogus system and still reproduce the original
problem.


Mathieu

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68