2006-12-21 01:51:11

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 0/4] Linux Kernel Markers

Hi,

You will find, in the following posts, the latest revision of the Linux Kernel
Markers. Due to the need some tracing projects (LTTng, SystemTAP) has of this
kind of mechanism, it could be nice to consider it for mainstream inclusion.

The following patches apply on 2.6.20-rc1-git7.

Signed-off-by : Mathieu Desnoyers <[email protected]>

OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68


2006-12-20 23:58:07

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 1/4] Linux Kernel Markers : Architecture agnostic code.

Linux Kernel Markers, architecture independant code.

This patch also includes non-optimized default behavior from asm-generic in each
architecture where the optimised support is not implemented.

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -118,6 +118,14 @@ #define RODATA \
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
*(__ksymtab_strings) \
} \
+ /* Kernel markers : pointers */ \
+ .markers : AT(ADDR(.markers) - LOAD_OFFSET) { \
+ VMLINUX_SYMBOL(__start___markers) = .; \
+ *(.markers) \
+ VMLINUX_SYMBOL(__stop___markers) = .; \
+ } \
+ __end_rodata = .; \
+ . = ALIGN(4096); \
\
/* Built-in module parameters. */ \
__param : AT(ADDR(__param) - LOAD_OFFSET) { \
--- /dev/null
+++ b/kernel/Kconfig.marker
@@ -0,0 +1,17 @@
+# Code markers configuration
+
+config MARKERS
+ bool "Activate markers"
+ select MODULES
+ default n
+ help
+ Place an empty function call at each marker site. Can be
+ dynamically changed for a probe function.
+
+config MARKERS_DISABLE_OPTIMIZATION
+ bool "Disable architecture specific marker optimization"
+ depends EMBEDDED
+ default n
+ help
+ Disable code replacement jump optimisations. Especially useful if your
+ code is in a read-only rom/flash.
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -356,6 +356,9 @@ #endif
/* The command line arguments (may be mangled). People like
keeping pointers to this stuff */
char *args;
+
+ const struct __mark_marker *markers;
+ unsigned int num_markers;
};

/* FIXME: It'd be nice to isolate modules during init, too, so they
@@ -469,6 +472,7 @@ extern void print_modules(void);
struct device_driver;
void module_add_driver(struct module *, struct device_driver *);
void module_remove_driver(struct device_driver *);
+extern void list_modules(void);

#else /* !CONFIG_MODULES... */
#define EXPORT_SYMBOL(sym)
--- /dev/null
+++ b/include/linux/marker.h
@@ -0,0 +1,65 @@
+#ifndef _LINUX_MARKER_H
+#define _LINUX_MARKER_H
+
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing.
+ *
+ * Example :
+ *
+ * MARK(subsystem_event, "%d %s", someint, somestring);
+ * Where :
+ * - Subsystem is the name of your subsystem.
+ * - event is the name of the event to mark.
+ * - "%d %s" is the formatted string for printk.
+ * - someint is an integer.
+ * - somestring is a char *.
+ *
+ * - Dynamically overridable function call based on marker mechanism
+ * from Frank Ch. Eigler <[email protected]>.
+ * - Thanks to Jeremy Fitzhardinge <[email protected]> for his constructive
+ * criticism about gcc optimization related issues.
+ *
+ * The marker mechanism supports multiple instances of the same marker.
+ * Markers can be put in inline functions, inlined static functions and
+ * unrolled loops.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef __ASSEMBLY__
+
+typedef void marker_probe_func(const char *fmt, ...);
+
+#ifndef CONFIG_MARKERS_DISABLE_OPTIMIZATION
+#include <asm/marker.h>
+#else
+#include <asm-generic/marker.h>
+#endif
+
+#define MARK_NOARGS " "
+#define MARK_MAX_FORMAT_LEN 1024
+
+#ifndef CONFIG_MARKERS
+#define MARK(name, format, args...) \
+ __mark_check_format(format, ## args)
+#endif
+
+static inline __attribute__ ((format (printf, 1, 2)))
+void __mark_check_format(const char *fmt, ...)
+{ }
+
+extern marker_probe_func __mark_empty_function;
+
+extern int marker_set_probe(const char *name, const char *format,
+ marker_probe_func *probe);
+
+extern int marker_remove_probe(marker_probe_func *probe);
+extern int marker_list_probe(marker_probe_func *probe);
+
+#endif
+#endif
--- /dev/null
+++ b/include/asm-arm/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-cris/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-frv/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-generic/marker.h
@@ -0,0 +1,49 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Generic header.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ *
+ * Note : the empty asm volatile with read constraint is used here instead of a
+ * "used" attribute to fix a gcc 4.1.x bug.
+ */
+
+struct __mark_marker_c {
+ const char *name;
+ marker_probe_func **call;
+ const char *format;
+} __attribute__((packed));
+
+struct __mark_marker {
+ const struct __mark_marker_c *cmark;
+ volatile char *enable;
+} __attribute__((packed));
+
+#ifdef CONFIG_MARKERS
+
+#define MARK(name, format, args...) \
+ do { \
+ static marker_probe_func *__mark_call_##name = \
+ __mark_empty_function; \
+ volatile static char __marker_enable_##name = 0; \
+ static const struct __mark_marker_c __mark_c_##name \
+ __attribute__((section(".markers.c"))) = \
+ { #name, &__mark_call_##name, format } ; \
+ static const struct __mark_marker __mark_##name \
+ __attribute__((section(".markers"))) = \
+ { &__mark_c_##name, &__marker_enable_##name } ; \
+ asm volatile ( "" : : "i" (&__mark_##name)); \
+ __mark_check_format(format, ## args); \
+ if (unlikely(__marker_enable_##name)) { \
+ preempt_disable(); \
+ (*__mark_call_##name)(format, ## args); \
+ preempt_enable_no_resched(); \
+ } \
+ } while (0)
+
+
+#define MARK_ENABLE_IMMEDIATE_OFFSET 0
+
+#endif
--- /dev/null
+++ b/include/asm-h8300/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-ia64/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-m32r/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-m68k/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-m68knommu/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-mips/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-parisc/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-ppc/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-s390/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-sh64/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-sh/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-sparc64/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-sparc/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-um/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-v850/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-x86_64/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>
--- /dev/null
+++ b/include/asm-xtensa/marker.h
@@ -0,0 +1,13 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. Architecture specific
+ * optimisations.
+ *
+ * No optimisation implemented.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm-generic/marker.h>

OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2006-12-20 23:59:36

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 2/4] Linux Kernel Markers : kconfig menus

Linux Kernel Markers : kconfig menus

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- a/Makefile
+++ b/Makefile
@@ -308,7 +308,8 @@ # Use LINUXINCLUDE when you must referen
# Needed to be compatible with the O= option
LINUXINCLUDE := -Iinclude \
$(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) \
- -include include/linux/autoconf.h
+ -include include/linux/autoconf.h \
+ -include linux/marker.h

CPPFLAGS := -D__KERNEL__ $(LINUXINCLUDE)

--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -638,6 +638,12 @@ source "fs/Kconfig"

source "arch/alpha/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/alpha/Kconfig.debug"

source "security/Kconfig"
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -191,6 +191,12 @@ source "sound/Kconfig"

source "drivers/usb/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/cris/Kconfig.debug"

source "security/Kconfig"
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -375,6 +375,12 @@ source "drivers/Kconfig"

source "fs/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/frv/Kconfig.debug"

source "security/Kconfig"
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -205,6 +205,12 @@ endmenu

source "fs/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/h8300/Kconfig.debug"

source "security/Kconfig"
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1169,6 +1169,9 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
+source "kernel/Kconfig.marker"
+
endmenu

source "arch/i386/Kconfig.debug"
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -564,6 +564,9 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
+source "kernel/Kconfig.marker"
+
endmenu

source "arch/ia64/Kconfig.debug"
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -394,6 +394,12 @@ source "fs/Kconfig"

source "arch/m32r/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/m32r/Kconfig.debug"

source "security/Kconfig"
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -660,6 +660,12 @@ endmenu

source "fs/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/m68k/Kconfig.debug"

source "security/Kconfig"
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -669,6 +669,12 @@ source "drivers/Kconfig"

source "fs/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/m68knommu/Kconfig.debug"

source "security/Kconfig"
--- a/arch/ppc/Kconfig
+++ b/arch/ppc/Kconfig
@@ -1443,8 +1443,14 @@ source "lib/Kconfig"

source "arch/powerpc/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/ppc/Kconfig.debug"

source "security/Kconfig"

source "crypto/Kconfig"
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1185,6 +1185,9 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
+source "kernel/Kconfig.marker"
+
endmenu

source "arch/powerpc/Kconfig.debug"
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -263,6 +263,12 @@ source "fs/Kconfig"

source "arch/parisc/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/parisc/Kconfig.debug"

source "security/Kconfig"
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -968,6 +968,12 @@ source "fs/Kconfig"

source "arch/arm/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/arm/Kconfig.debug"

source "security/Kconfig"
--- a/arch/arm26/Kconfig
+++ b/arch/arm26/Kconfig
@@ -240,6 +240,12 @@ source "drivers/misc/Kconfig"

source "drivers/usb/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/arm26/Kconfig.debug"

source "security/Kconfig"
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2068,6 +2068,12 @@ source "fs/Kconfig"

source "arch/mips/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/mips/Kconfig.debug"

source "security/Kconfig"
@@ -2075,3 +2081,7 @@ source "security/Kconfig"
source "crypto/Kconfig"

source "lib/Kconfig"
+
+menu "Instrumentation Support"
+
+endmenu
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -518,6 +518,8 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".

+source "kernel/Kconfig.marker"
+
endmenu

source "arch/s390/Kconfig.debug"
--- a/arch/sh64/Kconfig
+++ b/arch/sh64/Kconfig
@@ -284,6 +284,12 @@ source "fs/Kconfig"

source "arch/sh64/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/sh64/Kconfig.debug"

source "security/Kconfig"
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -707,6 +707,12 @@ source "fs/Kconfig"

source "arch/sh/oprofile/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/sh/Kconfig.debug"

source "security/Kconfig"
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -443,6 +443,9 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
+source "kernel/Kconfig.marker"
+
endmenu

source "arch/sparc64/Kconfig.debug"
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -302,6 +302,8 @@ menu "Instrumentation Support"

source "arch/sparc/oprofile/Kconfig"

+source "kernel/Kconfig.marker"
+
endmenu

source "arch/sparc/Kconfig.debug"
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -344,4 +344,10 @@ config INPUT
bool
default n

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/um/Kconfig.debug"
--- a/arch/v850/Kconfig
+++ b/arch/v850/Kconfig
@@ -332,6 +332,12 @@ source "sound/Kconfig"

source "drivers/usb/Kconfig"

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/v850/Kconfig.debug"

source "security/Kconfig"
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -244,6 +244,12 @@ config EMBEDDED_RAMDISK_IMAGE
provide one yourself.
endmenu

+menu "Instrumentation Support"
+
+source "kernel/Kconfig.marker"
+
+endmenu
+
source "arch/xtensa/Kconfig.debug"

source "security/Kconfig"
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -731,6 +731,9 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
+source "kernel/Kconfig.marker"
+
endmenu

source "arch/x86_64/Kconfig.debug"

OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2006-12-21 00:00:50

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 3/4] Linux Kernel Markers : i386 optimisation

This is the i386 optimisation for the Linux Kernel Markers.

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- /dev/null
+++ b/include/asm-i386/marker.h
@@ -0,0 +1,54 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. i386 architecture optimisations.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+
+struct __mark_marker_c {
+ const char *name;
+ marker_probe_func **call;
+ const char *format;
+} __attribute__((packed));
+
+struct __mark_marker {
+ struct __mark_marker_c *cmark;
+ volatile char *enable;
+} __attribute__((packed));
+
+#ifdef CONFIG_MARKERS
+#define MARK(name, format, args...) \
+ do { \
+ static marker_probe_func *__mark_call_##name = \
+ __mark_empty_function; \
+ static const struct __mark_marker_c __mark_c_##name \
+ __attribute__((section(".markers.c"))) = \
+ { #name, &__mark_call_##name, format } ; \
+ char condition; \
+ asm volatile( ".section .markers, \"a\";\n\t" \
+ ".long %1, 0f;\n\t" \
+ ".previous;\n\t" \
+ ".align 2\n\t" \
+ "0:\n\t" \
+ "movb $0,%0;\n\t" \
+ : "=r" (condition) \
+ : "m" (__mark_c_##name)); \
+ __mark_check_format(format, ## args); \
+ if (unlikely(condition)) { \
+ preempt_disable(); \
+ (*__mark_call_##name)(format, ## args); \
+ preempt_enable_no_resched(); \
+ } \
+ } while (0)
+
+/* Offset of the immediate value from the start of the movb instruction, in
+ * bytes. */
+#define MARK_ENABLE_IMMEDIATE_OFFSET 1
+#define MARK_POLYMORPHIC
+
+#endif

OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2006-12-21 00:01:45

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 4/4] Linux Kernel Markers : powerpc optimisation

This is the powerpc Linux Kernel Markers optimised version.

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- /dev/null
+++ b/include/asm-powerpc/marker.h
@@ -0,0 +1,58 @@
+/*
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing. PowerPC architecture
+ * optimisations.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <asm/asm-compat.h>
+
+struct __mark_marker_c {
+ const char *name;
+ marker_probe_func **call;
+ const char *format;
+} __attribute__((packed));
+
+struct __mark_marker {
+ struct __mark_marker_c *cmark;
+ volatile short *enable;
+} __attribute__((packed));
+
+#ifdef CONFIG_MARKERS
+
+#define MARK(name, format, args...) \
+ do { \
+ static marker_probe_func *__mark_call_##name = \
+ __mark_empty_function; \
+ static const struct __mark_marker_c __mark_c_##name \
+ __attribute__((section(".markers.c"))) = \
+ { #name, &__mark_call_##name, format } ; \
+ char condition; \
+ asm volatile( ".section .markers, \"a\";\n\t" \
+ PPC_LONG "%1, 0f;\n\t" \
+ ".previous;\n\t" \
+ ".align 4\n\t" \
+ "0:\n\t" \
+ "li %0,0;\n\t" \
+ : "=r" (condition) \
+ : "i" (&__mark_c_##name)); \
+ __mark_check_format(format, ## args); \
+ if (unlikely(condition)) { \
+ preempt_disable(); \
+ (*__mark_call_##name)(format, ## args); \
+ preempt_enable_no_resched(); \
+ } \
+ } while (0)
+
+
+/* Offset of the immediate value from the start of the addi instruction (result
+ * of the li mnemonic), in bytes. */
+#define MARK_ENABLE_IMMEDIATE_OFFSET 2
+#define MARK_POLYMORPHIC
+
+#endif

OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-13 01:33:40

by Richard J Moore

[permalink] [raw]
Subject: Re: [PATCH 0/4] Linux Kernel Markers



Mathieu Desnoyers <[email protected]> wrote on 20/12/2006
23:52:16:

> Hi,
>
> You will find, in the following posts, the latest revision of the Linux
Kernel
> Markers. Due to the need some tracing projects (LTTng, SystemTAP) has of
this
> kind of mechanism, it could be nice to consider it for mainstream
inclusion.
>
> The following patches apply on 2.6.20-rc1-git7.
>
> Signed-off-by : Mathieu Desnoyers <[email protected]>

Mathiue, FWIW I like this idea. A few years ago I implemented something
similar, but that had no explicit clients. Consequently I made my hooks
code more generalized than is needed in practice. I do remember that Karim
reworked the LTT instrumentation to use hooks and it worked fine.

You've got the same optimizations for x86 by modifying an instruction's
immediate operand and thus avoiding a d-cache hit. The only real caveat is
the need to avoid the unsynchronised cross modification erratum. Which
means that all processors will need to issue a serializing operation before
executing a Marker whose state is changed. How is that handled?

One additional thing we did, which might be useful at some future point,
was adding a /proc interface. We reflected the current instrumentation
though /proc and gave the status of each hook. We even talked about being
able to enable or disabled instrumentation by writing to /proc but I don't
think we ever implemented this.

It's high time we settled the issue of instrumentation. It gets my vote,

Good luck!

Richard

- -
Richard J Moore
IBM Linux Technology Centre

2007-01-13 05:50:50

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 0/4] Linux Kernel Markers

Hi Richard,

* Richard J Moore ([email protected]) wrote:
>
>
> Mathieu Desnoyers <[email protected]> wrote on 20/12/2006
> 23:52:16:
>
> > Hi,
> >
> > You will find, in the following posts, the latest revision of the Linux
> Kernel
> > Markers. Due to the need some tracing projects (LTTng, SystemTAP) has of
> this
> > kind of mechanism, it could be nice to consider it for mainstream
> inclusion.
> >
> > The following patches apply on 2.6.20-rc1-git7.
> >
> > Signed-off-by : Mathieu Desnoyers <[email protected]>
>
> Mathiue, FWIW I like this idea. A few years ago I implemented something
> similar, but that had no explicit clients. Consequently I made my hooks
> code more generalized than is needed in practice. I do remember that Karim
> reworked the LTT instrumentation to use hooks and it worked fine.
>

Yes, I think some features you implemented in GKHI, like chained calls to
multiple probes, should be implemented in a "probe management module" which
would be built on top of the marker infrastructure. One of my goal is to
concentrate on having the core right so that, afterward, building on top of it
will be easy.

> You've got the same optimizations for x86 by modifying an instruction's
> immediate operand and thus avoiding a d-cache hit. The only real caveat is
> the need to avoid the unsynchronised cross modification erratum. Which
> means that all processors will need to issue a serializing operation before
> executing a Marker whose state is changed. How is that handled?
>

Good catch. I thought that modifying only 1 byte would spare us from this
errata, but looking at it in detail tells me than it's not the case.

I see three different ways to address the problem :
1 - Adding some synchronization code in the marker and using
synchronize_sched().
2 - Using an IPI to make other CPUs busy loop while we change the code and then
execute a serializing instruction (iret, cpuid...).
3 - First write an int3 instead of the instruction's first byte. The handler
would do the following :
int3_handler :
single-step the original instruction.
iret

Secondly, we call an IPI that does a smp_processor_id() on each CPU and
wait for them to complete. It will make sure we execute a synchronizing
instruction on every CPU even if we do not execute the trap handler.

Then, we write the new 2 bytes instruction atomically instead of the int3
and immediate value.


I exclude (1) because of the performance impact, (2) because it does not deal
with NMIs. It leaves (3). Does it make sense ?


> One additional thing we did, which might be useful at some future point,
> was adding a /proc interface. We reflected the current instrumentation
> though /proc and gave the status of each hook. We even talked about being
> able to enable or disabled instrumentation by writing to /proc but I don't
> think we ever implemented this.
>

Adding a /proc output to list the active probes and their
callback will be tribial to add to the markers. I think the probe management
module should have its /proc file too to list the chains of connected handlers
once we get there.

> It's high time we settled the issue of instrumentation. It gets my vote,
>
> Good luck!
>
> Richard
>

Thanks,

Mathieu

> - -
> Richard J Moore
> IBM Linux Technology Centre
>

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-16 17:42:08

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 0/4 update] Linux Kernel Markers - i386 : pIII erratum 49 : XMC

Hi Richard,

* Mathieu Desnoyers ([email protected]) wrote:
> > You've got the same optimizations for x86 by modifying an instruction's
> > immediate operand and thus avoiding a d-cache hit. The only real caveat is
> > the need to avoid the unsynchronised cross modification erratum. Which
> > means that all processors will need to issue a serializing operation before
> > executing a Marker whose state is changed. How is that handled?
> >
>
> Good catch. I thought that modifying only 1 byte would spare us from this
> errata, but looking at it in detail tells me than it's not the case.
>
> I see three different ways to address the problem :
[...]
> 3 - First write an int3 instead of the instruction's first byte. The handler
> would do the following :
> int3_handler :
> single-step the original instruction.
> iret
>
> Secondly, we call an IPI that does a smp_processor_id() on each CPU and
> wait for them to complete. It will make sure we execute a synchronizing
> instruction on every CPU even if we do not execute the trap handler.
>
> Then, we write the new 2 bytes instruction atomically instead of the int3
> and immediate value.
>
>

Here is the implementation of my proposal using a slightly enhanced kprobes. I
add the ability to single step a different instruction than the original one,
and then put the new instruction instead of the original one when removing the
kprobe. It is an improvement on the djprobes design : AFAIK, djprobes required
the int3 to be executed by _every_ CPU before the instruction could be
replaced. It was problematic with rarely used code paths (error handling) and
with thread CPU affinity. Comments are welcome.

I noticed that it restrains LTTng by removing the ability to probe
do_general_protection, do_nmi, do_trap, do_debug and do_page_fault.
hardirq on/off in lockdep.c must also be tweaked to allow
local_irq_enable/disable usage within the debug trap handler.

It would be nice to push the study of the kprobes debug trap handler so it can
become possible to use it to put breakpoints in trap handlers. For now, kprobes
refuses to insert breakpoints in __kprobes marked functions. However, as we
instrument specific spots of the functions (not necessarily the function entry),
it is sometimes correct to use kprobes on a marker within the function even if
it is not correct to use it in the prologue. Insight from the SystemTAP team
would be welcome on this kprobe limitation.

Mathieu

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -31,6 +31,7 @@
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
+#include <linux/kallsyms.h>
#include <asm/cacheflush.h>
#include <asm/kdebug.h>
#include <asm/desc.h>
@@ -753,6 +754,73 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}

+static struct kprobe xmc_kp;
+DEFINE_MUTEX(kprobe_xmc_mutex);
+
+static int xmc_handler_pre(struct kprobe *p, struct pt_regs *regs)
+{
+ return 0;
+}
+
+static void xmc_handler_post(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+}
+
+static int xmc_handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
+{
+ return 0;
+}
+
+static void xmc_synchronize_cpu(void *info)
+{
+ smp_processor_id();
+}
+
+/* Think of it as a memcpy of new into address with safety with regard to
+ * Intel PIII errata 49. Only valid for modifying a single instruction with
+ * an instruction of the same size or in smaller instructions of the total
+ * same size as the original instruction. */
+int arch_xmc(void *address, char *newi, int size)
+{
+ int ret = 0;
+ char *dest = (char*)address;
+ char str[KSYM_NAME_LEN + 1];
+ unsigned long sym_offs, sym_size;
+ char *modname;
+ const char *sym_name;
+
+ mutex_lock(&kprobe_xmc_mutex);
+ xmc_kp.pre_handler = xmc_handler_pre;
+ xmc_kp.post_handler = xmc_handler_post;
+ xmc_kp.fault_handler = xmc_handler_fault;
+ xmc_kp.addr = address;
+ xmc_kp.symbol_name = NULL;
+ xmc_kp.offset = 0;
+
+ ret = register_kprobe_swapi(&xmc_kp, newi, size);
+ if (ret < 0) {
+ sym_name = kallsyms_lookup((unsigned long)address,
+ &sym_size, &sym_offs,
+ &modname, str);
+ printk("register_kprobe failed for %p in %s:%s returned %d\n",
+ address, modname?modname:"kernel", sym_name, ret);
+ goto end;
+ }
+ /* Copy every bytes of the new instruction except the first one */
+ memcpy(dest+1, newi+1, size-1);
+ flush_icache_range(dest, size);
+ /* Execute serializing instruction on each CPU */
+ ret = on_each_cpu(xmc_synchronize_cpu, NULL, 1, 1);
+ BUG_ON(ret != 0);
+
+ unregister_kprobe(&xmc_kp);
+end:
+ mutex_unlock(&kprobe_xmc_mutex);
+
+ return ret;
+}
+
int __init arch_init_kprobes(void)
{
return 0;
--- a/include/asm-generic/marker.h
+++ b/include/asm-generic/marker.h
@@ -18,7 +18,7 @@ struct __mark_marker_c {

struct __mark_marker {
const struct __mark_marker_c *cmark;
- volatile char *enable;
+ char *enable;
} __attribute__((packed));

#ifdef CONFIG_MARKERS
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -90,4 +90,7 @@ static inline void restore_interrupts(struct pt_regs *regs)

extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
+/* pIII code cross modification which follows erratum 49. */
+#define ARCH_HAS_XMC
+int arch_xmc(void *address, char *newi, int size);
#endif /* _ASM_KPROBES_H */
--- a/include/asm-i386/marker.h
+++ b/include/asm-i386/marker.h
@@ -18,7 +18,7 @@ struct __mark_marker_c {

struct __mark_marker {
struct __mark_marker_c *cmark;
- volatile char *enable;
+ char *enable;
} __attribute__((packed));

#ifdef CONFIG_MARKERS
--- a/include/asm-powerpc/marker.h
+++ b/include/asm-powerpc/marker.h
@@ -20,7 +20,7 @@ struct __mark_marker_c {

struct __mark_marker {
struct __mark_marker_c *cmark;
- volatile short *enable;
+ short *enable;
} __attribute__((packed));

#ifdef CONFIG_MARKERS
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -189,6 +189,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
}

int register_kprobe(struct kprobe *p);
+int register_kprobe_swapi(struct kprobe *p, char *newi, int size);
void unregister_kprobe(struct kprobe *p);
int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
int longjmp_break_handler(struct kprobe *, struct pt_regs *);
@@ -203,6 +204,9 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
void add_rp_inst(struct kretprobe_instance *ri);
void kprobe_flush_task(struct task_struct *tk);
void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
+/* Architecture specific code cross-modification. Valid for overwriting
+ * a single instruction. Safe for Intel PIII erratum 49. */
+int kprobe_xmc(void *address, char *newi, int size);
#else /* CONFIG_KPROBES */

#define __kprobes /**/
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -40,7 +40,7 @@

typedef void marker_probe_func(const char *fmt, ...);

-#ifndef CONFIG_MARKERS_DISABLE_OPTIMIZATION
+#ifdef CONFIG_MARKERS_ENABLE_OPTIMIZATION
#include <asm/marker.h>
#else
#include <asm-generic/marker.h>
--- a/kernel/Kconfig.marker
+++ b/kernel/Kconfig.marker
@@ -8,9 +8,10 @@ config MARKERS
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.

-config MARKERS_DISABLE_OPTIMIZATION
- bool "Disable architecture specific marker optimization"
- depends EMBEDDED
+config MARKERS_ENABLE_OPTIMIZATION
+ bool "Enable marker optimization (EXPERIMENTAL)"
+ depends MARKERS
+ select KPROBES
default n
help
Disable code replacement jump optimisations. Especially useful if your
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -534,7 +534,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
}

static int __kprobes __register_kprobe(struct kprobe *p,
- unsigned long called_from)
+ unsigned long called_from, char *newi, int size)
{
int ret = 0;
struct kprobe *old_p;
@@ -587,6 +587,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,

if ((ret = arch_prepare_kprobe(p)) != 0)
goto out;
+ if (newi) {
+ memcpy(p->ainsn.insn, newi, size);
+ p->opcode = *newi;
+ }

INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
@@ -609,7 +613,14 @@ out:
int __kprobes register_kprobe(struct kprobe *p)
{
return __register_kprobe(p,
- (unsigned long)__builtin_return_address(0));
+ (unsigned long)__builtin_return_address(0),
+ NULL, 0);
+}
+
+int __kprobes register_kprobe_swapi(struct kprobe *p, char *newi, int size)
+{
+ return __register_kprobe(p,
+ (unsigned long)__builtin_return_address(0), newi, size);
}

void __kprobes unregister_kprobe(struct kprobe *p)
@@ -699,7 +710,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
jp->kp.break_handler = longjmp_break_handler;

return __register_kprobe(&jp->kp,
- (unsigned long)__builtin_return_address(0));
+ (unsigned long)__builtin_return_address(0),
+ NULL, 0);
}

void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -760,7 +772,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
rp->nmissed = 0;
/* Establish function entry probe point */
if ((ret = __register_kprobe(&rp->kp,
- (unsigned long)__builtin_return_address(0))) != 0)
+ (unsigned long)__builtin_return_address(0),
+ NULL, 0)) != 0)
free_rp_inst(rp);
return ret;
}
@@ -790,6 +803,33 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
free_rp_inst(rp);
}

+#ifdef ARCH_HAS_XMC
+int kprobe_xmc(void *address, char *newi, int size)
+{
+ return arch_xmc(address, newi, size);
+}
+#else
+/* Limited XMC : only overwrite an instruction with an atomic operation
+ * (writing at most an aligned long). */
+int kprobe_xmc(void *address, char *newi, int size)
+{
+ if (size > sizeof(long)) {
+ printk("Limited XMC : cannot overwrite instructions bigger "\
+ "than %d. XMC of size %d fails.\n", sizeof(long),
+ size);
+ return -EPERM;
+ }
+ if (hweight32(size) != 1 || address % size != 0) {
+ printk("Limited XMC : cannot write %d bytes unaligned "\
+ "instruction. XMC fails.\n", size);
+ return -EPERM;
+ }
+ memcpy(address, newi, size);
+ flush_icache_range(address, size);
+}
+#endif /* HAS_ARCH_XMC */
+EXPORT_SYMBOL(kprobe_xmc);
+
static int __init init_kprobes(void)
{
int i, err = 0;
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/unwind.h>
+#include <linux/kprobes.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
@@ -309,6 +310,26 @@ EXPORT_SYMBOL(__mark_empty_function);
#define MARK_ENABLE_OFFSET(a) \
(typeof(a))((char*)a+MARK_ENABLE_IMMEDIATE_OFFSET)

+/* wmb() stay ordered. State will be ok for interrupts/other CPUs. */
+#ifdef MARK_POLYMORPHIC
+static int marker_set_enable(char *address, char enable)
+{
+ char newi[MARK_ENABLE_IMMEDIATE_OFFSET+1];
+
+ memcpy(newi, address, MARK_ENABLE_IMMEDIATE_OFFSET+1);
+ *MARK_ENABLE_OFFSET(&newi[0]) = enable;
+ return kprobe_xmc(address, newi, MARK_ENABLE_IMMEDIATE_OFFSET+1);
+}
+#else
+static int marker_set_enable(char *address, char enable)
+{
+ *MARK_ENABLE_OFFSET(address) = enable;
+ return 0;
+}
+#endif /* MARK_POLYMORPHIC */
+
+/* enable and function address are set out of order, and it's ok :
+ * the state is always coherent. */
static int marker_set_probe_range(const char *name,
const char *format,
marker_probe_func *probe,
@@ -336,13 +357,7 @@ static int marker_set_probe_range(const char *name,
*iter->cmark->call =
__mark_empty_function;
}
- /* Can have many enables for one function */
- *MARK_ENABLE_OFFSET(iter->enable) = 0;
-#ifdef MARK_POLYMORPHIC
- flush_icache_range(
- MARK_ENABLE_OFFSET(iter->enable),
- sizeof(*(iter->enable)));
-#endif
+ marker_set_enable(iter->enable, 0);
} else {
if (*iter->cmark->call
!= __mark_empty_function) {
@@ -360,12 +375,7 @@ static int marker_set_probe_range(const char *name,
*iter->cmark->call = probe;
}
/* Can have many enables for one function */
- *MARK_ENABLE_OFFSET(iter->enable) = 1;
-#ifdef MARK_POLYMORPHIC
- flush_icache_range(
- MARK_ENABLE_OFFSET(iter->enable),
- sizeof(*(iter->enable)));
-#endif
+ marker_set_enable(iter->enable, 1);
}
found++;
}
@@ -382,12 +392,7 @@ static int marker_remove_probe_range(marker_probe_func *probe,

for (iter = begin; iter < end; iter++) {
if (*iter->cmark->call == probe) {
- *MARK_ENABLE_OFFSET(iter->enable) = 0;
-#ifdef MARK_POLYMORPHIC
- flush_icache_range(
- MARK_ENABLE_OFFSET(iter->enable),
- sizeof(*(iter->enable)));
-#endif
+ marker_set_enable(iter->enable, 0);
*iter->cmark->call = __mark_empty_function;
found++;
}
@@ -419,9 +424,8 @@ int marker_set_probe(const char *name, const char *format,
{
struct module *mod;
int found = 0;
- unsigned long flags;

- spin_lock_irqsave(&modlist_lock, flags);
+ mutex_lock(&module_mutex);
/* Core kernel markers */
found += marker_set_probe_range(name, format, probe,
__start___markers, __stop___markers);
@@ -431,7 +435,7 @@ int marker_set_probe(const char *name, const char *format,
found += marker_set_probe_range(name, format, probe,
mod->markers, mod->markers+mod->num_markers);
}
- spin_unlock_irqrestore(&modlist_lock, flags);
+ mutex_unlock(&module_mutex);
return found;
}
EXPORT_SYMBOL(marker_set_probe);
@@ -440,9 +444,8 @@ int marker_remove_probe(marker_probe_func *probe)
{
struct module *mod;
int found = 0;
- unsigned long flags;

- spin_lock_irqsave(&modlist_lock, flags);
+ mutex_lock(&module_mutex);
/* Core kernel markers */
found += marker_remove_probe_range(probe,
__start___markers, __stop___markers);
@@ -452,7 +455,7 @@ int marker_remove_probe(marker_probe_func *probe)
found += marker_remove_probe_range(probe,
mod->markers, mod->markers+mod->num_markers);
}
- spin_unlock_irqrestore(&modlist_lock, flags);
+ mutex_unlock(&module_mutex);
return found;
}
EXPORT_SYMBOL(marker_remove_probe);
@@ -461,9 +464,8 @@ int marker_list_probe(marker_probe_func *probe)
{
struct module *mod;
int found = 0;
- unsigned long flags;

- spin_lock_irqsave(&modlist_lock, flags);
+ mutex_lock(&module_mutex);
/* Core kernel markers */
printk("Listing kernel markers\n");
found += marker_list_probe_range(probe,
@@ -477,7 +479,7 @@ int marker_list_probe(marker_probe_func *probe)
mod->markers, mod->markers+mod->num_markers);
}
}
- spin_unlock_irqrestore(&modlist_lock, flags);
+ mutex_unlock(&module_mutex);
return found;
}
EXPORT_SYMBOL(marker_list_probe);

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-16 17:56:29

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 1/2] lockdep missing barrier()

This patch adds a barrier() to lockdep.c lockdep_recursion updates. This
variable behaves like the preemption count and should therefore use similar
memory barriers.

This patch applies on 2.6.20-rc4-git3.

Signed-off-by: Mathieu Desnoyers <[email protected]>

--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -166,12 +166,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
void lockdep_off(void)
{
current->lockdep_recursion++;
+ barrier();
}

EXPORT_SYMBOL(lockdep_off);

void lockdep_on(void)
{
+ barrier();
current->lockdep_recursion--;
}

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-16 17:56:35

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 2/2] lockdep reentrancy

Here is a patch to lockdep.c so it behaves correctly when a kprobe breakpoint is
put on a marker within hardirq tracing functions as long as the marker is within
the lockdep_recursion incremented boundaries. It should apply on
2.6.20-rc4-git3.

Mathieu

Signed-off-by: Mathieu Desnoyers <[email protected]>


@@ -1841,33 +1843,36 @@ void trace_hardirqs_on(void)
struct task_struct *curr = current;
unsigned long ip;

if (unlikely(!debug_locks || current->lockdep_recursion))
return;

+ current->lockdep_recursion++;
+ barrier();
+
if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
- return;
+ goto end;

if (unlikely(curr->hardirqs_enabled)) {
debug_atomic_inc(&redundant_hardirqs_on);
- return;
+ goto end;
}
/* we'll do an OFF -> ON transition: */
curr->hardirqs_enabled = 1;
ip = (unsigned long) __builtin_return_address(0);

if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return;
+ goto end;
if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
- return;
+ goto end;
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
if (!mark_held_locks(curr, 1, ip))
- return;
+ goto end;
/*
* If we have softirqs enabled, then set the usage
* bit for all held locks. (disabled hardirqs prevented
@@ -1875,11 +1880,14 @@ void trace_hardirqs_on(void)
*/
if (curr->softirqs_enabled)
if (!mark_held_locks(curr, 0, ip))
- return;
+ goto end;

curr->hardirq_enable_ip = ip;
curr->hardirq_enable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_on_events);
+end:
+ barrier();
+ current->lockdep_recursion--;
}

EXPORT_SYMBOL(trace_hardirqs_on);
@@ -1888,14 +1896,17 @@ void trace_hardirqs_off(void)
{
struct task_struct *curr = current;

if (unlikely(!debug_locks || current->lockdep_recursion))
return;

+ current->lockdep_recursion++;
+ barrier();
+
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return;
+ goto end;

if (curr->hardirqs_enabled) {
/*
@@ -1910,6 +1921,9 @@ void trace_hardirqs_off(void)
debug_atomic_inc(&hardirqs_off_events);
} else
debug_atomic_inc(&redundant_hardirqs_off);
+end:
+ barrier();
+ current->lockdep_recursion--;
}

EXPORT_SYMBOL(trace_hardirqs_off);

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-16 18:46:39

by Frank Ch. Eigler

[permalink] [raw]
Subject: Re: [PATCH 0/4 update] Linux Kernel Markers - i386 : pIII erratum 49 : XMC

Mathieu Desnoyers <[email protected]> writes:

> [...]
> It would be nice to push the study of the kprobes debug trap handler so it can
> become possible to use it to put breakpoints in trap handlers. For now, kprobes
> refuses to insert breakpoints in __kprobes marked functions. However, as we
> instrument specific spots of the functions (not necessarily the function entry),
> it is sometimes correct to use kprobes on a marker within the function even if
> it is not correct to use it in the prologue. [...]

It may help to note that the issue with __kprobes attributes is
separate from putting probes in the prologue vs. elsewhere. The
__kprobes functions are so marked because they can cause infinite
regress if probed. Examples are fault handlers that would service
vmalloc-related faults, and some other functions unavoidably callable
from early probe handling context. Over time, the list has shrunk.

Indeed, __kprobes marking is a conservative measure, in that there may
be spots in such functions that are immune from recursion hazards.
But so far, we haven't encountered enough examples of this to warrant
refining this blacklist somehow.

- FChE

2007-01-16 21:27:52

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 0/4 update] kprobes and traps

Hi,

I have looked at kprobes code and have some questions for you. I would really
like to use it to patch dynamically my marker immediate value by doing code
patching. Using an int3 seems like the right way to handle this wrt pIII erratum
49.

Everything is ok, except for a limitation important to the LTTng project :
kprobes cannot probe trap handlers. Looking at the code, I see that the kprobes
trap notifier expects interrupts to be disabled when it is run. Looking a little
deeper in the code, I notice that you use per-cpu data structures to keep the
probe control information that is needed for single stepping, which clearly
requires you to disable interrupts so no interrupt handler with a kprobe in it
fires on top of the kprobe handler. It also forbids trap handler and NMI
handler instrumentation, as traps can be triggered by the kprobes handler and
NMIs can come at any point during execution.

Would it be possible to put these data structures on the stack or on a
separate stack accessible through thread_info instead ?

Mathieu


* Mathieu Desnoyers ([email protected]) wrote:
> Hi Richard,
>
> * Mathieu Desnoyers ([email protected]) wrote:
> > > You've got the same optimizations for x86 by modifying an instruction's
> > > immediate operand and thus avoiding a d-cache hit. The only real caveat is
> > > the need to avoid the unsynchronised cross modification erratum. Which
> > > means that all processors will need to issue a serializing operation before
> > > executing a Marker whose state is changed. How is that handled?
> > >
> >
> > Good catch. I thought that modifying only 1 byte would spare us from this
> > errata, but looking at it in detail tells me than it's not the case.
> >
> > I see three different ways to address the problem :
> [...]
> > 3 - First write an int3 instead of the instruction's first byte. The handler
> > would do the following :
> > int3_handler :
> > single-step the original instruction.
> > iret
> >
> > Secondly, we call an IPI that does a smp_processor_id() on each CPU and
> > wait for them to complete. It will make sure we execute a synchronizing
> > instruction on every CPU even if we do not execute the trap handler.
> >
> > Then, we write the new 2 bytes instruction atomically instead of the int3
> > and immediate value.
> >
> >
>
> Here is the implementation of my proposal using a slightly enhanced kprobes. I
> add the ability to single step a different instruction than the original one,
> and then put the new instruction instead of the original one when removing the
> kprobe. It is an improvement on the djprobes design : AFAIK, djprobes required
> the int3 to be executed by _every_ CPU before the instruction could be
> replaced. It was problematic with rarely used code paths (error handling) and
> with thread CPU affinity. Comments are welcome.
>
> I noticed that it restrains LTTng by removing the ability to probe
> do_general_protection, do_nmi, do_trap, do_debug and do_page_fault.
> hardirq on/off in lockdep.c must also be tweaked to allow
> local_irq_enable/disable usage within the debug trap handler.
>
> It would be nice to push the study of the kprobes debug trap handler so it can
> become possible to use it to put breakpoints in trap handlers. For now, kprobes
> refuses to insert breakpoints in __kprobes marked functions. However, as we
> instrument specific spots of the functions (not necessarily the function entry),
> it is sometimes correct to use kprobes on a marker within the function even if
> it is not correct to use it in the prologue. Insight from the SystemTAP team
> would be welcome on this kprobe limitation.
>
> Mathieu
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
>
> --- a/arch/i386/kernel/kprobes.c
> +++ b/arch/i386/kernel/kprobes.c
> @@ -31,6 +31,7 @@
> #include <linux/kprobes.h>
> #include <linux/ptrace.h>
> #include <linux/preempt.h>
> +#include <linux/kallsyms.h>
> #include <asm/cacheflush.h>
> #include <asm/kdebug.h>
> #include <asm/desc.h>
> @@ -753,6 +754,73 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
> return 0;
> }
>
> +static struct kprobe xmc_kp;
> +DEFINE_MUTEX(kprobe_xmc_mutex);
> +
> +static int xmc_handler_pre(struct kprobe *p, struct pt_regs *regs)
> +{
> + return 0;
> +}
> +
> +static void xmc_handler_post(struct kprobe *p, struct pt_regs *regs,
> + unsigned long flags)
> +{
> +}
> +
> +static int xmc_handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
> +{
> + return 0;
> +}
> +
> +static void xmc_synchronize_cpu(void *info)
> +{
> + smp_processor_id();
> +}
> +
> +/* Think of it as a memcpy of new into address with safety with regard to
> + * Intel PIII errata 49. Only valid for modifying a single instruction with
> + * an instruction of the same size or in smaller instructions of the total
> + * same size as the original instruction. */
> +int arch_xmc(void *address, char *newi, int size)
> +{
> + int ret = 0;
> + char *dest = (char*)address;
> + char str[KSYM_NAME_LEN + 1];
> + unsigned long sym_offs, sym_size;
> + char *modname;
> + const char *sym_name;
> +
> + mutex_lock(&kprobe_xmc_mutex);
> + xmc_kp.pre_handler = xmc_handler_pre;
> + xmc_kp.post_handler = xmc_handler_post;
> + xmc_kp.fault_handler = xmc_handler_fault;
> + xmc_kp.addr = address;
> + xmc_kp.symbol_name = NULL;
> + xmc_kp.offset = 0;
> +
> + ret = register_kprobe_swapi(&xmc_kp, newi, size);
> + if (ret < 0) {
> + sym_name = kallsyms_lookup((unsigned long)address,
> + &sym_size, &sym_offs,
> + &modname, str);
> + printk("register_kprobe failed for %p in %s:%s returned %d\n",
> + address, modname?modname:"kernel", sym_name, ret);
> + goto end;
> + }
> + /* Copy every bytes of the new instruction except the first one */
> + memcpy(dest+1, newi+1, size-1);
> + flush_icache_range(dest, size);
> + /* Execute serializing instruction on each CPU */
> + ret = on_each_cpu(xmc_synchronize_cpu, NULL, 1, 1);
> + BUG_ON(ret != 0);
> +
> + unregister_kprobe(&xmc_kp);
> +end:
> + mutex_unlock(&kprobe_xmc_mutex);
> +
> + return ret;
> +}
> +
> int __init arch_init_kprobes(void)
> {
> return 0;
> --- a/include/asm-generic/marker.h
> +++ b/include/asm-generic/marker.h
> @@ -18,7 +18,7 @@ struct __mark_marker_c {
>
> struct __mark_marker {
> const struct __mark_marker_c *cmark;
> - volatile char *enable;
> + char *enable;
> } __attribute__((packed));
>
> #ifdef CONFIG_MARKERS
> --- a/include/asm-i386/kprobes.h
> +++ b/include/asm-i386/kprobes.h
> @@ -90,4 +90,7 @@ static inline void restore_interrupts(struct pt_regs *regs)
>
> extern int kprobe_exceptions_notify(struct notifier_block *self,
> unsigned long val, void *data);
> +/* pIII code cross modification which follows erratum 49. */
> +#define ARCH_HAS_XMC
> +int arch_xmc(void *address, char *newi, int size);
> #endif /* _ASM_KPROBES_H */
> --- a/include/asm-i386/marker.h
> +++ b/include/asm-i386/marker.h
> @@ -18,7 +18,7 @@ struct __mark_marker_c {
>
> struct __mark_marker {
> struct __mark_marker_c *cmark;
> - volatile char *enable;
> + char *enable;
> } __attribute__((packed));
>
> #ifdef CONFIG_MARKERS
> --- a/include/asm-powerpc/marker.h
> +++ b/include/asm-powerpc/marker.h
> @@ -20,7 +20,7 @@ struct __mark_marker_c {
>
> struct __mark_marker {
> struct __mark_marker_c *cmark;
> - volatile short *enable;
> + short *enable;
> } __attribute__((packed));
>
> #ifdef CONFIG_MARKERS
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -189,6 +189,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
> }
>
> int register_kprobe(struct kprobe *p);
> +int register_kprobe_swapi(struct kprobe *p, char *newi, int size);
> void unregister_kprobe(struct kprobe *p);
> int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
> int longjmp_break_handler(struct kprobe *, struct pt_regs *);
> @@ -203,6 +204,9 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
> void add_rp_inst(struct kretprobe_instance *ri);
> void kprobe_flush_task(struct task_struct *tk);
> void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
> +/* Architecture specific code cross-modification. Valid for overwriting
> + * a single instruction. Safe for Intel PIII erratum 49. */
> +int kprobe_xmc(void *address, char *newi, int size);
> #else /* CONFIG_KPROBES */
>
> #define __kprobes /**/
> --- a/include/linux/marker.h
> +++ b/include/linux/marker.h
> @@ -40,7 +40,7 @@
>
> typedef void marker_probe_func(const char *fmt, ...);
>
> -#ifndef CONFIG_MARKERS_DISABLE_OPTIMIZATION
> +#ifdef CONFIG_MARKERS_ENABLE_OPTIMIZATION
> #include <asm/marker.h>
> #else
> #include <asm-generic/marker.h>
> --- a/kernel/Kconfig.marker
> +++ b/kernel/Kconfig.marker
> @@ -8,9 +8,10 @@ config MARKERS
> Place an empty function call at each marker site. Can be
> dynamically changed for a probe function.
>
> -config MARKERS_DISABLE_OPTIMIZATION
> - bool "Disable architecture specific marker optimization"
> - depends EMBEDDED
> +config MARKERS_ENABLE_OPTIMIZATION
> + bool "Enable marker optimization (EXPERIMENTAL)"
> + depends MARKERS
> + select KPROBES
> default n
> help
> Disable code replacement jump optimisations. Especially useful if your
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -534,7 +534,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
> }
>
> static int __kprobes __register_kprobe(struct kprobe *p,
> - unsigned long called_from)
> + unsigned long called_from, char *newi, int size)
> {
> int ret = 0;
> struct kprobe *old_p;
> @@ -587,6 +587,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
>
> if ((ret = arch_prepare_kprobe(p)) != 0)
> goto out;
> + if (newi) {
> + memcpy(p->ainsn.insn, newi, size);
> + p->opcode = *newi;
> + }
>
> INIT_HLIST_NODE(&p->hlist);
> hlist_add_head_rcu(&p->hlist,
> @@ -609,7 +613,14 @@ out:
> int __kprobes register_kprobe(struct kprobe *p)
> {
> return __register_kprobe(p,
> - (unsigned long)__builtin_return_address(0));
> + (unsigned long)__builtin_return_address(0),
> + NULL, 0);
> +}
> +
> +int __kprobes register_kprobe_swapi(struct kprobe *p, char *newi, int size)
> +{
> + return __register_kprobe(p,
> + (unsigned long)__builtin_return_address(0), newi, size);
> }
>
> void __kprobes unregister_kprobe(struct kprobe *p)
> @@ -699,7 +710,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
> jp->kp.break_handler = longjmp_break_handler;
>
> return __register_kprobe(&jp->kp,
> - (unsigned long)__builtin_return_address(0));
> + (unsigned long)__builtin_return_address(0),
> + NULL, 0);
> }
>
> void __kprobes unregister_jprobe(struct jprobe *jp)
> @@ -760,7 +772,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
> rp->nmissed = 0;
> /* Establish function entry probe point */
> if ((ret = __register_kprobe(&rp->kp,
> - (unsigned long)__builtin_return_address(0))) != 0)
> + (unsigned long)__builtin_return_address(0),
> + NULL, 0)) != 0)
> free_rp_inst(rp);
> return ret;
> }
> @@ -790,6 +803,33 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
> free_rp_inst(rp);
> }
>
> +#ifdef ARCH_HAS_XMC
> +int kprobe_xmc(void *address, char *newi, int size)
> +{
> + return arch_xmc(address, newi, size);
> +}
> +#else
> +/* Limited XMC : only overwrite an instruction with an atomic operation
> + * (writing at most an aligned long). */
> +int kprobe_xmc(void *address, char *newi, int size)
> +{
> + if (size > sizeof(long)) {
> + printk("Limited XMC : cannot overwrite instructions bigger "\
> + "than %d. XMC of size %d fails.\n", sizeof(long),
> + size);
> + return -EPERM;
> + }
> + if (hweight32(size) != 1 || address % size != 0) {
> + printk("Limited XMC : cannot write %d bytes unaligned "\
> + "instruction. XMC fails.\n", size);
> + return -EPERM;
> + }
> + memcpy(address, newi, size);
> + flush_icache_range(address, size);
> +}
> +#endif /* HAS_ARCH_XMC */
> +EXPORT_SYMBOL(kprobe_xmc);
> +
> static int __init init_kprobes(void)
> {
> int i, err = 0;
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -40,6 +40,7 @@
> #include <linux/string.h>
> #include <linux/mutex.h>
> #include <linux/unwind.h>
> +#include <linux/kprobes.h>
> #include <asm/uaccess.h>
> #include <asm/semaphore.h>
> #include <asm/cacheflush.h>
> @@ -309,6 +310,26 @@ EXPORT_SYMBOL(__mark_empty_function);
> #define MARK_ENABLE_OFFSET(a) \
> (typeof(a))((char*)a+MARK_ENABLE_IMMEDIATE_OFFSET)
>
> +/* wmb() stay ordered. State will be ok for interrupts/other CPUs. */
> +#ifdef MARK_POLYMORPHIC
> +static int marker_set_enable(char *address, char enable)
> +{
> + char newi[MARK_ENABLE_IMMEDIATE_OFFSET+1];
> +
> + memcpy(newi, address, MARK_ENABLE_IMMEDIATE_OFFSET+1);
> + *MARK_ENABLE_OFFSET(&newi[0]) = enable;
> + return kprobe_xmc(address, newi, MARK_ENABLE_IMMEDIATE_OFFSET+1);
> +}
> +#else
> +static int marker_set_enable(char *address, char enable)
> +{
> + *MARK_ENABLE_OFFSET(address) = enable;
> + return 0;
> +}
> +#endif /* MARK_POLYMORPHIC */
> +
> +/* enable and function address are set out of order, and it's ok :
> + * the state is always coherent. */
> static int marker_set_probe_range(const char *name,
> const char *format,
> marker_probe_func *probe,
> @@ -336,13 +357,7 @@ static int marker_set_probe_range(const char *name,
> *iter->cmark->call =
> __mark_empty_function;
> }
> - /* Can have many enables for one function */
> - *MARK_ENABLE_OFFSET(iter->enable) = 0;
> -#ifdef MARK_POLYMORPHIC
> - flush_icache_range(
> - MARK_ENABLE_OFFSET(iter->enable),
> - sizeof(*(iter->enable)));
> -#endif
> + marker_set_enable(iter->enable, 0);
> } else {
> if (*iter->cmark->call
> != __mark_empty_function) {
> @@ -360,12 +375,7 @@ static int marker_set_probe_range(const char *name,
> *iter->cmark->call = probe;
> }
> /* Can have many enables for one function */
> - *MARK_ENABLE_OFFSET(iter->enable) = 1;
> -#ifdef MARK_POLYMORPHIC
> - flush_icache_range(
> - MARK_ENABLE_OFFSET(iter->enable),
> - sizeof(*(iter->enable)));
> -#endif
> + marker_set_enable(iter->enable, 1);
> }
> found++;
> }
> @@ -382,12 +392,7 @@ static int marker_remove_probe_range(marker_probe_func *probe,
>
> for (iter = begin; iter < end; iter++) {
> if (*iter->cmark->call == probe) {
> - *MARK_ENABLE_OFFSET(iter->enable) = 0;
> -#ifdef MARK_POLYMORPHIC
> - flush_icache_range(
> - MARK_ENABLE_OFFSET(iter->enable),
> - sizeof(*(iter->enable)));
> -#endif
> + marker_set_enable(iter->enable, 0);
> *iter->cmark->call = __mark_empty_function;
> found++;
> }
> @@ -419,9 +424,8 @@ int marker_set_probe(const char *name, const char *format,
> {
> struct module *mod;
> int found = 0;
> - unsigned long flags;
>
> - spin_lock_irqsave(&modlist_lock, flags);
> + mutex_lock(&module_mutex);
> /* Core kernel markers */
> found += marker_set_probe_range(name, format, probe,
> __start___markers, __stop___markers);
> @@ -431,7 +435,7 @@ int marker_set_probe(const char *name, const char *format,
> found += marker_set_probe_range(name, format, probe,
> mod->markers, mod->markers+mod->num_markers);
> }
> - spin_unlock_irqrestore(&modlist_lock, flags);
> + mutex_unlock(&module_mutex);
> return found;
> }
> EXPORT_SYMBOL(marker_set_probe);
> @@ -440,9 +444,8 @@ int marker_remove_probe(marker_probe_func *probe)
> {
> struct module *mod;
> int found = 0;
> - unsigned long flags;
>
> - spin_lock_irqsave(&modlist_lock, flags);
> + mutex_lock(&module_mutex);
> /* Core kernel markers */
> found += marker_remove_probe_range(probe,
> __start___markers, __stop___markers);
> @@ -452,7 +455,7 @@ int marker_remove_probe(marker_probe_func *probe)
> found += marker_remove_probe_range(probe,
> mod->markers, mod->markers+mod->num_markers);
> }
> - spin_unlock_irqrestore(&modlist_lock, flags);
> + mutex_unlock(&module_mutex);
> return found;
> }
> EXPORT_SYMBOL(marker_remove_probe);
> @@ -461,9 +464,8 @@ int marker_list_probe(marker_probe_func *probe)
> {
> struct module *mod;
> int found = 0;
> - unsigned long flags;
>
> - spin_lock_irqsave(&modlist_lock, flags);
> + mutex_lock(&module_mutex);
> /* Core kernel markers */
> printk("Listing kernel markers\n");
> found += marker_list_probe_range(probe,
> @@ -477,7 +479,7 @@ int marker_list_probe(marker_probe_func *probe)
> mod->markers, mod->markers+mod->num_markers);
> }
> }
> - spin_unlock_irqrestore(&modlist_lock, flags);
> + mutex_unlock(&module_mutex);
> return found;
> }
> EXPORT_SYMBOL(marker_list_probe);
>
> --
> OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
> Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> _______________________________________________
> Ltt-dev mailing list
> [email protected]
> http://listserv.shafik.org/mailman/listinfo/ltt-dev
>

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-17 12:24:28

by S. P. Prasanna

[permalink] [raw]
Subject: Re: [PATCH 0/4 update] kprobes and traps

On Tue, Jan 16, 2007 at 04:27:40PM -0500, Mathieu Desnoyers wrote:
> Hi,
>
> I have looked at kprobes code and have some questions for you. I would really
> like to use it to patch dynamically my marker immediate value by doing code
> patching. Using an int3 seems like the right way to handle this wrt pIII erratum
> 49.
>
> Everything is ok, except for a limitation important to the LTTng project :
> kprobes cannot probe trap handlers. Looking at the code, I see that the kprobes
> trap notifier expects interrupts to be disabled when it is run. Looking a little
> deeper in the code, I notice that you use per-cpu data structures to keep the
> probe control information that is needed for single stepping, which clearly
> requires you to disable interrupts so no interrupt handler with a kprobe in it
> fires on top of the kprobe handler. It also forbids trap handler and NMI
> handler instrumentation, as traps can be triggered by the kprobes handler and
> NMIs can come at any point during execution.

>From i386 point of view, your understanding is correct.

>
> Would it be possible to put these data structures on the stack or on a
> separate stack accessible through thread_info instead ?
>

Yes, probably you can put them on per thread kernel stack.
But you need to find enough stack space to save the probe control
information. Also enough stack space should be allocated to handle
re-entrant kprobe handlers.
How will you handle the case where in nested interrupts happen while you
are in a the kprobe handler and those interrupt handlers have probes.
How many levels of nesting will you allow?

Regards
Prasanna

>
>
> * Mathieu Desnoyers ([email protected]) wrote:
> > Hi Richard,
> >
> > * Mathieu Desnoyers ([email protected]) wrote:
> > > > You've got the same optimizations for x86 by modifying an instruction's
> > > > immediate operand and thus avoiding a d-cache hit. The only real caveat is
> > > > the need to avoid the unsynchronised cross modification erratum. Which
> > > > means that all processors will need to issue a serializing operation before
> > > > executing a Marker whose state is changed. How is that handled?
> > > >
> > >
> > > Good catch. I thought that modifying only 1 byte would spare us from this
> > > errata, but looking at it in detail tells me than it's not the case.
> > >
> > > I see three different ways to address the problem :
> > [...]
> > > 3 - First write an int3 instead of the instruction's first byte. The handler
> > > would do the following :
> > > int3_handler :
> > > single-step the original instruction.
> > > iret
> > >
> > > Secondly, we call an IPI that does a smp_processor_id() on each CPU and
> > > wait for them to complete. It will make sure we execute a synchronizing
> > > instruction on every CPU even if we do not execute the trap handler.
> > >
> > > Then, we write the new 2 bytes instruction atomically instead of the int3
> > > and immediate value.
> > >
> > >
> >
> > Here is the implementation of my proposal using a slightly enhanced kprobes. I
> > add the ability to single step a different instruction than the original one,
> > and then put the new instruction instead of the original one when removing the
> > kprobe. It is an improvement on the djprobes design : AFAIK, djprobes required
> > the int3 to be executed by _every_ CPU before the instruction could be
> > replaced. It was problematic with rarely used code paths (error handling) and
> > with thread CPU affinity. Comments are welcome.
> >
> > I noticed that it restrains LTTng by removing the ability to probe
> > do_general_protection, do_nmi, do_trap, do_debug and do_page_fault.
> > hardirq on/off in lockdep.c must also be tweaked to allow
> > local_irq_enable/disable usage within the debug trap handler.
> >
> > It would be nice to push the study of the kprobes debug trap handler so it can
> > become possible to use it to put breakpoints in trap handlers. For now, kprobes
> > refuses to insert breakpoints in __kprobes marked functions. However, as we
> > instrument specific spots of the functions (not necessarily the function entry),
> > it is sometimes correct to use kprobes on a marker within the function even if
> > it is not correct to use it in the prologue. Insight from the SystemTAP team
> > would be welcome on this kprobe limitation.
> >
> > Mathieu
> >
> > Signed-off-by: Mathieu Desnoyers <[email protected]>
> >
> > --- a/arch/i386/kernel/kprobes.c
> > +++ b/arch/i386/kernel/kprobes.c
> > @@ -31,6 +31,7 @@
> > #include <linux/kprobes.h>
> > #include <linux/ptrace.h>
> > #include <linux/preempt.h>
> > +#include <linux/kallsyms.h>
> > #include <asm/cacheflush.h>
> > #include <asm/kdebug.h>
> > #include <asm/desc.h>
> > @@ -753,6 +754,73 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
> > return 0;
> > }
> >
> > +static struct kprobe xmc_kp;
> > +DEFINE_MUTEX(kprobe_xmc_mutex);
> > +
> > +static int xmc_handler_pre(struct kprobe *p, struct pt_regs *regs)
> > +{
> > + return 0;
> > +}
> > +
> > +static void xmc_handler_post(struct kprobe *p, struct pt_regs *regs,
> > + unsigned long flags)
> > +{
> > +}
> > +
> > +static int xmc_handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
> > +{
> > + return 0;
> > +}
> > +
> > +static void xmc_synchronize_cpu(void *info)
> > +{
> > + smp_processor_id();
> > +}
> > +
> > +/* Think of it as a memcpy of new into address with safety with regard to
> > + * Intel PIII errata 49. Only valid for modifying a single instruction with
> > + * an instruction of the same size or in smaller instructions of the total
> > + * same size as the original instruction. */
> > +int arch_xmc(void *address, char *newi, int size)
> > +{
> > + int ret = 0;
> > + char *dest = (char*)address;
> > + char str[KSYM_NAME_LEN + 1];
> > + unsigned long sym_offs, sym_size;
> > + char *modname;
> > + const char *sym_name;
> > +
> > + mutex_lock(&kprobe_xmc_mutex);
> > + xmc_kp.pre_handler = xmc_handler_pre;
> > + xmc_kp.post_handler = xmc_handler_post;
> > + xmc_kp.fault_handler = xmc_handler_fault;
> > + xmc_kp.addr = address;
> > + xmc_kp.symbol_name = NULL;
> > + xmc_kp.offset = 0;
> > +
> > + ret = register_kprobe_swapi(&xmc_kp, newi, size);
> > + if (ret < 0) {
> > + sym_name = kallsyms_lookup((unsigned long)address,
> > + &sym_size, &sym_offs,
> > + &modname, str);
> > + printk("register_kprobe failed for %p in %s:%s returned %d\n",
> > + address, modname?modname:"kernel", sym_name, ret);
> > + goto end;
> > + }
> > + /* Copy every bytes of the new instruction except the first one */
> > + memcpy(dest+1, newi+1, size-1);
> > + flush_icache_range(dest, size);
> > + /* Execute serializing instruction on each CPU */
> > + ret = on_each_cpu(xmc_synchronize_cpu, NULL, 1, 1);
> > + BUG_ON(ret != 0);
> > +
> > + unregister_kprobe(&xmc_kp);
> > +end:
> > + mutex_unlock(&kprobe_xmc_mutex);
> > +
> > + return ret;
> > +}
> > +
> > int __init arch_init_kprobes(void)
> > {
> > return 0;
> > --- a/include/asm-generic/marker.h
> > +++ b/include/asm-generic/marker.h
> > @@ -18,7 +18,7 @@ struct __mark_marker_c {
> >
> > struct __mark_marker {
> > const struct __mark_marker_c *cmark;
> > - volatile char *enable;
> > + char *enable;
> > } __attribute__((packed));
> >
> > #ifdef CONFIG_MARKERS
> > --- a/include/asm-i386/kprobes.h
> > +++ b/include/asm-i386/kprobes.h
> > @@ -90,4 +90,7 @@ static inline void restore_interrupts(struct pt_regs *regs)
> >
> > extern int kprobe_exceptions_notify(struct notifier_block *self,
> > unsigned long val, void *data);
> > +/* pIII code cross modification which follows erratum 49. */
> > +#define ARCH_HAS_XMC
> > +int arch_xmc(void *address, char *newi, int size);
> > #endif /* _ASM_KPROBES_H */
> > --- a/include/asm-i386/marker.h
> > +++ b/include/asm-i386/marker.h
> > @@ -18,7 +18,7 @@ struct __mark_marker_c {
> >
> > struct __mark_marker {
> > struct __mark_marker_c *cmark;
> > - volatile char *enable;
> > + char *enable;
> > } __attribute__((packed));
> >
> > #ifdef CONFIG_MARKERS
> > --- a/include/asm-powerpc/marker.h
> > +++ b/include/asm-powerpc/marker.h
> > @@ -20,7 +20,7 @@ struct __mark_marker_c {
> >
> > struct __mark_marker {
> > struct __mark_marker_c *cmark;
> > - volatile short *enable;
> > + short *enable;
> > } __attribute__((packed));
> >
> > #ifdef CONFIG_MARKERS
> > --- a/include/linux/kprobes.h
> > +++ b/include/linux/kprobes.h
> > @@ -189,6 +189,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
> > }
> >
> > int register_kprobe(struct kprobe *p);
> > +int register_kprobe_swapi(struct kprobe *p, char *newi, int size);
> > void unregister_kprobe(struct kprobe *p);
> > int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
> > int longjmp_break_handler(struct kprobe *, struct pt_regs *);
> > @@ -203,6 +204,9 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
> > void add_rp_inst(struct kretprobe_instance *ri);
> > void kprobe_flush_task(struct task_struct *tk);
> > void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
> > +/* Architecture specific code cross-modification. Valid for overwriting
> > + * a single instruction. Safe for Intel PIII erratum 49. */
> > +int kprobe_xmc(void *address, char *newi, int size);
> > #else /* CONFIG_KPROBES */
> >
> > #define __kprobes /**/
> > --- a/include/linux/marker.h
> > +++ b/include/linux/marker.h
> > @@ -40,7 +40,7 @@
> >
> > typedef void marker_probe_func(const char *fmt, ...);
> >
> > -#ifndef CONFIG_MARKERS_DISABLE_OPTIMIZATION
> > +#ifdef CONFIG_MARKERS_ENABLE_OPTIMIZATION
> > #include <asm/marker.h>
> > #else
> > #include <asm-generic/marker.h>
> > --- a/kernel/Kconfig.marker
> > +++ b/kernel/Kconfig.marker
> > @@ -8,9 +8,10 @@ config MARKERS
> > Place an empty function call at each marker site. Can be
> > dynamically changed for a probe function.
> >
> > -config MARKERS_DISABLE_OPTIMIZATION
> > - bool "Disable architecture specific marker optimization"
> > - depends EMBEDDED
> > +config MARKERS_ENABLE_OPTIMIZATION
> > + bool "Enable marker optimization (EXPERIMENTAL)"
> > + depends MARKERS
> > + select KPROBES
> > default n
> > help
> > Disable code replacement jump optimisations. Especially useful if your
> > --- a/kernel/kprobes.c
> > +++ b/kernel/kprobes.c
> > @@ -534,7 +534,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
> > }
> >
> > static int __kprobes __register_kprobe(struct kprobe *p,
> > - unsigned long called_from)
> > + unsigned long called_from, char *newi, int size)
> > {
> > int ret = 0;
> > struct kprobe *old_p;
> > @@ -587,6 +587,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
> >
> > if ((ret = arch_prepare_kprobe(p)) != 0)
> > goto out;
> > + if (newi) {
> > + memcpy(p->ainsn.insn, newi, size);
> > + p->opcode = *newi;
> > + }
> >
> > INIT_HLIST_NODE(&p->hlist);
> > hlist_add_head_rcu(&p->hlist,
> > @@ -609,7 +613,14 @@ out:
> > int __kprobes register_kprobe(struct kprobe *p)
> > {
> > return __register_kprobe(p,
> > - (unsigned long)__builtin_return_address(0));
> > + (unsigned long)__builtin_return_address(0),
> > + NULL, 0);
> > +}
> > +
> > +int __kprobes register_kprobe_swapi(struct kprobe *p, char *newi, int size)
> > +{
> > + return __register_kprobe(p,
> > + (unsigned long)__builtin_return_address(0), newi, size);
> > }
> >
> > void __kprobes unregister_kprobe(struct kprobe *p)
> > @@ -699,7 +710,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
> > jp->kp.break_handler = longjmp_break_handler;
> >
> > return __register_kprobe(&jp->kp,
> > - (unsigned long)__builtin_return_address(0));
> > + (unsigned long)__builtin_return_address(0),
> > + NULL, 0);
> > }
> >
> > void __kprobes unregister_jprobe(struct jprobe *jp)
> > @@ -760,7 +772,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
> > rp->nmissed = 0;
> > /* Establish function entry probe point */
> > if ((ret = __register_kprobe(&rp->kp,
> > - (unsigned long)__builtin_return_address(0))) != 0)
> > + (unsigned long)__builtin_return_address(0),
> > + NULL, 0)) != 0)
> > free_rp_inst(rp);
> > return ret;
> > }
> > @@ -790,6 +803,33 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
> > free_rp_inst(rp);
> > }
> >
> > +#ifdef ARCH_HAS_XMC
> > +int kprobe_xmc(void *address, char *newi, int size)
> > +{
> > + return arch_xmc(address, newi, size);
> > +}
> > +#else
> > +/* Limited XMC : only overwrite an instruction with an atomic operation
> > + * (writing at most an aligned long). */
> > +int kprobe_xmc(void *address, char *newi, int size)
> > +{
> > + if (size > sizeof(long)) {
> > + printk("Limited XMC : cannot overwrite instructions bigger "\
> > + "than %d. XMC of size %d fails.\n", sizeof(long),
> > + size);
> > + return -EPERM;
> > + }
> > + if (hweight32(size) != 1 || address % size != 0) {
> > + printk("Limited XMC : cannot write %d bytes unaligned "\
> > + "instruction. XMC fails.\n", size);
> > + return -EPERM;
> > + }
> > + memcpy(address, newi, size);
> > + flush_icache_range(address, size);
> > +}
> > +#endif /* HAS_ARCH_XMC */
> > +EXPORT_SYMBOL(kprobe_xmc);
> > +
> > static int __init init_kprobes(void)
> > {
> > int i, err = 0;
> > --- a/kernel/module.c
> > +++ b/kernel/module.c
> > @@ -40,6 +40,7 @@
> > #include <linux/string.h>
> > #include <linux/mutex.h>
> > #include <linux/unwind.h>
> > +#include <linux/kprobes.h>
> > #include <asm/uaccess.h>
> > #include <asm/semaphore.h>
> > #include <asm/cacheflush.h>
> > @@ -309,6 +310,26 @@ EXPORT_SYMBOL(__mark_empty_function);
> > #define MARK_ENABLE_OFFSET(a) \
> > (typeof(a))((char*)a+MARK_ENABLE_IMMEDIATE_OFFSET)
> >
> > +/* wmb() stay ordered. State will be ok for interrupts/other CPUs. */
> > +#ifdef MARK_POLYMORPHIC
> > +static int marker_set_enable(char *address, char enable)
> > +{
> > + char newi[MARK_ENABLE_IMMEDIATE_OFFSET+1];
> > +
> > + memcpy(newi, address, MARK_ENABLE_IMMEDIATE_OFFSET+1);
> > + *MARK_ENABLE_OFFSET(&newi[0]) = enable;
> > + return kprobe_xmc(address, newi, MARK_ENABLE_IMMEDIATE_OFFSET+1);
> > +}
> > +#else
> > +static int marker_set_enable(char *address, char enable)
> > +{
> > + *MARK_ENABLE_OFFSET(address) = enable;
> > + return 0;
> > +}
> > +#endif /* MARK_POLYMORPHIC */
> > +
> > +/* enable and function address are set out of order, and it's ok :
> > + * the state is always coherent. */
> > static int marker_set_probe_range(const char *name,
> > const char *format,
> > marker_probe_func *probe,
> > @@ -336,13 +357,7 @@ static int marker_set_probe_range(const char *name,
> > *iter->cmark->call =
> > __mark_empty_function;
> > }
> > - /* Can have many enables for one function */
> > - *MARK_ENABLE_OFFSET(iter->enable) = 0;
> > -#ifdef MARK_POLYMORPHIC
> > - flush_icache_range(
> > - MARK_ENABLE_OFFSET(iter->enable),
> > - sizeof(*(iter->enable)));
> > -#endif
> > + marker_set_enable(iter->enable, 0);
> > } else {
> > if (*iter->cmark->call
> > != __mark_empty_function) {
> > @@ -360,12 +375,7 @@ static int marker_set_probe_range(const char *name,
> > *iter->cmark->call = probe;
> > }
> > /* Can have many enables for one function */
> > - *MARK_ENABLE_OFFSET(iter->enable) = 1;
> > -#ifdef MARK_POLYMORPHIC
> > - flush_icache_range(
> > - MARK_ENABLE_OFFSET(iter->enable),
> > - sizeof(*(iter->enable)));
> > -#endif
> > + marker_set_enable(iter->enable, 1);
> > }
> > found++;
> > }
> > @@ -382,12 +392,7 @@ static int marker_remove_probe_range(marker_probe_func *probe,
> >
> > for (iter = begin; iter < end; iter++) {
> > if (*iter->cmark->call == probe) {
> > - *MARK_ENABLE_OFFSET(iter->enable) = 0;
> > -#ifdef MARK_POLYMORPHIC
> > - flush_icache_range(
> > - MARK_ENABLE_OFFSET(iter->enable),
> > - sizeof(*(iter->enable)));
> > -#endif
> > + marker_set_enable(iter->enable, 0);
> > *iter->cmark->call = __mark_empty_function;
> > found++;
> > }
> > @@ -419,9 +424,8 @@ int marker_set_probe(const char *name, const char *format,
> > {
> > struct module *mod;
> > int found = 0;
> > - unsigned long flags;
> >
> > - spin_lock_irqsave(&modlist_lock, flags);
> > + mutex_lock(&module_mutex);
> > /* Core kernel markers */
> > found += marker_set_probe_range(name, format, probe,
> > __start___markers, __stop___markers);
> > @@ -431,7 +435,7 @@ int marker_set_probe(const char *name, const char *format,
> > found += marker_set_probe_range(name, format, probe,
> > mod->markers, mod->markers+mod->num_markers);
> > }
> > - spin_unlock_irqrestore(&modlist_lock, flags);
> > + mutex_unlock(&module_mutex);
> > return found;
> > }
> > EXPORT_SYMBOL(marker_set_probe);
> > @@ -440,9 +444,8 @@ int marker_remove_probe(marker_probe_func *probe)
> > {
> > struct module *mod;
> > int found = 0;
> > - unsigned long flags;
> >
> > - spin_lock_irqsave(&modlist_lock, flags);
> > + mutex_lock(&module_mutex);
> > /* Core kernel markers */
> > found += marker_remove_probe_range(probe,
> > __start___markers, __stop___markers);
> > @@ -452,7 +455,7 @@ int marker_remove_probe(marker_probe_func *probe)
> > found += marker_remove_probe_range(probe,
> > mod->markers, mod->markers+mod->num_markers);
> > }
> > - spin_unlock_irqrestore(&modlist_lock, flags);
> > + mutex_unlock(&module_mutex);
> > return found;
> > }
> > EXPORT_SYMBOL(marker_remove_probe);
> > @@ -461,9 +464,8 @@ int marker_list_probe(marker_probe_func *probe)
> > {
> > struct module *mod;
> > int found = 0;
> > - unsigned long flags;
> >
> > - spin_lock_irqsave(&modlist_lock, flags);
> > + mutex_lock(&module_mutex);
> > /* Core kernel markers */
> > printk("Listing kernel markers\n");
> > found += marker_list_probe_range(probe,
> > @@ -477,7 +479,7 @@ int marker_list_probe(marker_probe_func *probe)
> > mod->markers, mod->markers+mod->num_markers);
> > }
> > }
> > - spin_unlock_irqrestore(&modlist_lock, flags);
> > + mutex_unlock(&module_mutex);
> > return found;
> > }
> > EXPORT_SYMBOL(marker_list_probe);
> >
> > --
> > OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
> > Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> > _______________________________________________
> > Ltt-dev mailing list
> > [email protected]
> > http://listserv.shafik.org/mailman/listinfo/ltt-dev
> >
>
> --
> OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
> Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

--
Prasanna S.P.
Linux Technology Center
India Software Labs, IBM Bangalore
Email: [email protected]
Ph: 91-80-41776329

2007-01-24 04:28:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] lockdep missing barrier()

On Tue, 16 Jan 2007 12:56:24 -0500
Mathieu Desnoyers <[email protected]> wrote:

> This patch adds a barrier() to lockdep.c lockdep_recursion updates. This
> variable behaves like the preemption count and should therefore use similar
> memory barriers.
>
> This patch applies on 2.6.20-rc4-git3.
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
>
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -166,12 +166,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
> void lockdep_off(void)
> {
> current->lockdep_recursion++;
> + barrier();
> }
>
> EXPORT_SYMBOL(lockdep_off);
>
> void lockdep_on(void)
> {
> + barrier();
> current->lockdep_recursion--;
> }

I am allergic to undocumented barriers. It is often unobvious what the
barrier is supposed to protect against, yielding mystifying code. This is
one such case.

Please add code comments.

2007-01-24 04:30:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/2] lockdep reentrancy

On Tue, 16 Jan 2007 12:56:31 -0500
Mathieu Desnoyers <[email protected]> wrote:

> Here is a patch to lockdep.c so it behaves correctly when a kprobe breakpoint is
> put on a marker within hardirq tracing functions as long as the marker is within
> the lockdep_recursion incremented boundaries. It should apply on
> 2.6.20-rc4-git3.
>
> Mathieu
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
>
>
> @@ -1841,33 +1843,36 @@ void trace_hardirqs_on(void)

You lost the patch headers.

> struct task_struct *curr = current;
> unsigned long ip;
>
> if (unlikely(!debug_locks || current->lockdep_recursion))
> return;
>
> + current->lockdep_recursion++;
> + barrier();

Why can't we use lockdep_off() here?

> if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
> - return;
> + goto end;
>
> if (unlikely(curr->hardirqs_enabled)) {
> debug_atomic_inc(&redundant_hardirqs_on);
> - return;
> + goto end;
> }
> /* we'll do an OFF -> ON transition: */
> curr->hardirqs_enabled = 1;
> ip = (unsigned long) __builtin_return_address(0);
>
> if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
> - return;
> + goto end;
> if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
> - return;
> + goto end;
> /*
> * We are going to turn hardirqs on, so set the
> * usage bit for all held locks:
> */
> if (!mark_held_locks(curr, 1, ip))
> - return;
> + goto end;
> /*
> * If we have softirqs enabled, then set the usage
> * bit for all held locks. (disabled hardirqs prevented
> @@ -1875,11 +1880,14 @@ void trace_hardirqs_on(void)
> */
> if (curr->softirqs_enabled)
> if (!mark_held_locks(curr, 0, ip))
> - return;
> + goto end;
>
> curr->hardirq_enable_ip = ip;
> curr->hardirq_enable_event = ++curr->irq_events;
> debug_atomic_inc(&hardirqs_on_events);
> +end:
> + barrier();
> + current->lockdep_recursion--;

lockdep_on()?

> }
>
> EXPORT_SYMBOL(trace_hardirqs_on);
> @@ -1888,14 +1896,17 @@ void trace_hardirqs_off(void)
> {
> struct task_struct *curr = current;
>
> if (unlikely(!debug_locks || current->lockdep_recursion))
> return;
>
> + current->lockdep_recursion++;
> + barrier();

lockdep_off()?

> if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
> - return;
> + goto end;
>
> if (curr->hardirqs_enabled) {
> /*
> @@ -1910,6 +1921,9 @@ void trace_hardirqs_off(void)
> debug_atomic_inc(&hardirqs_off_events);
> } else
> debug_atomic_inc(&redundant_hardirqs_off);
> +end:
> + barrier();
> + current->lockdep_recursion--;

lockdep_on()?

2007-01-24 16:51:57

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 1/2] lockdep missing barrier()

* Andrew Morton ([email protected]) wrote:
> On Tue, 16 Jan 2007 12:56:24 -0500
> Mathieu Desnoyers <[email protected]> wrote:
>
> > This patch adds a barrier() to lockdep.c lockdep_recursion updates. This
> > variable behaves like the preemption count and should therefore use similar
> > memory barriers.
> >
> > This patch applies on 2.6.20-rc4-git3.
> >
> > Signed-off-by: Mathieu Desnoyers <[email protected]>
> >
> > --- a/kernel/lockdep.c
> > +++ b/kernel/lockdep.c
> > @@ -166,12 +166,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
> > void lockdep_off(void)
> > {
> > current->lockdep_recursion++;
> > + barrier();
> > }
> >
> > EXPORT_SYMBOL(lockdep_off);
> >
> > void lockdep_on(void)
> > {
> > + barrier();
> > current->lockdep_recursion--;
> > }
>
> I am allergic to undocumented barriers. It is often unobvious what the
> barrier is supposed to protect against, yielding mystifying code. This is
> one such case.
>
> Please add code comments.

It looks like my fix was not the right one, but looking at the code in more
depth, another fix seems to be required. Summary : the order of locking in
vprintk() should be changed.


lockdep on/off used in : printk and nmi_enter/exit.

* In kernel/printk.c :

vprintk() does :

preempt_disable()
local_irq_save()
lockdep_off()
spin_lock(&logbuf_lock)
spin_unlock(&logbuf_lock)
if(!down_trylock(&console_sem))
up(&console_sem)
lockdep_on()
local_irq_restore()
preempt_enable()

The goals here is to make sure we do not call printk() recursively from
kernel/lockdep.c:__lock_acquire() (called from spin_* and down/up) nor from
kernel/lockdep.c:trace_hardirqs_on/off() (called from local_irq_restore/save).
It can then potentially call printk() through mark_held_locks/mark_lock.

It correctly protects against the spin_lock call and the up/down call, but it
does not protect against local_irq_restore.

If we change the locking so it becomes correct :

preempt_disable()
lockdep_off()
local_irq_save()
spin_lock(&logbuf_lock)
spin_unlock(&logbuf_lock)
if(!down_trylock(&console_sem))
up(&console_sem)
local_irq_restore()
lockdep_on()
preempt_enable()

Everything should be fine without a barrier(), because the
local_irq_save/restore will hopefully make sure the compiler won't reorder the
memory writes across cli()/sti() and the lockdep_recursion variable belongs to
the current task.



* In include/linux/hardirq.h:nmi_enter()/nmi_exit()

Used, for instance, in arch/i386/kernel/traps.c:do_nmi()
Calls nmi_enter : (notice : possibly no barrier between lockdep_off() and the
end of the nmi_enter() code with the "right" config options : preemption
disabled)
#define nmi_enter() do { lockdep_off(); irq_enter(); } while (0)
#define irq_enter() \
do { \
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
} while (0)
# define add_preempt_count(val) do { preempt_count() += (val); } while (0)
# define trace_hardirq_enter() do { current->hardirq_context++; } while (0)

Then calls, for instance, arch/i386/kernel/nmi.c:nmi_watchdog_tick(),
which takes a spinlock and may also call printk.

Because we are within a context where irqs are disabled and we use the
per-task lockdep_recursion only within the current task, there is no need to
make it appear ordered to other CPUs. Also, the compiler should not reorder the
lockdep_off() and the call to kernel/lockdep.c:__lock_acquire(), because they
both access the same variable : current->lockdep_recursion. So the NMI case
seems fine without a memory barrier.

Mathieu

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-24 16:55:07

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 2/2] lockdep reentrancy

Hi Andrew,

* Andrew Morton ([email protected]) wrote:
> On Tue, 16 Jan 2007 12:56:31 -0500
> Mathieu Desnoyers <[email protected]> wrote:
>
> > Here is a patch to lockdep.c so it behaves correctly when a kprobe breakpoint is
> > put on a marker within hardirq tracing functions as long as the marker is within
> > the lockdep_recursion incremented boundaries. It should apply on
> > 2.6.20-rc4-git3.
> >
> > Mathieu
> >
> > Signed-off-by: Mathieu Desnoyers <[email protected]>
> >
> >
> > @@ -1841,33 +1843,36 @@ void trace_hardirqs_on(void)
>
> You lost the patch headers.
>
> > struct task_struct *curr = current;
> > unsigned long ip;
> >
> > if (unlikely(!debug_locks || current->lockdep_recursion))
> > return;
> >
> > + current->lockdep_recursion++;
> > + barrier();
>
> Why can't we use lockdep_off() here?
>

Because I thought that changing lockdep_off() for the whole system might be a
little different than adding a lockdep recursion protection. See my other email
for the lockdep_on/off() discussion.

And please drop this patch : It sometimes make lockdep trigger false positives
under heavy IRQ load because of a dropped lockdep event.

Mathieu


> > if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
> > - return;
> > + goto end;
> >
> > if (unlikely(curr->hardirqs_enabled)) {
> > debug_atomic_inc(&redundant_hardirqs_on);
> > - return;
> > + goto end;
> > }
> > /* we'll do an OFF -> ON transition: */
> > curr->hardirqs_enabled = 1;
> > ip = (unsigned long) __builtin_return_address(0);
> >
> > if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
> > - return;
> > + goto end;
> > if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
> > - return;
> > + goto end;
> > /*
> > * We are going to turn hardirqs on, so set the
> > * usage bit for all held locks:
> > */
> > if (!mark_held_locks(curr, 1, ip))
> > - return;
> > + goto end;
> > /*
> > * If we have softirqs enabled, then set the usage
> > * bit for all held locks. (disabled hardirqs prevented
> > @@ -1875,11 +1880,14 @@ void trace_hardirqs_on(void)
> > */
> > if (curr->softirqs_enabled)
> > if (!mark_held_locks(curr, 0, ip))
> > - return;
> > + goto end;
> >
> > curr->hardirq_enable_ip = ip;
> > curr->hardirq_enable_event = ++curr->irq_events;
> > debug_atomic_inc(&hardirqs_on_events);
> > +end:
> > + barrier();
> > + current->lockdep_recursion--;
>
> lockdep_on()?
>
> > }
> >
> > EXPORT_SYMBOL(trace_hardirqs_on);
> > @@ -1888,14 +1896,17 @@ void trace_hardirqs_off(void)
> > {
> > struct task_struct *curr = current;
> >
> > if (unlikely(!debug_locks || current->lockdep_recursion))
> > return;
> >
> > + current->lockdep_recursion++;
> > + barrier();
>
> lockdep_off()?
>
> > if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
> > - return;
> > + goto end;
> >
> > if (curr->hardirqs_enabled) {
> > /*
> > @@ -1910,6 +1921,9 @@ void trace_hardirqs_off(void)
> > debug_atomic_inc(&hardirqs_off_events);
> > } else
> > debug_atomic_inc(&redundant_hardirqs_off);
> > +end:
> > + barrier();
> > + current->lockdep_recursion--;
>
> lockdep_on()?

--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-24 17:24:50

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH] order of lockdep off/on in vprintk() should be changed

The order of locking between lockdep_off/on() and local_irq_save/restore() in
vprintk() should be changed.

* In kernel/printk.c :

vprintk() does :

preempt_disable()
local_irq_save()
lockdep_off()
spin_lock(&logbuf_lock)
spin_unlock(&logbuf_lock)
if(!down_trylock(&console_sem))
up(&console_sem)
lockdep_on()
local_irq_restore()
preempt_enable()

The goals here is to make sure we do not call printk() recursively from
kernel/lockdep.c:__lock_acquire() (called from spin_* and down/up) nor from
kernel/lockdep.c:trace_hardirqs_on/off() (called from local_irq_restore/save).
It can then potentially call printk() through mark_held_locks/mark_lock.

It correctly protects against the spin_lock call and the up/down call, but it
does not protect against local_irq_restore. It could cause infinite recursive
printk/trace_hardirqs_on() calls when printk() is called from the
mark_lock() error handing path.

We should change the locking so it becomes correct :

preempt_disable()
lockdep_off()
local_irq_save()
spin_lock(&logbuf_lock)
spin_unlock(&logbuf_lock)
if(!down_trylock(&console_sem))
up(&console_sem)
local_irq_restore()
lockdep_on()
preempt_enable()


Signed-off-by: Mathieu Desnoyers <[email protected]>

--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -530,8 +530,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();

/* This stops the holder of console_sem just where we want him */
- local_irq_save(flags);
lockdep_off();
+ local_irq_save(flags);
spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();

@@ -640,8 +640,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
console_locked = 0;
up(&console_sem);
}
- lockdep_on();
local_irq_restore(flags);
+ lockdep_on();
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
@@ -650,8 +650,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
*/
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
- lockdep_on();
local_irq_restore(flags);
+ lockdep_on();
}

preempt_enable();
--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2007-01-24 17:56:01

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] minimize lockdep_on/off side-effect

Minimize lockdep_on/off side-effect on irq tracing in vprintk by using
raw_local_irq_save/restore _around_ lockdep_off/on().

It applies on the previous patch. It has the advantage of not losing the IRQ
events coming between the lockdep disabling and the irq disabling.

Signed-off-by: Mathieu Desnoyers <[email protected]>


--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -530,8 +530,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();

/* This stops the holder of console_sem just where we want him */
+ raw_local_irq_save(flags);
lockdep_off();
- local_irq_save(flags);
spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();

@@ -640,8 +640,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
console_locked = 0;
up(&console_sem);
}
- local_irq_restore(flags);
lockdep_on();
+ raw_local_irq_restore(flags);
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
@@ -650,8 +650,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
*/
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
- local_irq_restore(flags);
lockdep_on();
+ raw_local_irq_restore(flags);
}

preempt_enable();
--
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68