2008-01-30 03:19:49

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

If CONFIG_MCOUNT is selected and /proc/sys/kernel/mcount_enabled is set to a
non-zero value the mcount routine will be called everytime we enter a kernel
function that is not marked with the "notrace" attribute.

The mcount routine will then call a registered function if a function
happens to be registered.

[This code has been highly hacked by Steven Rostedt, so don't
blame Arnaldo for all of this ;-) ]

Update:
It is now possible to register more than one mcount function.
If only one mcount function is registered, that will be the
function that mcount calls directly. If more than one function
is registered, then mcount will call a function that will loop
through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
Makefile | 3
arch/x86/Kconfig | 1
arch/x86/kernel/entry_32.S | 25 +++++++
arch/x86/kernel/entry_64.S | 36 +++++++++++
include/linux/linkage.h | 2
include/linux/mcount.h | 38 ++++++++++++
kernel/sysctl.c | 11 +++
lib/Kconfig.debug | 1
lib/Makefile | 2
lib/tracing/Kconfig | 10 +++
lib/tracing/Makefile | 3
lib/tracing/mcount.c | 141 +++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 273 insertions(+)

Index: linux-mcount.git/Makefile
===================================================================
--- linux-mcount.git.orig/Makefile 2008-01-29 17:01:56.000000000 -0500
+++ linux-mcount.git/Makefile 2008-01-29 17:26:17.000000000 -0500
@@ -509,6 +509,9 @@ endif

include $(srctree)/arch/$(SRCARCH)/Makefile

+ifdef CONFIG_MCOUNT
+KBUILD_CFLAGS += -pg
+endif
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
else
Index: linux-mcount.git/arch/x86/Kconfig
===================================================================
--- linux-mcount.git.orig/arch/x86/Kconfig 2008-01-29 16:59:15.000000000 -0500
+++ linux-mcount.git/arch/x86/Kconfig 2008-01-29 17:26:18.000000000 -0500
@@ -19,6 +19,7 @@ config X86_64
config X86
bool
default y
+ select HAVE_MCOUNT

config GENERIC_TIME
bool
Index: linux-mcount.git/arch/x86/kernel/entry_32.S
===================================================================
--- linux-mcount.git.orig/arch/x86/kernel/entry_32.S 2008-01-29 16:59:15.000000000 -0500
+++ linux-mcount.git/arch/x86/kernel/entry_32.S 2008-01-29 17:26:18.000000000 -0500
@@ -75,6 +75,31 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000

+#ifdef CONFIG_MCOUNT
+.globl mcount
+mcount:
+ /* unlikely(mcount_enabled) */
+ cmpl $0, mcount_enabled
+ jnz trace
+ ret
+
+trace:
+ /* taken from glibc */
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %edx
+ movl 0x4(%ebp), %eax
+
+ call *mcount_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+ ret
+#endif
+
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
Index: linux-mcount.git/arch/x86/kernel/entry_64.S
===================================================================
--- linux-mcount.git.orig/arch/x86/kernel/entry_64.S 2008-01-29 16:59:15.000000000 -0500
+++ linux-mcount.git/arch/x86/kernel/entry_64.S 2008-01-29 17:26:18.000000000 -0500
@@ -53,6 +53,42 @@

.code64

+#ifdef CONFIG_MCOUNT
+
+ENTRY(mcount)
+ /* unlikely(mcount_enabled) */
+ cmpl $0, mcount_enabled
+ jnz trace
+ retq
+
+trace:
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rsi
+ movq 8(%rbp), %rdi
+
+ call *mcount_trace_function
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+ retq
+#endif
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
Index: linux-mcount.git/include/linux/linkage.h
===================================================================
--- linux-mcount.git.orig/include/linux/linkage.h 2008-01-29 16:59:15.000000000 -0500
+++ linux-mcount.git/include/linux/linkage.h 2008-01-29 17:26:18.000000000 -0500
@@ -3,6 +3,8 @@

#include <asm/linkage.h>

+#define notrace __attribute__((no_instrument_function))
+
#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
Index: linux-mcount.git/include/linux/mcount.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-mcount.git/include/linux/mcount.h 2008-01-29 17:26:18.000000000 -0500
@@ -0,0 +1,38 @@
+#ifndef _LINUX_MCOUNT_H
+#define _LINUX_MCOUNT_H
+
+#ifdef CONFIG_MCOUNT
+extern int mcount_enabled;
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct mcount_ops {
+ mcount_func_t func;
+ struct mcount_ops *next;
+};
+
+/*
+ * The mcount_ops must be a static and should also
+ * be read_mostly. These functions do modify read_mostly variables
+ * so use them sparely. Never free an mcount_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_mcount_function(struct mcount_ops *ops);
+int unregister_mcount_function(struct mcount_ops *ops);
+void clear_mcount_function(void);
+
+extern void mcount(void);
+
+#else /* !CONFIG_MCOUNT */
+# define register_mcount_function(ops) do { } while (0)
+# define unregister_mcount_function(ops) do { } while (0)
+# define clear_mcount_function(ops) do { } while (0)
+#endif /* CONFIG_MCOUNT */
+#endif /* _LINUX_MCOUNT_H */
Index: linux-mcount.git/kernel/sysctl.c
===================================================================
--- linux-mcount.git.orig/kernel/sysctl.c 2008-01-29 17:02:10.000000000 -0500
+++ linux-mcount.git/kernel/sysctl.c 2008-01-29 17:26:18.000000000 -0500
@@ -46,6 +46,7 @@
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
+#include <linux/mcount.h>

#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -514,6 +515,16 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifdef CONFIG_MCOUNT
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "mcount_enabled",
+ .data = &mcount_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_KMOD
{
.ctl_name = KERN_MODPROBE,
Index: linux-mcount.git/lib/Kconfig.debug
===================================================================
--- linux-mcount.git.orig/lib/Kconfig.debug 2008-01-29 17:02:10.000000000 -0500
+++ linux-mcount.git/lib/Kconfig.debug 2008-01-29 17:26:18.000000000 -0500
@@ -562,5 +562,6 @@ config LATENCYTOP
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.

+source lib/tracing/Kconfig

source "samples/Kconfig"
Index: linux-mcount.git/lib/Makefile
===================================================================
--- linux-mcount.git.orig/lib/Makefile 2008-01-29 17:02:10.000000000 -0500
+++ linux-mcount.git/lib/Makefile 2008-01-29 17:26:18.000000000 -0500
@@ -67,6 +67,8 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o

+obj-$(CONFIG_MCOUNT) += tracing/
+
lib-$(CONFIG_GENERIC_BUG) += bug.o

hostprogs-y := gen_crc32table
Index: linux-mcount.git/lib/tracing/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-mcount.git/lib/tracing/Kconfig 2008-01-29 17:26:18.000000000 -0500
@@ -0,0 +1,10 @@
+
+# Archs that enable MCOUNT should select HAVE_MCOUNT
+config HAVE_MCOUNT
+ bool
+
+# MCOUNT itself is useless, or will just be added overhead.
+# It needs something to register a function with it.
+config MCOUNT
+ bool
+ select FRAME_POINTER
Index: linux-mcount.git/lib/tracing/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-mcount.git/lib/tracing/Makefile 2008-01-29 17:26:18.000000000 -0500
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MCOUNT) += libmcount.o
+
+libmcount-y := mcount.o
Index: linux-mcount.git/lib/tracing/mcount.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-mcount.git/lib/tracing/mcount.c 2008-01-29 17:26:18.000000000 -0500
@@ -0,0 +1,141 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <[email protected]>
+ *
+ * Originally ported from the -rt patch by:
+ * Copyright (C) 2007 Arnaldo Carvalho de Melo <[email protected]>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/mcount.h>
+
+/*
+ * Since we have nothing protecting between the test of
+ * mcount_trace_function and the call to it, we can't
+ * set it to NULL without risking a race that will have
+ * the kernel call the NULL pointer. Instead, we just
+ * set the function pointer to a dummy function.
+ */
+notrace void dummy_mcount_tracer(unsigned long ip,
+ unsigned long parent_ip)
+{
+ /* do nothing */
+}
+
+static DEFINE_SPINLOCK(mcount_func_lock);
+static struct mcount_ops mcount_list_end __read_mostly =
+{
+ .func = dummy_mcount_tracer,
+};
+
+static struct mcount_ops *mcount_list __read_mostly = &mcount_list_end;
+mcount_func_t mcount_trace_function __read_mostly = dummy_mcount_tracer;
+int mcount_enabled __read_mostly;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL_GPL(mcount);
+
+notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
+{
+ struct mcount_ops *op = mcount_list;
+
+ while (op != &mcount_list_end) {
+ op->func(ip, parent_ip);
+ op = op->next;
+ };
+}
+
+/**
+ * register_mcount_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ * with "notrace", otherwise it will go into a
+ * recursive loop.
+ */
+int register_mcount_function(struct mcount_ops *ops)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mcount_func_lock, flags);
+ ops->next = mcount_list;
+ /* must have next seen before we update the list pointer */
+ smp_wmb();
+ mcount_list = ops;
+ /*
+ * For one func, simply call it directly.
+ * For more than one func, call the chain.
+ */
+ if (ops->next == &mcount_list_end)
+ mcount_trace_function = ops->func;
+ else
+ mcount_trace_function = mcount_list_func;
+ spin_unlock_irqrestore(&mcount_func_lock, flags);
+
+ return 0;
+}
+
+/**
+ * unregister_mcount_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by mcount profiling.
+ */
+int unregister_mcount_function(struct mcount_ops *ops)
+{
+ unsigned long flags;
+ struct mcount_ops **p;
+ int ret = 0;
+
+ spin_lock_irqsave(&mcount_func_lock, flags);
+
+ /*
+ * If we are the only function, then the mcount pointer is
+ * pointing directly to that function.
+ */
+ if (mcount_list == ops && ops->next == &mcount_list_end) {
+ mcount_trace_function = dummy_mcount_tracer;
+ mcount_list = &mcount_list_end;
+ goto out;
+ }
+
+ for (p = &mcount_list; *p != &mcount_list_end; p = &(*p)->next)
+ if (*p == ops)
+ break;
+
+ if (*p != ops) {
+ ret = -1;
+ goto out;
+ }
+
+ *p = (*p)->next;
+
+ /* If we only have one func left, then call that directly */
+ if (mcount_list->next == &mcount_list_end)
+ mcount_trace_function = mcount_list->func;
+
+ out:
+ spin_unlock_irqrestore(&mcount_func_lock, flags);
+
+ return 0;
+}
+
+/**
+ * clear_mcount_function - reset the mcount function
+ *
+ * This NULLs the mcount function and in essence stops
+ * tracing. There may be lag
+ */
+void clear_mcount_function(void)
+{
+ mcount_trace_function = dummy_mcount_tracer;
+}

--


2008-01-30 08:48:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


On Tue, 2008-01-29 at 22:15 -0500, Steven Rostedt wrote:

> +int register_mcount_function(struct mcount_ops *ops)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&mcount_func_lock, flags);
> + ops->next = mcount_list;
> + /* must have next seen before we update the list pointer */
> + smp_wmb();

That comment does not explain which race it closes; this is esp
important as there is no paired barrier to give hints.

> + mcount_list = ops;
> + /*
> + * For one func, simply call it directly.
> + * For more than one func, call the chain.
> + */
> + if (ops->next == &mcount_list_end)
> + mcount_trace_function = ops->func;
> + else
> + mcount_trace_function = mcount_list_func;
> + spin_unlock_irqrestore(&mcount_func_lock, flags);
> +
> + return 0;
> +}

2008-01-30 13:08:39

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation



On Wed, 30 Jan 2008, Peter Zijlstra wrote:

>
> On Tue, 2008-01-29 at 22:15 -0500, Steven Rostedt wrote:
>
> > +int register_mcount_function(struct mcount_ops *ops)
> > +{
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&mcount_func_lock, flags);
> > + ops->next = mcount_list;
> > + /* must have next seen before we update the list pointer */
> > + smp_wmb();
>
> That comment does not explain which race it closes; this is esp
> important as there is no paired barrier to give hints.

OK, fair enough. I'll explain it a bit more.

How's this:

/*
* We are entering ops into the mcount_list but another
* CPU might be walking that list. We need to make sure
* the ops->next pointer is valid before another CPU sees
* the ops pointer included into the mcount_list.
*/

-- Steve

>
> > + mcount_list = ops;
> > + /*
> > + * For one func, simply call it directly.
> > + * For more than one func, call the chain.
> > + */
> > + if (ops->next == &mcount_list_end)
> > + mcount_trace_function = ops->func;
> > + else
> > + mcount_trace_function = mcount_list_func;
> > + spin_unlock_irqrestore(&mcount_func_lock, flags);
> > +
> > + return 0;
> > +}
>
>
>

2008-01-30 13:26:10

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

Steven Rostedt wrote:

> --- linux-mcount.git.orig/arch/x86/kernel/entry_32.S 2008-01-29 16:59:15.000000000 -0500
> +++ linux-mcount.git/arch/x86/kernel/entry_32.S 2008-01-29 17:26:18.000000000 -0500
> @@ -75,6 +75,31 @@ DF_MASK = 0x00000400
> NT_MASK = 0x00004000
> VM_MASK = 0x00020000
>
> +#ifdef CONFIG_MCOUNT
> +.globl mcount
> +mcount:
> + /* unlikely(mcount_enabled) */
> + cmpl $0, mcount_enabled
> + jnz trace
> + ret

(and the corresponding 64-bit version)

Is the impact of this change on the (already expensive) mcount_enabled
case negligible? I worried about use cases where we want to gain some
(relative) worst-case numbers via these instrumentations.

In my personal priority scheme, CONFIG_MCOUNT=y && !mcount_enabled comes
after mcount_enabled.

Jan

--
Siemens AG, Corporate Technology, CT SE 2
Corporate Competence Center Embedded Linux

2008-01-30 13:53:50

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


On Wed, 30 Jan 2008, Jan Kiszka wrote:

> Steven Rostedt wrote:
>
> > --- linux-mcount.git.orig/arch/x86/kernel/entry_32.S 2008-01-29 16:59:15.000000000 -0500
> > +++ linux-mcount.git/arch/x86/kernel/entry_32.S 2008-01-29 17:26:18.000000000 -0500
> > @@ -75,6 +75,31 @@ DF_MASK = 0x00000400
> > NT_MASK = 0x00004000
> > VM_MASK = 0x00020000
> >
> > +#ifdef CONFIG_MCOUNT
> > +.globl mcount
> > +mcount:
> > + /* unlikely(mcount_enabled) */
> > + cmpl $0, mcount_enabled
> > + jnz trace
> > + ret
>
> (and the corresponding 64-bit version)
>
> Is the impact of this change on the (already expensive) mcount_enabled
> case negligible? I worried about use cases where we want to gain some
> (relative) worst-case numbers via these instrumentations.

The goal here was to limit the instruction cache hit that we take when
mcount_enabled = 0.
>
> In my personal priority scheme, CONFIG_MCOUNT=y && !mcount_enabled comes
> after mcount_enabled.

well, actually, I disagree. I only set mcount_enabled=1 when I'm about to
test something. You're right that we want the impact of the test least
affected, but when we have mcount_enabled=1 we usually also have a
function that's attached and in that case this change is negligible. But
on the normal case where mcount_enabled=0, this change may have a bigger
impact.

Remember CONFIG_MCOUNT=y && mcount_enabled=0 is (15% overhead)
CONFIG_MCOUNT=y && mcount_enabled=1 dummy func (49% overhead)
CONFIG_MCOUNT=y && mcount_enabled=1 trace func (500% overhead)

The trace func is the one that will be most likely used when analyzing. It
gives hackbench a 500% overhead, so I'm expecting this change to be
negligible in that case. But after I find what's wrong, I like to rebuild
the kernel without rebooting so I like to have mcount_enabled=0 have the
smallest impact ;-)

I'll put back the original code and run some new numbers.

-- Steve

2008-01-30 14:10:07

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


Paul,

Peter and I are having a discussion on craziness of archs and memory
barriers. You seem to understand crazy archs pretty well, and we would
like some advice. :-)

See below:

On Wed, 30 Jan 2008, Steven Rostedt wrote:

>
>
> On Wed, 30 Jan 2008, Peter Zijlstra wrote:
>
> >
> > On Tue, 2008-01-29 at 22:15 -0500, Steven Rostedt wrote:
> >
> > > +int register_mcount_function(struct mcount_ops *ops)
> > > +{
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&mcount_func_lock, flags);
> > > + ops->next = mcount_list;
> > > + /* must have next seen before we update the list pointer */
> > > + smp_wmb();
> >
> > That comment does not explain which race it closes; this is esp
> > important as there is no paired barrier to give hints.
>
> OK, fair enough. I'll explain it a bit more.
>
> How's this:
>
> /*
> * We are entering ops into the mcount_list but another
> * CPU might be walking that list. We need to make sure
> * the ops->next pointer is valid before another CPU sees
> * the ops pointer included into the mcount_list.
> */
>

The above is my new comment. But Peter says that it's still not good
enough and that all write memory barriers need read barriers. Let me
explain the situation here.

We have a single link list called mcount_list that is walked when more
than one function is registered by mcount. Mcount is called at the start
of all C functions that are not annotated with "notrace". When more than
one function is registered, mcount calls a loop function that does the
following:

notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
{
struct mcount_ops *op = mcount_list;

while (op != &mcount_list_end) {
op->func(ip, parent_ip);
op = op->next;
};
}

A registered function must already have a "func" filled, and the mcount
register code takes care of "next". It is documented that the calling
function should "never" change next and always expect that the func can be
called after it is unregistered. That's not the issue here.

The issue is how to insert the ops into the list. I've done the following,
as you can see in the code this text is inserted between.

ops->next = mcount_list;
smp_wmb();
mcount_list = ops;

The read side pair is the reading of ops to ops->next, which should imply
a smp_rmb() just by the logic. But Peter tells me things like alpha is
crazy enough to do better than that! Thus, I'm asking you.

Can some arch have a reader where it receives ops->next before it received
ops? This seems to me to be a phsyic arch, to know where ops->next is
before it knows ops!

Remember, that the ops that is being registered, is not viewable by any
other CPU until mcount_list = ops. I don't see the need for a read barrier
in this case. But I could very well be wrong.

Help!

-- Steve


>
> >
> > > + mcount_list = ops;
> > > + /*
> > > + * For one func, simply call it directly.
> > > + * For more than one func, call the chain.
> > > + */
> > > + if (ops->next == &mcount_list_end)
> > > + mcount_trace_function = ops->func;
> > > + else
> > > + mcount_trace_function = mcount_list_func;
> > > + spin_unlock_irqrestore(&mcount_func_lock, flags);
> > > +
> > > + return 0;
> > > +}
> >
> >
> >
>

2008-01-30 14:25:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


On Wed, 2008-01-30 at 09:09 -0500, Steven Rostedt wrote:
> Paul,
>
> Peter and I are having a discussion on craziness of archs and memory
> barriers. You seem to understand crazy archs pretty well, and we would
> like some advice. :-)
>
> See below:
>
> On Wed, 30 Jan 2008, Steven Rostedt wrote:
>
> >
> >
> > On Wed, 30 Jan 2008, Peter Zijlstra wrote:
> >
> > >
> > > On Tue, 2008-01-29 at 22:15 -0500, Steven Rostedt wrote:
> > >
> > > > +int register_mcount_function(struct mcount_ops *ops)
> > > > +{
> > > > + unsigned long flags;
> > > > +
> > > > + spin_lock_irqsave(&mcount_func_lock, flags);
> > > > + ops->next = mcount_list;
> > > > + /* must have next seen before we update the list pointer */
> > > > + smp_wmb();
> > >
> > > That comment does not explain which race it closes; this is esp
> > > important as there is no paired barrier to give hints.
> >
> > OK, fair enough. I'll explain it a bit more.
> >
> > How's this:
> >
> > /*
> > * We are entering ops into the mcount_list but another
> > * CPU might be walking that list. We need to make sure
> > * the ops->next pointer is valid before another CPU sees
> > * the ops pointer included into the mcount_list.
> > */
> >
>
> The above is my new comment. But Peter says that it's still not good
> enough and that all write memory barriers need read barriers.

To clarify, either: full mb, rmb or read depend.

> Let me explain the situation here.
>
> We have a single link list called mcount_list that is walked when more
> than one function is registered by mcount. Mcount is called at the start
> of all C functions that are not annotated with "notrace". When more than
> one function is registered, mcount calls a loop function that does the
> following:
>
> notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> {
> struct mcount_ops *op = mcount_list;

When thinking RCU, this would be rcu_dereference and imply a read
barrier.

> while (op != &mcount_list_end) {
> op->func(ip, parent_ip);
> op = op->next;

Same here; the rcu_dereference() would do the read depend barrier.

> };
> }
>
> A registered function must already have a "func" filled, and the mcount
> register code takes care of "next". It is documented that the calling
> function should "never" change next and always expect that the func can be
> called after it is unregistered. That's not the issue here.
>
> The issue is how to insert the ops into the list. I've done the following,
> as you can see in the code this text is inserted between.
>
> ops->next = mcount_list;
> smp_wmb();
> mcount_list = ops;
>
> The read side pair is the reading of ops to ops->next, which should imply
> a smp_rmb() just by the logic. But Peter tells me things like alpha is
> crazy enough to do better than that! Thus, I'm asking you.
>
> Can some arch have a reader where it receives ops->next before it received
> ops? This seems to me to be a phsyic arch, to know where ops->next is
> before it knows ops!
>
> Remember, that the ops that is being registered, is not viewable by any
> other CPU until mcount_list = ops. I don't see the need for a read barrier
> in this case. But I could very well be wrong.
>
> Help!
>
> -- Steve
>
>
> >
> > >
> > > > + mcount_list = ops;
> > > > + /*
> > > > + * For one func, simply call it directly.
> > > > + * For more than one func, call the chain.
> > > > + */
> > > > + if (ops->next == &mcount_list_end)
> > > > + mcount_trace_function = ops->func;
> > > > + else
> > > > + mcount_trace_function = mcount_list_func;
> > > > + spin_unlock_irqrestore(&mcount_func_lock, flags);
> > > > +
> > > > + return 0;
> > > > +}
> > >
> > >
> > >
> >

2008-01-30 14:28:47

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


On Wed, 30 Jan 2008, Steven Rostedt wrote:
> well, actually, I disagree. I only set mcount_enabled=1 when I'm about to
> test something. You're right that we want the impact of the test least
> affected, but when we have mcount_enabled=1 we usually also have a
> function that's attached and in that case this change is negligible. But
> on the normal case where mcount_enabled=0, this change may have a bigger
> impact.
>
> Remember CONFIG_MCOUNT=y && mcount_enabled=0 is (15% overhead)
> CONFIG_MCOUNT=y && mcount_enabled=1 dummy func (49% overhead)
> CONFIG_MCOUNT=y && mcount_enabled=1 trace func (500% overhead)
>
> The trace func is the one that will be most likely used when analyzing. It
> gives hackbench a 500% overhead, so I'm expecting this change to be
> negligible in that case. But after I find what's wrong, I like to rebuild
> the kernel without rebooting so I like to have mcount_enabled=0 have the
> smallest impact ;-)
>
> I'll put back the original code and run some new numbers.

I just ran with the original version of that test (on x86_64, the same box
as the previous tests were done, with the same kernel and config except
for this change)

Here's the numbers with the new design (the one that was used in this
patch):

mcount disabled:
Avg: 4.8638 (15.934498% overhead)

mcount enabled:
Avg: 6.2819 (49.736610% overhead)

function tracing:
Avg: 25.2035 (500.755607% overhead)

Now changing the code to:

ENTRY(mcount)
/* likely(mcount_enabled) */
cmpl $0, mcount_enabled
jz out

/* taken from glibc */
subq $0x38, %rsp
movq %rax, (%rsp)
movq %rcx, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rsi, 24(%rsp)
movq %rdi, 32(%rsp)
movq %r8, 40(%rsp)
movq %r9, 48(%rsp)

movq 0x38(%rsp), %rsi
movq 8(%rbp), %rdi

call *mcount_trace_function

movq 48(%rsp), %r9
movq 40(%rsp), %r8
movq 32(%rsp), %rdi
movq 24(%rsp), %rsi
movq 16(%rsp), %rdx
movq 8(%rsp), %rcx
movq (%rsp), %rax
addq $0x38, %rsp

out:
retq


mcount disabled:
Avg: 4.908 (16.988058% overhead)

mcount enabled:
Avg: 6.244. (48.840369% overhead)

function tracing:
Avg: 25.1963 (500.583987% overhead)


The change seems to cause a 1% overhead difference. With mcount disabled,
the newer code has a 1% performance benefit. With mcount enabled as well
as with tracing on, the old code has the 1% benefit.

But 1% has a bigger impact on something that is 15% than it does on
something that is 48% or 500%, so I'm keeping the newer version.

-- Steve

2008-02-01 22:34:41

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

On Wed, Jan 30, 2008 at 03:25:00PM +0100, Peter Zijlstra wrote:
>
> On Wed, 2008-01-30 at 09:09 -0500, Steven Rostedt wrote:
> > Paul,
> >
> > Peter and I are having a discussion on craziness of archs and memory
> > barriers. You seem to understand crazy archs pretty well, and we would
> > like some advice. :-)

OK, let's see what we have here...

> > See below:
> >
> > On Wed, 30 Jan 2008, Steven Rostedt wrote:
> >
> > >
> > >
> > > On Wed, 30 Jan 2008, Peter Zijlstra wrote:
> > >
> > > >
> > > > On Tue, 2008-01-29 at 22:15 -0500, Steven Rostedt wrote:
> > > >
> > > > > +int register_mcount_function(struct mcount_ops *ops)
> > > > > +{
> > > > > + unsigned long flags;
> > > > > +
> > > > > + spin_lock_irqsave(&mcount_func_lock, flags);
> > > > > + ops->next = mcount_list;
> > > > > + /* must have next seen before we update the list pointer */
> > > > > + smp_wmb();
> > > >
> > > > That comment does not explain which race it closes; this is esp
> > > > important as there is no paired barrier to give hints.
> > >
> > > OK, fair enough. I'll explain it a bit more.
> > >
> > > How's this:
> > >
> > > /*
> > > * We are entering ops into the mcount_list but another
> > > * CPU might be walking that list. We need to make sure
> > > * the ops->next pointer is valid before another CPU sees
> > > * the ops pointer included into the mcount_list.
> > > */
> > >
> >
> > The above is my new comment. But Peter says that it's still not good
> > enough and that all write memory barriers need read barriers.
>
> To clarify, either: full mb, rmb or read depend.

This is true. A write barrier ensures that the writes remain ordered,
but unless the reads are also ordered, the reader can still get confused.
For example (assuming all variables are initially zero):

writer:

a = 1;
smp_wmb(); /* or smp_mb() */
b = 1;

reader:

tb = b;
ta = a;

The writer will (roughly speaking) execute the assignments in order,
but the reader might not. If the reader executes the assignment from
"a" first, it might see tb==1&&ta==0. To prevent this, we do:

reader:

tb = b;
smp_rmb(); /* or smp_mb() */
ta = a;

There are a lot of variations on this theme.

> > Let me explain the situation here.
> >
> > We have a single link list called mcount_list that is walked when more
> > than one function is registered by mcount. Mcount is called at the start
> > of all C functions that are not annotated with "notrace". When more than
> > one function is registered, mcount calls a loop function that does the
> > following:
> >
> > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > {
> > struct mcount_ops *op = mcount_list;
>
> When thinking RCU, this would be rcu_dereference and imply a read
> barrier.
>
> > while (op != &mcount_list_end) {
> > op->func(ip, parent_ip);
> > op = op->next;
>
> Same here; the rcu_dereference() would do the read depend barrier.

Specifically:

notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
{
struct mcount_ops *op = rcu_dereference(mcount_list);

while (op != &mcount_list_end) {
op->func(ip, parent_ip);
op = rcu_dereference(op->next);

This assumes that you are using call_rcu(), synchronize_rcu(), or
whatever to defer freeing/reuse of the ops structure.

> > };
> > }
> >
> > A registered function must already have a "func" filled, and the mcount
> > register code takes care of "next". It is documented that the calling
> > function should "never" change next and always expect that the func can be
> > called after it is unregistered. That's not the issue here.
> >
> > The issue is how to insert the ops into the list. I've done the following,
> > as you can see in the code this text is inserted between.
> >
> > ops->next = mcount_list;
> > smp_wmb();
> > mcount_list = ops;
> >
> > The read side pair is the reading of ops to ops->next, which should imply
> > a smp_rmb() just by the logic. But Peter tells me things like alpha is
> > crazy enough to do better than that! Thus, I'm asking you.

Peter is correct when he says that Alpha does not necessarily respect data
dependencies. See the following URL for the official story:

http://www.openvms.compaq.com/wizard/wiz_2637.html

And I give an example hardware cache design that can result in this
situation here:

http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf

See the discussion starting with the "Why Reorder Memory Accesses?"
heading in the second column of the first page.

Strange, but true. It took an Alpha architect quite some time to
convince me of this back in the late 90s. ;-)

> > Can some arch have a reader where it receives ops->next before it received
> > ops? This seems to me to be a phsyic arch, to know where ops->next is
> > before it knows ops!

The trick is that the machine might have a split cache, with (say)
odd-numbered cache lines being processed by one half and even-numbered
lines processed by the other half. If reading CPU has one half of the
cache extremely busy (e.g., processing invalidation requests from other
CPUs) and the other half idle, memory misordering can happen in the
receiving CPU -- if the pointer is processed by the idle half, and
the pointed-to struct by the busy half, you might see the unitialized
contents of the pointed-to structure. The reading CPU must execute
a memory barrier to force ordering in this case.

> > Remember, that the ops that is being registered, is not viewable by any
> > other CPU until mcount_list = ops. I don't see the need for a read barrier
> > in this case. But I could very well be wrong.

And I was right there with you before my extended discussions with the
aforementioned Alpha architect!

Thanx, Paul

> > Help!
> >
> > -- Steve
> >
> >
> > >
> > > >
> > > > > + mcount_list = ops;
> > > > > + /*
> > > > > + * For one func, simply call it directly.
> > > > > + * For more than one func, call the chain.
> > > > > + */
> > > > > + if (ops->next == &mcount_list_end)
> > > > > + mcount_trace_function = ops->func;
> > > > > + else
> > > > > + mcount_trace_function = mcount_list_func;
> > > > > + spin_unlock_irqrestore(&mcount_func_lock, flags);
> > > > > +
> > > > > + return 0;
> > > > > +}
> > > >
> > > >
> > > >
> > >
>

2008-02-02 01:56:28

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation



On Fri, 1 Feb 2008, Paul E. McKenney wrote:

> > > > OK, fair enough. I'll explain it a bit more.
> > > >
> > > > How's this:
> > > >
> > > > /*
> > > > * We are entering ops into the mcount_list but another
> > > > * CPU might be walking that list. We need to make sure
> > > > * the ops->next pointer is valid before another CPU sees
> > > > * the ops pointer included into the mcount_list.
> > > > */
> > > >
> > >
> > > The above is my new comment. But Peter says that it's still not good
> > > enough and that all write memory barriers need read barriers.
> >
> > To clarify, either: full mb, rmb or read depend.
>
> This is true. A write barrier ensures that the writes remain ordered,
> but unless the reads are also ordered, the reader can still get confused.
> For example (assuming all variables are initially zero):
>
> writer:
>
> a = 1;
> smp_wmb(); /* or smp_mb() */
> b = 1;
>
> reader:
>
> tb = b;
> ta = a;
>
> The writer will (roughly speaking) execute the assignments in order,
> but the reader might not. If the reader executes the assignment from
> "a" first, it might see tb==1&&ta==0. To prevent this, we do:
>
> reader:
>
> tb = b;
> smp_rmb(); /* or smp_mb() */
> ta = a;
>
> There are a lot of variations on this theme.

Yep, this is all clear, but not quite what this code does.

>
> > > Let me explain the situation here.
> > >
> > > We have a single link list called mcount_list that is walked when more
> > > than one function is registered by mcount. Mcount is called at the start
> > > of all C functions that are not annotated with "notrace". When more than
> > > one function is registered, mcount calls a loop function that does the
> > > following:
> > >
> > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > {
> > > struct mcount_ops *op = mcount_list;
> >
> > When thinking RCU, this would be rcu_dereference and imply a read
> > barrier.
> >
> > > while (op != &mcount_list_end) {
> > > op->func(ip, parent_ip);
> > > op = op->next;
> >
> > Same here; the rcu_dereference() would do the read depend barrier.
>
> Specifically:
>
> notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> {
> struct mcount_ops *op = rcu_dereference(mcount_list);
>
> while (op != &mcount_list_end) {
> op->func(ip, parent_ip);
> op = rcu_dereference(op->next);
>
> This assumes that you are using call_rcu(), synchronize_rcu(), or
> whatever to defer freeing/reuse of the ops structure.

One special part of this is that the ops structure is never to be freed
(this is documented). It should be a static read-mostly structure.
Since it is not to be freed, I did not export the registered functions to
keep modules from using it. I may later add an export that will cause the
module to increment it's usage count so that it may never be freed.

There's no guarantees that prevent the func from being called after it was
unregistered, nor should the users of this, ever touch the "next" pointer.

This makes things easy when you don't need to free ;-)

>
> > > };
> > > }
> > >
> > > A registered function must already have a "func" filled, and the mcount
> > > register code takes care of "next". It is documented that the calling
> > > function should "never" change next and always expect that the func can be
> > > called after it is unregistered. That's not the issue here.
> > >
> > > The issue is how to insert the ops into the list. I've done the following,
> > > as you can see in the code this text is inserted between.
> > >
> > > ops->next = mcount_list;
> > > smp_wmb();
> > > mcount_list = ops;
> > >
> > > The read side pair is the reading of ops to ops->next, which should imply
> > > a smp_rmb() just by the logic. But Peter tells me things like alpha is
> > > crazy enough to do better than that! Thus, I'm asking you.
>
> Peter is correct when he says that Alpha does not necessarily respect data
> dependencies. See the following URL for the official story:
>
> http://www.openvms.compaq.com/wizard/wiz_2637.html
>
> And I give an example hardware cache design that can result in this
> situation here:
>
> http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
>
> See the discussion starting with the "Why Reorder Memory Accesses?"
> heading in the second column of the first page.
>
> Strange, but true. It took an Alpha architect quite some time to
> convince me of this back in the late 90s. ;-)
>
> > > Can some arch have a reader where it receives ops->next before it received
> > > ops? This seems to me to be a phsyic arch, to know where ops->next is
> > > before it knows ops!
>
> The trick is that the machine might have a split cache, with (say)
> odd-numbered cache lines being processed by one half and even-numbered
> lines processed by the other half. If reading CPU has one half of the
> cache extremely busy (e.g., processing invalidation requests from other
> CPUs) and the other half idle, memory misordering can happen in the
> receiving CPU -- if the pointer is processed by the idle half, and
> the pointed-to struct by the busy half, you might see the unitialized
> contents of the pointed-to structure. The reading CPU must execute
> a memory barrier to force ordering in this case.
>
> > > Remember, that the ops that is being registered, is not viewable by any
> > > other CPU until mcount_list = ops. I don't see the need for a read barrier
> > > in this case. But I could very well be wrong.
>
> And I was right there with you before my extended discussions with the
> aforementioned Alpha architect!
>

hmm, I'm still not convinced ;-)

This is a unique situation. We don't need to worry about items being freed
because there's too many races to allow that. The items are only to
register functions and are not to be dynamically allocated or freed. In
this situation we do not need to worry about deletions.

The smp_wmb is only for initialization of something that is about to enter
the list. It is not to protect against freeing.

Specifically:

ops->next = mcount_list;
smp_wmb();
mcount_list = ops;


What this is to prevent is a new item that has next = NULL being viewable
to other CPUS before next is initalized.

On another cpu we have (simplified by removing loop):

op = mcount_list;
op->func();
op = op->next;
if (op->next != NULL)
op->func;

What we want to prevent is reading of the new ops before ops->next is set.

What you are saying is that on alpha, even though the write to ops->next
has completed before mcount_list is set, we can still get a reversed
order?

ops->next = mcount_list; -- in one cache line
smp_wmb();
mcount_list = ops; -- in another cache line

Even though the ops->next is completed, we can have on another cpu:

op = mcount_list; (which is the ops from above)
op = op->next; -- still see the old ops->next?

I just want to understand this. I already put in the read_barrier_depends
because it doesn't hurt on most archs anyway (nops).

Thanks,

-- Steve

2008-02-02 21:42:18

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

On Fri, Feb 01, 2008 at 08:56:12PM -0500, Steven Rostedt wrote:
>
>
> On Fri, 1 Feb 2008, Paul E. McKenney wrote:
>
> > > > > OK, fair enough. I'll explain it a bit more.
> > > > >
> > > > > How's this:
> > > > >
> > > > > /*
> > > > > * We are entering ops into the mcount_list but another
> > > > > * CPU might be walking that list. We need to make sure
> > > > > * the ops->next pointer is valid before another CPU sees
> > > > > * the ops pointer included into the mcount_list.
> > > > > */
> > > > >
> > > >
> > > > The above is my new comment. But Peter says that it's still not good
> > > > enough and that all write memory barriers need read barriers.
> > >
> > > To clarify, either: full mb, rmb or read depend.
> >
> > This is true. A write barrier ensures that the writes remain ordered,
> > but unless the reads are also ordered, the reader can still get confused.
> > For example (assuming all variables are initially zero):
> >
> > writer:
> >
> > a = 1;
> > smp_wmb(); /* or smp_mb() */
> > b = 1;
> >
> > reader:
> >
> > tb = b;
> > ta = a;
> >
> > The writer will (roughly speaking) execute the assignments in order,
> > but the reader might not. If the reader executes the assignment from
> > "a" first, it might see tb==1&&ta==0. To prevent this, we do:
> >
> > reader:
> >
> > tb = b;
> > smp_rmb(); /* or smp_mb() */
> > ta = a;
> >
> > There are a lot of variations on this theme.
>
> Yep, this is all clear, but not quite what this code does.

Yep, you have dependencies, so something like the following:

initial state:

struct foo {
int a;
};
struct foo x = { 0 };
struct foo y = { 0 };
struct foo *global_p = &y;
/* other variables are appropriately declared auto variables */

/* No kmalloc() or kfree(), hence no RCU grace periods. */
/* In the terminology of http://lwn.net/Articles/262464/, we */
/* are doing only publish-subscribe, nothing else. */

writer:

x.a = 1;
smp_wmb(); /* or smp_mb() */
global_p = &x;

reader:

p = global_p;
ta = p->a;

Both Alpha and aggressive compiler optimizations can result in the reader
seeing the new value of the pointer (&x) but the old value of the field
(0). Strange but true. The fix is as follows:

reader:

p = global_p;
smp_read_barrier_depends(); /* or use rcu_dereference() */
ta = p->a;

So how can this happen? First note that if smp_read_barrier_depends()
was unnecessary in this case, it would be unnecessary in all cases.

Second, let's start with the compiler. Suppose that a highly optimizing
compiler notices that in almost all cases, the reader finds p==global_p.
Suppose that this compiler also notices that one of the registers (say
r1) almost always contains this expected value of global_p, and that
cache pressure ensures that an actual load from global_p almost always
generates an expensive cache miss. Such a compiler would be within its
rights (as defined by the C standard) to generate code assuming that r1
already had the right value, while also generating code to validate this
assumption, perhaps as follows:

r2 = global_p; /* high latency, other things complete meanwhile */
ta == r1->a;
if (r1 != r2)
ta = r2->a;

Now consider the following sequence of events on a superscalar CPU:

reader: r2 = global_p; /* issued, has not yet completed. */
reader: ta = r1->a; /* which gives zero. */
writer: x.a = 1;
writer: smp_wmb();
writer: global_p = &x;
reader: r2 = global_p; /* this instruction now completes */
reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */

I have great sympathy with the argument that this level of optimization
is simply insane, but the fact is that there are real-world compilers
that actually do this sort of thing. In addition, there are cases where
the compiler might be able to figure out that a value is constant, thus
breaking the dependency chain. This is most common for array references
where the compiler might be able to figure out that a given array index
is always zero, thus optimizing away the load and the dependency that
the programmer might expect to enforce ordering. (I have an example
of this down at the end.)

This sort of misordering is also done by DEC Alpha hardware, assuming
split caches. This can happen if the variable x is in an odd-numbered
cache line and the variable global_p is in an even-numbered cache line.
In this case, the smp_wmb() affects the memory order, but only within
the writing CPU. The ordering can be defeated in the reading CPU as
follows:

writer: x.a = 1;
writer: smp_wmb();
writer: global_p = &x;
reader: p = global_p;
reader: ta = p->a;

But the reader's odd-numbered cache shard is loaded
down with many queued cacheline invalidation requests,
so the old cached version of x.a==0 remains in the
reader's cache, so that the reader sees ta==0.

In contrast:

writer: x.a = 1;
writer: smp_wmb();
writer: global_p = &x;
reader: p = global_p;
reader: smp_read_barrier_depends();

The above barrier forces all cacheline invalidation
requests that have arrived at the reading CPU to be
processed before any subsequent reads, including
the pending invalidation request for the variable x.

reader: ta = p->a;

So ta is now guaranteed to be 1, as desired.

> > > > Let me explain the situation here.
> > > >
> > > > We have a single link list called mcount_list that is walked when more
> > > > than one function is registered by mcount. Mcount is called at the start
> > > > of all C functions that are not annotated with "notrace". When more than
> > > > one function is registered, mcount calls a loop function that does the
> > > > following:
> > > >
> > > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > > {
> > > > struct mcount_ops *op = mcount_list;
> > >
> > > When thinking RCU, this would be rcu_dereference and imply a read
> > > barrier.
> > >
> > > > while (op != &mcount_list_end) {
> > > > op->func(ip, parent_ip);
> > > > op = op->next;
> > >
> > > Same here; the rcu_dereference() would do the read depend barrier.
> >
> > Specifically:
> >
> > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > {
> > struct mcount_ops *op = rcu_dereference(mcount_list);
> >
> > while (op != &mcount_list_end) {
> > op->func(ip, parent_ip);
> > op = rcu_dereference(op->next);
> >
> > This assumes that you are using call_rcu(), synchronize_rcu(), or
> > whatever to defer freeing/reuse of the ops structure.
>
> One special part of this is that the ops structure is never to be freed
> (this is documented). It should be a static read-mostly structure.
> Since it is not to be freed, I did not export the registered functions to
> keep modules from using it. I may later add an export that will cause the
> module to increment it's usage count so that it may never be freed.
>
> There's no guarantees that prevent the func from being called after it was
> unregistered, nor should the users of this, ever touch the "next" pointer.
>
> This makes things easy when you don't need to free ;-)

It can indeed make things easier, but it does not help in this case.
This memory-ordering problem appears even if you never free anything, as
described above. Again, in the terminology laid out in the LWN article
at http://lwn.net/Articles/262464/, you are doing a publish-subscribe
operation, and it still must be protected.

But yes, my comment above about using call_rcu() and friends did in fact
incorrectly assume that you were freeing (or otherwise re-using) the
data structures.

> > > > };
> > > > }
> > > >
> > > > A registered function must already have a "func" filled, and the mcount
> > > > register code takes care of "next". It is documented that the calling
> > > > function should "never" change next and always expect that the func can be
> > > > called after it is unregistered. That's not the issue here.
> > > >
> > > > The issue is how to insert the ops into the list. I've done the following,
> > > > as you can see in the code this text is inserted between.
> > > >
> > > > ops->next = mcount_list;
> > > > smp_wmb();
> > > > mcount_list = ops;
> > > >
> > > > The read side pair is the reading of ops to ops->next, which should imply
> > > > a smp_rmb() just by the logic. But Peter tells me things like alpha is
> > > > crazy enough to do better than that! Thus, I'm asking you.
> >
> > Peter is correct when he says that Alpha does not necessarily respect data
> > dependencies. See the following URL for the official story:
> >
> > http://www.openvms.compaq.com/wizard/wiz_2637.html
> >
> > And I give an example hardware cache design that can result in this
> > situation here:
> >
> > http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
> >
> > See the discussion starting with the "Why Reorder Memory Accesses?"
> > heading in the second column of the first page.
> >
> > Strange, but true. It took an Alpha architect quite some time to
> > convince me of this back in the late 90s. ;-)
> >
> > > > Can some arch have a reader where it receives ops->next before it received
> > > > ops? This seems to me to be a phsyic arch, to know where ops->next is
> > > > before it knows ops!
> >
> > The trick is that the machine might have a split cache, with (say)
> > odd-numbered cache lines being processed by one half and even-numbered
> > lines processed by the other half. If reading CPU has one half of the
> > cache extremely busy (e.g., processing invalidation requests from other
> > CPUs) and the other half idle, memory misordering can happen in the
> > receiving CPU -- if the pointer is processed by the idle half, and
> > the pointed-to struct by the busy half, you might see the unitialized
> > contents of the pointed-to structure. The reading CPU must execute
> > a memory barrier to force ordering in this case.
> >
> > > > Remember, that the ops that is being registered, is not viewable by any
> > > > other CPU until mcount_list = ops. I don't see the need for a read barrier
> > > > in this case. But I could very well be wrong.
> >
> > And I was right there with you before my extended discussions with the
> > aforementioned Alpha architect!
>
> hmm, I'm still not convinced ;-)
>
> This is a unique situation. We don't need to worry about items being freed
> because there's too many races to allow that. The items are only to
> register functions and are not to be dynamically allocated or freed. In
> this situation we do not need to worry about deletions.
>
> The smp_wmb is only for initialization of something that is about to enter
> the list. It is not to protect against freeing.

Similarly, the smp_read_barrier_depends() is only for initialization
of something that is about to enter the list. As with the smp_wmb()
primitive, smp_read_barrier_depends() also is not to protect against
freeing. Instead, it is rcu_read_lock() and rcu_read_unlock() that
protect against freeing.

> Specifically:
>
> ops->next = mcount_list;
> smp_wmb();
> mcount_list = ops;
>
> What this is to prevent is a new item that has next = NULL being viewable
> to other CPUS before next is initalized.

Were it not for aggressive compiler optimizations and DEC Alpha, you would
be correct. What this instead does is to do the writer's part of the job
of preventing such new items from being visible to other CPUs before ->next
is initialized. These other CPUs must do their part as well, and that
part is smp_read_barrier_depends() -- or rcu_dereference(), whichever is
most appropriate.

> On another cpu we have (simplified by removing loop):
>
> op = mcount_list;
> op->func();
> op = op->next;
> if (op->next != NULL)
> op->func;
>
> What we want to prevent is reading of the new ops before ops->next is set.

Understood.

> What you are saying is that on alpha, even though the write to ops->next
> has completed before mcount_list is set, we can still get a reversed
> order?

That is exactly what I am saying. In addition, I am saying that
aggressive compiler optimizations can have this same effect, even on
non-Alpha CPUs.

> ops->next = mcount_list; -- in one cache line
> smp_wmb();
> mcount_list = ops; -- in another cache line
>
> Even though the ops->next is completed, we can have on another cpu:
>
> op = mcount_list; (which is the ops from above)
> op = op->next; -- still see the old ops->next?

Yes, this bizarre sequence of events really can happen. The fix is to
do the following:

op = mcount_list; (which is the ops from above)
smp_read_barrier_depends();
op = op->next; -- no longer see the old ops->next

> I just want to understand this. I already put in the read_barrier_depends
> because it doesn't hurt on most archs anyway (nops).

Very good!!!

And here is the example using array indexes.

initial state:

struct foo {
int a;
};
struct foo x[ARRAY_SIZE] = { 0 };
struct foo *global_p = &x[0];
/* other variables are appropriately declared auto variables */

/* No kmalloc() or kfree(), hence no RCU grace periods. */
/* In the terminology of http://lwn.net/Articles/262464/, we */
/* are doing only publish-subscribe, nothing else. */

writer:

x[cur_idx].a = 1;
smp_wmb(); /* or smp_mb() */
global_idx = cur_idx;

reader:

i = global_idx;
ta = x[i].a

Suppose we have ARRAY_SIZE of 1. Then the standard states that the
results of indexing x[] with a non-zero index are undefined. Since they
are undefined, the compiler is within its rights to assume that the
index will always be zero, so that the reader code would be as follows:

reader:

ta = x[0].a

No dependency, no ordering. So this totally reasonable generated code
could see the pre-initialized value of field a. The job of both
smp_read_barrier_depends() and rcu_dereference() is to tell both the
CPU and the compiler that such assumptions are ill-advised.

Thanx, Paul

> Thanks,
>
> -- Steve
>
>

2008-02-04 17:09:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


Hi Paul,

First I want to say, "Thank you", for taking the time to explain this in
considerable detail. But I still have some minor questions.

(Even though you already convinced me, but I still want full
understanding ;-)


On Sat, 2 Feb 2008, Paul E. McKenney wrote:

> Yep, you have dependencies, so something like the following:
>
> initial state:
>
> struct foo {
> int a;
> };
> struct foo x = { 0 };
> struct foo y = { 0 };
> struct foo *global_p = &y;
> /* other variables are appropriately declared auto variables */
>
> /* No kmalloc() or kfree(), hence no RCU grace periods. */
> /* In the terminology of http://lwn.net/Articles/262464/, we */
> /* are doing only publish-subscribe, nothing else. */
>
> writer:
>
> x.a = 1;
> smp_wmb(); /* or smp_mb() */
> global_p = &x;
>
> reader:
>
> p = global_p;
> ta = p->a;
>
> Both Alpha and aggressive compiler optimizations can result in the reader
> seeing the new value of the pointer (&x) but the old value of the field
> (0). Strange but true. The fix is as follows:
>
> reader:
>
> p = global_p;
> smp_read_barrier_depends(); /* or use rcu_dereference() */
> ta = p->a;
>
> So how can this happen? First note that if smp_read_barrier_depends()
> was unnecessary in this case, it would be unnecessary in all cases.
>
> Second, let's start with the compiler. Suppose that a highly optimizing
> compiler notices that in almost all cases, the reader finds p==global_p.
> Suppose that this compiler also notices that one of the registers (say
> r1) almost always contains this expected value of global_p, and that
> cache pressure ensures that an actual load from global_p almost always
> generates an expensive cache miss. Such a compiler would be within its
> rights (as defined by the C standard) to generate code assuming that r1
> already had the right value, while also generating code to validate this
> assumption, perhaps as follows:
>
> r2 = global_p; /* high latency, other things complete meanwhile */
> ta == r1->a;
> if (r1 != r2)
> ta = r2->a;
>
> Now consider the following sequence of events on a superscalar CPU:

I think you missed one step here (causing my confusion). I don't want to
assume so I'll try to put in the missing step:

writer: r1 = p; /* happens to use r1 to store parameter p */

> reader: r2 = global_p; /* issued, has not yet completed. */
> reader: ta = r1->a; /* which gives zero. */
> writer: x.a = 1;
> writer: smp_wmb();
> writer: global_p = &x;
> reader: r2 = global_p; /* this instruction now completes */
> reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */

Is that the case?

>
> I have great sympathy with the argument that this level of optimization
> is simply insane, but the fact is that there are real-world compilers
> that actually do this sort of thing. In addition, there are cases where
> the compiler might be able to figure out that a value is constant, thus
> breaking the dependency chain. This is most common for array references
> where the compiler might be able to figure out that a given array index
> is always zero, thus optimizing away the load and the dependency that
> the programmer might expect to enforce ordering. (I have an example
> of this down at the end.)
>
> This sort of misordering is also done by DEC Alpha hardware, assuming
> split caches. This can happen if the variable x is in an odd-numbered
> cache line and the variable global_p is in an even-numbered cache line.
> In this case, the smp_wmb() affects the memory order, but only within
> the writing CPU. The ordering can be defeated in the reading CPU as
> follows:
>
> writer: x.a = 1;
> writer: smp_wmb();
> writer: global_p = &x;
> reader: p = global_p;
> reader: ta = p->a;
>
> But the reader's odd-numbered cache shard is loaded
> down with many queued cacheline invalidation requests,
> so the old cached version of x.a==0 remains in the
> reader's cache, so that the reader sees ta==0.
>
> In contrast:
>
> writer: x.a = 1;
> writer: smp_wmb();
> writer: global_p = &x;
> reader: p = global_p;
> reader: smp_read_barrier_depends();
>
> The above barrier forces all cacheline invalidation
> requests that have arrived at the reading CPU to be
> processed before any subsequent reads, including
> the pending invalidation request for the variable x.
>
> reader: ta = p->a;
>
> So ta is now guaranteed to be 1, as desired.

Thanks, this is starting to clear things up for me (And scare me away from
Alpha's)

>
> > > > > Let me explain the situation here.
> > > > >
> > > > > We have a single link list called mcount_list that is walked when more
> > > > > than one function is registered by mcount. Mcount is called at the start
> > > > > of all C functions that are not annotated with "notrace". When more than
> > > > > one function is registered, mcount calls a loop function that does the
> > > > > following:
> > > > >
> > > > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > > > {
> > > > > struct mcount_ops *op = mcount_list;
> > > >
> > > > When thinking RCU, this would be rcu_dereference and imply a read
> > > > barrier.
> > > >
> > > > > while (op != &mcount_list_end) {
> > > > > op->func(ip, parent_ip);
> > > > > op = op->next;
> > > >
> > > > Same here; the rcu_dereference() would do the read depend barrier.
> > >
> > > Specifically:
> > >
> > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > {
> > > struct mcount_ops *op = rcu_dereference(mcount_list);
> > >
> > > while (op != &mcount_list_end) {
> > > op->func(ip, parent_ip);
> > > op = rcu_dereference(op->next);
> > >
> > > This assumes that you are using call_rcu(), synchronize_rcu(), or
> > > whatever to defer freeing/reuse of the ops structure.
> >
> > One special part of this is that the ops structure is never to be freed
> > (this is documented). It should be a static read-mostly structure.
> > Since it is not to be freed, I did not export the registered functions to
> > keep modules from using it. I may later add an export that will cause the
> > module to increment it's usage count so that it may never be freed.
> >
> > There's no guarantees that prevent the func from being called after it was
> > unregistered, nor should the users of this, ever touch the "next" pointer.
> >
> > This makes things easy when you don't need to free ;-)
>
> It can indeed make things easier, but it does not help in this case.
> This memory-ordering problem appears even if you never free anything, as
> described above. Again, in the terminology laid out in the LWN article
> at http://lwn.net/Articles/262464/, you are doing a publish-subscribe
> operation, and it still must be protected.
>
> But yes, my comment above about using call_rcu() and friends did in fact
> incorrectly assume that you were freeing (or otherwise re-using) the
> data structures.
>
> > > > > };
> > > > > }
> > > > >
> > > > > A registered function must already have a "func" filled, and the mcount
> > > > > register code takes care of "next". It is documented that the calling
> > > > > function should "never" change next and always expect that the func can be
> > > > > called after it is unregistered. That's not the issue here.
> > > > >
> > > > > The issue is how to insert the ops into the list. I've done the following,
> > > > > as you can see in the code this text is inserted between.
> > > > >
> > > > > ops->next = mcount_list;
> > > > > smp_wmb();
> > > > > mcount_list = ops;
> > > > >
> > > > > The read side pair is the reading of ops to ops->next, which should imply
> > > > > a smp_rmb() just by the logic. But Peter tells me things like alpha is
> > > > > crazy enough to do better than that! Thus, I'm asking you.
> > >
> > > Peter is correct when he says that Alpha does not necessarily respect data
> > > dependencies. See the following URL for the official story:
> > >
> > > http://www.openvms.compaq.com/wizard/wiz_2637.html
> > >
> > > And I give an example hardware cache design that can result in this
> > > situation here:
> > >
> > > http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
> > >
> > > See the discussion starting with the "Why Reorder Memory Accesses?"
> > > heading in the second column of the first page.
> > >
> > > Strange, but true. It took an Alpha architect quite some time to
> > > convince me of this back in the late 90s. ;-)
> > >
> > > > > Can some arch have a reader where it receives ops->next before it received
> > > > > ops? This seems to me to be a phsyic arch, to know where ops->next is
> > > > > before it knows ops!
> > >
> > > The trick is that the machine might have a split cache, with (say)
> > > odd-numbered cache lines being processed by one half and even-numbered
> > > lines processed by the other half. If reading CPU has one half of the
> > > cache extremely busy (e.g., processing invalidation requests from other
> > > CPUs) and the other half idle, memory misordering can happen in the
> > > receiving CPU -- if the pointer is processed by the idle half, and
> > > the pointed-to struct by the busy half, you might see the unitialized
> > > contents of the pointed-to structure. The reading CPU must execute
> > > a memory barrier to force ordering in this case.
> > >
> > > > > Remember, that the ops that is being registered, is not viewable by any
> > > > > other CPU until mcount_list = ops. I don't see the need for a read barrier
> > > > > in this case. But I could very well be wrong.
> > >
> > > And I was right there with you before my extended discussions with the
> > > aforementioned Alpha architect!
> >
> > hmm, I'm still not convinced ;-)
> >
> > This is a unique situation. We don't need to worry about items being freed
> > because there's too many races to allow that. The items are only to
> > register functions and are not to be dynamically allocated or freed. In
> > this situation we do not need to worry about deletions.
> >
> > The smp_wmb is only for initialization of something that is about to enter
> > the list. It is not to protect against freeing.
>
> Similarly, the smp_read_barrier_depends() is only for initialization
> of something that is about to enter the list. As with the smp_wmb()
> primitive, smp_read_barrier_depends() also is not to protect against
> freeing. Instead, it is rcu_read_lock() and rcu_read_unlock() that
> protect against freeing.
>
> > Specifically:
> >
> > ops->next = mcount_list;
> > smp_wmb();
> > mcount_list = ops;
> >
> > What this is to prevent is a new item that has next = NULL being viewable
> > to other CPUS before next is initalized.
>
> Were it not for aggressive compiler optimizations and DEC Alpha, you would
> be correct. What this instead does is to do the writer's part of the job
> of preventing such new items from being visible to other CPUs before ->next
> is initialized. These other CPUs must do their part as well, and that
> part is smp_read_barrier_depends() -- or rcu_dereference(), whichever is
> most appropriate.

Since the code doesn't use RCU, I'll keep with the
smp_read_barrier_depends().

>
> > On another cpu we have (simplified by removing loop):
> >
> > op = mcount_list;
> > op->func();
> > op = op->next;
> > if (op->next != NULL)
> > op->func;
> >
> > What we want to prevent is reading of the new ops before ops->next is set.
>
> Understood.
>
> > What you are saying is that on alpha, even though the write to ops->next
> > has completed before mcount_list is set, we can still get a reversed
> > order?
>
> That is exactly what I am saying. In addition, I am saying that
> aggressive compiler optimizations can have this same effect, even on
> non-Alpha CPUs.
>
> > ops->next = mcount_list; -- in one cache line
> > smp_wmb();
> > mcount_list = ops; -- in another cache line
> >
> > Even though the ops->next is completed, we can have on another cpu:
> >
> > op = mcount_list; (which is the ops from above)
> > op = op->next; -- still see the old ops->next?
>
> Yes, this bizarre sequence of events really can happen. The fix is to
> do the following:
>
> op = mcount_list; (which is the ops from above)
> smp_read_barrier_depends();
> op = op->next; -- no longer see the old ops->next
>
> > I just want to understand this. I already put in the read_barrier_depends
> > because it doesn't hurt on most archs anyway (nops).
>
> Very good!!!
>
> And here is the example using array indexes.
>
> initial state:
>
> struct foo {
> int a;
> };
> struct foo x[ARRAY_SIZE] = { 0 };
> struct foo *global_p = &x[0];
> /* other variables are appropriately declared auto variables */
>
> /* No kmalloc() or kfree(), hence no RCU grace periods. */
> /* In the terminology of http://lwn.net/Articles/262464/, we */
> /* are doing only publish-subscribe, nothing else. */
>
> writer:
>
> x[cur_idx].a = 1;
> smp_wmb(); /* or smp_mb() */
> global_idx = cur_idx;
>
> reader:
>
> i = global_idx;
> ta = x[i].a
>
> Suppose we have ARRAY_SIZE of 1. Then the standard states that the
> results of indexing x[] with a non-zero index are undefined. Since they
> are undefined, the compiler is within its rights to assume that the
> index will always be zero, so that the reader code would be as follows:
>
> reader:
>
> ta = x[0].a
>
> No dependency, no ordering. So this totally reasonable generated code
> could see the pre-initialized value of field a. The job of both
> smp_read_barrier_depends() and rcu_dereference() is to tell both the
> CPU and the compiler that such assumptions are ill-advised.
>

Paul,

Thanks again for this lengthy email. It took me several readings to absorb
it all in.

I recommend that someone have a pointer to this email because it really
does explain why read_barrier_depends is needed.

Excellent job of explaining this!!! Much appreciated.

-- Steve

2008-02-04 21:40:47

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

On Mon, Feb 04, 2008 at 12:09:00PM -0500, Steven Rostedt wrote:
>
> Hi Paul,
>
> First I want to say, "Thank you", for taking the time to explain this in
> considerable detail. But I still have some minor questions.
>
> (Even though you already convinced me, but I still want full
> understanding ;-)

OK, will see what I can do...

> On Sat, 2 Feb 2008, Paul E. McKenney wrote:
>
> > Yep, you have dependencies, so something like the following:
> >
> > initial state:
> >
> > struct foo {
> > int a;
> > };
> > struct foo x = { 0 };
> > struct foo y = { 0 };
> > struct foo *global_p = &y;
> > /* other variables are appropriately declared auto variables */
> >
> > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > /* are doing only publish-subscribe, nothing else. */
> >
> > writer:
> >
> > x.a = 1;
> > smp_wmb(); /* or smp_mb() */
> > global_p = &x;
> >
> > reader:
> >
> > p = global_p;
> > ta = p->a;
> >
> > Both Alpha and aggressive compiler optimizations can result in the reader
> > seeing the new value of the pointer (&x) but the old value of the field
> > (0). Strange but true. The fix is as follows:
> >
> > reader:
> >
> > p = global_p;
> > smp_read_barrier_depends(); /* or use rcu_dereference() */
> > ta = p->a;
> >
> > So how can this happen? First note that if smp_read_barrier_depends()
> > was unnecessary in this case, it would be unnecessary in all cases.
> >
> > Second, let's start with the compiler. Suppose that a highly optimizing
> > compiler notices that in almost all cases, the reader finds p==global_p.
> > Suppose that this compiler also notices that one of the registers (say
> > r1) almost always contains this expected value of global_p, and that
> > cache pressure ensures that an actual load from global_p almost always
> > generates an expensive cache miss. Such a compiler would be within its
> > rights (as defined by the C standard) to generate code assuming that r1
> > already had the right value, while also generating code to validate this
> > assumption, perhaps as follows:
> >
> > r2 = global_p; /* high latency, other things complete meanwhile */
> > ta == r1->a;
> > if (r1 != r2)
> > ta = r2->a;
> >
> > Now consider the following sequence of events on a superscalar CPU:
>
> I think you missed one step here (causing my confusion). I don't want to
> assume so I'll try to put in the missing step:
>
> writer: r1 = p; /* happens to use r1 to store parameter p */

You lost me on this one... The writer has only the following three steps:

writer:

x.a = 1;
smp_wmb(); /* or smp_mb() */
global_p = &x;

Where did the "r1 = p" come from? For that matter, where did "p" come
from?

> > reader: r2 = global_p; /* issued, has not yet completed. */
> > reader: ta = r1->a; /* which gives zero. */
> > writer: x.a = 1;
> > writer: smp_wmb();
> > writer: global_p = &x;
> > reader: r2 = global_p; /* this instruction now completes */
> > reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */
>
> Is that the case?

Ah! Please note that I am doing something unusual here in that I am
working with global variables, as opposed to the normal RCU practice of
dynamically allocating memory. So "x" is just a global struct, not a
pointer to a struct.

> > I have great sympathy with the argument that this level of optimization
> > is simply insane, but the fact is that there are real-world compilers
> > that actually do this sort of thing. In addition, there are cases where
> > the compiler might be able to figure out that a value is constant, thus
> > breaking the dependency chain. This is most common for array references
> > where the compiler might be able to figure out that a given array index
> > is always zero, thus optimizing away the load and the dependency that
> > the programmer might expect to enforce ordering. (I have an example
> > of this down at the end.)
> >
> > This sort of misordering is also done by DEC Alpha hardware, assuming
> > split caches. This can happen if the variable x is in an odd-numbered
> > cache line and the variable global_p is in an even-numbered cache line.
> > In this case, the smp_wmb() affects the memory order, but only within
> > the writing CPU. The ordering can be defeated in the reading CPU as
> > follows:
> >
> > writer: x.a = 1;
> > writer: smp_wmb();
> > writer: global_p = &x;
> > reader: p = global_p;
> > reader: ta = p->a;
> >
> > But the reader's odd-numbered cache shard is loaded
> > down with many queued cacheline invalidation requests,
> > so the old cached version of x.a==0 remains in the
> > reader's cache, so that the reader sees ta==0.
> >
> > In contrast:
> >
> > writer: x.a = 1;
> > writer: smp_wmb();
> > writer: global_p = &x;
> > reader: p = global_p;
> > reader: smp_read_barrier_depends();
> >
> > The above barrier forces all cacheline invalidation
> > requests that have arrived at the reading CPU to be
> > processed before any subsequent reads, including
> > the pending invalidation request for the variable x.
> >
> > reader: ta = p->a;
> >
> > So ta is now guaranteed to be 1, as desired.
>
> Thanks, this is starting to clear things up for me (And scare me away from
> Alpha's)

Of course, fairness would require that it also scare you away from
value-speculation optimizations in compilers. ;-)

Thanx, Paul

> > > > > > Let me explain the situation here.
> > > > > >
> > > > > > We have a single link list called mcount_list that is walked when more
> > > > > > than one function is registered by mcount. Mcount is called at the start
> > > > > > of all C functions that are not annotated with "notrace". When more than
> > > > > > one function is registered, mcount calls a loop function that does the
> > > > > > following:
> > > > > >
> > > > > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > > > > {
> > > > > > struct mcount_ops *op = mcount_list;
> > > > >
> > > > > When thinking RCU, this would be rcu_dereference and imply a read
> > > > > barrier.
> > > > >
> > > > > > while (op != &mcount_list_end) {
> > > > > > op->func(ip, parent_ip);
> > > > > > op = op->next;
> > > > >
> > > > > Same here; the rcu_dereference() would do the read depend barrier.
> > > >
> > > > Specifically:
> > > >
> > > > notrace void mcount_list_func(unsigned long ip, unsigned long parent_ip)
> > > > {
> > > > struct mcount_ops *op = rcu_dereference(mcount_list);
> > > >
> > > > while (op != &mcount_list_end) {
> > > > op->func(ip, parent_ip);
> > > > op = rcu_dereference(op->next);
> > > >
> > > > This assumes that you are using call_rcu(), synchronize_rcu(), or
> > > > whatever to defer freeing/reuse of the ops structure.
> > >
> > > One special part of this is that the ops structure is never to be freed
> > > (this is documented). It should be a static read-mostly structure.
> > > Since it is not to be freed, I did not export the registered functions to
> > > keep modules from using it. I may later add an export that will cause the
> > > module to increment it's usage count so that it may never be freed.
> > >
> > > There's no guarantees that prevent the func from being called after it was
> > > unregistered, nor should the users of this, ever touch the "next" pointer.
> > >
> > > This makes things easy when you don't need to free ;-)
> >
> > It can indeed make things easier, but it does not help in this case.
> > This memory-ordering problem appears even if you never free anything, as
> > described above. Again, in the terminology laid out in the LWN article
> > at http://lwn.net/Articles/262464/, you are doing a publish-subscribe
> > operation, and it still must be protected.
> >
> > But yes, my comment above about using call_rcu() and friends did in fact
> > incorrectly assume that you were freeing (or otherwise re-using) the
> > data structures.
> >
> > > > > > };
> > > > > > }
> > > > > >
> > > > > > A registered function must already have a "func" filled, and the mcount
> > > > > > register code takes care of "next". It is documented that the calling
> > > > > > function should "never" change next and always expect that the func can be
> > > > > > called after it is unregistered. That's not the issue here.
> > > > > >
> > > > > > The issue is how to insert the ops into the list. I've done the following,
> > > > > > as you can see in the code this text is inserted between.
> > > > > >
> > > > > > ops->next = mcount_list;
> > > > > > smp_wmb();
> > > > > > mcount_list = ops;
> > > > > >
> > > > > > The read side pair is the reading of ops to ops->next, which should imply
> > > > > > a smp_rmb() just by the logic. But Peter tells me things like alpha is
> > > > > > crazy enough to do better than that! Thus, I'm asking you.
> > > >
> > > > Peter is correct when he says that Alpha does not necessarily respect data
> > > > dependencies. See the following URL for the official story:
> > > >
> > > > http://www.openvms.compaq.com/wizard/wiz_2637.html
> > > >
> > > > And I give an example hardware cache design that can result in this
> > > > situation here:
> > > >
> > > > http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
> > > >
> > > > See the discussion starting with the "Why Reorder Memory Accesses?"
> > > > heading in the second column of the first page.
> > > >
> > > > Strange, but true. It took an Alpha architect quite some time to
> > > > convince me of this back in the late 90s. ;-)
> > > >
> > > > > > Can some arch have a reader where it receives ops->next before it received
> > > > > > ops? This seems to me to be a phsyic arch, to know where ops->next is
> > > > > > before it knows ops!
> > > >
> > > > The trick is that the machine might have a split cache, with (say)
> > > > odd-numbered cache lines being processed by one half and even-numbered
> > > > lines processed by the other half. If reading CPU has one half of the
> > > > cache extremely busy (e.g., processing invalidation requests from other
> > > > CPUs) and the other half idle, memory misordering can happen in the
> > > > receiving CPU -- if the pointer is processed by the idle half, and
> > > > the pointed-to struct by the busy half, you might see the unitialized
> > > > contents of the pointed-to structure. The reading CPU must execute
> > > > a memory barrier to force ordering in this case.
> > > >
> > > > > > Remember, that the ops that is being registered, is not viewable by any
> > > > > > other CPU until mcount_list = ops. I don't see the need for a read barrier
> > > > > > in this case. But I could very well be wrong.
> > > >
> > > > And I was right there with you before my extended discussions with the
> > > > aforementioned Alpha architect!
> > >
> > > hmm, I'm still not convinced ;-)
> > >
> > > This is a unique situation. We don't need to worry about items being freed
> > > because there's too many races to allow that. The items are only to
> > > register functions and are not to be dynamically allocated or freed. In
> > > this situation we do not need to worry about deletions.
> > >
> > > The smp_wmb is only for initialization of something that is about to enter
> > > the list. It is not to protect against freeing.
> >
> > Similarly, the smp_read_barrier_depends() is only for initialization
> > of something that is about to enter the list. As with the smp_wmb()
> > primitive, smp_read_barrier_depends() also is not to protect against
> > freeing. Instead, it is rcu_read_lock() and rcu_read_unlock() that
> > protect against freeing.
> >
> > > Specifically:
> > >
> > > ops->next = mcount_list;
> > > smp_wmb();
> > > mcount_list = ops;
> > >
> > > What this is to prevent is a new item that has next = NULL being viewable
> > > to other CPUS before next is initalized.
> >
> > Were it not for aggressive compiler optimizations and DEC Alpha, you would
> > be correct. What this instead does is to do the writer's part of the job
> > of preventing such new items from being visible to other CPUs before ->next
> > is initialized. These other CPUs must do their part as well, and that
> > part is smp_read_barrier_depends() -- or rcu_dereference(), whichever is
> > most appropriate.
>
> Since the code doesn't use RCU, I'll keep with the
> smp_read_barrier_depends().
>
> >
> > > On another cpu we have (simplified by removing loop):
> > >
> > > op = mcount_list;
> > > op->func();
> > > op = op->next;
> > > if (op->next != NULL)
> > > op->func;
> > >
> > > What we want to prevent is reading of the new ops before ops->next is set.
> >
> > Understood.
> >
> > > What you are saying is that on alpha, even though the write to ops->next
> > > has completed before mcount_list is set, we can still get a reversed
> > > order?
> >
> > That is exactly what I am saying. In addition, I am saying that
> > aggressive compiler optimizations can have this same effect, even on
> > non-Alpha CPUs.
> >
> > > ops->next = mcount_list; -- in one cache line
> > > smp_wmb();
> > > mcount_list = ops; -- in another cache line
> > >
> > > Even though the ops->next is completed, we can have on another cpu:
> > >
> > > op = mcount_list; (which is the ops from above)
> > > op = op->next; -- still see the old ops->next?
> >
> > Yes, this bizarre sequence of events really can happen. The fix is to
> > do the following:
> >
> > op = mcount_list; (which is the ops from above)
> > smp_read_barrier_depends();
> > op = op->next; -- no longer see the old ops->next
> >
> > > I just want to understand this. I already put in the read_barrier_depends
> > > because it doesn't hurt on most archs anyway (nops).
> >
> > Very good!!!
> >
> > And here is the example using array indexes.
> >
> > initial state:
> >
> > struct foo {
> > int a;
> > };
> > struct foo x[ARRAY_SIZE] = { 0 };
> > struct foo *global_p = &x[0];
> > /* other variables are appropriately declared auto variables */
> >
> > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > /* are doing only publish-subscribe, nothing else. */
> >
> > writer:
> >
> > x[cur_idx].a = 1;
> > smp_wmb(); /* or smp_mb() */
> > global_idx = cur_idx;
> >
> > reader:
> >
> > i = global_idx;
> > ta = x[i].a
> >
> > Suppose we have ARRAY_SIZE of 1. Then the standard states that the
> > results of indexing x[] with a non-zero index are undefined. Since they
> > are undefined, the compiler is within its rights to assume that the
> > index will always be zero, so that the reader code would be as follows:
> >
> > reader:
> >
> > ta = x[0].a
> >
> > No dependency, no ordering. So this totally reasonable generated code
> > could see the pre-initialized value of field a. The job of both
> > smp_read_barrier_depends() and rcu_dereference() is to tell both the
> > CPU and the compiler that such assumptions are ill-advised.
> >
>
> Paul,
>
> Thanks again for this lengthy email. It took me several readings to absorb
> it all in.
>
> I recommend that someone have a pointer to this email because it really
> does explain why read_barrier_depends is needed.
>
> Excellent job of explaining this!!! Much appreciated.

Glad you liked it!!!

Thanx, Paul

2008-02-04 22:04:17

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation


On Mon, 4 Feb 2008, Paul E. McKenney wrote:
> OK, will see what I can do...
>
> > On Sat, 2 Feb 2008, Paul E. McKenney wrote:
> >
> > > Yep, you have dependencies, so something like the following:
> > >
> > > initial state:
> > >
> > > struct foo {
> > > int a;
> > > };
> > > struct foo x = { 0 };
> > > struct foo y = { 0 };
> > > struct foo *global_p = &y;
> > > /* other variables are appropriately declared auto variables */
> > >
> > > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > > /* are doing only publish-subscribe, nothing else. */
> > >
> > > writer:
> > >
> > > x.a = 1;
> > > smp_wmb(); /* or smp_mb() */
> > > global_p = &x;
> > >
> > > reader:
> > >
> > > p = global_p;
> > > ta = p->a;
> > >
> > > Both Alpha and aggressive compiler optimizations can result in the reader
> > > seeing the new value of the pointer (&x) but the old value of the field
> > > (0). Strange but true. The fix is as follows:
> > >
> > > reader:
> > >
> > > p = global_p;
> > > smp_read_barrier_depends(); /* or use rcu_dereference() */
> > > ta = p->a;
> > >
> > > So how can this happen? First note that if smp_read_barrier_depends()
> > > was unnecessary in this case, it would be unnecessary in all cases.
> > >
> > > Second, let's start with the compiler. Suppose that a highly optimizing
> > > compiler notices that in almost all cases, the reader finds p==global_p.
> > > Suppose that this compiler also notices that one of the registers (say
> > > r1) almost always contains this expected value of global_p, and that
> > > cache pressure ensures that an actual load from global_p almost always
> > > generates an expensive cache miss. Such a compiler would be within its
> > > rights (as defined by the C standard) to generate code assuming that r1
> > > already had the right value, while also generating code to validate this
> > > assumption, perhaps as follows:
> > >
> > > r2 = global_p; /* high latency, other things complete meanwhile */
> > > ta == r1->a;
> > > if (r1 != r2)
> > > ta = r2->a;
> > >
> > > Now consider the following sequence of events on a superscalar CPU:
> >
> > I think you missed one step here (causing my confusion). I don't want to
> > assume so I'll try to put in the missing step:
> >
> > writer: r1 = p; /* happens to use r1 to store parameter p */
>
> You lost me on this one... The writer has only the following three steps:

You're right. I meant "writer: r1 = x;"

>
> writer:
>
> x.a = 1;
> smp_wmb(); /* or smp_mb() */
> global_p = &x;
>
> Where did the "r1 = p" come from? For that matter, where did "p" come
> from?
>
> > > reader: r2 = global_p; /* issued, has not yet completed. */
> > > reader: ta = r1->a; /* which gives zero. */
> > > writer: x.a = 1;
> > > writer: smp_wmb();
> > > writer: global_p = &x;
> > > reader: r2 = global_p; /* this instruction now completes */
> > > reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */
> >
> > Is that the case?
>
> Ah! Please note that I am doing something unusual here in that I am
> working with global variables, as opposed to the normal RCU practice of
> dynamically allocating memory. So "x" is just a global struct, not a
> pointer to a struct.
>

But lets look at a simple version of my original code anyway ;-)

Writer:

void add_op(struct myops *x) {
/* x->next may be garbage here */
x->next = global_p;
smp_wmb();
global_p = x;
}

Reader:

void read_op(void)
{
struct myops *p = global_p;

while (p != NULL) {
p->func();
p = next;
/* if p->next is garbage we crash */
}
}


Here, we are missing the read_barrier_depends(). Lets look at the Alpha
cache issue:


reader reads the new version of global_p, and then reads the next
pointer. But since the next pointer is on a different cacheline than
global_p, it may have somehow had that in it's cache still. So it uses the
old next pointer which contains the garbage.

Is that correct?

But I will have to admit, that I can't see how an aggressive compiler
might have screwed this up. Being that x is a parameter, and the function
add_op is not in a header file.

-- Steve

2008-02-04 22:51:58

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

* Steven Rostedt ([email protected]) wrote:
>
> On Mon, 4 Feb 2008, Paul E. McKenney wrote:
> > OK, will see what I can do...
> >
> > > On Sat, 2 Feb 2008, Paul E. McKenney wrote:
> > >
> > > > Yep, you have dependencies, so something like the following:
> > > >
> > > > initial state:
> > > >
> > > > struct foo {
> > > > int a;
> > > > };
> > > > struct foo x = { 0 };
> > > > struct foo y = { 0 };
> > > > struct foo *global_p = &y;
> > > > /* other variables are appropriately declared auto variables */
> > > >
> > > > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > > > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > > > /* are doing only publish-subscribe, nothing else. */
> > > >
> > > > writer:
> > > >
> > > > x.a = 1;
> > > > smp_wmb(); /* or smp_mb() */
> > > > global_p = &x;
> > > >
> > > > reader:
> > > >
> > > > p = global_p;
> > > > ta = p->a;
> > > >
> > > > Both Alpha and aggressive compiler optimizations can result in the reader
> > > > seeing the new value of the pointer (&x) but the old value of the field
> > > > (0). Strange but true. The fix is as follows:
> > > >
> > > > reader:
> > > >
> > > > p = global_p;
> > > > smp_read_barrier_depends(); /* or use rcu_dereference() */
> > > > ta = p->a;
> > > >
> > > > So how can this happen? First note that if smp_read_barrier_depends()
> > > > was unnecessary in this case, it would be unnecessary in all cases.
> > > >
> > > > Second, let's start with the compiler. Suppose that a highly optimizing
> > > > compiler notices that in almost all cases, the reader finds p==global_p.
> > > > Suppose that this compiler also notices that one of the registers (say
> > > > r1) almost always contains this expected value of global_p, and that
> > > > cache pressure ensures that an actual load from global_p almost always
> > > > generates an expensive cache miss. Such a compiler would be within its
> > > > rights (as defined by the C standard) to generate code assuming that r1
> > > > already had the right value, while also generating code to validate this
> > > > assumption, perhaps as follows:
> > > >
> > > > r2 = global_p; /* high latency, other things complete meanwhile */
> > > > ta == r1->a;
> > > > if (r1 != r2)
> > > > ta = r2->a;
> > > >
> > > > Now consider the following sequence of events on a superscalar CPU:
> > >
> > > I think you missed one step here (causing my confusion). I don't want to
> > > assume so I'll try to put in the missing step:
> > >
> > > writer: r1 = p; /* happens to use r1 to store parameter p */
> >
> > You lost me on this one... The writer has only the following three steps:
>
> You're right. I meant "writer: r1 = x;"
>
> >
> > writer:
> >
> > x.a = 1;
> > smp_wmb(); /* or smp_mb() */
> > global_p = &x;
> >
> > Where did the "r1 = p" come from? For that matter, where did "p" come
> > from?
> >
> > > > reader: r2 = global_p; /* issued, has not yet completed. */
> > > > reader: ta = r1->a; /* which gives zero. */
> > > > writer: x.a = 1;
> > > > writer: smp_wmb();
> > > > writer: global_p = &x;
> > > > reader: r2 = global_p; /* this instruction now completes */
> > > > reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */
> > >
> > > Is that the case?
> >
> > Ah! Please note that I am doing something unusual here in that I am
> > working with global variables, as opposed to the normal RCU practice of
> > dynamically allocating memory. So "x" is just a global struct, not a
> > pointer to a struct.
> >
>
> But lets look at a simple version of my original code anyway ;-)
>
> Writer:
>
> void add_op(struct myops *x) {
> /* x->next may be garbage here */
> x->next = global_p;
> smp_wmb();
> global_p = x;
> }
>
> Reader:
>
> void read_op(void)
> {
> struct myops *p = global_p;
>
> while (p != NULL) {
> p->func();
> p = next;
> /* if p->next is garbage we crash */
> }
> }
>
>
> Here, we are missing the read_barrier_depends(). Lets look at the Alpha
> cache issue:
>
>
> reader reads the new version of global_p, and then reads the next
> pointer. But since the next pointer is on a different cacheline than
> global_p, it may have somehow had that in it's cache still. So it uses the
> old next pointer which contains the garbage.
>
> Is that correct?
>
> But I will have to admit, that I can't see how an aggressive compiler
> might have screwed this up. Being that x is a parameter, and the function
> add_op is not in a header file.
>

Tell me if I am mistakened, but applying Paul's explanation to your
example would give (I unroll the loop for clarity) :

Writer:

void add_op(struct myops *x) {
/* x->next may be garbage here */
x->next = global_p;
smp_wmb();
global_p = x;
}

Reader:

void read_op(void)
{
struct myops *p = global_p;

if (p != NULL) {
p->func();
p = p->next;
/*
* Suppose the compiler expects that p->next is likely to be equal to
* p + sizeof(struct myops), uses r1 to store previous p, r2 to store the
* next p and r3 to store the expected value. Let's look at what the
* compiler could do for the next loop iteration.
*/
r2 = r1->next (1)
r3 = r1 + sizeof(struct myops)
r4 = r3->func (2)
if (r3 == r2 && r3 != NULL)
call r4

/* if p->next is garbage we crash */
} else
return;

if (p != NULL) {
p->func();
p = p->next;
/* if p->next is garbage we crash */
} else
return;
.....
}

In this example, we would be reading the expected "r3->func" (2) before
reading the real r1->next (1) value if reads are issued out of order.

Paul, am I correct ? And.. does the specific loop optimization I just
described actually exist ?

Thanks for your enlightenment :)

Mathieu

> -- Steve
>

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-02-05 06:57:31

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

On Mon, Feb 04, 2008 at 05:03:47PM -0500, Steven Rostedt wrote:
>
> On Mon, 4 Feb 2008, Paul E. McKenney wrote:
> > OK, will see what I can do...
> >
> > > On Sat, 2 Feb 2008, Paul E. McKenney wrote:
> > >
> > > > Yep, you have dependencies, so something like the following:
> > > >
> > > > initial state:
> > > >
> > > > struct foo {
> > > > int a;
> > > > };
> > > > struct foo x = { 0 };
> > > > struct foo y = { 0 };
> > > > struct foo *global_p = &y;
> > > > /* other variables are appropriately declared auto variables */
> > > >
> > > > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > > > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > > > /* are doing only publish-subscribe, nothing else. */
> > > >
> > > > writer:
> > > >
> > > > x.a = 1;
> > > > smp_wmb(); /* or smp_mb() */
> > > > global_p = &x;
> > > >
> > > > reader:
> > > >
> > > > p = global_p;
> > > > ta = p->a;
> > > >
> > > > Both Alpha and aggressive compiler optimizations can result in the reader
> > > > seeing the new value of the pointer (&x) but the old value of the field
> > > > (0). Strange but true. The fix is as follows:
> > > >
> > > > reader:
> > > >
> > > > p = global_p;
> > > > smp_read_barrier_depends(); /* or use rcu_dereference() */
> > > > ta = p->a;
> > > >
> > > > So how can this happen? First note that if smp_read_barrier_depends()
> > > > was unnecessary in this case, it would be unnecessary in all cases.
> > > >
> > > > Second, let's start with the compiler. Suppose that a highly optimizing
> > > > compiler notices that in almost all cases, the reader finds p==global_p.
> > > > Suppose that this compiler also notices that one of the registers (say
> > > > r1) almost always contains this expected value of global_p, and that
> > > > cache pressure ensures that an actual load from global_p almost always
> > > > generates an expensive cache miss. Such a compiler would be within its
> > > > rights (as defined by the C standard) to generate code assuming that r1
> > > > already had the right value, while also generating code to validate this
> > > > assumption, perhaps as follows:
> > > >
> > > > r2 = global_p; /* high latency, other things complete meanwhile */
> > > > ta == r1->a;
> > > > if (r1 != r2)
> > > > ta = r2->a;
> > > >
> > > > Now consider the following sequence of events on a superscalar CPU:
> > >
> > > I think you missed one step here (causing my confusion). I don't want to
> > > assume so I'll try to put in the missing step:
> > >
> > > writer: r1 = p; /* happens to use r1 to store parameter p */
> >
> > You lost me on this one... The writer has only the following three steps:
>
> You're right. I meant "writer: r1 = x;"

OK, I understand. You are correct, it would make more sense at the machine
level for the writer to do something like:

writer:

r1 = &x;
r1->a = 1;
smp_wmb(); /* or smp_mb() */
global_p = r1;

> > writer:
> >
> > x.a = 1;
> > smp_wmb(); /* or smp_mb() */
> > global_p = &x;
> >
> > Where did the "r1 = p" come from? For that matter, where did "p" come
> > from?
> >
> > > > reader: r2 = global_p; /* issued, has not yet completed. */
> > > > reader: ta = r1->a; /* which gives zero. */
> > > > writer: x.a = 1;
> > > > writer: smp_wmb();
> > > > writer: global_p = &x;
> > > > reader: r2 = global_p; /* this instruction now completes */
> > > > reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */
> > >
> > > Is that the case?
> >
> > Ah! Please note that I am doing something unusual here in that I am
> > working with global variables, as opposed to the normal RCU practice of
> > dynamically allocating memory. So "x" is just a global struct, not a
> > pointer to a struct.
>
> But lets look at a simple version of my original code anyway ;-)

Fair enough! ;-)

> Writer:
>
> void add_op(struct myops *x) {
> /* x->next may be garbage here */
> x->next = global_p;
> smp_wmb();
> global_p = x;
> }
>
> Reader:
>
> void read_op(void)
> {
> struct myops *p = global_p;
>
> while (p != NULL) {
> p->func();
> p = next;
> /* if p->next is garbage we crash */
> }
> }
>
>
> Here, we are missing the read_barrier_depends(). Lets look at the Alpha
> cache issue:
>
>
> reader reads the new version of global_p, and then reads the next
> pointer. But since the next pointer is on a different cacheline than
> global_p, it may have somehow had that in it's cache still. So it uses the
> old next pointer which contains the garbage.
>
> Is that correct?

Indeed! Changing the reader to be as follows should fix it:

Reader:

void read_op(void)
{
struct myops *p = global_p;

while (p != NULL) {
smp_read_barrier_depends();
p->func();
p = next;
/* if p->next is garbage we crash */
}
}

> But I will have to admit, that I can't see how an aggressive compiler
> might have screwed this up. Being that x is a parameter, and the function
> add_op is not in a header file.

Ah...

Suppose that we have a compiler that uses profile-based feedback.
It compiles the kernel with profiling code that tracks the values of
pointers, whether function arguments or global variables. All it need
do is look for repeated values, and then track the fraction of time
that the value occurs. If a given value occurs (say) 99.999% of the
time, it might make sense for the compiler to simply guess the value
ahead of time. Even more to the point, if the compiler determines
that an existing register already has the correct value 99.999% of
the time, we can simply use that register, then check that the value
was correct.

This might appear as follows:

Reader:

void read_op(void)
{
struct myops *p = global_p;

while (p != NULL) {
p->func();
/* do stuff with r1 assuming r1==p->next. */
r2 = p->next;
if (r2 != r1)
/* compensate somehow for guessing wrong. */
}
}

Insane? Probably so. But there are compiler guys who swear by it.

Thanx, Paul

2008-02-05 06:57:45

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 02/22 -v7] Add basic support for gcc profiler instrumentation

On Mon, Feb 04, 2008 at 05:41:40PM -0500, Mathieu Desnoyers wrote:
> * Steven Rostedt ([email protected]) wrote:
> >
> > On Mon, 4 Feb 2008, Paul E. McKenney wrote:
> > > OK, will see what I can do...
> > >
> > > > On Sat, 2 Feb 2008, Paul E. McKenney wrote:
> > > >
> > > > > Yep, you have dependencies, so something like the following:
> > > > >
> > > > > initial state:
> > > > >
> > > > > struct foo {
> > > > > int a;
> > > > > };
> > > > > struct foo x = { 0 };
> > > > > struct foo y = { 0 };
> > > > > struct foo *global_p = &y;
> > > > > /* other variables are appropriately declared auto variables */
> > > > >
> > > > > /* No kmalloc() or kfree(), hence no RCU grace periods. */
> > > > > /* In the terminology of http://lwn.net/Articles/262464/, we */
> > > > > /* are doing only publish-subscribe, nothing else. */
> > > > >
> > > > > writer:
> > > > >
> > > > > x.a = 1;
> > > > > smp_wmb(); /* or smp_mb() */
> > > > > global_p = &x;
> > > > >
> > > > > reader:
> > > > >
> > > > > p = global_p;
> > > > > ta = p->a;
> > > > >
> > > > > Both Alpha and aggressive compiler optimizations can result in the reader
> > > > > seeing the new value of the pointer (&x) but the old value of the field
> > > > > (0). Strange but true. The fix is as follows:
> > > > >
> > > > > reader:
> > > > >
> > > > > p = global_p;
> > > > > smp_read_barrier_depends(); /* or use rcu_dereference() */
> > > > > ta = p->a;
> > > > >
> > > > > So how can this happen? First note that if smp_read_barrier_depends()
> > > > > was unnecessary in this case, it would be unnecessary in all cases.
> > > > >
> > > > > Second, let's start with the compiler. Suppose that a highly optimizing
> > > > > compiler notices that in almost all cases, the reader finds p==global_p.
> > > > > Suppose that this compiler also notices that one of the registers (say
> > > > > r1) almost always contains this expected value of global_p, and that
> > > > > cache pressure ensures that an actual load from global_p almost always
> > > > > generates an expensive cache miss. Such a compiler would be within its
> > > > > rights (as defined by the C standard) to generate code assuming that r1
> > > > > already had the right value, while also generating code to validate this
> > > > > assumption, perhaps as follows:
> > > > >
> > > > > r2 = global_p; /* high latency, other things complete meanwhile */
> > > > > ta == r1->a;
> > > > > if (r1 != r2)
> > > > > ta = r2->a;
> > > > >
> > > > > Now consider the following sequence of events on a superscalar CPU:
> > > >
> > > > I think you missed one step here (causing my confusion). I don't want to
> > > > assume so I'll try to put in the missing step:
> > > >
> > > > writer: r1 = p; /* happens to use r1 to store parameter p */
> > >
> > > You lost me on this one... The writer has only the following three steps:
> >
> > You're right. I meant "writer: r1 = x;"
> >
> > >
> > > writer:
> > >
> > > x.a = 1;
> > > smp_wmb(); /* or smp_mb() */
> > > global_p = &x;
> > >
> > > Where did the "r1 = p" come from? For that matter, where did "p" come
> > > from?
> > >
> > > > > reader: r2 = global_p; /* issued, has not yet completed. */
> > > > > reader: ta = r1->a; /* which gives zero. */
> > > > > writer: x.a = 1;
> > > > > writer: smp_wmb();
> > > > > writer: global_p = &x;
> > > > > reader: r2 = global_p; /* this instruction now completes */
> > > > > reader: if (r1 != r2) /* and these are equal, so we keep bad ta! */
> > > >
> > > > Is that the case?
> > >
> > > Ah! Please note that I am doing something unusual here in that I am
> > > working with global variables, as opposed to the normal RCU practice of
> > > dynamically allocating memory. So "x" is just a global struct, not a
> > > pointer to a struct.
> > >
> >
> > But lets look at a simple version of my original code anyway ;-)
> >
> > Writer:
> >
> > void add_op(struct myops *x) {
> > /* x->next may be garbage here */
> > x->next = global_p;
> > smp_wmb();
> > global_p = x;
> > }
> >
> > Reader:
> >
> > void read_op(void)
> > {
> > struct myops *p = global_p;
> >
> > while (p != NULL) {
> > p->func();
> > p = next;
> > /* if p->next is garbage we crash */
> > }
> > }
> >
> >
> > Here, we are missing the read_barrier_depends(). Lets look at the Alpha
> > cache issue:
> >
> >
> > reader reads the new version of global_p, and then reads the next
> > pointer. But since the next pointer is on a different cacheline than
> > global_p, it may have somehow had that in it's cache still. So it uses the
> > old next pointer which contains the garbage.
> >
> > Is that correct?
> >
> > But I will have to admit, that I can't see how an aggressive compiler
> > might have screwed this up. Being that x is a parameter, and the function
> > add_op is not in a header file.
> >
>
> Tell me if I am mistakened, but applying Paul's explanation to your
> example would give (I unroll the loop for clarity) :
>
> Writer:
>
> void add_op(struct myops *x) {
> /* x->next may be garbage here */
> x->next = global_p;
> smp_wmb();
> global_p = x;
> }
>
> Reader:
>
> void read_op(void)
> {
> struct myops *p = global_p;
>
> if (p != NULL) {
> p->func();
> p = p->next;
> /*
> * Suppose the compiler expects that p->next is likely to be equal to
> * p + sizeof(struct myops), uses r1 to store previous p, r2 to store the
> * next p and r3 to store the expected value. Let's look at what the
> * compiler could do for the next loop iteration.
> */
> r2 = r1->next (1)
> r3 = r1 + sizeof(struct myops)
> r4 = r3->func (2)
> if (r3 == r2 && r3 != NULL)
> call r4
>
> /* if p->next is garbage we crash */
> } else
> return;
>
> if (p != NULL) {
> p->func();
> p = p->next;
> /* if p->next is garbage we crash */
> } else
> return;
> .....
> }
>
> In this example, we would be reading the expected "r3->func" (2) before
> reading the real r1->next (1) value if reads are issued out of order.
>
> Paul, am I correct ? And.. does the specific loop optimization I just
> described actually exist ?

This is indeed another form of value prediction. Perhaps more common
in scientific applications, but one could imagine it occurring in the
kernel as well.

In some cases, the read from the real r1->next might be deferred until
after the computation so as to overlap the speculative computation with
the memory latency. Border-line insane, perhaps, but some compiler
folks like this sort of approach...

> Thanks for your enlightenment :)

;-)

Thanx, Paul