This is basic timer profiling for ppc, tested on the
2.5.62 linuxppc kernel. It's a port of the ppc64 code.
diff -Naurd orig62/arch/ppc/Kconfig hack62/arch/ppc/Kconfig
--- orig62/arch/ppc/Kconfig 2003-02-24 20:15:49.000000000 -0500
+++ hack62/arch/ppc/Kconfig 2003-03-07 00:22:41.000000000 -0500
@@ -1522,6 +1522,8 @@
source "lib/Kconfig"
+source "arch/ppc/oprofile/Kconfig"
+
menu "Kernel hacking"
config DEBUG_KERNEL
diff -Naurd orig62/arch/ppc/Makefile hack62/arch/ppc/Makefile
--- orig62/arch/ppc/Makefile 2003-02-24 20:12:40.000000000 -0500
+++ hack62/arch/ppc/Makefile 2003-03-07 00:22:41.000000000 -0500
@@ -49,6 +49,7 @@
core-$(CONFIG_MATH_EMULATION) += arch/ppc/math-emu/
core-$(CONFIG_XMON) += arch/ppc/xmon/
core-$(CONFIG_APUS) += arch/ppc/amiga/
+drivers-$(CONFIG_OPROFILE) += arch/ppc/oprofile/
drivers-$(CONFIG_8xx) += arch/ppc/8xx_io/
drivers-$(CONFIG_4xx) += arch/ppc/4xx_io/
drivers-$(CONFIG_8260) += arch/ppc/8260_io/
diff -Naurd orig62/arch/ppc/defconfig hack62/arch/ppc/defconfig
--- orig62/arch/ppc/defconfig 2003-02-24 20:15:14.000000000 -0500
+++ hack62/arch/ppc/defconfig 2003-03-07 00:22:41.000000000 -0500
@@ -1108,6 +1108,12 @@
# CONFIG_BT is not set
#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
# Library routines
#
# CONFIG_CRC32 is not set
diff -Naurd orig62/arch/ppc/kernel/time.c hack62/arch/ppc/kernel/time.c
--- orig62/arch/ppc/kernel/time.c 2003-02-24 20:15:00.000000000 -0500
+++ hack62/arch/ppc/kernel/time.c 2003-03-07 00:22:41.000000000 -0500
@@ -56,6 +56,7 @@
#include <linux/mc146818rtc.h>
#include <linux/time.h>
#include <linux/init.h>
+#include <linux/profile.h>
#include <asm/segment.h>
#include <asm/io.h>
@@ -105,17 +106,28 @@
return delta;
}
-extern unsigned long prof_cpu_mask;
-extern unsigned int * prof_buffer;
-extern unsigned long prof_len;
-extern unsigned long prof_shift;
-extern char _stext;
-
-static inline void ppc_do_profile (unsigned long nip)
+/*
+ * The profiling function is SMP safe. (nothing can mess
+ * around with "current", and the profiling counters are
+ * updated with atomic operations). This is especially
+ * useful with a profiling multiplier != 1
+ */
+static inline void ppc_do_profile(struct pt_regs *regs)
{
+ unsigned long nip;
+ extern unsigned long prof_cpu_mask;
+ extern char _stext;
+
+ profile_hook(regs);
+
+ if (user_mode(regs))
+ return;
+
if (!prof_buffer)
return;
+ nip = instruction_pointer(regs);
+
/*
* Only measure the CPUs specified by /proc/irq/prof_cpu_mask.
* (default is all CPUs.)
@@ -151,11 +163,10 @@
do_IRQ(regs);
irq_enter();
-
+
while ((next_dec = tb_ticks_per_jiffy - tb_delta(&jiffy_stamp)) < 0) {
jiffy_stamp += tb_ticks_per_jiffy;
- if (!user_mode(regs))
- ppc_do_profile(instruction_pointer(regs));
+ ppc_do_profile(regs);
if (smp_processor_id())
continue;
diff -Naurd orig62/arch/ppc/oprofile/Kconfig hack62/arch/ppc/oprofile/Kconfig
--- orig62/arch/ppc/oprofile/Kconfig 1969-12-31 19:00:00.000000000 -0500
+++ hack62/arch/ppc/oprofile/Kconfig 2003-03-07 00:22:41.000000000 -0500
@@ -0,0 +1,23 @@
+
+menu "Profiling support"
+ depends on EXPERIMENTAL
+
+config PROFILING
+ bool "Profiling support (EXPERIMENTAL)"
+ help
+ Say Y here to enable the extended profiling support mechanisms used
+ by profilers such as OProfile.
+
+
+config OPROFILE
+ tristate "OProfile system profiling (EXPERIMENTAL)"
+ depends on PROFILING
+ help
+ OProfile is a profiling system capable of profiling the
+ whole system, include the kernel, kernel modules, libraries,
+ and applications.
+
+ If unsure, say N.
+
+endmenu
+
diff -Naurd orig62/arch/ppc/oprofile/Makefile hack62/arch/ppc/oprofile/Makefile
--- orig62/arch/ppc/oprofile/Makefile 1969-12-31 19:00:00.000000000 -0500
+++ hack62/arch/ppc/oprofile/Makefile 2003-03-07 00:22:41.000000000 -0500
@@ -0,0 +1,8 @@
+obj-$(CONFIG_OPROFILE) += oprofile.o
+
+DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
+ oprof.o cpu_buffer.o buffer_sync.o \
+ event_buffer.o oprofile_files.o \
+ oprofilefs.o oprofile_stats.o )
+
+oprofile-y := $(DRIVER_OBJS) init.o timer_int.o
diff -Naurd orig62/arch/ppc/oprofile/init.c hack62/arch/ppc/oprofile/init.c
--- orig62/arch/ppc/oprofile/init.c 1969-12-31 19:00:00.000000000 -0500
+++ hack62/arch/ppc/oprofile/init.c 2003-03-07 00:22:41.000000000 -0500
@@ -0,0 +1,20 @@
+/**
+ * @file init.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <linux/oprofile.h>
+#include <linux/init.h>
+
+extern void timer_init(struct oprofile_operations ** ops);
+
+int __init oprofile_arch_init(struct oprofile_operations ** ops)
+{
+ timer_init(ops);
+ return 0;
+}
diff -Naurd orig62/arch/ppc/oprofile/timer_int.c hack62/arch/ppc/oprofile/timer_int.c
--- orig62/arch/ppc/oprofile/timer_int.c 1969-12-31 19:00:00.000000000 -0500
+++ hack62/arch/ppc/oprofile/timer_int.c 2003-03-07 00:22:41.000000000 -0500
@@ -0,0 +1,59 @@
+/**
+ * @file timer_int.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/oprofile.h>
+#include <linux/profile.h>
+#include <asm/ptrace.h>
+
+static int timer_notify(struct notifier_block * self, unsigned long val, void * data)
+{
+ struct pt_regs * regs = (struct pt_regs *)data;
+ int cpu = smp_processor_id();
+ unsigned long pc = instruction_pointer(regs);
+ int is_kernel = !user_mode(regs);
+
+ oprofile_add_sample(pc, is_kernel, 0, cpu);
+ return 0;
+}
+
+
+static struct notifier_block timer_notifier = {
+ .notifier_call = timer_notify,
+};
+
+
+static int timer_start(void)
+{
+ return register_profile_notifier(&timer_notifier);
+}
+
+
+static void timer_stop(void)
+{
+ unregister_profile_notifier(&timer_notifier);
+}
+
+
+static struct oprofile_operations timer_ops = {
+ .start = timer_start,
+ .stop = timer_stop,
+ .cpu_type = "timer"
+};
+
+
+void __init timer_init(struct oprofile_operations ** ops)
+{
+ *ops = &timer_ops;
+ printk(KERN_INFO "oprofile: using timer interrupt.\n");
+}
On Fri, 2003-03-07 at 10:29, Albert D. Cahalan wrote:
> This is basic timer profiling for ppc, tested on the
> 2.5.62 linuxppc kernel. It's a port of the ppc64 code.
I'm sure I missed something... but I fail to see the the
interest in profiling based on sampling the instruction ptr
on a 100 Hz basis. This is way to slow to give any useful
results imho
Ben.
On Fri, 2003-03-07 at 05:13, Benjamin Herrenschmidt wrote:
> On Fri, 2003-03-07 at 10:29, Albert D. Cahalan wrote:
>> This is basic timer profiling for ppc, tested on the
>> 2.5.62 linuxppc kernel. It's a port of the ppc64 code.
>
> I'm sure I missed something... but I fail to see the the
> interest in profiling based on sampling the instruction ptr
> on a 100 Hz basis. This is way to slow to give any useful
> results imho
This is just the first part of the code. Please merge it
into any tree you have, unless it's obviously broken.
It is useful for long-running processes that don't do
much that is tied to the clock tick. (number crunching,
maybe X, web browsers without animations, /tmp cleaner...)
The i386 port is already using 1000 Hz in the kernel,
and has 100 Hz as a non-default option. I'd really like
to have this on my Mac; lots of things would improve.
I intend to allow sampling based on the performance counter
interrupt/trap/exception and the external interrupt signal.
On Fri, 2003-03-07 at 19:31, Albert Cahalan wrote:
> This is just the first part of the code. Please merge it
> into any tree you have, unless it's obviously broken.
> It is useful for long-running processes that don't do
> much that is tied to the clock tick. (number crunching,
> maybe X, web browsers without animations, /tmp cleaner...)
>
> The i386 port is already using 1000 Hz in the kernel,
> and has 100 Hz as a non-default option. I'd really like
> to have this on my Mac; lots of things would improve.
>
> I intend to allow sampling based on the performance counter
> interrupt/trap/exception and the external interrupt signal.
Ok. I'll ask paulus about merging this.
Beware though that some G4s have a nasty bug that prevents
using the performance counter interrupt (and the thermal interrupt
as well). The problem is that if any of those fall at the same
time as the DEC interrupt, the CPU messes up it's internal
state and you lose SRR0/SRR1, which means you can't recover
from the exception.
Note also that it should be relatively easy to have the DEC timer
run faster than HZ. The code in timer.c can deal with spurrious DEC
interrupts, so you may improve your results by just making it fire
at 1Khz or higher.
Ben.
On Sat, 2003-03-08 at 10:10, Benjamin Herrenschmidt wrote:
> On Fri, 2003-03-07 at 19:31, Albert Cahalan wrote:
>> This is just the first part of the code. Please merge it
>> into any tree you have, unless it's obviously broken.
>> It is useful for long-running processes that don't do
>> much that is tied to the clock tick. (number crunching,
>> maybe X, web browsers without animations, /tmp cleaner...)
>>
>> The i386 port is already using 1000 Hz in the kernel,
>> and has 100 Hz as a non-default option. I'd really like
>> to have this on my Mac; lots of things would improve.
>>
>> I intend to allow sampling based on the performance counter
>> interrupt/trap/exception and the external interrupt signal.
>
> Ok. I'll ask paulus about merging this.
>
> Beware though that some G4s have a nasty bug that prevents
> using the performance counter interrupt (and the thermal interrupt
> as well). The problem is that if any of those fall at the same
> time as the DEC interrupt, the CPU messes up it's internal
> state and you lose SRR0/SRR1, which means you can't recover
> from the exception.
Ouch. Motorola's description looks really suspicious.
The other interrupts "not affected by this errata"
might not suffer the 2-cycle MSR(EE) reset delay,
but they sure would interact with the broken ones.
MPC7400PNS.pdf doesn't list the bug; is a MPC7400 OK?
If not, perhaps you can send me some better info.
The decrementer isn't needed on systems with the
performance monitor. Simply require that one of the
PMCx registers count something like clock ticks, and
require that performance monitor interrupts be enabled.
Problem solved, eh?
> Note also that it should be relatively easy to have the
> DEC timer run faster than HZ. The code in timer.c can
> deal with spurrious DEC interrupts, so you may improve
> your results by just making it fire at 1Khz or higher.
How about this:
Bound the alarm clock (decrementer or an alternative)
setting such that it always fires between 10000 bus
cycles (safe number?) and 1/4000 second into the future.
Update jiffies purely based on the timebase register.
HZ is 1000. This ought to help with high-res timers.
If that special page at the top of user-space got
implemented (did it?), supply timebase frequency and
offset info there for non-SMP systems. (and for SMP
too if somebody cares to count the 60x/MPX bus cycles
involved in synchronizing timebase registers)
> Bound the alarm clock (decrementer or an alternative)
> setting such that it always fires between 10000 bus
> cycles (safe number?) and 1/4000 second into the future.
> Update jiffies purely based on the timebase register.
> HZ is 1000. This ought to help with high-res timers.
As I told you, current ppc timer.c should already bind on
timebase and so be safe with increased DEC irq frequency.
> If that special page at the top of user-space got
> implemented (did it?), supply timebase frequency and
> offset info there for non-SMP systems. (and for SMP
> too if somebody cares to count the 60x/MPX bus cycles
> involved in synchronizing timebase registers)
I don't think it is on PPC, though I'm still thinking about
it to also provide userspace with some stuff like atomic ops
etc... so that CPU specific workarounds like additional
sync's on 405gp can be left out of glibc.
I'll look for more detailed infos about the G4 bug, possibly
tomorrow or next week.
Ben.
Benjamin Herrenschmidt wrote:
> Beware though that some G4s have a nasty bug that prevents
> using the performance counter interrupt (and the thermal interrupt
> as well).
MPC7400 version 1.2 and lower have this problem.
> The problem is that if any of those fall at the same
> time as the DEC interrupt, the CPU messes up it's internal
> state and you lose SRR0/SRR1, which means you can't recover
> from the exception.
But the worst that happens is that you lose that process, isn't
it? Not all that big a problem, esp. since the window in which
this can happen is very small.
Segher
Benjamin Herrenschmidt wrote:
> I'll look for more detailed infos about the G4 bug, possibly
> tomorrow or next week.
It's mentioned in MPC7410CE.pdf, at least. I don't have the 7400
sheet, sorry.
Segher
On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
> Benjamin Herrenschmidt wrote:
>> Beware though that some G4s have a nasty bug that
>> prevents using the performance counter interrupt
>> (and the thermal interrupt as well).
>
> MPC7400 version 1.2 and lower have this problem.
MPC7410 you mean, right? Are those early revisions
even popular?
I'm wondering if the MPC7400 is also affected.
The MPC7400 has some significant differences.
The pipeline length changed.
>> The problem is that if any of those fall at the same
>> time as the DEC interrupt, the CPU messes up it's
>> internal state and you lose SRR0/SRR1, which means
>> you can't recover from the exception.
>
> But the worst that happens is that you lose that
> process, isn't it? Not all that big a problem,
> esp. since the window in which this can happen is
> very small.
I think you'd get an infinite loop of either
the decrementer or performance monitor. That's
mostly fixable by checking for the condition and
killing the affected process, but that process
could be one of the ones built into the kernel.
So the use of oprofile comes down to a choice:
a. Ignore the problem.
rare crashes
b. The decrementer goes much faster for profiling.
high overhead, awkwardness in non-time measurement
c. The performance monitor is used for clock ticks.
hard choices about sharing or frequency
Besides the obvious use of core cycles to generate
a clock tick out of the performance monitor, there
is the tbsel field in MMCR0. That has some strange
frequency choices. On a system with a 100 MHz bus,
it looks like one gets:
12.5 MHz, 49 kHz, 3 kHz, 191 Hz
So 3 kHz it is. That's 1526 Hz on a 50 MHz bus,
or 6104 Hz on a 200 MHz bus. This is enough to
get a 1000 Hz jiffies with reasonable jitter on
most machines, and a very good 100 Hz for user apps.
On Mon, 2003-03-10 at 04:50, Segher Boessenkool wrote:
> Benjamin Herrenschmidt wrote:
> > Beware though that some G4s have a nasty bug that prevents
> > using the performance counter interrupt (and the thermal interrupt
> > as well).
>
> MPC7400 version 1.2 and lower have this problem.
>
> > The problem is that if any of those fall at the same
> > time as the DEC interrupt, the CPU messes up it's internal
> > state and you lose SRR0/SRR1, which means you can't recover
> > from the exception.
>
> But the worst that happens is that you lose that process, isn't
> it? Not all that big a problem, esp. since the window in which
> this can happen is very small.
Well, you can also lose the kernel. We used to catch it with MOL
(since MOL increase the amount of DEC interrupts when running
within the virtual machine) quite easily.
Fortunately, the MOL kernel engine would then figure out that
something was wrong, bail out killing the emulator properly and
give useful error messages. So at that time, we could find the
problem before it beeing documented.
Ben.
On Mon, 2003-03-10 at 07:31, Albert Cahalan wrote:
> On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
> > Benjamin Herrenschmidt wrote:
>
> >> Beware though that some G4s have a nasty bug that
> >> prevents using the performance counter interrupt
> >> (and the thermal interrupt as well).
> >
> > MPC7400 version 1.2 and lower have this problem.
>
> MPC7410 you mean, right? Are those early revisions
> even popular?
>
> I'm wondering if the MPC7400 is also affected.
> The MPC7400 has some significant differences.
> The pipeline length changed.
7400 and 7410 are quite similar. I had the problem on
my old G4 which is a 7400 (I don't have it any more so
I can't tell you about the CPU rev).
> >> The problem is that if any of those fall at the same
> >> time as the DEC interrupt, the CPU messes up it's
> >> internal state and you lose SRR0/SRR1, which means
> >> you can't recover from the exception.
> >
> > But the worst that happens is that you lose that
> > process, isn't it? Not all that big a problem,
> > esp. since the window in which this can happen is
> > very small.
>
> I think you'd get an infinite loop of either
> the decrementer or performance monitor. That's
> mostly fixable by checking for the condition and
> killing the affected process, but that process
> could be one of the ones built into the kernel.
You can lose the kernel state as well
> So the use of oprofile comes down to a choice:
>
> a. Ignore the problem.
> rare crashes
Not that rare as soon as you increase the interrupt frequency
> b. The decrementer goes much faster for profiling.
> high overhead, awkwardness in non-time measurement
The overhead of a single DEC interrupt isn't _that_ high
> c. The performance monitor is used for clock ticks.
> hard choices about sharing or frequency
>
> Besides the obvious use of core cycles to generate
> a clock tick out of the performance monitor, there
> is the tbsel field in MMCR0. That has some strange
> frequency choices. On a system with a 100 MHz bus,
> it looks like one gets:
>
> 12.5 MHz, 49 kHz, 3 kHz, 191 Hz
>
> So 3 kHz it is. That's 1526 Hz on a 50 MHz bus,
> or 6104 Hz on a 200 MHz bus. This is enough to
> get a 1000 Hz jiffies with reasonable jitter on
> most machines, and a very good 100 Hz for user apps.
>
>
> ** Sent via the linuxppc-dev mail list. See http://lists.linuxppc.org/
--
Benjamin Herrenschmidt <[email protected]>
Albert Cahalan wrote:
> On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
>
>>Benjamin Herrenschmidt wrote:
>
>
>>>Beware though that some G4s have a nasty bug that
>>>prevents using the performance counter interrupt
>>>(and the thermal interrupt as well).
>>
>>MPC7400 version 1.2 and lower have this problem.
>
>
> MPC7410 you mean, right? Are those early revisions
> even popular?
7400 and 7410 core versions are identical, afaik. I don't
think any 7410 core lower than version 2.0 was ever used
in any consumer machines. ymmv.
> I'm wondering if the MPC7400 is also affected.
> The MPC7400 has some significant differences.
> The pipeline length changed.
Between 7400 and 7410? That's news to me...
>>>The problem is that if any of those fall at the same
>>>time as the DEC interrupt, the CPU messes up it's
>>>internal state and you lose SRR0/SRR1, which means
>>>you can't recover from the exception.
>>
>>But the worst that happens is that you lose that
>>process, isn't it? Not all that big a problem,
>>esp. since the window in which this can happen is
>>very small.
>
> I think you'd get an infinite loop of either
> the decrementer or performance monitor. That's
> mostly fixable by checking for the condition and
> killing the affected process, but that process
> could be one of the ones built into the kernel.
That would be a problem, yes :-(
> So the use of oprofile comes down to a choice:
>
> a. Ignore the problem.
> rare crashes
As long as its rare, that's not _too_ big of a problem,
really. Just document it ;)
> b. The decrementer goes much faster for profiling.
> high overhead, awkwardness in non-time measurement
Bad idea, I think.
> c. The performance monitor is used for clock ticks.
> hard choices about sharing or frequency
I'd go for this option.
Segher
On Monday, Mar 10, 2003, at 20:14 US/Central, Segher Boessenkool wrote:
> Albert Cahalan wrote:
>> On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
>>> Benjamin Herrenschmidt wrote:
>>>> Beware though that some G4s have a nasty bug that
>>>> prevents using the performance counter interrupt
>>>> (and the thermal interrupt as well).
>>>
>>> MPC7400 version 1.2 and lower have this problem.
>> MPC7410 you mean, right? Are those early revisions
>> even popular?
>
> 7400 and 7410 core versions are identical, afaik. I don't
> think any 7410 core lower than version 2.0 was ever used
> in any consumer machines. ymmv.
I've been looking into this, and all versions of the 7410 before 1.3
(where it was fixed) have this errata. And there is no version of the
7410 above 1.4. Some of the machines with 7410s, and all of the
machines with 7400s have this problem, I believe. If nothing else, it
is a security issue if user processes are allowed to configure the
counters (something that would be nice, in terms of useability).
>
>> I'm wondering if the MPC7400 is also affected.
>> The MPC7400 has some significant differences.
>> The pipeline length changed.
>
> Between 7400 and 7410? That's news to me...
There are no significant changes between the 7400 and 7410 pipelines,
the primary difference was the process in which it was fabricated. You
are probably thinking of the 7450 and its successors--the pipeline
changed in that model from 4 to 7 stages (depending on how one defines
"stage").
>
>>>> The problem is that if any of those fall at the same
>>>> time as the DEC interrupt, the CPU messes up it's
>>>> internal state and you lose SRR0/SRR1, which means
>>>> you can't recover from the exception.
>>>
>>> But the worst that happens is that you lose that
>>> process, isn't it? Not all that big a problem,
>>> esp. since the window in which this can happen is
>>> very small.
>> I think you'd get an infinite loop of either
>> the decrementer or performance monitor. That's
>> mostly fixable by checking for the condition and
>> killing the affected process, but that process
>> could be one of the ones built into the kernel.
>
> That would be a problem, yes :-(
>
>> So the use of oprofile comes down to a choice:
>> a. Ignore the problem.
>> rare crashes
>
> As long as its rare, that's not _too_ big of a problem,
> really. Just document it ;)
I suggest a modification of this behavior, which I'll describe at the
end of this email.
>
>> b. The decrementer goes much faster for profiling.
>> high overhead, awkwardness in non-time measurement
>
> Bad idea, I think.
>
>> c. The performance monitor is used for clock ticks.
>> hard choices about sharing or frequency
>
> I'd go for this option.
I don't think either of these are ideal. On most systems the
decrementer is used for generating timer interrupts used for
preemption, and other such fun. Messing around with this facility to
work around errata in the 7400 seems excessive. And locking down one
of the counters to only count cycles is undesireable: you would lose
the ability to count some events in most implementations of the
counters. As time goes on, the number of people wanting to tune
performance on 300-500MHz 7400/7410 processors will dwindle, but the
complications created by this workaround would haunt us forever.
As I see it, the problem is:
1) If the decrementer and perfmon interrupts occur one after the other
while a process is being profiled on some 7400/7410 processors, that
process's state (in terms of where it is in execution) will be lost.
This can be acceptable, since the PMI handler could detect such a
condition (a return address of 0x900 would be a good hint), and
terminate the offending program. Since nothing is harmed, you just try
again. As long as this behavior, and its cause, is documented (it
could even be detected by the module), this should be acceptable to
people with these processors.
2) If the same happens while in the kernel on one of those processors,
we have a kernel panic.
This is not, I think, acceptable behavior. Linux shouldn't crash.
However, this should only be a problem if the counters are on in
privileged space. If they don't increment when an interrupt occurs,
they can't cause a PMI. So the solution would be to disallow profiling
the kernel. However, some people want to profile the kernel, and those
processors should not be left out, if possible. What we can do,
though, is use timer based profiling for the kernel for only those
processors. The processors should be easy to detect. We just need to
make sure not to enable the PMI in the one condition (kernel is being
profiled one 7400/7410 processors before 7410 version 1.3).
Any thoughts on this solution?
Andy Fleming
PowerPC Software Enablement
Motorola, Inc
Note that my opinions are not Motorola's, even the good ones!
On Tue, 2003-03-11 at 16:54, Andrew Fleming wrote:
> On Monday, Mar 10, 2003, at 20:14 US/Central, Segher Boessenkool wrote:
>> Albert Cahalan wrote:
>>> On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
>>>> Benjamin Herrenschmidt wrote:
>>>>> Beware though that some G4s have a nasty bug that
>>>>> prevents using the performance counter interrupt
>>>>> (and the thermal interrupt as well).
>>>>
>>>> MPC7400 version 1.2 and lower have this problem.
>>>
>>> MPC7410 you mean, right? Are those early revisions
>>> even popular?
>>
>> 7400 and 7410 core versions are identical, afaik. I don't
>> think any 7410 core lower than version 2.0 was ever used
>> in any consumer machines. ymmv.
>
> I've been looking into this, and all versions of the 7410
> before 1.3 (where it was fixed) have this errata. And
> there is no version of the 7410 above 1.4. Some of the
> machines with 7410s, and all of the machines with 7400s
> have this problem, I believe. If nothing else, it is a
> security issue if user processes are allowed to configure
> the counters (something that would be nice, in terms of
> useability).
It would be nice if this bug were added to the notes
for the MPC7400 processor, if indeed it is present.
Even without this bug, I suspect oprofile is a major
security hazard. It lets you time things in the kernel.
Just set BAMR (to choose a kernel address) as desired,
and you can follow the jumps taken in crypto code, etc.
>>> I'm wondering if the MPC7400 is also affected.
>>> The MPC7400 has some significant differences.
>>> The pipeline length changed.
>>
>> Between 7400 and 7410? That's news to me...
>
> There are no significant changes between the 7400
> and 7410 pipelines, the primary difference was the
> process in which it was fabricated. You are probably
> thinking of the 7450 and its successors--the pipeline
> changed in that model from 4 to 7 stages (depending
> on how one defines "stage").
That's right. I keep thinking the MPC7410 got the
7-stage pipeline.
There's more than just a process difference though.
The version number is seriously different. It's not
just one bit changing to indicate a different process.
Here I am, with a version 2.9 chip:
cpu : 7400, altivec supported
temperature : 35-40 C (uncalibrated)
clock : 450MHz
revision : 2.9 (pvr 000c 0209)
>>> So the use of oprofile comes down to a choice:
>>> a. Ignore the problem.
>>> rare crashes
>>
>> As long as its rare, that's not _too_ big of
>> a problem, really. Just document it ;)
>
> I suggest a modification of this behavior, which
> I'll describe at the end of this email.
>
>>> b. The decrementer goes much faster for profiling.
>>> high overhead, awkwardness in non-time measurement
>>
>> Bad idea, I think.
>>
>>> c. The performance monitor is used for clock ticks.
>>> hard choices about sharing or frequency
>>
>> I'd go for this option.
>
> I don't think either of these are ideal. On most
> systems the decrementer is used for generating timer
> interrupts used for preemption, and other such fun.
> Messing around with this facility to work around
> errata in the 7400 seems excessive.
I have a 7400. I care deeply about this issue. :-)
If I could get a fanless (like the G4 Cube) system
with a newer processor, I might not care so much.
> And locking down one of the counters to only count
> cycles is undesireable: you would lose the ability
> to count some events in most implementations of the
> counters.
Any one of the counters would do; the event can be
moved around as needed. Also note the TBSEL bits in
MMCR0. TBSEL gives another way to get an interrupt,
without giving up any of the counters.
> As I see it, the problem is:
>
> 1) If the decrementer and perfmon interrupts occur
> one after the other while a process is being profiled
> on some 7400/7410 processors, that process's state
> (in terms of where it is in execution) will be lost.
>
> This can be acceptable, since the PMI handler could
> detect such a condition (a return address of 0x900
> would be a good hint), and terminate the offending
> program. Since nothing is harmed, you just try again.
> As long as this behavior, and its cause, is documented
> (it could even be detected by the module), this should
> be acceptable to people with these processors.
I'd really like to profile the kernel.
> 2) If the same happens while in the kernel on one
> of those processors, we have a kernel panic.
>
> This is not, I think, acceptable behavior. Linux
> shouldn't crash. However, this should only be a
> problem if the counters are on in privileged space.
> If they don't increment when an interrupt occurs,
> they can't cause a PMI.
Pardon me for being a pessimist. I have to imagine
that the counters don't turn off fast enough too.
Andrew Fleming writes:
>
> On Monday, Mar 10, 2003, at 20:14 US/Central, Segher Boessenkool wrote:
>
> > Albert Cahalan wrote:
> >> On Sun, 2003-03-09 at 22:50, Segher Boessenkool wrote:
> >>> Benjamin Herrenschmidt wrote:
> >>>> Beware though that some G4s have a nasty bug that
> >>>> prevents using the performance counter interrupt
> >>>> (and the thermal interrupt as well).
> >>>
> >>> MPC7400 version 1.2 and lower have this problem.
> >> MPC7410 you mean, right? Are those early revisions
> >> even popular?
> >
> > 7400 and 7410 core versions are identical, afaik. I don't
> > think any 7410 core lower than version 2.0 was ever used
> > in any consumer machines. ymmv.
>
> I've been looking into this, and all versions of the 7410 before 1.3
> (where it was fixed) have this errata. And there is no version of the
> 7410 above 1.4. Some of the machines with 7410s, and all of the
> machines with 7400s have this problem, I believe.
Is this bug restricted to 7400/7410 only, or does it affect the
750 (and relatives) and 604/604e too?
I'm thinking about ppc support for my perfctr driver, and whether
overflow interrupts are worth supporting or not given the errata.
/Mikael
On Tue, 2003-03-11 at 18:30, [email protected] wrote:
>>>>> Benjamin Herrenschmidt wrote:
>>>>>> Beware though that some G4s have a nasty bug that
>>>>>> prevents using the performance counter interrupt
>>>>>> (and the thermal interrupt as well).
...
> Is this bug restricted to 7400/7410 only, or does it
> affect the 750 (and relatives) and 604/604e too?
>
> I'm thinking about ppc support for my perfctr driver,
> and whether overflow interrupts are worth supporting
> or not given the errata.
604/604e doesn't even have performance monitoring AFAIK.
I've heard nothing to suggest that the 750 is affected.
I'll give you a hand; point me to the latest perfctr code
and explain how it is supposed to interact with oprofile.
>
> It would be nice if this bug were added to the notes
> for the MPC7400 processor, if indeed it is present.
>
> Even without this bug, I suspect oprofile is a major
> security hazard. It lets you time things in the kernel.
> Just set BAMR (to choose a kernel address) as desired,
> and you can follow the jumps taken in crypto code, etc.
Well, I can confirm that it is in the errata internally available in a
document marked 5/14/2001. I had difficulty finding any errata
listings for the 7400 on our external site, though.
>
> There's more than just a process difference though.
> The version number is seriously different. It's not
> just one bit changing to indicate a different process.
> Here I am, with a version 2.9 chip:
>
> cpu : 7400, altivec supported
> temperature : 35-40 C (uncalibrated)
> clock : 450MHz
> revision : 2.9 (pvr 000c 0209)
Yeah, the 7410 had some small microarchitectural and architectural
changes. Changes that mean it can't necessarily be used in the same
socket as the 7400. Votages were modified, and some memory interfacing
was changed. Also, it was a significantly different process (a major
shrink), so it gets a new name. The various versions of the 7400 are
all in the same sized process (with slightly different transistors, I
think).
>
> Any one of the counters would do; the event can be
> moved around as needed. Also note the TBSEL bits in
> MMCR0. TBSEL gives another way to get an interrupt,
> without giving up any of the counters.
True. But I'm going to have to think about this one more, before I
agree with you. :)
>
> Pardon me for being a pessimist. I have to imagine
> that the counters don't turn off fast enough too.
I'm fairly certain they do shut off the instant the interrupt happens
if they aren't supposed to count privileged events, but I'd have to do
some testing to prove it. The issue is that MSR[EE] takes 2 cycles to
be written.
>> Is this bug restricted to 7400/7410 only, or does it
>> affect the 750 (and relatives) and 604/604e too?
>>
>> I'm thinking about ppc support for my perfctr driver,
>> and whether overflow interrupts are worth supporting
>> or not given the errata.
>
> 604/604e doesn't even have performance monitoring AFAIK.
> I've heard nothing to suggest that the 750 is affected.
>
> I'll give you a hand; point me to the latest perfctr code
> and explain how it is supposed to interact with oprofile.
The 604 and 604e both have performance monitors. I can't find anything
which says that this is a bug in them. However, being as this bug
managed to creep into the 7400/7410, I'm not willing to say that the
bug didn't exist in all of those processors. More testing would be
necessary.
BTW, I am also interested in helping out with this code. Meanwhile,
I'll attempt to find the answers to those questions.
Andy Fleming
PowerPC Software Enablement
Motorola, Inc
Note that my opinions are not Motorola's, even the good ones!
Albert Cahalan writes:
> > Is this bug restricted to 7400/7410 only, or does it
> > affect the 750 (and relatives) and 604/604e too?
> >
> > I'm thinking about ppc support for my perfctr driver,
> > and whether overflow interrupts are worth supporting
> > or not given the errata.
>
> 604/604e doesn't even have performance monitoring AFAIK.
Yes they do. 604 has two counters, 604e has four.
> I've heard nothing to suggest that the 750 is affected.
I seem to recall hearing something about some temperature
monitoring interrupt interacting badly with the performance
monitor interupt due to an errata, but that may not have been
the 750.
> I'll give you a hand; point me to the latest perfctr code
> and explain how it is supposed to interact with oprofile.
They're not supposed to interact, but there is currently no
mechanism in place for preventing both from being activated
at the same time. What's needed is some form of kernel API
for reserving and releasing the performance counter hardware,
and updating oprofile to use that API.
On Fri, 2003-03-07 at 03:29, Albert D. Cahalan wrote:
> This is basic timer profiling for ppc, tested on the
> 2.5.62 linuxppc kernel. It's a port of the ppc64 code.
I can't seem to find profile_hook() anywhere in the kernel source. Did
it get implemented? It looks like the ppc64 code could be copied
directly. Without it, the kernel as patched doesn't build. Well, at
least the 2.5.59 kernel doesn't have it. Is it in the 2.5.62 kernel?
Andy Fleming