Due to restriction and specifics of Netburst PMU we need a separated
event for NMI watchdog. In particular every Netburst event consume not
just a counter and config register, but also an additional ESCR register.
Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
for some event there is no room for another event to enter the room until
it's released) we need to pick up "least" used ESCR (or most available)
for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
Note that on all other PMUs which support relocation of events between
counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
v2: Add a comment about non-sleeping clockticks.
N.B: An attempts to make an alternate encodings for events didn't make
situation better because we would need to track how exactly we substitute
the particular event -- hw::config knows nothing from where the event came,
from user-space as a raw event or as pre-configured general event. If it
comes as raw event we have to track every single bit of ESCR mask and find
out if new event would count exactly the same thing as the former event
was supposed to. So I found such way inconvenient for users and adding a
single code snippet seems to be a way more clean approach.
Signed-off-by: Cyrill Gorcunov <[email protected]>
Acked-by: Don Zickus <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Lin Ming <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Arnaldo Carvalho de Melo <[email protected]>
CC: Frederic Weisbecker <[email protected]>
---
To PeterZ: Peter, I've tried various ways to implement an alternate encoding
(Don even tried one which didn't work because of ESCR conflict ;) but all them
introduced much code which makes the whole picture more complex I think and
there is no 1:1 map between even single event (initially I thought we have
something but eventually found they are not). So even new NMI-WATCHDOG event
is *not* the same as "power events" were before but they are not supposed to
be "exactly" precise in compare with cpu-clocks we use for perf top. So I think
it's acceptable trade off -- less precise events for nmi-watchdog and more
precise for perf top and friends.
Don, I put your Ack here because the only thing I've changed (in compare with
previous tested verion) is PERF_COUNT_HW_NMI_WATCHDOG = 8 (was 7 before), please
re-test it again, I've tested it already but still.
Comments are welcome as usuall ;)
arch/x86/kernel/cpu/perf_event_amd.c | 1 +
arch/x86/kernel/cpu/perf_event_intel.c | 1 +
arch/x86/kernel/cpu/perf_event_p4.c | 18 ++++++++++++++++++
arch/x86/kernel/cpu/perf_event_p6.c | 1 +
include/linux/perf_event.h | 1 +
kernel/watchdog.c | 2 +-
6 files changed, 23 insertions(+), 1 deletion(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_amd.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
@@ -102,6 +102,7 @@ static const u64 amd_perfmon_event_map[]
[PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_NMI_WATCHDOG] = 0x0076,
};
static u64 amd_pmu_event_map(int hw_event)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_intel.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
@@ -34,6 +34,7 @@ static u64 intel_perfmon_event_map[PERF_
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_NMI_WATCHDOG] = 0x003c,
};
static struct event_constraint intel_core_event_constraints[] __read_mostly =
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -607,6 +607,24 @@ static u64 p4_general_events[PERF_COUNT_
P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+
+ /*
+ * This is a specific way to count non-halted clockticks as SDM Vol.3B
+ * "30.11.2 Non-Sleep Clockticks" suggest. We set threshold and complement
+ * flag as result every tick is accounted and delivered to the counter.
+ */
+ [PERF_COUNT_HW_NMI_WATCHDOG] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
};
static struct p4_event_bind *p4_config_get_bind(u64 config)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p6.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
@@ -12,6 +12,7 @@ static const u64 p6_perfmon_event_map[]
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
[PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+ [PERF_COUNT_HW_NMI_WATCHDOG] = 0x0079,
};
static u64 p6_pmu_event_map(int hw_event)
Index: linux-2.6.git/include/linux/perf_event.h
=====================================================================
--- linux-2.6.git.orig/include/linux/perf_event.h
+++ linux-2.6.git/include/linux/perf_event.h
@@ -53,6 +53,7 @@ enum perf_hw_id {
PERF_COUNT_HW_BRANCH_MISSES = 5,
PERF_COUNT_HW_BUS_CYCLES = 6,
PERF_COUNT_HW_STALLED_CYCLES = 7,
+ PERF_COUNT_HW_NMI_WATCHDOG = 8,
PERF_COUNT_HW_MAX, /* non-ABI */
};
Index: linux-2.6.git/kernel/watchdog.c
=====================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -191,7 +191,7 @@ static int is_softlockup(unsigned long t
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
+ .config = PERF_COUNT_HW_NMI_WATCHDOG,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
--
Cyrill
On Thu, Apr 28, 2011 at 07:37:25PM +0400, Cyrill Gorcunov wrote:
> Due to restriction and specifics of Netburst PMU we need a separated
> event for NMI watchdog. In particular every Netburst event consume not
> just a counter and config register, but also an additional ESCR register.
> Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
> for some event there is no room for another event to enter the room until
> it's released) we need to pick up "least" used ESCR (or most available)
> for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
>
> Note that on all other PMUs which support relocation of events between
> counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
Cyril,
What ever happened with this patch?
Cheers,
Don
>
> v2: Add a comment about non-sleeping clockticks.
>
> N.B: An attempts to make an alternate encodings for events didn't make
> situation better because we would need to track how exactly we substitute
> the particular event -- hw::config knows nothing from where the event came,
> from user-space as a raw event or as pre-configured general event. If it
> comes as raw event we have to track every single bit of ESCR mask and find
> out if new event would count exactly the same thing as the former event
> was supposed to. So I found such way inconvenient for users and adding a
> single code snippet seems to be a way more clean approach.
>
> Signed-off-by: Cyrill Gorcunov <[email protected]>
> Acked-by: Don Zickus <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: Lin Ming <[email protected]>
> CC: Peter Zijlstra <[email protected]>
> CC: Arnaldo Carvalho de Melo <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> ---
>
> To PeterZ: Peter, I've tried various ways to implement an alternate encoding
> (Don even tried one which didn't work because of ESCR conflict ;) but all them
> introduced much code which makes the whole picture more complex I think and
> there is no 1:1 map between even single event (initially I thought we have
> something but eventually found they are not). So even new NMI-WATCHDOG event
> is *not* the same as "power events" were before but they are not supposed to
> be "exactly" precise in compare with cpu-clocks we use for perf top. So I think
> it's acceptable trade off -- less precise events for nmi-watchdog and more
> precise for perf top and friends.
>
> Don, I put your Ack here because the only thing I've changed (in compare with
> previous tested verion) is PERF_COUNT_HW_NMI_WATCHDOG = 8 (was 7 before), please
> re-test it again, I've tested it already but still.
>
> Comments are welcome as usuall ;)
>
> arch/x86/kernel/cpu/perf_event_amd.c | 1 +
> arch/x86/kernel/cpu/perf_event_intel.c | 1 +
> arch/x86/kernel/cpu/perf_event_p4.c | 18 ++++++++++++++++++
> arch/x86/kernel/cpu/perf_event_p6.c | 1 +
> include/linux/perf_event.h | 1 +
> kernel/watchdog.c | 2 +-
> 6 files changed, 23 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
> =====================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_amd.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
> @@ -102,6 +102,7 @@ static const u64 amd_perfmon_event_map[]
> [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
> [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
> [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
> + [PERF_COUNT_HW_NMI_WATCHDOG] = 0x0076,
> };
>
> static u64 amd_pmu_event_map(int hw_event)
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
> =====================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_intel.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -34,6 +34,7 @@ static u64 intel_perfmon_event_map[PERF_
> [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
> [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
> [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
> + [PERF_COUNT_HW_NMI_WATCHDOG] = 0x003c,
> };
>
> static struct event_constraint intel_core_event_constraints[] __read_mostly =
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> =====================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> @@ -607,6 +607,24 @@ static u64 p4_general_events[PERF_COUNT_
> P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
> P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
> p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
> +
> + /*
> + * This is a specific way to count non-halted clockticks as SDM Vol.3B
> + * "30.11.2 Non-Sleep Clockticks" suggest. We set threshold and complement
> + * flag as result every tick is accounted and delivered to the counter.
> + */
> + [PERF_COUNT_HW_NMI_WATCHDOG] =
> + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
> + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
> + P4_CCCR_COMPARE),
> };
>
> static struct p4_event_bind *p4_config_get_bind(u64 config)
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
> =====================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p6.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
> @@ -12,6 +12,7 @@ static const u64 p6_perfmon_event_map[]
> [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
> [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
> [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
> + [PERF_COUNT_HW_NMI_WATCHDOG] = 0x0079,
> };
>
> static u64 p6_pmu_event_map(int hw_event)
> Index: linux-2.6.git/include/linux/perf_event.h
> =====================================================================
> --- linux-2.6.git.orig/include/linux/perf_event.h
> +++ linux-2.6.git/include/linux/perf_event.h
> @@ -53,6 +53,7 @@ enum perf_hw_id {
> PERF_COUNT_HW_BRANCH_MISSES = 5,
> PERF_COUNT_HW_BUS_CYCLES = 6,
> PERF_COUNT_HW_STALLED_CYCLES = 7,
> + PERF_COUNT_HW_NMI_WATCHDOG = 8,
>
> PERF_COUNT_HW_MAX, /* non-ABI */
> };
> Index: linux-2.6.git/kernel/watchdog.c
> =====================================================================
> --- linux-2.6.git.orig/kernel/watchdog.c
> +++ linux-2.6.git/kernel/watchdog.c
> @@ -191,7 +191,7 @@ static int is_softlockup(unsigned long t
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> static struct perf_event_attr wd_hw_attr = {
> .type = PERF_TYPE_HARDWARE,
> - .config = PERF_COUNT_HW_CPU_CYCLES,
> + .config = PERF_COUNT_HW_NMI_WATCHDOG,
> .size = sizeof(struct perf_event_attr),
> .pinned = 1,
> .disabled = 1,
>
> --
> Cyrill
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Tue, Jun 21, 2011 at 7:23 PM, Don Zickus <[email protected]> wrote:
> On Thu, Apr 28, 2011 at 07:37:25PM +0400, Cyrill Gorcunov wrote:
>> Due to restriction and specifics of Netburst PMU we need a separated
>> event for NMI watchdog. In particular every Netburst event consume not
>> just a counter and config register, but also an additional ESCR register.
>> Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
>> for some event there is no room for another event to enter the room until
>> it's released) we need to pick up "least" used ESCR (or most available)
>> for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
>>
>> Note that on all other PMUs which support relocation of events between
>> counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
>
> Cyril,
>
> What ever happened with this patch?
>
> Cheers,
> Don
>
Well, it's flowing around, I didn't get any more Ack's other than your. Since it
introduces kind of abi I presume more ack's would be welcome before it
(possibly) get merged.
On Tue, 2011-06-21 at 19:40 +0400, Cyrill Gorcunov wrote:
> On Tue, Jun 21, 2011 at 7:23 PM, Don Zickus <[email protected]> wrote:
> > On Thu, Apr 28, 2011 at 07:37:25PM +0400, Cyrill Gorcunov wrote:
> >> Due to restriction and specifics of Netburst PMU we need a separated
> >> event for NMI watchdog. In particular every Netburst event consume not
> >> just a counter and config register, but also an additional ESCR register.
> >> Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
> >> for some event there is no room for another event to enter the room until
> >> it's released) we need to pick up "least" used ESCR (or most available)
> >> for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
> >>
> >> Note that on all other PMUs which support relocation of events between
> >> counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
> >
> > Cyril,
> >
> > What ever happened with this patch?
> >
> > Cheers,
> > Don
> >
>
> Well, it's flowing around, I didn't get any more Ack's other than your. Since it
> introduces kind of abi I presume more ack's would be welcome before it
> (possibly) get merged.
we could avoid that abi thing by simply not making it an exposed event
Its not like userspace ever should be using it anyway
Ok, I'll take a look (sorry for top posting, ISP problem)
On Tuesday, June 21, 2011, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-06-21 at 19:40 +0400, Cyrill Gorcunov wrote:
>> On Tue, Jun 21, 2011 at 7:23 PM, Don Zickus <[email protected]> wrote:
>> > On Thu, Apr 28, 2011 at 07:37:25PM +0400, Cyrill Gorcunov wrote:
>> >> Due to restriction and specifics of Netburst PMU we need a separated
>> >> event for NMI watchdog. In particular every Netburst event consume not
>> >> just a counter and config register, but also an additional ESCR register.
>> >> Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
>> >> for some event there is no room for another event to enter the room until
>> >> it's released) we need to pick up "least" used ESCR (or most available)
>> >> for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
>> >>
>> >> Note that on all other PMUs which support relocation of events between
>> >> counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
>> >
>> > Cyril,
>> >
>> > What ever happened with this patch?
>> >
>> > Cheers,
>> > Don
>> >
>>
>> Well, it's flowing around, I didn't get any more Ack's other than your. Since it
>> introduces kind of abi I presume more ack's would be welcome before it
>> (possibly) get merged.
>
> we could avoid that abi thing by simply not making it an exposed event
>
> Its not like userspace ever should be using it anyway
>
Cyrill,
[repost because of MIME crap]
I admit I don't quite understand how this patch works around
the limitation. In the end you are still going to program some cycle
event into a P4 PMU register. So how it is going to free more counters
for regular users?
On Tue, Jun 21, 2011 at 6:13 PM, Cyrill Gorcunov <[email protected]> wrote:
>
> Ok, I'll take a look (sorry for top posting, ISP problem)
>
> On Tuesday, June 21, 2011, Peter Zijlstra <[email protected]> wrote:
> > On Tue, 2011-06-21 at 19:40 +0400, Cyrill Gorcunov wrote:
> >> On Tue, Jun 21, 2011 at 7:23 PM, Don Zickus <[email protected]> wrote:
> >> > On Thu, Apr 28, 2011 at 07:37:25PM +0400, Cyrill Gorcunov wrote:
> >> >> Due to restriction and specifics of Netburst PMU we need a separated
> >> >> event for NMI watchdog. In particular every Netburst event consume not
> >> >> just a counter and config register, but also an additional ESCR register.
> >> >> Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
> >> >> for some event there is no room for another event to enter the room until
> >> >> it's released) we need to pick up "least" used ESCR (or most available)
> >> >> for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
> >> >>
> >> >> Note that on all other PMUs which support relocation of events between
> >> >> counters this event is a simple alias for PERF_COUNT_HW_CPU_CYCLES.
> >> >
> >> > Cyril,
> >> >
> >> > What ever happened with this patch?
> >> >
> >> > Cheers,
> >> > Don
> >> >
> >>
> >> Well, it's flowing around, I didn't get any more Ack's other than your. Since it
> >> introduces kind of abi I presume more ack's would be welcome before it
> >> (possibly) get merged.
> >
> > we could avoid that abi thing by simply not making it an exposed event
> >
> > Its not like userspace ever should be using it anyway
> >
On Tue, Jun 21, 2011 at 06:20:56PM +0200, Stephane Eranian wrote:
> Cyrill,
> [repost because of MIME crap]
I saw first as well, sorry for delay, my isp is broken today :/
>
> I admit I don't quite understand how this patch works around
> the limitation. In the end you are still going to program some cycle
> event into a P4 PMU register. So how it is going to free more counters
> for regular users?
>
>
The key here is that we use that named non-sleeping ticks (as oprofile
did) for nmi-watchdog and it allows us to free "cpu-cycles" counter for
user needs. Of course we pick up one counter for this but it doesn't intersect
with "cpu-cycles" counter (because counters are grouped and can count only
specified events in each group). Stepane, should I post more details?
Cyrill
On Tue, Jun 21, 2011 at 08:48:20PM +0400, Cyrill Gorcunov wrote:
...
>
> The key here is that we use that named non-sleeping ticks (as oprofile
> did) for nmi-watchdog and it allows us to free "cpu-cycles" counter for
> user needs. Of course we pick up one counter for this but it doesn't intersect
> with "cpu-cycles" counter (because counters are grouped and can count only
> specified events in each group). Stepane, should I post more details?
>
> Cyrill
Stephane, I think I have a shorter explanation -- the counter is still borrowed
(there is no magic ;) but this counter is less loaded than anything else (in
terms of restrictions) so since CPU cycles is a way more popular now it's allowed
to count them simultaneously with nmi-watchdog, but again -- this means
another set of events can't work and if (for some reason) user needs this counter
he has to disable nmi-watchdog.
Cyrill
On Tue, Jun 21, 2011 at 6:48 PM, Cyrill Gorcunov <[email protected]> wrote:
> On Tue, Jun 21, 2011 at 06:20:56PM +0200, Stephane Eranian wrote:
>> Cyrill,
>> [repost because of MIME crap]
>
> I saw first as well, sorry for delay, my isp is broken today :/
>
>>
>> I admit I don't quite understand how this patch works around
>> the limitation. In the end you are still going to program some cycle
>> event into a P4 PMU register. So how it is going to free more counters
>> for regular users?
>>
>>
>
> The key here is that we use that named non-sleeping ticks (as oprofile
> did) for nmi-watchdog and it allows us to free "cpu-cycles" counter for
> user needs. Of course we pick up one counter for this but it doesn't intersect
> with "cpu-cycles" counter (because counters are grouped and can count only
> specified events in each group). Stepane, should I post more details?
>
Ok, so you're using a different PMU event for the watchdog. But then, in this
case why not simply change kernel/watchdog.c to hardcode something specific
for P4, i.e, change the wd_hw_attr.config/type fields to match the
event you want.
You could create a arch specific callback to setup config/type.
I don't think it makes sense to expose yet another generic PMU event, especially
given the name you gave to it and what it actually does. People might wonder
what good is that for?
On Tue, Jun 21, 2011 at 07:10:05PM +0200, Stephane Eranian wrote:
...
> >
> > ?The key here is that we use that named non-sleeping ticks (as oprofile
> > did) for nmi-watchdog and it allows us to free "cpu-cycles" counter for
> > user needs. Of course we pick up one counter for this but it doesn't intersect
> > with "cpu-cycles" counter (because counters are grouped and can count only
> > specified events in each group). Stepane, should I post more details?
> >
> Ok, so you're using a different PMU event for the watchdog. But then, in this
> case why not simply change kernel/watchdog.c to hardcode something specific
> for P4, i.e, change the wd_hw_attr.config/type fields to match the
> event you want.
It doesn't change things much. Of course I can put some specifics into
.config but where is the guarantee some new generic event would not ever
intersect with it. I know (for example) we could reserve -1ULL for this
event but again where is the guarantee that it will never ever be used
system wide in future for some different event?
So I need some event-id in which I may be sure it will never be
used for different purpose.
> You could create a arch specific callback to setup config/type.
>
> I don't think it makes sense to expose yet another generic PMU event, especially
> given the name you gave to it and what it actually does. People might wonder
> what good is that for?
Cyrill
On Tue, Jun 21, 2011 at 7:23 PM, Cyrill Gorcunov <[email protected]> wrote:
> On Tue, Jun 21, 2011 at 07:10:05PM +0200, Stephane Eranian wrote:
> ...
>> >
>> > The key here is that we use that named non-sleeping ticks (as oprofile
>> > did) for nmi-watchdog and it allows us to free "cpu-cycles" counter for
>> > user needs. Of course we pick up one counter for this but it doesn't intersect
>> > with "cpu-cycles" counter (because counters are grouped and can count only
>> > specified events in each group). Stepane, should I post more details?
>> >
>> Ok, so you're using a different PMU event for the watchdog. But then, in this
>> case why not simply change kernel/watchdog.c to hardcode something specific
>> for P4, i.e, change the wd_hw_attr.config/type fields to match the
>> event you want.
>
> It doesn't change things much. Of course I can put some specifics into
> .config but where is the guarantee some new generic event would not ever
> intersect with it. I know (for example) we could reserve -1ULL for this
> event but again where is the guarantee that it will never ever be used
> system wide in future for some different event?
>
I am not talking about a new generic PMU event. I am talking about
you hardcoding a raw event in the callback: type = PERF_TYPE_RAW,
config=0x003c.
> So I need some event-id in which I may be sure it will never be
> used for different purpose.
>
>> You could create a arch specific callback to setup config/type.
>>
>> I don't think it makes sense to expose yet another generic PMU event, especially
>> given the name you gave to it and what it actually does. People might wonder
>> what good is that for?
>
> Cyrill
>
On Tue, Jun 21, 2011 at 07:26:16PM +0200, Stephane Eranian wrote:
...
> >
> > ?It doesn't change things much. Of course I can put some specifics into
> > .config but where is the guarantee some new generic event would not ever
> > intersect with it. I know (for example) we could reserve -1ULL for this
> > event but again where is the guarantee that it will never ever be used
> > system wide in future for some different event?
> >
> I am not talking about a new generic PMU event. I am talking about
> you hardcoding a raw event in the callback: type = PERF_TYPE_RAW,
> config=0x003c.
>
watchdog.c is system-wide and even if it's not used by archs other
than x86 doesn't mean it'll not in future ;) So we could encode
this watchdog event as you mention (and I would check for this
values and map it internally to bits I need, this btw will require me
to check if this bits do not intersect with some already valid bits
combination but it's a different problem) but than we _must_ be sure it
never bring problems as new arch or consumer appear. So, no, I don't
like this approach, it's fragile. But gimme some time -- I'll re-check
all bits, probably there is some workaround.
Cyrill
On Tue, Jun 21, 2011 at 7:54 PM, Cyrill Gorcunov <[email protected]> wrote:
> On Tue, Jun 21, 2011 at 07:26:16PM +0200, Stephane Eranian wrote:
> ...
>> >
>> > It doesn't change things much. Of course I can put some specifics into
>> > .config but where is the guarantee some new generic event would not ever
>> > intersect with it. I know (for example) we could reserve -1ULL for this
>> > event but again where is the guarantee that it will never ever be used
>> > system wide in future for some different event?
>> >
>> I am not talking about a new generic PMU event. I am talking about
>> you hardcoding a raw event in the callback: type = PERF_TYPE_RAW,
>> config=0x003c.
>>
>
> watchdog.c is system-wide and even if it's not used by archs other
> than x86 doesn't mean it'll not in future ;) So we could encode
> this watchdog event as you mention (and I would check for this
> values and map it internally to bits I need, this btw will require me
> to check if this bits do not intersect with some already valid bits
> combination but it's a different problem) but than we _must_ be sure it
> never bring problems as new arch or consumer appear. So, no, I don't
> like this approach, it's fragile. But gimme some time -- I'll re-check
> all bits, probably there is some workaround.
>
I don't understand what possible conflict bits you're talking about
If in arch/x86/kernel/cpu/perf_event.c I add a callback which
checks:
if (netburst_host) {
attr.type = PERF_TYPE_RAW
attr.config = 0x3c
} else {
attr.type = PERF_TYPE_HARDWARE
attr.config = PERF_COUNT_HW__CPU_CYCLES
}
I simply don't think PERF_COUNT_HW_NMI_WATCHDOG should
ever be exposed to users.
> Cyrill
>
On Tue, Jun 21, 2011 at 08:06:54PM +0200, Stephane Eranian wrote:
...
>
> If in arch/x86/kernel/cpu/perf_event.c I add a callback which
> checks:
>
> if (netburst_host) {
> attr.type = PERF_TYPE_RAW
> attr.config = 0x3c
> } else {
> attr.type = PERF_TYPE_HARDWARE
> attr.config = PERF_COUNT_HW__CPU_CYCLES
> }
>
> I simply don't think PERF_COUNT_HW_NMI_WATCHDOG should
> ever be exposed to users.
yes, and peterz pointed it out as well, i agree of course.
>
So, you mean to put it into x86_pmu_event_init, right? How would
we know if this event comes from watchdog.c or not? And what if
user wanted to pass _exactly_ this crazy values to RAW event but
I will threat it as a sign for nmi-watchdog event and will map
it internally to P4_EVENT_EXECUTION_EVENT?
Don't get me wrong, Stephane, this approach will work and it's
not a problem (actually I think I would put -1ULL into config
since it's kinda "icorrect" and can be treated as special one)
but I think it introduce more complexity, it should not be like
this.
Gimme some time, probably I invent something more convenient.
Cyrill
On Tue, Jun 21, 2011 at 10:32:27PM +0400, Cyrill Gorcunov wrote:
> On Tue, Jun 21, 2011 at 08:06:54PM +0200, Stephane Eranian wrote:
> ...
> >
> > If in arch/x86/kernel/cpu/perf_event.c I add a callback which
> > checks:
> >
> > if (netburst_host) {
> > attr.type = PERF_TYPE_RAW
> > attr.config = 0x3c
> > } else {
> > attr.type = PERF_TYPE_HARDWARE
> > attr.config = PERF_COUNT_HW__CPU_CYCLES
> > }
> >
> > I simply don't think PERF_COUNT_HW_NMI_WATCHDOG should
> > ever be exposed to users.
>
> yes, and peterz pointed it out as well, i agree of course.
>
What about something like below?
Cyrill
---
arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++
arch/x86/kernel/cpu/perf_event_amd.c | 1 +
arch/x86/kernel/cpu/perf_event_intel.c | 2 ++
arch/x86/kernel/cpu/perf_event_p4.c | 27 +++++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_p6.c | 1 +
include/linux/perf_event.h | 6 ++++++
kernel/watchdog.c | 2 +-
7 files changed, 53 insertions(+), 1 deletion(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
@@ -233,6 +233,7 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*hw_watchdog_config)(struct perf_event *event);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -413,6 +414,17 @@ static int x86_pmu_extra_regs(u64 config
return 0;
}
+static void x86_pmu_hw_watchdog_config(struct perf_event *event)
+{
+ /*
+ * On most x86 architectures watchdog cycles
+ * are the same as cpu cycles.
+ */
+ if (event->attr.type == PERF_TYPE_HARDWARE &&
+ event->attr.config == PERF_COUNT_HW_NMI_WATCHDOG)
+ event->attr.config = PERF_COUNT_HW_CPU_CYCLES;
+}
+
static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
@@ -706,6 +718,9 @@ static int __x86_pmu_event_init(struct p
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
+ if (x86_pmu.hw_watchdog_config)
+ x86_pmu.hw_watchdog_config(event);
+
return x86_pmu.hw_config(event);
}
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_amd.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
@@ -372,6 +372,7 @@ static __initconst const struct x86_pmu
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_intel.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1213,6 +1213,7 @@ static __initconst const struct x86_pmu
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
@@ -1298,6 +1299,7 @@ static __initconst const struct x86_pmu
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,32 @@ static int p4_validate_raw_event(struct
return 0;
}
+static void p4_hw_watchdog_config(struct perf_event *event)
+{
+ /*
+ * Watchdog ticks are special on Netburst, we use
+ * that named "non-sleeping" ticks as recommended
+ * by Intel SDM Vol3b.
+ */
+ if (event->attr.type != PERF_TYPE_HARDWARE ||
+ event->attr.config != PERF_COUNT_HW_NMI_WATCHDOG)
+ return;
+
+ event->attr.type = PERF_TYPE_RAW;
+ event->attr.config =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
@@ -1179,6 +1205,7 @@ static __initconst const struct x86_pmu
.cntval_bits = ARCH_P4_CNTRVAL_BITS,
.cntval_mask = ARCH_P4_CNTRVAL_MASK,
.max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+ .hw_watchdog_config = p4_hw_watchdog_config,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p6.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
@@ -91,6 +91,7 @@ static __initconst const struct x86_pmu
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
Index: linux-2.6.git/include/linux/perf_event.h
===================================================================
--- linux-2.6.git.orig/include/linux/perf_event.h
+++ linux-2.6.git/include/linux/perf_event.h
@@ -582,6 +582,12 @@ struct hw_perf_event {
};
/*
+ * Watchdog cycles are special on some architectures (such as Netburst)
+ * and because of that it must be unique among enum perf_hw_id.
+ */
+#define PERF_COUNT_HW_WATCHDOG_CYCLES (PERF_COUNT_HW_MAX + 1)
+
+/*
* hw_perf_event::state flags
*/
#define PERF_HES_STOPPED 0x01 /* the counter is stopped */
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -202,7 +202,7 @@ static int is_softlockup(unsigned long t
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
+ .config = PERF_COUNT_HW_WATCHDOG_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
On Wed, Jun 22, 2011 at 12:27:56PM +0400, Cyrill Gorcunov wrote:
...
> >
> > yes, and peterz pointed it out as well, i agree of course.
> >
>
> What about something like below?
>
> Cyrill
...
Note that I've not tested it but rather need approval/rejecting
on idea in general.
On Wed, Jun 22, 2011 at 01:21:34PM +0400, Cyrill Gorcunov wrote:
...
> Note that I've not tested it but rather need approval/rejecting
> on idea in general.
The final version is below. Stephane, note that it's almost the
same idea as you proposed except it uses explicit namings to
mark out that watchdog cycles are special.
Cyrill
---
perf, x86: Add PERF_COUNT_HW_WATCHDOG_CYCLES event in a sake of nmi-watchdog v3
Due to restriction and specifics of Netburst PMU we need a separated
event for NMI watchdog. In particular every Netburst event consume not
just a counter and config register, but also an additional ESCR register.
Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
for some event there is no room for another event to enter until it's
released) we need to pick up "least" used ESCR (or most available)
for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.
v2: Add a comment about non-sleeping clockticks spotted by Ingo Molnar.
v3: Peter Zijlstra and Stephane Eranian pointed out that making new
event global visible (up to userspace) will bring problems supporting
this ABI in future. So now this event is x86 specific and hidden
from userspace.
N.B: An attempts to make an alternate encodings for events didn't make
situation better because we would need to track how exactly we substitute
the particular event -- hw::config knows nothing from where the event came,
from user-space as a raw event or as pre-configured general event. If it
comes as raw event we have to track every single bit of ESCR mask and find
out if new event would count exactly the same thing as the former event
was supposed to. So I found such way pretty inconvenient.
Signed-off-by: Cyrill Gorcunov <[email protected]>
CC: Don Zickus <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Lin Ming <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Arnaldo Carvalho de Melo <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Stephane Eranian <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++
arch/x86/kernel/cpu/perf_event_amd.c | 1 +
arch/x86/kernel/cpu/perf_event_intel.c | 2 ++
arch/x86/kernel/cpu/perf_event_p4.c | 27 +++++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_p6.c | 1 +
include/linux/perf_event.h | 6 ++++++
kernel/watchdog.c | 2 +-
7 files changed, 53 insertions(+), 1 deletion(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
@@ -233,6 +233,7 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*hw_watchdog_config)(struct perf_event *event);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -413,6 +414,17 @@ static int x86_pmu_extra_regs(u64 config
return 0;
}
+static void x86_pmu_hw_watchdog_config(struct perf_event *event)
+{
+ /*
+ * On most x86 architectures watchdog cycles
+ * are the same as cpu cycles.
+ */
+ if (event->attr.type == PERF_TYPE_HARDWARE &&
+ event->attr.config == PERF_COUNT_HW_WATCHDOG_CYCLES)
+ event->attr.config = PERF_COUNT_HW_CPU_CYCLES;
+}
+
static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
@@ -706,6 +718,9 @@ static int __x86_pmu_event_init(struct p
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
+ if (x86_pmu.hw_watchdog_config)
+ x86_pmu.hw_watchdog_config(event);
+
return x86_pmu.hw_config(event);
}
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_amd.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_amd.c
@@ -372,6 +372,7 @@ static __initconst const struct x86_pmu
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_intel.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1213,6 +1213,7 @@ static __initconst const struct x86_pmu
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
@@ -1298,6 +1299,7 @@ static __initconst const struct x86_pmu
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,32 @@ static int p4_validate_raw_event(struct
return 0;
}
+static void p4_hw_watchdog_config(struct perf_event *event)
+{
+ /*
+ * Watchdog ticks are special on Netburst, we use
+ * that named "non-sleeping" ticks as recommended
+ * by Intel SDM Vol3b.
+ */
+ if (event->attr.type != PERF_TYPE_HARDWARE ||
+ event->attr.config != PERF_COUNT_HW_WATCHDOG_CYCLES)
+ return;
+
+ event->attr.type = PERF_TYPE_RAW;
+ event->attr.config =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE);
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
@@ -1179,6 +1205,7 @@ static __initconst const struct x86_pmu
.cntval_bits = ARCH_P4_CNTRVAL_BITS,
.cntval_mask = ARCH_P4_CNTRVAL_MASK,
.max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+ .hw_watchdog_config = p4_hw_watchdog_config,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p6.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p6.c
@@ -91,6 +91,7 @@ static __initconst const struct x86_pmu
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
+ .hw_watchdog_config = x86_pmu_hw_watchdog_config,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
Index: linux-2.6.git/include/linux/perf_event.h
===================================================================
--- linux-2.6.git.orig/include/linux/perf_event.h
+++ linux-2.6.git/include/linux/perf_event.h
@@ -582,6 +582,12 @@ struct hw_perf_event {
};
/*
+ * Watchdog cycles are special on some architectures (such as Netburst)
+ * and because of that it must be unique among enum perf_hw_id.
+ */
+#define PERF_COUNT_HW_WATCHDOG_CYCLES (PERF_COUNT_HW_MAX + 1)
+
+/*
* hw_perf_event::state flags
*/
#define PERF_HES_STOPPED 0x01 /* the counter is stopped */
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -202,7 +202,7 @@ static int is_softlockup(unsigned long t
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
+ .config = PERF_COUNT_HW_WATCHDOG_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
On Thu, Jun 23, 2011 at 8:48 AM, Cyrill Gorcunov <[email protected]> wrote:
> On Wed, Jun 22, 2011 at 01:21:34PM +0400, Cyrill Gorcunov wrote:
> ...
>> Note that I've not tested it but rather need approval/rejecting
>> on idea in general.
>
> The final version is below. Stephane, note that it's almost the
> same idea as you proposed except it uses explicit namings to
> mark out that watchdog cycles are special.
>
This looks okay.
The only alternative I see (This is wha I had in mind at the
beginning) that would
not require this new hidden generic event would be to have watchdog.c invoke an
arch-specific callback to fill out the attr.type, attr->config fields directly:
static int watchdog_nmi_enable(int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
goto out;
/* it is setup but not enabled */
if (event != NULL)
goto out_enable;
/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
--> hw_nmi_get_event(wd_attr);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback);
...
}
wd_attr would still be hardcoded with CPU_CYCLES, but could be overriden.
A default and empty hw_nmi_get_event() would be defined as __weak in watchdog.c.
Then for x86(), it would do what you have in your patch:
hw_nmi_get_event(struct perf_event *event)
{
if (x86_pmu.hw_watchdog_config)
x86_pmu.hw_watchdog_config(event);
}
For P4, hw_watchdog_config(), would be defined as in your patch:
+static void p4_hw_watchdog_config(struct perf_event *event)
+{
+ /*
+ * Watchdog ticks are special on Netburst, we use
+ * that named "non-sleeping" ticks as recommended
+ * by Intel SDM Vol3b.
+ */
+ event->attr.type = PERF_TYPE_RAW;
+ event->attr.config =
+
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)
|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT,
BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) |
P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE);
+}
+
No new hidden event, just a x86_pmu + a per-arch callbacks.
On Thu, Jun 23, 2011 at 11:40:39AM +0200, Stephane Eranian wrote:
> On Thu, Jun 23, 2011 at 8:48 AM, Cyrill Gorcunov <[email protected]> wrote:
> > On Wed, Jun 22, 2011 at 01:21:34PM +0400, Cyrill Gorcunov wrote:
> > ...
> >> Note that I've not tested it but rather need approval/rejecting
> >> on idea in general.
> >
> > The final version is below. Stephane, note that it's almost the
> > same idea as you proposed except it uses explicit namings to
> > mark out that watchdog cycles are special.
> >
> This looks okay.
>
> The only alternative I see (This is wha I had in mind at the
> beginning) that would
> not require this new hidden generic event would be to have watchdog.c invoke an
> arch-specific callback to fill out the attr.type, attr->config fields directly:
>
...
>
> No new hidden event, just a x86_pmu + a per-arch callbacks.
Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
Cyrill
On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
...
> >
> > No new hidden event, just a x86_pmu + a per-arch callbacks.
>
> Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
>
> Cyrill
Since we are going to make __weak linking anyway maybe something like below
fit even beter? (untested)
Cyrill
---
arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++
kernel/watchdog.c | 3 +++
2 files changed, 29 insertions(+)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,32 @@ static int p4_validate_raw_event(struct
return 0;
}
+void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+ /*
+ * Watchdog ticks are special on Netburst, we use
+ * that named "non-sleeping" ticks as recommended
+ * by Intel SDM Vol3b.
+ */
+ if (wd_attr->type != PERF_TYPE_HARDWARE ||
+ wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
+ return;
+
+ wd_attr->type = PERF_TYPE_RAW;
+ wd_attr->config =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE);
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -355,6 +355,8 @@ static int watchdog(void *unused)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
+
static int watchdog_nmi_enable(int cpu)
{
struct perf_event_attr *wd_attr;
@@ -371,6 +373,7 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+ hw_nmi_watchdog_set_attr(wd_attr);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
On Thu, Jun 23, 2011 at 03:07:06PM +0400, Cyrill Gorcunov wrote:
> On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
> ...
> > >
> > > No new hidden event, just a x86_pmu + a per-arch callbacks.
> >
> > Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
> >
> > Cyrill
>
> Since we are going to make __weak linking anyway maybe something like below
> fit even beter? (untested)
I don't think the compiler knows what platform you are running on and may
just blindly link your new p4 function for all x86s, which is probably not
what you want.
Cheers,
Don
>
> Cyrill
> ---
> arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++
> kernel/watchdog.c | 3 +++
> 2 files changed, 29 insertions(+)
>
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> @@ -705,6 +705,32 @@ static int p4_validate_raw_event(struct
> return 0;
> }
>
> +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
> +{
> + /*
> + * Watchdog ticks are special on Netburst, we use
> + * that named "non-sleeping" ticks as recommended
> + * by Intel SDM Vol3b.
> + */
> + if (wd_attr->type != PERF_TYPE_HARDWARE ||
> + wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
> + return;
> +
> + wd_attr->type = PERF_TYPE_RAW;
> + wd_attr->config =
> + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
> + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
> + P4_CCCR_COMPARE);
> +}
> +
> static int p4_hw_config(struct perf_event *event)
> {
> int cpu = get_cpu();
> Index: linux-2.6.git/kernel/watchdog.c
> ===================================================================
> --- linux-2.6.git.orig/kernel/watchdog.c
> +++ linux-2.6.git/kernel/watchdog.c
> @@ -355,6 +355,8 @@ static int watchdog(void *unused)
>
>
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> +void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
> +
> static int watchdog_nmi_enable(int cpu)
> {
> struct perf_event_attr *wd_attr;
> @@ -371,6 +373,7 @@ static int watchdog_nmi_enable(int cpu)
> /* Try to register using hardware perf events */
> wd_attr = &wd_hw_attr;
> wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
> + hw_nmi_watchdog_set_attr(wd_attr);
> event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
> if (!IS_ERR(event)) {
> printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
On Thu, Jun 23, 2011 at 1:40 PM, Don Zickus <[email protected]> wrote:
> On Thu, Jun 23, 2011 at 03:07:06PM +0400, Cyrill Gorcunov wrote:
>> On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
>> ...
>> > >
>> > > No new hidden event, just a x86_pmu + a per-arch callbacks.
>> >
>> > Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
>> >
>> > Cyrill
>>
>> Since we are going to make __weak linking anyway maybe something like below
>> fit even beter? (untested)
>
> I don't think the compiler knows what platform you are running on and may
> just blindly link your new p4 function for all x86s, which is probably not
> what you want.
>
Don, is right. You need the level of indirection I had in my outline patch.
You also don't need the:
+ if (wd_attr->type != PERF_TYPE_HARDWARE ||
+ wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
+ return;
In the p4 callback given you know your coming in for the watchdog.
> Cheers,
> Don
>
>>
>> Cyrill
>> ---
>> arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++
>> kernel/watchdog.c | 3 +++
>> 2 files changed, 29 insertions(+)
>>
>> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
>> ===================================================================
>> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
>> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
>> @@ -705,6 +705,32 @@ static int p4_validate_raw_event(struct
>> return 0;
>> }
>>
>> +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
>> +{
>> + /*
>> + * Watchdog ticks are special on Netburst, we use
>> + * that named "non-sleeping" ticks as recommended
>> + * by Intel SDM Vol3b.
>> + */
>> + if (wd_attr->type != PERF_TYPE_HARDWARE ||
>> + wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
>> + return;
>> +
>> + wd_attr->type = PERF_TYPE_RAW;
>> + wd_attr->config =
>> + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
>> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
>> + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
>> + P4_CCCR_COMPARE);
>> +}
>> +
>> static int p4_hw_config(struct perf_event *event)
>> {
>> int cpu = get_cpu();
>> Index: linux-2.6.git/kernel/watchdog.c
>> ===================================================================
>> --- linux-2.6.git.orig/kernel/watchdog.c
>> +++ linux-2.6.git/kernel/watchdog.c
>> @@ -355,6 +355,8 @@ static int watchdog(void *unused)
>>
>>
>> #ifdef CONFIG_HARDLOCKUP_DETECTOR
>> +void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
>> +
>> static int watchdog_nmi_enable(int cpu)
>> {
>> struct perf_event_attr *wd_attr;
>> @@ -371,6 +373,7 @@ static int watchdog_nmi_enable(int cpu)
>> /* Try to register using hardware perf events */
>> wd_attr = &wd_hw_attr;
>> wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
>> + hw_nmi_watchdog_set_attr(wd_attr);
>> event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
>> if (!IS_ERR(event)) {
>> printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
>
On Thu, Jun 23, 2011 at 07:40:55AM -0400, Don Zickus wrote:
> On Thu, Jun 23, 2011 at 03:07:06PM +0400, Cyrill Gorcunov wrote:
> > On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
> > ...
> > > >
> > > > No new hidden event, just a x86_pmu + a per-arch callbacks.
> > >
> > > Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
> > >
> > > Cyrill
> >
> > Since we are going to make __weak linking anyway maybe something like below
> > fit even beter? (untested)
>
> I don't think the compiler knows what platform you are running on and may
> just blindly link your new p4 function for all x86s, which is probably not
> what you want.
As only there will be second/third and so on implementation of hw_nmi_watchdog_set_attr
then indeed it become a problem, for a while it's not, only single
implementation. I personally don't like much __weak linking until there is
no other choise, so explicit general event for x86 in a sake of nmi-watchdog
(without any __weak functions) is my favorite but I'm fine with Stephane proposal
as well, so Don what is your choise?
Cyrill
On Thu, Jun 23, 2011 at 01:44:36PM +0200, Stephane Eranian wrote:
> On Thu, Jun 23, 2011 at 1:40 PM, Don Zickus <[email protected]> wrote:
> > On Thu, Jun 23, 2011 at 03:07:06PM +0400, Cyrill Gorcunov wrote:
> >> On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
> >> ...
> >> > >
> >> > > No new hidden event, just a x86_pmu + a per-arch callbacks.
> >> >
> >> > Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
> >> >
> >> > ? ? Cyrill
> >>
> >> Since we are going to make __weak linking anyway maybe something like below
> >> fit even beter? (untested)
> >
> > I don't think the compiler knows what platform you are running on and may
> > just blindly link your new p4 function for all x86s, which is probably not
> > what you want.
> >
> Don, is right. You need the level of indirection I had in my outline patch.
>
> You also don't need the:
> + if (wd_attr->type != PERF_TYPE_HARDWARE ||
> + wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
> + return;
>
> In the p4 callback given you know your coming in for the watchdog.
>
Yes, that is why in __weak implementation I dropped it. So guys,
what we stick with -- __weak with second level indirection?
Cyrill
On Thu, Jun 23, 2011 at 1:53 PM, Cyrill Gorcunov <[email protected]> wrote:
> On Thu, Jun 23, 2011 at 01:44:36PM +0200, Stephane Eranian wrote:
>> On Thu, Jun 23, 2011 at 1:40 PM, Don Zickus <[email protected]> wrote:
>> > On Thu, Jun 23, 2011 at 03:07:06PM +0400, Cyrill Gorcunov wrote:
>> >> On Thu, Jun 23, 2011 at 01:54:39PM +0400, Cyrill Gorcunov wrote:
>> >> ...
>> >> > >
>> >> > > No new hidden event, just a x86_pmu + a per-arch callbacks.
>> >> >
>> >> > Looks quite good for me, Don? (i'll cook some draft patch for review meanwhile).
>> >> >
>> >> > Cyrill
>> >>
>> >> Since we are going to make __weak linking anyway maybe something like below
>> >> fit even beter? (untested)
>> >
>> > I don't think the compiler knows what platform you are running on and may
>> > just blindly link your new p4 function for all x86s, which is probably not
>> > what you want.
>> >
>> Don, is right. You need the level of indirection I had in my outline patch.
>>
>> You also don't need the:
>> + if (wd_attr->type != PERF_TYPE_HARDWARE ||
>> + wd_attr->attr.config != PERF_COUNT_HW_CPU_CYCLES)
>> + return;
>>
>> In the p4 callback given you know your coming in for the watchdog.
>>
>
> Yes, that is why in __weak implementation I dropped it. So guys,
> what we stick with -- __weak with second level indirection?
>
No, first level in watchdog.c, the other callback has to be implemented
from x86_pmu as you had it.
> Cyrill
>
On Thu, Jun 23, 2011 at 02:03:50PM +0200, Stephane Eranian wrote:
...
> >
> No, first level in watchdog.c, the other callback has to be implemented
> from x86_pmu as you had it.
>
ok, will do
Cyrill
On Thu, Jun 23, 2011 at 04:07:32PM +0400, Cyrill Gorcunov wrote:
> On Thu, Jun 23, 2011 at 02:03:50PM +0200, Stephane Eranian wrote:
> ...
> > >
> > No, first level in watchdog.c, the other callback has to be implemented
> > from x86_pmu as you had it.
> >
Something like below I suppose
Cyrill
---
arch/x86/kernel/cpu/perf_event.c | 7 +++++++
arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++
kernel/watchdog.c | 6 +++++-
3 files changed, 38 insertions(+), 1 deletion(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
@@ -233,6 +233,7 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*hw_watchdog_set_attr)(struct perf_event_attr *attr);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
+void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+ if (x86_pmu.hw_watchdog_set_attr)
+ x86_pmu.hw_watchdog_set_attr(wd_attr);
+}
+
/*
* Propagate event elapsed time into the generic event.
* Can only be executed on the CPU where the event is active.
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct
return 0;
}
+static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+ /*
+ * Watchdog ticks are special on Netburst, we use
+ * that named "non-sleeping" ticks as recommended
+ * by Intel SDM Vol3b.
+ */
+ WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE ||
+ wd_attr->config != PERF_COUNT_HW_CPU_CYCLES);
+
+ wd_attr->type = PERF_TYPE_RAW;
+ wd_attr->config =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE);
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
@@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu
.cntval_bits = ARCH_P4_CNTRVAL_BITS,
.cntval_mask = ARCH_P4_CNTRVAL_MASK,
.max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+ .hw_watchdog_set_attr = p4_hw_watchdog_set_attr,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -200,6 +200,8 @@ static int is_softlockup(unsigned long t
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu)
if (event != NULL)
goto out_enable;
- /* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+ hw_nmi_watchdog_set_attr(wd_attr);
+
+ /* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
Yes.
On Thu, Jun 23, 2011 at 2:26 PM, Cyrill Gorcunov <[email protected]> wrote:
> On Thu, Jun 23, 2011 at 04:07:32PM +0400, Cyrill Gorcunov wrote:
>> On Thu, Jun 23, 2011 at 02:03:50PM +0200, Stephane Eranian wrote:
>> ...
>> > >
>> > No, first level in watchdog.c, the other callback has to be implemented
>> > from x86_pmu as you had it.
>> >
> Something like below I suppose
>
> Cyrill
> ---
> arch/x86/kernel/cpu/perf_event.c | 7 +++++++
> arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++
> kernel/watchdog.c | 6 +++++-
> 3 files changed, 38 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
> @@ -233,6 +233,7 @@ struct x86_pmu {
> void (*enable_all)(int added);
> void (*enable)(struct perf_event *);
> void (*disable)(struct perf_event *);
> + void (*hw_watchdog_set_attr)(struct perf_event_attr *attr);
> int (*hw_config)(struct perf_event *event);
> int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
> unsigned eventsel;
> @@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_
> [PERF_COUNT_HW_CACHE_OP_MAX]
> [PERF_COUNT_HW_CACHE_RESULT_MAX];
>
> +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
> +{
> + if (x86_pmu.hw_watchdog_set_attr)
> + x86_pmu.hw_watchdog_set_attr(wd_attr);
> +}
> +
> /*
> * Propagate event elapsed time into the generic event.
> * Can only be executed on the CPU where the event is active.
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> @@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct
> return 0;
> }
>
> +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr)
> +{
> + /*
> + * Watchdog ticks are special on Netburst, we use
> + * that named "non-sleeping" ticks as recommended
> + * by Intel SDM Vol3b.
> + */
> + WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE ||
> + wd_attr->config != PERF_COUNT_HW_CPU_CYCLES);
> +
> + wd_attr->type = PERF_TYPE_RAW;
> + wd_attr->config =
> + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
> + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) |
> + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
> + P4_CCCR_COMPARE);
> +}
> +
> static int p4_hw_config(struct perf_event *event)
> {
> int cpu = get_cpu();
> @@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu
> .cntval_bits = ARCH_P4_CNTRVAL_BITS,
> .cntval_mask = ARCH_P4_CNTRVAL_MASK,
> .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
> + .hw_watchdog_set_attr = p4_hw_watchdog_set_attr,
> .hw_config = p4_hw_config,
> .schedule_events = p4_pmu_schedule_events,
> /*
> Index: linux-2.6.git/kernel/watchdog.c
> ===================================================================
> --- linux-2.6.git.orig/kernel/watchdog.c
> +++ linux-2.6.git/kernel/watchdog.c
> @@ -200,6 +200,8 @@ static int is_softlockup(unsigned long t
> }
>
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> +void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
> +
> static struct perf_event_attr wd_hw_attr = {
> .type = PERF_TYPE_HARDWARE,
> .config = PERF_COUNT_HW_CPU_CYCLES,
> @@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu)
> if (event != NULL)
> goto out_enable;
>
> - /* Try to register using hardware perf events */
> wd_attr = &wd_hw_attr;
> wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
> + hw_nmi_watchdog_set_attr(wd_attr);
> +
> + /* Try to register using hardware perf events */
> event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
> if (!IS_ERR(event)) {
> printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
>
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Thu, Jun 23, 2011 at 02:31:14PM +0200, Stephane Eranian wrote:
> Yes.
>
OK, I prepare a final version and send out as a new mail, since thread
is big enough already. Thanks!
Cyrill