From: Andy Lutomirski <[email protected]>
This patch adds freq PMU to support time and freq related counters
includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
The events are exposed in sysfs for use by perf stat and other tools.
The files are under /sys/devices/freq/events/
These events only support system-wide mode counting.
The PMU type (attr->type) is PERF_TYPE_FREQ.
Example:
To caculate the CPU%
CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
$ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
3164023,,ref-cycles,1048387386,100.00
2410812089,,freq/tsc/,1050022373,100.00
The CPU% for sleep is 0.13%.
$ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 busyloop
15662183572,,ref-cycles,6822637855,100.00
15667608992,,freq/tsc/,6823978523,100.00
The CPU% for busy loop is 99.9%.
Signed-off-by: Andy Lutomirski <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
The patch is based on the patch "x86, perf: Add an aperfmperf driver"
from Andy Lutomirski (https://lkml.org/lkml/2015/4/28/757)
Changes from V1
- Change the name from aperfmperf to freq
- Add TSC and PPERF support
- Change to perf_sw_context
- Assign new PERF_TYPE name
- Split stop_or_del to two functions
- Update state of hw_perf_event
- Update changlog
arch/x86/kernel/cpu/Makefile | 2 +
arch/x86/kernel/cpu/perf_event_freq.c | 207 ++++++++++++++++++++++++++++++++++
include/uapi/linux/perf_event.h | 1 +
3 files changed, 210 insertions(+)
create mode 100644 arch/x86/kernel/cpu/perf_event_freq.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9bff687..f621dba 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_freq.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_freq.o
endif
diff --git a/arch/x86/kernel/cpu/perf_event_freq.c b/arch/x86/kernel/cpu/perf_event_freq.c
new file mode 100644
index 0000000..9389b3b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_freq.c
@@ -0,0 +1,207 @@
+#include <linux/perf_event.h>
+
+enum perf_freq_id {
+ /*
+ * freq events, generalized by the kernel:
+ */
+ PERF_FREQ_TSC = 0,
+ PERF_FREQ_APERF = 1,
+ PERF_FREQ_MPERF = 2,
+ PERF_FREQ_PPERF = 3,
+
+ PERF_FREQ_EVENT_MAX, /* non-ABI */
+};
+
+struct perf_freq_msr {
+ int id;
+ u64 msr;
+};
+
+static struct perf_freq_msr freq_msr[] = {
+ { PERF_FREQ_TSC, 0 },
+ { PERF_FREQ_APERF, MSR_IA32_APERF },
+ { PERF_FREQ_MPERF, MSR_IA32_MPERF },
+ { PERF_FREQ_PPERF, MSR_PPERF },
+};
+
+
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+
+static struct attribute *events_attrs[PERF_FREQ_EVENT_MAX + 1] = {
+ &evattr_tsc.attr.attr,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int freq_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (cfg >= PERF_FREQ_EVENT_MAX)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = freq_msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 freq_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ now = rdtsc();
+
+ return now;
+}
+static void freq_event_update(struct perf_event *event)
+{
+ u64 prev;
+ u64 now;
+
+ /* Assume counters are 64bit */
+ now = freq_read_counter(event);
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
+}
+
+static void freq_event_start(struct perf_event *event, int flags)
+{
+ u64 now;
+
+ now = freq_read_counter(event);
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void freq_event_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED))
+ hwc->state |= PERF_HES_STOPPED;
+
+ /* check if update of sw counter is necessary */
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ freq_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static void freq_event_del(struct perf_event *event, int flags)
+{
+ freq_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int freq_event_add(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_START)
+ freq_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_freq = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = freq_event_init,
+ .add = freq_event_add,
+ .del = freq_event_del,
+ .start = freq_event_start,
+ .stop = freq_event_stop,
+ .read = freq_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init intel_freq_specific_init(int idx)
+{
+ /* Skylake has pperf support */
+ if ((boot_cpu_data.x86_model == 78) ||
+ (boot_cpu_data.x86_model == 94))
+ events_attrs[idx++] = &evattr_pperf.attr.attr;
+
+ events_attrs[idx++] = NULL;
+
+ return 0;
+}
+
+static int __init amd_freq_specific_init(int idx)
+{
+ return 0;
+}
+
+static int __init freq_init(void)
+{
+ int err;
+ int idx = 1;
+
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
+ events_attrs[idx++] = &evattr_aperf.attr.attr;
+ events_attrs[idx++] = &evattr_mperf.attr.attr;
+ }
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_freq_specific_init(idx);
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_freq_specific_init(idx);
+ break;
+ default:
+ err = -ENOTSUPP;
+ }
+
+ if (err != 0) {
+ pr_cont("no freq PMU driver.\n");
+ return 0;
+ }
+
+ perf_pmu_register(&pmu_freq, "freq", PERF_TYPE_FREQ);
+
+ return 0;
+}
+device_initcall(freq_init);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d97f84c..1f8c400 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_BREAKPOINT = 5,
+ PERF_TYPE_FREQ = 6,
PERF_TYPE_MAX, /* non-ABI */
};
--
1.8.3.1
On Mon, Jul 20, 2015 at 8:49 AM, Kan Liang <[email protected]> wrote:
> From: Andy Lutomirski <[email protected]>
>
> This patch adds freq PMU to support time and freq related counters
> includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
>
> The events are exposed in sysfs for use by perf stat and other tools.
> The files are under /sys/devices/freq/events/
>
> These events only support system-wide mode counting.
>
> The PMU type (attr->type) is PERF_TYPE_FREQ.
>
> Example:
>
> To caculate the CPU%
> CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
>
> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
> 3164023,,ref-cycles,1048387386,100.00
> 2410812089,,freq/tsc/,1050022373,100.00
> The CPU% for sleep is 0.13%.
>
This event is system-wide only. Thus, the kernel should return
an error when you try to use it in per-thread mode. That would
be more consistent with RAPL and uncore events.
> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 busyloop
> 15662183572,,ref-cycles,6822637855,100.00
> 15667608992,,freq/tsc/,6823978523,100.00
> The CPU% for busy loop is 99.9%.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Kan Liang <[email protected]>
> ---
>
> The patch is based on the patch "x86, perf: Add an aperfmperf driver"
> from Andy Lutomirski (https://lkml.org/lkml/2015/4/28/757)
>
> Changes from V1
> - Change the name from aperfmperf to freq
> - Add TSC and PPERF support
> - Change to perf_sw_context
> - Assign new PERF_TYPE name
> - Split stop_or_del to two functions
> - Update state of hw_perf_event
> - Update changlog
>
> arch/x86/kernel/cpu/Makefile | 2 +
> arch/x86/kernel/cpu/perf_event_freq.c | 207 ++++++++++++++++++++++++++++++++++
> include/uapi/linux/perf_event.h | 1 +
> 3 files changed, 210 insertions(+)
> create mode 100644 arch/x86/kernel/cpu/perf_event_freq.c
>
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 9bff687..f621dba 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
> perf_event_intel_uncore_snb.o \
> perf_event_intel_uncore_snbep.o \
> perf_event_intel_uncore_nhmex.o
> +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_freq.o
> +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_freq.o
> endif
>
>
> diff --git a/arch/x86/kernel/cpu/perf_event_freq.c b/arch/x86/kernel/cpu/perf_event_freq.c
> new file mode 100644
> index 0000000..9389b3b
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_freq.c
> @@ -0,0 +1,207 @@
> +#include <linux/perf_event.h>
> +
> +enum perf_freq_id {
> + /*
> + * freq events, generalized by the kernel:
> + */
> + PERF_FREQ_TSC = 0,
> + PERF_FREQ_APERF = 1,
> + PERF_FREQ_MPERF = 2,
> + PERF_FREQ_PPERF = 3,
> +
> + PERF_FREQ_EVENT_MAX, /* non-ABI */
> +};
> +
> +struct perf_freq_msr {
> + int id;
> + u64 msr;
> +};
> +
> +static struct perf_freq_msr freq_msr[] = {
> + { PERF_FREQ_TSC, 0 },
> + { PERF_FREQ_APERF, MSR_IA32_APERF },
> + { PERF_FREQ_MPERF, MSR_IA32_MPERF },
> + { PERF_FREQ_PPERF, MSR_PPERF },
> +};
> +
> +
> +
> +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
> +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
> +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
> +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
> +
> +static struct attribute *events_attrs[PERF_FREQ_EVENT_MAX + 1] = {
> + &evattr_tsc.attr.attr,
> +};
> +
> +static struct attribute_group events_attr_group = {
> + .name = "events",
> + .attrs = events_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-63");
> +static struct attribute *format_attrs[] = {
> + &format_attr_event.attr,
> + NULL,
> +};
> +static struct attribute_group format_attr_group = {
> + .name = "format",
> + .attrs = format_attrs,
> +};
> +
> +static const struct attribute_group *attr_groups[] = {
> + &events_attr_group,
> + &format_attr_group,
> + NULL,
> +};
> +
> +static int freq_event_init(struct perf_event *event)
> +{
> + u64 cfg = event->attr.config;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + if (cfg >= PERF_FREQ_EVENT_MAX)
> + return -EINVAL;
> +
> + /* unsupported modes and filters */
> + if (event->attr.exclude_user ||
> + event->attr.exclude_kernel ||
> + event->attr.exclude_hv ||
> + event->attr.exclude_idle ||
> + event->attr.exclude_host ||
> + event->attr.exclude_guest ||
> + event->attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + event->hw.idx = -1;
> + event->hw.event_base = freq_msr[cfg].msr;
> + event->hw.config = cfg;
> +
> + return 0;
> +}
> +
> +static inline u64 freq_read_counter(struct perf_event *event)
> +{
> + u64 now;
> +
> + if (event->hw.event_base)
> + rdmsrl(event->hw.event_base, now);
> + else
> + now = rdtsc();
> +
> + return now;
> +}
> +static void freq_event_update(struct perf_event *event)
> +{
> + u64 prev;
> + u64 now;
> +
> + /* Assume counters are 64bit */
> + now = freq_read_counter(event);
> + prev = local64_xchg(&event->hw.prev_count, now);
> + local64_add(now - prev, &event->count);
> +}
> +
> +static void freq_event_start(struct perf_event *event, int flags)
> +{
> + u64 now;
> +
> + now = freq_read_counter(event);
> + local64_set(&event->hw.prev_count, now);
> +}
> +
> +static void freq_event_stop(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + /* mark event as deactivated and stopped */
> + if (!(hwc->state & PERF_HES_STOPPED))
> + hwc->state |= PERF_HES_STOPPED;
> +
> + /* check if update of sw counter is necessary */
> + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
> + freq_event_update(event);
> + hwc->state |= PERF_HES_UPTODATE;
> + }
> +}
> +
> +static void freq_event_del(struct perf_event *event, int flags)
> +{
> + freq_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static int freq_event_add(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> +
> + if (flags & PERF_EF_START)
> + freq_event_start(event, flags);
> +
> + return 0;
> +}
> +
> +static struct pmu pmu_freq = {
> + .task_ctx_nr = perf_sw_context,
> + .attr_groups = attr_groups,
> + .event_init = freq_event_init,
> + .add = freq_event_add,
> + .del = freq_event_del,
> + .start = freq_event_start,
> + .stop = freq_event_stop,
> + .read = freq_event_update,
> + .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
> +};
> +
> +static int __init intel_freq_specific_init(int idx)
> +{
> + /* Skylake has pperf support */
> + if ((boot_cpu_data.x86_model == 78) ||
> + (boot_cpu_data.x86_model == 94))
> + events_attrs[idx++] = &evattr_pperf.attr.attr;
> +
> + events_attrs[idx++] = NULL;
> +
> + return 0;
> +}
> +
> +static int __init amd_freq_specific_init(int idx)
> +{
> + return 0;
> +}
> +
> +static int __init freq_init(void)
> +{
> + int err;
> + int idx = 1;
> +
> + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
> + events_attrs[idx++] = &evattr_aperf.attr.attr;
> + events_attrs[idx++] = &evattr_mperf.attr.attr;
> + }
> +
> + switch (boot_cpu_data.x86_vendor) {
> + case X86_VENDOR_INTEL:
> + err = intel_freq_specific_init(idx);
> + break;
> + case X86_VENDOR_AMD:
> + err = amd_freq_specific_init(idx);
> + break;
> + default:
> + err = -ENOTSUPP;
> + }
> +
> + if (err != 0) {
> + pr_cont("no freq PMU driver.\n");
> + return 0;
> + }
> +
> + perf_pmu_register(&pmu_freq, "freq", PERF_TYPE_FREQ);
> +
> + return 0;
> +}
> +device_initcall(freq_init);
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index d97f84c..1f8c400 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -32,6 +32,7 @@ enum perf_type_id {
> PERF_TYPE_HW_CACHE = 3,
> PERF_TYPE_RAW = 4,
> PERF_TYPE_BREAKPOINT = 5,
> + PERF_TYPE_FREQ = 6,
>
What's the point of this when we can dynamically register a PMU?
There has not been anymore modification to perf_type_id since a very long
time ago. Not sure why this proposal qualifies for that?
The perf tool can find the PMU type from sysfs.
> PERF_TYPE_MAX, /* non-ABI */
> };
> --
> 1.8.3.1
>
On Mon, Jul 20, 2015 at 8:10 PM, Stephane Eranian <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 8:49 AM, Kan Liang <[email protected]> wrote:
>> From: Andy Lutomirski <[email protected]>
>>
>> This patch adds freq PMU to support time and freq related counters
>> includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
>>
>> The events are exposed in sysfs for use by perf stat and other tools.
>> The files are under /sys/devices/freq/events/
>>
>> These events only support system-wide mode counting.
>>
>> The PMU type (attr->type) is PERF_TYPE_FREQ.
>>
>> Example:
>>
>> To caculate the CPU%
>> CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
>>
>> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
>> 3164023,,ref-cycles,1048387386,100.00
>> 2410812089,,freq/tsc/,1050022373,100.00
>> The CPU% for sleep is 0.13%.
>>
> This event is system-wide only. Thus, the kernel should return
> an error when you try to use it in per-thread mode. That would
> be more consistent with RAPL and uncore events.
I don't know enough about perf's inner workings to be sure, but can't
perf context switch free-running counters like this?
--Andy
On Mon, Jul 20, 2015 at 11:01 PM, Andy Lutomirski <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 8:10 PM, Stephane Eranian <[email protected]> wrote:
>> On Mon, Jul 20, 2015 at 8:49 AM, Kan Liang <[email protected]> wrote:
>>> From: Andy Lutomirski <[email protected]>
>>>
>>> This patch adds freq PMU to support time and freq related counters
>>> includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
>>>
>>> The events are exposed in sysfs for use by perf stat and other tools.
>>> The files are under /sys/devices/freq/events/
>>>
>>> These events only support system-wide mode counting.
>>>
>>> The PMU type (attr->type) is PERF_TYPE_FREQ.
>>>
>>> Example:
>>>
>>> To caculate the CPU%
>>> CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
>>>
>>> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
>>> 3164023,,ref-cycles,1048387386,100.00
>>> 2410812089,,freq/tsc/,1050022373,100.00
>>> The CPU% for sleep is 0.13%.
>>>
>> This event is system-wide only. Thus, the kernel should return
>> an error when you try to use it in per-thread mode. That would
>> be more consistent with RAPL and uncore events.
>
> I don't know enough about perf's inner workings to be sure, but can't
> perf context switch free-running counters like this?
>
It should be able to but your patch does not seem to do this. I get zero
count when I try.
On Mon, Jul 20, 2015 at 11:21 PM, Stephane Eranian <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 11:01 PM, Andy Lutomirski <[email protected]> wrote:
>> On Mon, Jul 20, 2015 at 8:10 PM, Stephane Eranian <[email protected]> wrote:
>>> On Mon, Jul 20, 2015 at 8:49 AM, Kan Liang <[email protected]> wrote:
>>>> From: Andy Lutomirski <[email protected]>
>>>>
>>>> This patch adds freq PMU to support time and freq related counters
>>>> includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
>>>>
>>>> The events are exposed in sysfs for use by perf stat and other tools.
>>>> The files are under /sys/devices/freq/events/
>>>>
>>>> These events only support system-wide mode counting.
>>>>
>>>> The PMU type (attr->type) is PERF_TYPE_FREQ.
>>>>
>>>> Example:
>>>>
>>>> To caculate the CPU%
>>>> CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
>>>>
>>>> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
>>>> 3164023,,ref-cycles,1048387386,100.00
>>>> 2410812089,,freq/tsc/,1050022373,100.00
>>>> The CPU% for sleep is 0.13%.
>>>>
>>> This event is system-wide only. Thus, the kernel should return
>>> an error when you try to use it in per-thread mode. That would
>>> be more consistent with RAPL and uncore events.
>>
>> I don't know enough about perf's inner workings to be sure, but can't
>> perf context switch free-running counters like this?
>>
> It should be able to but your patch does not seem to do this. I get zero
> count when I try.
Peter, help? I'm guessing the fix is a line of two of code, but I
have no idea what line or two.
--Andy
--
Andy Lutomirski
AMA Capital Management, LLC
On Mon, Jul 20, 2015 at 11:49:06AM -0400, Kan Liang wrote:
> +static void freq_event_update(struct perf_event *event)
> +{
> + u64 prev;
> + u64 now;
> +
> + /* Assume counters are 64bit */
> + now = freq_read_counter(event);
> + prev = local64_xchg(&event->hw.prev_count, now);
> + local64_add(now - prev, &event->count);
> +}
That really wants to be that cmpxchg loop; esp. since you want to be
able to combine these events with hardware sampling events.
In which case the update from NMI context can interfere with an update
from whatever other context.
> +static void freq_event_stop(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + /* mark event as deactivated and stopped */
> + if (!(hwc->state & PERF_HES_STOPPED))
> + hwc->state |= PERF_HES_STOPPED;
This is pointless, nobody will clear the bit.
> + /* check if update of sw counter is necessary */
> + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
> + freq_event_update(event);
> + hwc->state |= PERF_HES_UPTODATE;
> + }
This should be an unconditional update.
> +}
> +
> +static void freq_event_del(struct perf_event *event, int flags)
> +{
> + freq_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static int freq_event_add(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
And that again is pointless.
> + if (flags & PERF_EF_START)
> + freq_event_start(event, flags);
> +
> + return 0;
> +}
Other than that, I did a big rename to MSR, and added SMI_COUNT.
This seems to work fine for per-task events:
$ perf stat -e '{ref-cycles,msr/tsc/}' -e '{msr/aperf/,msr/mperf/}' -e 'msr/smi/' -- ls > /dev/null
Performance counter stats for 'ls':
8,657,404 ref-cycles
8,943,754 msr/tsc/
3,784,505 msr/aperf/
8,651,964 msr/mperf/
0 msr/smi/
0.004307333 seconds time elapsed
---
Subject: x86, perf: Add an MSR pmu driver
From: Andy Lutomirski <[email protected]>
Date: Mon, 20 Jul 2015 11:49:06 -0400
This patch adds an MSR PMU to support free running MSR counters. Such as
time and freq related counters includes TSC, IA32_APERF, IA32_MPERF and
IA32_PPERF, but also SMI_COUNT.
The events are exposed in sysfs for use by perf stat and other tools.
The files are under /sys/devices/msr/events/
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Andy Lutomirski <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
[peterz: s/freq/msr/, added SMI_COUNT, fixed bugs]
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
arch/x86/kernel/cpu/Makefile | 2
arch/x86/kernel/cpu/perf_event_msr.c | 242 +++++++++++++++++++++++++++++++++++
2 files changed, 244 insertions(+)
create mode 100644 arch/x86/kernel/cpu/perf_event_msr.c
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) +
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
endif
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,242 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+
+ PERF_MSR_EVENT_MAX,
+};
+
+struct perf_msr {
+ int id;
+ u64 msr;
+};
+
+static struct perf_msr msr[] = {
+ { PERF_MSR_TSC, 0 },
+ { PERF_MSR_APERF, MSR_IA32_APERF },
+ { PERF_MSR_MPERF, MSR_IA32_MPERF },
+ { PERF_MSR_PPERF, MSR_PPERF },
+ { PERF_MSR_SMI, MSR_SMI_COUNT },
+};
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+ &evattr_tsc.attr.attr,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ now = rdtsc();
+
+ return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value. */
+again:
+ prev = local64_read(&event->hw.prev_count);
+ now = msr_read_counter(event);
+
+ if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+ delta <<= 32;
+ delta >>= 32; /* sign extend */
+ }
+ local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now;
+
+ now = msr_read_counter(event);
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init intel_msr_init(int idx)
+{
+ if (boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ events_attrs[idx++] = &evattr_pperf.attr.attr;
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+
+ case 55: /* 22nm Atom "Silvermont" */
+ case 76: /* 14nm Atom "Airmont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+ }
+
+ events_attrs[idx] = NULL;
+
+ return 0;
+}
+
+static int __init amd_msr_init(int idx)
+{
+ return 0;
+}
+
+static int __init msr_init(void)
+{
+ int err;
+ int idx = 1;
+
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
+ events_attrs[idx++] = &evattr_aperf.attr.attr;
+ events_attrs[idx++] = &evattr_mperf.attr.attr;
+ events_attrs[idx] = NULL;
+ }
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_msr_init(idx);
+ break;
+
+ case X86_VENDOR_AMD:
+ err = amd_msr_init(idx);
+ break;
+
+ default:
+ err = -ENOTSUPP;
+ }
+
+ if (err != 0) {
+ pr_cont("no msr PMU driver.\n");
+ return 0;
+ }
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
On Mon, Jul 20, 2015 at 11:23:36PM -0700, Andy Lutomirski wrote:
> Peter, help? I'm guessing the fix is a line of two of code, but I
> have no idea what line or two.
Yep, the HES flag confusion.
On Tue, Jul 21, 2015 at 2:55 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 11:49:06AM -0400, Kan Liang wrote:
>> +static void freq_event_update(struct perf_event *event)
>> +{
>> + u64 prev;
>> + u64 now;
>> +
>> + /* Assume counters are 64bit */
>> + now = freq_read_counter(event);
>> + prev = local64_xchg(&event->hw.prev_count, now);
>> + local64_add(now - prev, &event->count);
>> +}
>
> That really wants to be that cmpxchg loop; esp. since you want to be
> able to combine these events with hardware sampling events.
>
> In which case the update from NMI context can interfere with an update
> from whatever other context.
>
>> +static void freq_event_stop(struct perf_event *event, int flags)
>> +{
>> + struct hw_perf_event *hwc = &event->hw;
>> +
>> + /* mark event as deactivated and stopped */
>> + if (!(hwc->state & PERF_HES_STOPPED))
>> + hwc->state |= PERF_HES_STOPPED;
>
> This is pointless, nobody will clear the bit.
>
>> + /* check if update of sw counter is necessary */
>> + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
>> + freq_event_update(event);
>> + hwc->state |= PERF_HES_UPTODATE;
>> + }
>
> This should be an unconditional update.
>
>> +}
>> +
>> +static void freq_event_del(struct perf_event *event, int flags)
>> +{
>> + freq_event_stop(event, PERF_EF_UPDATE);
>> +}
>> +
>> +static int freq_event_add(struct perf_event *event, int flags)
>> +{
>> + struct hw_perf_event *hwc = &event->hw;
>> +
>> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
>
> And that again is pointless.
>
>> + if (flags & PERF_EF_START)
>> + freq_event_start(event, flags);
>> +
>> + return 0;
>> +}
>
> Other than that, I did a big rename to MSR, and added SMI_COUNT.
>
> This seems to work fine for per-task events:
>
> $ perf stat -e '{ref-cycles,msr/tsc/}' -e '{msr/aperf/,msr/mperf/}' -e 'msr/smi/' -- ls > /dev/null
>
> Performance counter stats for 'ls':
>
> 8,657,404 ref-cycles
> 8,943,754 msr/tsc/
> 3,784,505 msr/aperf/
> 8,651,964 msr/mperf/
> 0 msr/smi/
>
I understand the value of the tsc and smi events. It is not
clear to me what aperf/mperf buys you over cycles and ref-cycles:
$ perf stat -a -e msr/aperf/,msr/mperf/,cycles,ref-cycles -C 1 -I 1000 sleep 10
# time counts unit events
2.000361718 14,826,353 msr/aperf/
2.000361718 11,865,170 msr/mperf/
2.000361718 17,170,101 cycles
2.000361718 13,629,675 ref-cycles
Only the ratio aperf/mperf is defined, here 1.25 and the ratio
cycles/ref-cycles is 1.25 as well. So what is a situation where
aperf/mperf provides better info than cycles/ref-cycles?
The SDM also says aperf/mperf only defined when running in C0 mode.
Thanks.
> Subject: x86, perf: Add an MSR pmu driver
> From: Andy Lutomirski <[email protected]>
> Date: Mon, 20 Jul 2015 11:49:06 -0400
>
> This patch adds an MSR PMU to support free running MSR counters. Such as
> time and freq related counters includes TSC, IA32_APERF, IA32_MPERF and
> IA32_PPERF, but also SMI_COUNT.
>
> The events are exposed in sysfs for use by perf stat and other tools.
> The files are under /sys/devices/msr/events/
>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Kan Liang <[email protected]>
> [peterz: s/freq/msr/, added SMI_COUNT, fixed bugs]
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Link: http://lkml.kernel.org/r/[email protected]
> ---
> arch/x86/kernel/cpu/Makefile | 2
> arch/x86/kernel/cpu/perf_event_msr.c | 242 +++++++++++++++++++++++++++++++++++
> 2 files changed, 244 insertions(+)
> create mode 100644 arch/x86/kernel/cpu/perf_event_msr.c
>
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) +
> perf_event_intel_uncore_snb.o \
> perf_event_intel_uncore_snbep.o \
> perf_event_intel_uncore_nhmex.o
> +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
> +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
> endif
>
>
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_msr.c
> @@ -0,0 +1,242 @@
> +#include <linux/perf_event.h>
> +
> +enum perf_msr_id {
> + PERF_MSR_TSC = 0,
> + PERF_MSR_APERF = 1,
> + PERF_MSR_MPERF = 2,
> + PERF_MSR_PPERF = 3,
> + PERF_MSR_SMI = 4,
> +
> + PERF_MSR_EVENT_MAX,
> +};
> +
> +struct perf_msr {
> + int id;
> + u64 msr;
> +};
> +
> +static struct perf_msr msr[] = {
> + { PERF_MSR_TSC, 0 },
> + { PERF_MSR_APERF, MSR_IA32_APERF },
> + { PERF_MSR_MPERF, MSR_IA32_MPERF },
> + { PERF_MSR_PPERF, MSR_PPERF },
> + { PERF_MSR_SMI, MSR_SMI_COUNT },
> +};
> +
> +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
> +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
> +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
> +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
> +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
> +
> +static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
> + &evattr_tsc.attr.attr,
> +};
> +
> +static struct attribute_group events_attr_group = {
> + .name = "events",
> + .attrs = events_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-63");
> +static struct attribute *format_attrs[] = {
> + &format_attr_event.attr,
> + NULL,
> +};
> +static struct attribute_group format_attr_group = {
> + .name = "format",
> + .attrs = format_attrs,
> +};
> +
> +static const struct attribute_group *attr_groups[] = {
> + &events_attr_group,
> + &format_attr_group,
> + NULL,
> +};
> +
> +static int msr_event_init(struct perf_event *event)
> +{
> + u64 cfg = event->attr.config;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + if (cfg >= PERF_MSR_EVENT_MAX)
> + return -EINVAL;
> +
> + /* unsupported modes and filters */
> + if (event->attr.exclude_user ||
> + event->attr.exclude_kernel ||
> + event->attr.exclude_hv ||
> + event->attr.exclude_idle ||
> + event->attr.exclude_host ||
> + event->attr.exclude_guest ||
> + event->attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + event->hw.idx = -1;
> + event->hw.event_base = msr[cfg].msr;
> + event->hw.config = cfg;
> +
> + return 0;
> +}
> +
> +static inline u64 msr_read_counter(struct perf_event *event)
> +{
> + u64 now;
> +
> + if (event->hw.event_base)
> + rdmsrl(event->hw.event_base, now);
> + else
> + now = rdtsc();
> +
> + return now;
> +}
> +static void msr_event_update(struct perf_event *event)
> +{
> + u64 prev, now;
> + s64 delta;
> +
> + /* Careful, an NMI might modify the previous event value. */
> +again:
> + prev = local64_read(&event->hw.prev_count);
> + now = msr_read_counter(event);
> +
> + if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
> + goto again;
> +
> + delta = now - prev;
> + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
> + delta <<= 32;
> + delta >>= 32; /* sign extend */
> + }
> + local64_add(now - prev, &event->count);
> +}
> +
> +static void msr_event_start(struct perf_event *event, int flags)
> +{
> + u64 now;
> +
> + now = msr_read_counter(event);
> + local64_set(&event->hw.prev_count, now);
> +}
> +
> +static void msr_event_stop(struct perf_event *event, int flags)
> +{
> + msr_event_update(event);
> +}
> +
> +static void msr_event_del(struct perf_event *event, int flags)
> +{
> + msr_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static int msr_event_add(struct perf_event *event, int flags)
> +{
> + if (flags & PERF_EF_START)
> + msr_event_start(event, flags);
> +
> + return 0;
> +}
> +
> +static struct pmu pmu_msr = {
> + .task_ctx_nr = perf_sw_context,
> + .attr_groups = attr_groups,
> + .event_init = msr_event_init,
> + .add = msr_event_add,
> + .del = msr_event_del,
> + .start = msr_event_start,
> + .stop = msr_event_stop,
> + .read = msr_event_update,
> + .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
> +};
> +
> +static int __init intel_msr_init(int idx)
> +{
> + if (boot_cpu_data.x86 != 6)
> + return 0;
> +
> + switch (boot_cpu_data.x86_model) {
> + case 30: /* 45nm Nehalem */
> + case 26: /* 45nm Nehalem-EP */
> + case 46: /* 45nm Nehalem-EX */
> +
> + case 37: /* 32nm Westmere */
> + case 44: /* 32nm Westmere-EP */
> + case 47: /* 32nm Westmere-EX */
> +
> + case 42: /* 32nm SandyBridge */
> + case 45: /* 32nm SandyBridge-E/EN/EP */
> +
> + case 58: /* 22nm IvyBridge */
> + case 62: /* 22nm IvyBridge-EP/EX */
> +
> + case 60: /* 22nm Haswell Core */
> + case 63: /* 22nm Haswell Server */
> + case 69: /* 22nm Haswell ULT */
> + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
> +
> + case 61: /* 14nm Broadwell Core-M */
> + case 86: /* 14nm Broadwell Xeon D */
> + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
> + case 79: /* 14nm Broadwell Server */
> + events_attrs[idx++] = &evattr_smi.attr.attr;
> + break;
> +
> + case 78: /* 14nm Skylake Mobile */
> + case 94: /* 14nm Skylake Desktop */
> + events_attrs[idx++] = &evattr_pperf.attr.attr;
> + events_attrs[idx++] = &evattr_smi.attr.attr;
> + break;
> +
> + case 55: /* 22nm Atom "Silvermont" */
> + case 76: /* 14nm Atom "Airmont" */
> + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
> + events_attrs[idx++] = &evattr_smi.attr.attr;
> + break;
> + }
> +
> + events_attrs[idx] = NULL;
> +
> + return 0;
> +}
> +
> +static int __init amd_msr_init(int idx)
> +{
> + return 0;
> +}
> +
> +static int __init msr_init(void)
> +{
> + int err;
> + int idx = 1;
> +
> + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
> + events_attrs[idx++] = &evattr_aperf.attr.attr;
> + events_attrs[idx++] = &evattr_mperf.attr.attr;
> + events_attrs[idx] = NULL;
> + }
> +
> + switch (boot_cpu_data.x86_vendor) {
> + case X86_VENDOR_INTEL:
> + err = intel_msr_init(idx);
> + break;
> +
> + case X86_VENDOR_AMD:
> + err = amd_msr_init(idx);
> + break;
> +
> + default:
> + err = -ENOTSUPP;
> + }
> +
> + if (err != 0) {
> + pr_cont("no msr PMU driver.\n");
> + return 0;
> + }
> +
> + perf_pmu_register(&pmu_msr, "msr", -1);
> +
> + return 0;
> +}
> +device_initcall(msr_init);
On Thu, Jul 23, 2015 at 8:44 AM, Stephane Eranian <[email protected]> wrote:
> I understand the value of the tsc and smi events. It is not
> clear to me what aperf/mperf buys you over cycles and ref-cycles:
>
> $ perf stat -a -e msr/aperf/,msr/mperf/,cycles,ref-cycles -C 1 -I 1000 sleep 10
> # time counts unit events
> 2.000361718 14,826,353 msr/aperf/
> 2.000361718 11,865,170 msr/mperf/
> 2.000361718 17,170,101 cycles
> 2.000361718 13,629,675 ref-cycles
>
> Only the ratio aperf/mperf is defined, here 1.25 and the ratio
> cycles/ref-cycles is 1.25 as well. So what is a situation where
> aperf/mperf provides better info than cycles/ref-cycles?
> The SDM also says aperf/mperf only defined when running in C0 mode.
They're free-running and always on, which means that you can never
fail to schedule them.
--Andy
On Thu, Jul 23, 2015 at 8:52 AM, Andy Lutomirski <[email protected]> wrote:
> On Thu, Jul 23, 2015 at 8:44 AM, Stephane Eranian <[email protected]> wrote:
>> I understand the value of the tsc and smi events. It is not
>> clear to me what aperf/mperf buys you over cycles and ref-cycles:
>>
>> $ perf stat -a -e msr/aperf/,msr/mperf/,cycles,ref-cycles -C 1 -I 1000 sleep 10
>> # time counts unit events
>> 2.000361718 14,826,353 msr/aperf/
>> 2.000361718 11,865,170 msr/mperf/
>> 2.000361718 17,170,101 cycles
>> 2.000361718 13,629,675 ref-cycles
>>
>> Only the ratio aperf/mperf is defined, here 1.25 and the ratio
>> cycles/ref-cycles is 1.25 as well. So what is a situation where
>> aperf/mperf provides better info than cycles/ref-cycles?
>> The SDM also says aperf/mperf only defined when running in C0 mode.
>
> They're free-running and always on, which means that you can never
> fail to schedule them.
>
You get the same with cycles and ref-cycles. They can both run on
fixed-counters.
So you can always schedule them. If you cannot, then it means you are already
measuring them.
The only case I can see where there is a benefit is if you have a
competing system-wide
and per-thread sessions and the former is already using all the
generic counters + fixed
and you come in with a per-thread event to measure cycles or
ref-cycles. That would be
rejected but aperf/mperf would not. But that would only work if you
are counting. There
would be no benefits for sampling mode.
> I understand the value of the tsc and smi events. It is not
> clear to me what aperf/mperf buys you over cycles and ref-cycles:
>
> $ perf stat -a -e msr/aperf/,msr/mperf/,cycles,ref-cycles -C 1 -I 1000 sleep 10
> # time counts unit events
> 2.000361718 14,826,353 msr/aperf/
> 2.000361718 11,865,170 msr/mperf/
> 2.000361718 17,170,101 cycles
> 2.000361718 13,629,675 ref-cycles
>
> Only the ratio aperf/mperf is defined, here 1.25 and the ratio
> cycles/ref-cycles is 1.25 as well. So what is a situation where
> aperf/mperf provides better info than cycles/ref-cycles?
> The SDM also says aperf/mperf only defined when running in C0 mode.
aperf/mperf excludes idle and it accounts for throttling and p-state
coordination. So it's a indicator how busy the CPU is while running.
That's why it is used as input to the p-state algorithm.
cycles/ref-cycles is just the core busy indicator related
to idle. That's a different metric.
Also we have PPERF now on some parts (productive performance count),
but it also needs a ratio, so MPERF is needed to get the ratio.
BTW with the table driven scheme adding new MSRs is very cheap
(just a table entry usually) so there is little reason for not adding any.
-Andi
On Mon, Jul 20, 2015 at 11:49:06AM -0400, Kan Liang wrote:
> From: Andy Lutomirski <[email protected]>
>
> This patch adds freq PMU to support time and freq related counters
> includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
>
> The events are exposed in sysfs for use by perf stat and other tools.
> The files are under /sys/devices/freq/events/
>
> These events only support system-wide mode counting.
>
> The PMU type (attr->type) is PERF_TYPE_FREQ.
>
> Example:
>
> To caculate the CPU%
> CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
>
> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
> 3164023,,ref-cycles,1048387386,100.00
> 2410812089,,freq/tsc/,1050022373,100.00
> The CPU% for sleep is 0.13%.
>
> $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 busyloop
> 15662183572,,ref-cycles,6822637855,100.00
> 15667608992,,freq/tsc/,6823978523,100.00
> The CPU% for busy loop is 99.9%.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Kan Liang <[email protected]>
SNIP
> diff --git a/arch/x86/kernel/cpu/perf_event_freq.c b/arch/x86/kernel/cpu/perf_event_freq.c
> new file mode 100644
> index 0000000..9389b3b
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_freq.c
> @@ -0,0 +1,207 @@
> +#include <linux/perf_event.h>
> +
> +enum perf_freq_id {
> + /*
> + * freq events, generalized by the kernel:
> + */
> + PERF_FREQ_TSC = 0,
> + PERF_FREQ_APERF = 1,
> + PERF_FREQ_MPERF = 2,
> + PERF_FREQ_PPERF = 3,
> +
> + PERF_FREQ_EVENT_MAX, /* non-ABI */
should this be in uapi header?
jirka
On Jul 26, 2015 9:31 AM, "Jiri Olsa" <[email protected]> wrote:
>
> On Mon, Jul 20, 2015 at 11:49:06AM -0400, Kan Liang wrote:
> > From: Andy Lutomirski <[email protected]>
> >
> > This patch adds freq PMU to support time and freq related counters
> > includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF.
> >
> > The events are exposed in sysfs for use by perf stat and other tools.
> > The files are under /sys/devices/freq/events/
> >
> > These events only support system-wide mode counting.
> >
> > The PMU type (attr->type) is PERF_TYPE_FREQ.
> >
> > Example:
> >
> > To caculate the CPU%
> > CPU_Utilization = CPU_CLK_UNHALTED.REF_TSC / TSC
> >
> > $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 sleep 1
> > 3164023,,ref-cycles,1048387386,100.00
> > 2410812089,,freq/tsc/,1050022373,100.00
> > The CPU% for sleep is 0.13%.
> >
> > $ perf stat -e '{ref-cycles,freq/tsc/}' -C0 -- taskset -c 0 busyloop
> > 15662183572,,ref-cycles,6822637855,100.00
> > 15667608992,,freq/tsc/,6823978523,100.00
> > The CPU% for busy loop is 99.9%.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > Signed-off-by: Kan Liang <[email protected]>
>
> SNIP
>
> > diff --git a/arch/x86/kernel/cpu/perf_event_freq.c b/arch/x86/kernel/cpu/perf_event_freq.c
> > new file mode 100644
> > index 0000000..9389b3b
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/perf_event_freq.c
> > @@ -0,0 +1,207 @@
> > +#include <linux/perf_event.h>
> > +
> > +enum perf_freq_id {
> > + /*
> > + * freq events, generalized by the kernel:
> > + */
> > + PERF_FREQ_TSC = 0,
> > + PERF_FREQ_APERF = 1,
> > + PERF_FREQ_MPERF = 2,
> > + PERF_FREQ_PPERF = 3,
> > +
> > + PERF_FREQ_EVENT_MAX, /* non-ABI */
>
> should this be in uapi header?
Possibly not, since user code can fish them out of sysfs.
Commit-ID: b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4
Gitweb: http://git.kernel.org/tip/b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4
Author: Andy Lutomirski <[email protected]>
AuthorDate: Mon, 20 Jul 2015 11:49:06 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:17:00 +0200
perf/x86: Add an MSR PMU driver
This patch adds an MSR PMU to support free running MSR counters. Such
as time and freq related counters includes TSC, IA32_APERF, IA32_MPERF
and IA32_PPERF, but also SMI_COUNT.
The events are exposed in sysfs for use by perf stat and other tools.
The files are under /sys/devices/msr/events/
Signed-off-by: Andy Lutomirski <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
[ s/freq/msr/, added SMI_COUNT, fixed bugs. ]
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/Makefile | 2 +
arch/x86/kernel/cpu/perf_event_msr.c | 242 +++++++++++++++++++++++++++++++++++
2 files changed, 244 insertions(+)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9bff687..4eb065c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
endif
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
new file mode 100644
index 0000000..af216e9
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,242 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+
+ PERF_MSR_EVENT_MAX,
+};
+
+struct perf_msr {
+ int id;
+ u64 msr;
+};
+
+static struct perf_msr msr[] = {
+ { PERF_MSR_TSC, 0 },
+ { PERF_MSR_APERF, MSR_IA32_APERF },
+ { PERF_MSR_MPERF, MSR_IA32_MPERF },
+ { PERF_MSR_PPERF, MSR_PPERF },
+ { PERF_MSR_SMI, MSR_SMI_COUNT },
+};
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+ &evattr_tsc.attr.attr,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ now = rdtsc();
+
+ return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value. */
+again:
+ prev = local64_read(&event->hw.prev_count);
+ now = msr_read_counter(event);
+
+ if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+ delta <<= 32;
+ delta >>= 32; /* sign extend */
+ }
+ local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now;
+
+ now = msr_read_counter(event);
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init intel_msr_init(int idx)
+{
+ if (boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ events_attrs[idx++] = &evattr_pperf.attr.attr;
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+
+ case 55: /* 22nm Atom "Silvermont" */
+ case 76: /* 14nm Atom "Airmont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ events_attrs[idx++] = &evattr_smi.attr.attr;
+ break;
+ }
+
+ events_attrs[idx] = NULL;
+
+ return 0;
+}
+
+static int __init amd_msr_init(int idx)
+{
+ return 0;
+}
+
+static int __init msr_init(void)
+{
+ int err;
+ int idx = 1;
+
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
+ events_attrs[idx++] = &evattr_aperf.attr.attr;
+ events_attrs[idx++] = &evattr_mperf.attr.attr;
+ events_attrs[idx] = NULL;
+ }
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_msr_init(idx);
+ break;
+
+ case X86_VENDOR_AMD:
+ err = amd_msr_init(idx);
+ break;
+
+ default:
+ err = -ENOTSUPP;
+ }
+
+ if (err != 0) {
+ pr_cont("no msr PMU driver.\n");
+ return 0;
+ }
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
On Tue, Aug 4, 2015 at 2:01 AM, tip-bot for Andy Lutomirski
<[email protected]> wrote:
> Commit-ID: b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4
> Gitweb: http://git.kernel.org/tip/b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4
> Author: Andy Lutomirski <[email protected]>
I think I'm slightly late to the party reviewing what appears to be my
own code :)
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_msr.c
> @@ -0,0 +1,242 @@
> +#include <linux/perf_event.h>
> +
> +enum perf_msr_id {
> + PERF_MSR_TSC = 0,
> + PERF_MSR_APERF = 1,
> + PERF_MSR_MPERF = 2,
> + PERF_MSR_PPERF = 3,
> + PERF_MSR_SMI = 4,
> +
> + PERF_MSR_EVENT_MAX,
> +};
> +
> +struct perf_msr {
> + int id;
> + u64 msr;
> +};
> +
> +static struct perf_msr msr[] = {
> + { PERF_MSR_TSC, 0 },
> + { PERF_MSR_APERF, MSR_IA32_APERF },
> + { PERF_MSR_MPERF, MSR_IA32_MPERF },
> + { PERF_MSR_PPERF, MSR_PPERF },
> + { PERF_MSR_SMI, MSR_SMI_COUNT },
> +};
I think this could be easier to work with if it were [PERF_MSR_TSC] =
{...}, etc. No big deal, though, until the list gets long. However,
it might make fixing the apparent issue below easier...
> +static int msr_event_init(struct perf_event *event)
> +{
> + u64 cfg = event->attr.config;
...
> + event->hw.event_base = msr[cfg].msr;
Shouldn't this verify that the fancy enumeration code actually
believes that msr[cfg] exists on this system? Otherwise we might have
a very short wait until the perf fuzzer oopses this thing :)
--Andy
> > +
> > +enum perf_msr_id {
> > + PERF_MSR_TSC = 0,
> > + PERF_MSR_APERF = 1,
> > + PERF_MSR_MPERF = 2,
> > + PERF_MSR_PPERF = 3,
> > + PERF_MSR_SMI = 4,
> > +
> > + PERF_MSR_EVENT_MAX,
> > +};
> > +
> > +struct perf_msr {
> > + int id;
> > + u64 msr;
> > +};
> > +
> > +static struct perf_msr msr[] = {
> > + { PERF_MSR_TSC, 0 },
> > + { PERF_MSR_APERF, MSR_IA32_APERF },
> > + { PERF_MSR_MPERF, MSR_IA32_MPERF },
> > + { PERF_MSR_PPERF, MSR_PPERF },
> > + { PERF_MSR_SMI, MSR_SMI_COUNT }, };
>
> I think this could be easier to work with if it were [PERF_MSR_TSC] = {...},
> etc. No big deal, though, until the list gets long. However, it might make
> fixing the apparent issue below easier...
>
> > +static int msr_event_init(struct perf_event *event) {
> > + u64 cfg = event->attr.config;
>
> ...
>
> > + event->hw.event_base = msr[cfg].msr;
>
> Shouldn't this verify that the fancy enumeration code actually believes that
> msr[cfg] exists on this system? Otherwise we might have a very short wait
> until the perf fuzzer oopses this thing :)
>
I think we already did the check before using msr[cfg].
> +
> + if (cfg >= PERF_MSR_EVENT_MAX)
> + return -EINVAL;
> +
Kan
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Tue, Aug 4, 2015 at 8:03 AM, Liang, Kan <[email protected]> wrote:
>
>> > +
>> > +enum perf_msr_id {
>> > + PERF_MSR_TSC = 0,
>> > + PERF_MSR_APERF = 1,
>> > + PERF_MSR_MPERF = 2,
>> > + PERF_MSR_PPERF = 3,
>> > + PERF_MSR_SMI = 4,
>> > +
>> > + PERF_MSR_EVENT_MAX,
>> > +};
>> > +
>> > +struct perf_msr {
>> > + int id;
>> > + u64 msr;
>> > +};
>> > +
>> > +static struct perf_msr msr[] = {
>> > + { PERF_MSR_TSC, 0 },
>> > + { PERF_MSR_APERF, MSR_IA32_APERF },
>> > + { PERF_MSR_MPERF, MSR_IA32_MPERF },
>> > + { PERF_MSR_PPERF, MSR_PPERF },
>> > + { PERF_MSR_SMI, MSR_SMI_COUNT }, };
>>
>> I think this could be easier to work with if it were [PERF_MSR_TSC] = {...},
>> etc. No big deal, though, until the list gets long. However, it might make
>> fixing the apparent issue below easier...
>>
>> > +static int msr_event_init(struct perf_event *event) {
>> > + u64 cfg = event->attr.config;
>>
>> ...
>>
>> > + event->hw.event_base = msr[cfg].msr;
>>
>> Shouldn't this verify that the fancy enumeration code actually believes that
>> msr[cfg] exists on this system? Otherwise we might have a very short wait
>> until the perf fuzzer oopses this thing :)
>>
>
> I think we already did the check before using msr[cfg].
Where? All I see is:
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
--Andy
>
> On Tue, Aug 4, 2015 at 8:03 AM, Liang, Kan <[email protected]> wrote:
> >
> >> > +
> >> > +enum perf_msr_id {
> >> > + PERF_MSR_TSC = 0,
> >> > + PERF_MSR_APERF = 1,
> >> > + PERF_MSR_MPERF = 2,
> >> > + PERF_MSR_PPERF = 3,
> >> > + PERF_MSR_SMI = 4,
> >> > +
> >> > + PERF_MSR_EVENT_MAX,
> >> > +};
> >> > +
> >> > +struct perf_msr {
> >> > + int id;
> >> > + u64 msr;
> >> > +};
> >> > +
> >> > +static struct perf_msr msr[] = {
> >> > + { PERF_MSR_TSC, 0 },
> >> > + { PERF_MSR_APERF, MSR_IA32_APERF },
> >> > + { PERF_MSR_MPERF, MSR_IA32_MPERF },
> >> > + { PERF_MSR_PPERF, MSR_PPERF },
> >> > + { PERF_MSR_SMI, MSR_SMI_COUNT }, };
> >>
> >> I think this could be easier to work with if it were [PERF_MSR_TSC] =
> >> {...}, etc. No big deal, though, until the list gets long. However,
> >> it might make fixing the apparent issue below easier...
> >>
> >> > +static int msr_event_init(struct perf_event *event) {
> >> > + u64 cfg = event->attr.config;
> >>
> >> ...
> >>
> >> > + event->hw.event_base = msr[cfg].msr;
> >>
> >> Shouldn't this verify that the fancy enumeration code actually
> >> believes that msr[cfg] exists on this system? Otherwise we might have
> >> a very short wait until the perf fuzzer oopses this thing :)
> >>
> >
> > I think we already did the check before using msr[cfg].
>
> Where? All I see is:
>
> + if (cfg >= PERF_MSR_EVENT_MAX)
> + return -EINVAL;
Yes, we check cfg here. So msr[cfg] should be always available.
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Tue, Aug 4, 2015 at 11:11 AM, Liang, Kan <[email protected]> wrote:
>
>
>>
>> On Tue, Aug 4, 2015 at 8:03 AM, Liang, Kan <[email protected]> wrote:
>> >
>> >> > +
>> >> > +enum perf_msr_id {
>> >> > + PERF_MSR_TSC = 0,
>> >> > + PERF_MSR_APERF = 1,
>> >> > + PERF_MSR_MPERF = 2,
>> >> > + PERF_MSR_PPERF = 3,
>> >> > + PERF_MSR_SMI = 4,
>> >> > +
>> >> > + PERF_MSR_EVENT_MAX,
>> >> > +};
>> >> > +
>> >> > +struct perf_msr {
>> >> > + int id;
>> >> > + u64 msr;
>> >> > +};
>> >> > +
>> >> > +static struct perf_msr msr[] = {
>> >> > + { PERF_MSR_TSC, 0 },
>> >> > + { PERF_MSR_APERF, MSR_IA32_APERF },
>> >> > + { PERF_MSR_MPERF, MSR_IA32_MPERF },
>> >> > + { PERF_MSR_PPERF, MSR_PPERF },
>> >> > + { PERF_MSR_SMI, MSR_SMI_COUNT }, };
>> >>
>> >> I think this could be easier to work with if it were [PERF_MSR_TSC] =
>> >> {...}, etc. No big deal, though, until the list gets long. However,
>> >> it might make fixing the apparent issue below easier...
>> >>
>> >> > +static int msr_event_init(struct perf_event *event) {
>> >> > + u64 cfg = event->attr.config;
>> >>
>> >> ...
>> >>
>> >> > + event->hw.event_base = msr[cfg].msr;
>> >>
>> >> Shouldn't this verify that the fancy enumeration code actually
>> >> believes that msr[cfg] exists on this system? Otherwise we might have
>> >> a very short wait until the perf fuzzer oopses this thing :)
>> >>
>> >
>> > I think we already did the check before using msr[cfg].
>>
>> Where? All I see is:
>>
>> + if (cfg >= PERF_MSR_EVENT_MAX)
>> + return -EINVAL;
>
> Yes, we check cfg here. So msr[cfg] should be always available.
>
PERF_MSR_EVENT_MAX is a constant. If I run this thing on an AMD CPU
that supports TSC, APERF, MPERF, and nothing else, and someone asks
for PPERF, then the check will succeed and we'll oops, right?
--Andy
> On Tue, Aug 4, 2015 at 11:11 AM, Liang, Kan <[email protected]> wrote:
> >
> >
> >>
> >> On Tue, Aug 4, 2015 at 8:03 AM, Liang, Kan <[email protected]>
> wrote:
> >> >
> >> >> > +
> >> >> > +enum perf_msr_id {
> >> >> > + PERF_MSR_TSC = 0,
> >> >> > + PERF_MSR_APERF = 1,
> >> >> > + PERF_MSR_MPERF = 2,
> >> >> > + PERF_MSR_PPERF = 3,
> >> >> > + PERF_MSR_SMI = 4,
> >> >> > +
> >> >> > + PERF_MSR_EVENT_MAX,
> >> >> > +};
> >> >> > +
> >> >> > +struct perf_msr {
> >> >> > + int id;
> >> >> > + u64 msr;
> >> >> > +};
> >> >> > +
> >> >> > +static struct perf_msr msr[] = {
> >> >> > + { PERF_MSR_TSC, 0 },
> >> >> > + { PERF_MSR_APERF, MSR_IA32_APERF },
> >> >> > + { PERF_MSR_MPERF, MSR_IA32_MPERF },
> >> >> > + { PERF_MSR_PPERF, MSR_PPERF },
> >> >> > + { PERF_MSR_SMI, MSR_SMI_COUNT }, };
> >> >>
> >> >> I think this could be easier to work with if it were
> >> >> [PERF_MSR_TSC] = {...}, etc. No big deal, though, until the list
> >> >> gets long. However, it might make fixing the apparent issue below
> easier...
> >> >>
> >> >> > +static int msr_event_init(struct perf_event *event) {
> >> >> > + u64 cfg = event->attr.config;
> >> >>
> >> >> ...
> >> >>
> >> >> > + event->hw.event_base = msr[cfg].msr;
> >> >>
> >> >> Shouldn't this verify that the fancy enumeration code actually
> >> >> believes that msr[cfg] exists on this system? Otherwise we might
> >> >> have a very short wait until the perf fuzzer oopses this thing :)
> >> >>
> >> >
> >> > I think we already did the check before using msr[cfg].
> >>
> >> Where? All I see is:
> >>
> >> + if (cfg >= PERF_MSR_EVENT_MAX)
> >> + return -EINVAL;
> >
> > Yes, we check cfg here. So msr[cfg] should be always available.
> >
>
> PERF_MSR_EVENT_MAX is a constant. If I run this thing on an AMD CPU
> that supports TSC, APERF, MPERF, and nothing else, and someone asks for
> PPERF, then the check will succeed and we'll oops, right?
>
Right, it could be a problem.
How about the patch as below?
---
>From 0217ffc9a0d2fac6417552b9f66fafc538ef9068 Mon Sep 17 00:00:00 2001
Date: Tue, 4 Aug 2015 08:27:19 -0400
Subject: [PATCH 1/1] perf/x86/msr: Fix issue of accessing unsupported MSR
events
Most of platforms only support part of MSR events. If unsupported MSR
events are mistakenly accessed, it may cause oops.
Introducing .available to mark the supported MSR events, and check it in
event init.
Reported-by: Andy Lutomirski <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
arch/x86/kernel/cpu/perf_event_msr.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index af216e9..c174242 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -13,14 +13,15 @@ enum perf_msr_id {
struct perf_msr {
int id;
u64 msr;
+ bool available;
};
static struct perf_msr msr[] = {
- { PERF_MSR_TSC, 0 },
- { PERF_MSR_APERF, MSR_IA32_APERF },
- { PERF_MSR_MPERF, MSR_IA32_MPERF },
- { PERF_MSR_PPERF, MSR_PPERF },
- { PERF_MSR_SMI, MSR_SMI_COUNT },
+ { PERF_MSR_TSC, 0, true },
+ { PERF_MSR_APERF, MSR_IA32_APERF, false },
+ { PERF_MSR_MPERF, MSR_IA32_MPERF, false },
+ { PERF_MSR_PPERF, MSR_PPERF, false },
+ { PERF_MSR_SMI, MSR_SMI_COUNT, false },
};
PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
@@ -74,6 +75,10 @@ static int msr_event_init(struct perf_event *event)
event->attr.sample_period) /* no sampling */
return -EINVAL;
+ /* The platform may not support all events */
+ if (!msr[cfg].available)
+ return -EINVAL;
+
event->hw.idx = -1;
event->hw.event_base = msr[cfg].msr;
event->hw.config = cfg;
@@ -181,18 +186,22 @@ static int __init intel_msr_init(int idx)
case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
case 79: /* 14nm Broadwell Server */
events_attrs[idx++] = &evattr_smi.attr.attr;
+ msr[PERF_MSR_SMI].available = true;
break;
case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
events_attrs[idx++] = &evattr_pperf.attr.attr;
events_attrs[idx++] = &evattr_smi.attr.attr;
+ msr[PERF_MSR_PPERF].available = true;
+ msr[PERF_MSR_SMI].available = true;
break;
case 55: /* 22nm Atom "Silvermont" */
case 76: /* 14nm Atom "Airmont" */
case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
events_attrs[idx++] = &evattr_smi.attr.attr;
+ msr[PERF_MSR_SMI].available = true;
break;
}
@@ -215,6 +224,8 @@ static int __init msr_init(void)
events_attrs[idx++] = &evattr_aperf.attr.attr;
events_attrs[idx++] = &evattr_mperf.attr.attr;
events_attrs[idx] = NULL;
+ msr[PERF_MSR_APERF].available = true;
+ msr[PERF_MSR_MPERF].available = true;
}
switch (boot_cpu_data.x86_vendor) {
--
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
> Right, it could be a problem.
> How about the patch as below?
X-MS-Has-Attach:
X-MS-TNEF-Correlator:
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Your outlook wrecked it, please use a less broken MUA.
How about we do the below instead? Its also virt proof by virtue of
probing if we can read the MSRs.
---
arch/x86/kernel/cpu/perf_event_msr.c | 122 +++++++++++------------------------
1 file changed, 39 insertions(+), 83 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index af216e9223e8..e9b6a0058110 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -10,17 +10,21 @@ enum perf_msr_id {
PERF_MSR_EVENT_MAX,
};
+bool test_aperfmperf(void)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+bool test_intel(void)
+{
+ return boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6;
+}
+
struct perf_msr {
- int id;
u64 msr;
-};
-
-static struct perf_msr msr[] = {
- { PERF_MSR_TSC, 0 },
- { PERF_MSR_APERF, MSR_IA32_APERF },
- { PERF_MSR_MPERF, MSR_IA32_MPERF },
- { PERF_MSR_PPERF, MSR_PPERF },
- { PERF_MSR_SMI, MSR_SMI_COUNT },
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(void);
};
PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
@@ -29,8 +33,17 @@ PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+};
+
static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
&evattr_tsc.attr.attr,
+ NULL,
};
static struct attribute_group events_attr_group = {
@@ -74,6 +87,9 @@ static int msr_event_init(struct perf_event *event)
event->attr.sample_period) /* no sampling */
return -EINVAL;
+ if (!msr[cfg].attr)
+ return -EINVAL;
+
event->hw.idx = -1;
event->hw.event_base = msr[cfg].msr;
event->hw.config = cfg;
@@ -151,89 +167,29 @@ static struct pmu pmu_msr = {
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
-static int __init intel_msr_init(int idx)
-{
- if (boot_cpu_data.x86 != 6)
- return 0;
-
- switch (boot_cpu_data.x86_model) {
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
-
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
-
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
-
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
-
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 69: /* 22nm Haswell ULT */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
-
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
- events_attrs[idx++] = &evattr_pperf.attr.attr;
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
-
- case 55: /* 22nm Atom "Silvermont" */
- case 76: /* 14nm Atom "Airmont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
- }
-
- events_attrs[idx] = NULL;
-
- return 0;
-}
-
-static int __init amd_msr_init(int idx)
-{
- return 0;
-}
-
static int __init msr_init(void)
{
- int err;
- int idx = 1;
+ int i, j = 1;
- if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
- events_attrs[idx++] = &evattr_aperf.attr.attr;
- events_attrs[idx++] = &evattr_mperf.attr.attr;
- events_attrs[idx] = NULL;
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
}
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- err = intel_msr_init(idx);
- break;
-
- case X86_VENDOR_AMD:
- err = amd_msr_init(idx);
- break;
+ /* Probe the MSRs. */
+ for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+ u64 val;
- default:
- err = -ENOTSUPP;
+ if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
}
- if (err != 0) {
- pr_cont("no msr PMU driver.\n");
- return 0;
+ /* List remaining MSRs in the sysfs attrs. */
+ for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
}
+ events_attrs[j] = NULL;
perf_pmu_register(&pmu_msr, "msr", -1);
On Thu, Aug 6, 2015 at 8:21 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
> - default:
> - err = -ENOTSUPP;
> + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
> + msr[i].attr = NULL;
IIRC rdmsrl_safe literally never fails under QEMU TCG, and I'm not
entirely sure what happens under KVM if emulation kicks in. It might
pay to keep the model check for the non-architectural stuff, or at
least check for a nonzero return value.
--Andy
On Thu, Aug 06, 2015 at 08:30:08AM -0700, Andy Lutomirski wrote:
> On Thu, Aug 6, 2015 at 8:21 AM, Peter Zijlstra <[email protected]> wrote:
> > On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
> > - default:
> > - err = -ENOTSUPP;
> > + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
> > + msr[i].attr = NULL;
>
> IIRC rdmsrl_safe literally never fails under QEMU TCG, and I'm not
*sigh* the borkage never stops does it :-(
> entirely sure what happens under KVM if emulation kicks in. It might
> pay to keep the model check for the non-architectural stuff, or at
> least check for a nonzero return value.
Of course, 0 might be a valid value.. Esp. for the SMI counter.
> + /* Probe the MSRs. */
> + for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
> + u64 val;
>
> - default:
> - err = -ENOTSUPP;
> + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
> + msr[i].attr = NULL;
> }
>
> - if (err != 0) {
> - pr_cont("no msr PMU driver.\n");
> - return 0;
> + /* List remaining MSRs in the sysfs attrs. */
> + for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
i should start from PERF_MSR_TSC + 1. The tsc has already been
inserted into events_attrs by default.
> + if (msr[i].attr)
> + events_attrs[j++] = &msr[i].attr->attr.attr;
> }
> + events_attrs[j] = NULL;
>
> perf_pmu_register(&pmu_msr, "msr", -1);
>
On Thu, Aug 06, 2015 at 05:59:43PM +0200, Peter Zijlstra wrote:
> On Thu, Aug 06, 2015 at 08:30:08AM -0700, Andy Lutomirski wrote:
> > On Thu, Aug 6, 2015 at 8:21 AM, Peter Zijlstra <[email protected]> wrote:
> > > On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
> > > - default:
> > > - err = -ENOTSUPP;
> > > + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
> > > + msr[i].attr = NULL;
> >
> > IIRC rdmsrl_safe literally never fails under QEMU TCG, and I'm not
>
> *sigh* the borkage never stops does it :-(
>
> > entirely sure what happens under KVM if emulation kicks in. It might
> > pay to keep the model check for the non-architectural stuff, or at
> > least check for a nonzero return value.
>
> Of course, 0 might be a valid value.. Esp. for the SMI counter.
This then..
---
arch/x86/kernel/cpu/perf_event_msr.c | 170 +++++++++++++++++------------------
1 file changed, 85 insertions(+), 85 deletions(-)
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -10,17 +10,63 @@ enum perf_msr_id {
PERF_MSR_EVENT_MAX,
};
+bool test_aperfmperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+bool test_intel(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_MSR_SMI)
+ return true;
+ break;
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_MSR_SMI || idx = PERF_MSR_PPERF)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
struct perf_msr {
- int id;
u64 msr;
-};
-
-static struct perf_msr msr[] = {
- { PERF_MSR_TSC, 0 },
- { PERF_MSR_APERF, MSR_IA32_APERF },
- { PERF_MSR_MPERF, MSR_IA32_MPERF },
- { PERF_MSR_PPERF, MSR_PPERF },
- { PERF_MSR_SMI, MSR_SMI_COUNT },
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
};
PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
@@ -29,8 +75,16 @@ PMU_EVENT_ATTR_STRING(mperf, evattr_mper
PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+};
+
static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
- &evattr_tsc.attr.attr,
+ NULL,
};
static struct attribute_group events_attr_group = {
@@ -74,6 +128,9 @@ static int msr_event_init(struct perf_ev
event->attr.sample_period) /* no sampling */
return -EINVAL;
+ if (!msr[cfg].attr)
+ return -EINVAL;
+
event->hw.idx = -1;
event->hw.event_base = msr[cfg].msr;
event->hw.config = cfg;
@@ -151,89 +208,32 @@ static struct pmu pmu_msr = {
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
-static int __init intel_msr_init(int idx)
-{
- if (boot_cpu_data.x86 != 6)
- return 0;
-
- switch (boot_cpu_data.x86_model) {
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
-
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
-
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
-
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
-
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 69: /* 22nm Haswell ULT */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
-
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
- events_attrs[idx++] = &evattr_pperf.attr.attr;
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
-
- case 55: /* 22nm Atom "Silvermont" */
- case 76: /* 14nm Atom "Airmont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
- events_attrs[idx++] = &evattr_smi.attr.attr;
- break;
- }
-
- events_attrs[idx] = NULL;
-
- return 0;
-}
-
-static int __init amd_msr_init(int idx)
-{
- return 0;
-}
-
static int __init msr_init(void)
{
- int err;
- int idx = 1;
+ int i, j = 0;
- if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
- events_attrs[idx++] = &evattr_aperf.attr.attr;
- events_attrs[idx++] = &evattr_mperf.attr.attr;
- events_attrs[idx] = NULL;
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
}
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- err = intel_msr_init(idx);
- break;
-
- case X86_VENDOR_AMD:
- err = amd_msr_init(idx);
- break;
-
- default:
- err = -ENOTSUPP;
+ /* Probe the MSRs. */
+ for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+ u64 val;
+
+ /*
+ * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+ */
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
}
- if (err != 0) {
- pr_cont("no msr PMU driver.\n");
- return 0;
+ /* List remaining MSRs in the sysfs attrs. */
+ for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
}
+ events_attrs[j] = NULL;
perf_pmu_register(&pmu_msr, "msr", -1);
On Fri, Aug 7, 2015 at 1:34 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Aug 06, 2015 at 05:59:43PM +0200, Peter Zijlstra wrote:
>> On Thu, Aug 06, 2015 at 08:30:08AM -0700, Andy Lutomirski wrote:
>> > On Thu, Aug 6, 2015 at 8:21 AM, Peter Zijlstra <[email protected]> wrote:
>> > > On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
>> > > - default:
>> > > - err = -ENOTSUPP;
>> > > + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
>> > > + msr[i].attr = NULL;
>> >
>> > IIRC rdmsrl_safe literally never fails under QEMU TCG, and I'm not
>>
>> *sigh* the borkage never stops does it :-(
>>
>> > entirely sure what happens under KVM if emulation kicks in. It might
>> > pay to keep the model check for the non-architectural stuff, or at
>> > least check for a nonzero return value.
>>
>> Of course, 0 might be a valid value.. Esp. for the SMI counter.
>
> This then..
>
LGTM on brief inspection.
--Andy
On Tue, Aug 04, 2015 at 02:01:20AM -0700, tip-bot for Andy Lutomirski wrote:
SNIP
> + /* unsupported modes and filters */
> + if (event->attr.exclude_user ||
> + event->attr.exclude_kernel ||
> + event->attr.exclude_hv ||
> + event->attr.exclude_idle ||
> + event->attr.exclude_host ||
> + event->attr.exclude_guest ||
> + event->attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + event->hw.idx = -1;
> + event->hw.event_base = msr[cfg].msr;
> + event->hw.config = cfg;
> +
> + return 0;
> +}
> +
> +static inline u64 msr_read_counter(struct perf_event *event)
> +{
> + u64 now;
> +
> + if (event->hw.event_base)
> + rdmsrl(event->hw.event_base, now);
> + else
> + now = rdtsc();
not sure itf there's newer version, but I needed attached change
to make it compile on latest Arnaldo's tree
jirka
---
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index af216e9..dfa75e5 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -88,7 +88,7 @@ static inline u64 msr_read_counter(struct perf_event *event)
if (event->hw.event_base)
rdmsrl(event->hw.event_base, now);
else
- now = rdtsc();
+ rdtscll(now);
return now;
}
On Mon, Aug 10, 2015 at 01:19:36PM +0200, Jiri Olsa wrote:
> On Tue, Aug 04, 2015 at 02:01:20AM -0700, tip-bot for Andy Lutomirski wrote:
>
> SNIP
>
> > + /* unsupported modes and filters */
> > + if (event->attr.exclude_user ||
> > + event->attr.exclude_kernel ||
> > + event->attr.exclude_hv ||
> > + event->attr.exclude_idle ||
> > + event->attr.exclude_host ||
> > + event->attr.exclude_guest ||
> > + event->attr.sample_period) /* no sampling */
> > + return -EINVAL;
> > +
> > + event->hw.idx = -1;
> > + event->hw.event_base = msr[cfg].msr;
> > + event->hw.config = cfg;
> > +
> > + return 0;
> > +}
> > +
> > +static inline u64 msr_read_counter(struct perf_event *event)
> > +{
> > + u64 now;
> > +
> > + if (event->hw.event_base)
> > + rdmsrl(event->hw.event_base, now);
> > + else
> > + now = rdtsc();
>
> not sure itf there's newer version, but I needed attached change
> to make it compile on latest Arnaldo's tree
You need tip/x86/asm merged in.
> On Thu, Aug 06, 2015 at 05:59:43PM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 06, 2015 at 08:30:08AM -0700, Andy Lutomirski wrote:
> > > On Thu, Aug 6, 2015 at 8:21 AM, Peter Zijlstra <[email protected]>
> wrote:
> > > > On Tue, Aug 04, 2015 at 08:39:27PM +0000, Liang, Kan wrote:
> > > > - default:
> > > > - err = -ENOTSUPP;
> > > > + if (!msr[i].test() || rdmsrl_safe(msr[i].msr, &val))
> > > > + msr[i].attr = NULL;
> > >
> > > IIRC rdmsrl_safe literally never fails under QEMU TCG, and I'm not
> >
> > *sigh* the borkage never stops does it :-(
> >
> > > entirely sure what happens under KVM if emulation kicks in. It
> > > might pay to keep the model check for the non-architectural stuff,
> > > or at least check for a nonzero return value.
> >
> > Of course, 0 might be a valid value.. Esp. for the SMI counter.
>
> This then..
>
> ---
> arch/x86/kernel/cpu/perf_event_msr.c | 170 +++++++++++++++++-------
> -----------
> 1 file changed, 85 insertions(+), 85 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event_msr.c
> +++ b/arch/x86/kernel/cpu/perf_event_msr.c
> @@ -10,17 +10,63 @@ enum perf_msr_id {
> PERF_MSR_EVENT_MAX,
> };
>
> +bool test_aperfmperf(int idx)
> +{
> + return boot_cpu_has(X86_FEATURE_APERFMPERF);
> +}
> +
> +bool test_intel(int idx)
> +{
> + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
> + boot_cpu_data.x86 != 6)
> + return false;
> +
> + switch (boot_cpu_data.x86_model) {
> + case 30: /* 45nm Nehalem */
> + case 26: /* 45nm Nehalem-EP */
> + case 46: /* 45nm Nehalem-EX */
> +
> + case 37: /* 32nm Westmere */
> + case 44: /* 32nm Westmere-EP */
> + case 47: /* 32nm Westmere-EX */
> +
> + case 42: /* 32nm SandyBridge */
> + case 45: /* 32nm SandyBridge-E/EN/EP */
> +
> + case 58: /* 22nm IvyBridge */
> + case 62: /* 22nm IvyBridge-EP/EX */
> +
> + case 60: /* 22nm Haswell Core */
> + case 63: /* 22nm Haswell Server */
> + case 69: /* 22nm Haswell ULT */
> + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
> +
> + case 61: /* 14nm Broadwell Core-M */
> + case 86: /* 14nm Broadwell Xeon D */
> + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
> + case 79: /* 14nm Broadwell Server */
> +
> + case 55: /* 22nm Atom "Silvermont" */
> + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
> + case 76: /* 14nm Atom "Airmont" */
> + if (idx == PERF_MSR_SMI)
> + return true;
> + break;
> +
> + case 78: /* 14nm Skylake Mobile */
> + case 94: /* 14nm Skylake Desktop */
> + if (idx == PERF_MSR_SMI || idx = PERF_MSR_PPERF)
Here is a typo. The rest is OK for me.
I did some simple test on HSX.
Tested-by: Kan Liang <[email protected]>
> + return true;
> + break;
> + }
> +
> + return false;
> +}
> +
> struct perf_msr {
> - int id;
> u64 msr;
> -};
> -
> -static struct perf_msr msr[] = {
> - { PERF_MSR_TSC, 0 },
> - { PERF_MSR_APERF, MSR_IA32_APERF },
> - { PERF_MSR_MPERF, MSR_IA32_MPERF },
> - { PERF_MSR_PPERF, MSR_PPERF },
> - { PERF_MSR_SMI, MSR_SMI_COUNT },
> + struct perf_pmu_events_attr *attr;
> + bool (*test)(int idx);
> };
>
> PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
> @@ -29,8 +75,16 @@ PMU_EVENT_ATTR_STRING(mperf, evattr_mper
> PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
> PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
>
> +static struct perf_msr msr[] = {
> + [PERF_MSR_TSC] = { 0, &evattr_tsc,
> NULL, },
> + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf,
> test_aperfmperf, },
> + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf,
> test_aperfmperf, },
> + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf,
> test_intel, },
> + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi,
> test_intel, },
> +};
> +
> static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
> - &evattr_tsc.attr.attr,
> + NULL,
> };
>
> static struct attribute_group events_attr_group = { @@ -74,6 +128,9 @@
> static int msr_event_init(struct perf_ev
> event->attr.sample_period) /* no sampling */
> return -EINVAL;
>
> + if (!msr[cfg].attr)
> + return -EINVAL;
> +
> event->hw.idx = -1;
> event->hw.event_base = msr[cfg].msr;
> event->hw.config = cfg;
> @@ -151,89 +208,32 @@ static struct pmu pmu_msr = {
> .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
> };
>
> -static int __init intel_msr_init(int idx) -{
> - if (boot_cpu_data.x86 != 6)
> - return 0;
> -
> - switch (boot_cpu_data.x86_model) {
> - case 30: /* 45nm Nehalem */
> - case 26: /* 45nm Nehalem-EP */
> - case 46: /* 45nm Nehalem-EX */
> -
> - case 37: /* 32nm Westmere */
> - case 44: /* 32nm Westmere-EP */
> - case 47: /* 32nm Westmere-EX */
> -
> - case 42: /* 32nm SandyBridge */
> - case 45: /* 32nm SandyBridge-E/EN/EP */
> -
> - case 58: /* 22nm IvyBridge */
> - case 62: /* 22nm IvyBridge-EP/EX */
> -
> - case 60: /* 22nm Haswell Core */
> - case 63: /* 22nm Haswell Server */
> - case 69: /* 22nm Haswell ULT */
> - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
> -
> - case 61: /* 14nm Broadwell Core-M */
> - case 86: /* 14nm Broadwell Xeon D */
> - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
> - case 79: /* 14nm Broadwell Server */
> - events_attrs[idx++] = &evattr_smi.attr.attr;
> - break;
> -
> - case 78: /* 14nm Skylake Mobile */
> - case 94: /* 14nm Skylake Desktop */
> - events_attrs[idx++] = &evattr_pperf.attr.attr;
> - events_attrs[idx++] = &evattr_smi.attr.attr;
> - break;
> -
> - case 55: /* 22nm Atom "Silvermont" */
> - case 76: /* 14nm Atom "Airmont" */
> - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
> - events_attrs[idx++] = &evattr_smi.attr.attr;
> - break;
> - }
> -
> - events_attrs[idx] = NULL;
> -
> - return 0;
> -}
> -
> -static int __init amd_msr_init(int idx) -{
> - return 0;
> -}
> -
> static int __init msr_init(void)
> {
> - int err;
> - int idx = 1;
> + int i, j = 0;
>
> - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) {
> - events_attrs[idx++] = &evattr_aperf.attr.attr;
> - events_attrs[idx++] = &evattr_mperf.attr.attr;
> - events_attrs[idx] = NULL;
> + if (!boot_cpu_has(X86_FEATURE_TSC)) {
> + pr_cont("no MSR PMU driver.\n");
> + return 0;
> }
>
> - switch (boot_cpu_data.x86_vendor) {
> - case X86_VENDOR_INTEL:
> - err = intel_msr_init(idx);
> - break;
> -
> - case X86_VENDOR_AMD:
> - err = amd_msr_init(idx);
> - break;
> -
> - default:
> - err = -ENOTSUPP;
> + /* Probe the MSRs. */
> + for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
> + u64 val;
> +
> + /*
> + * Virt sucks arse; you cannot tell if a R/O MSR is present :/
> + */
> + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
> + msr[i].attr = NULL;
> }
>
> - if (err != 0) {
> - pr_cont("no msr PMU driver.\n");
> - return 0;
> + /* List remaining MSRs in the sysfs attrs. */
> + for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
> + if (msr[i].attr)
> + events_attrs[j++] = &msr[i].attr->attr.attr;
> }
> + events_attrs[j] = NULL;
>
> perf_pmu_register(&pmu_msr, "msr", -1);
>
On Mon, Aug 10, 2015 at 02:02:17PM +0000, Liang, Kan wrote:
> > + case 94: /* 14nm Skylake Desktop */
> > + if (idx == PERF_MSR_SMI || idx = PERF_MSR_PPERF)
>
> Here is a typo. The rest is OK for me.
> I did some simple test on HSX.
Yep, fixed that already.
> Tested-by: Kan Liang <[email protected]>
Thanks!