MIME-Version: 1.0
In-Reply-To: <20150806154424.GR19282@twins.programming.kicks-ass.net>
References: <1437986776-8438-1-git-send-email-kan.liang@intel.com>
	<20150806154424.GR19282@twins.programming.kicks-ass.net>
Date: Thu, 6 Aug 2015 11:14:17 -0700
Message-ID: <CABPqkBR91SjU6UMVbdZ0+CpDw5gFOyFREAOk-qPCvxOkuWGWdA@mail.gmail.com>
Subject: Re: [PATCH V2 1/1] perf/x86: Add Intel power cstate PMUs support
From: Stephane Eranian <eranian@google.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: "Liang, Kan" <kan.liang@intel.com>, "mingo@redhat.com" <mingo@redhat.com>,
        Arnaldo Carvalho de Melo <acme@kernel.org>,
        "ak@linux.intel.com" <ak@linux.intel.com>,
        LKML <linux-kernel@vger.kernel.org>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5776
Lines: 161

On Thu, Aug 6, 2015 at 8:44 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jul 27, 2015 at 04:46:16AM -0400, kan.liang@intel.com wrote:
>
> As a general comment; this thing is unreadable. Far too much macro foo
> to instantiate the different PMUs.
>
>> +struct perf_power_cstate_event_msr {
>> +     int     id;
>> +     u64     msr;
>> +};
>> +
>> +enum perf_power_cstate_id {
>> +     /*
>> +      * power_cstate events, generalized by the kernel:
>> +      */
>> +     PERF_POWER_CORE_C1_RES = 0,
>> +     PERF_POWER_CORE_C3_RES,
>> +     PERF_POWER_CORE_C6_RES,
>> +     PERF_POWER_CORE_C7_RES,
>
> These are two different PMUs, why are they in the same enum space?
>
>> +     PERF_POWER_PKG_C2_RES,
>> +     PERF_POWER_PKG_C3_RES,
>> +     PERF_POWER_PKG_C6_RES,
>> +     PERF_POWER_PKG_C7_RES,
>> +     PERF_POWER_PKG_C8_RES,
>> +     PERF_POWER_PKG_C9_RES,
>> +     PERF_POWER_PKG_C10_RES,
>> +
>> +     PERF_POWER_CSTATE_EVENT_MAX,       /* non-ABI */
>> +};
>> +
>> +struct power_cstate_pmu {
>> +     struct intel_power_cstate_type  *power_cstate_type;
>> +     struct pmu                      *pmu;
>> +};
>> +
>> +
>> +static struct intel_power_cstate_type *empty_power_cstate[] = { NULL, };
>> +struct intel_power_cstate_type **power_cstate = empty_power_cstate;
>> +
>> +static struct perf_power_cstate_event_msr power_cstate_events[] = {
>> +     { PERF_POWER_CORE_C1_RES, MSR_CORE_C1_RES },
>> +     { PERF_POWER_CORE_C3_RES, MSR_CORE_C3_RESIDENCY },
>> +     { PERF_POWER_CORE_C6_RES, MSR_CORE_C6_RESIDENCY },
>> +     { PERF_POWER_CORE_C7_RES, MSR_CORE_C7_RESIDENCY },
>> +     { PERF_POWER_PKG_C2_RES, MSR_PKG_C2_RESIDENCY },
>> +     { PERF_POWER_PKG_C3_RES, MSR_PKG_C3_RESIDENCY },
>> +     { PERF_POWER_PKG_C6_RES, MSR_PKG_C6_RESIDENCY },
>> +     { PERF_POWER_PKG_C7_RES, MSR_PKG_C7_RESIDENCY },
>> +     { PERF_POWER_PKG_C8_RES, MSR_PKG_C8_RESIDENCY },
>> +     { PERF_POWER_PKG_C9_RES, MSR_PKG_C9_RESIDENCY },
>> +     { PERF_POWER_PKG_C10_RES, MSR_PKG_C10_RESIDENCY },
>> +};
>
> What's the point of the first entry? You already know which index it is,
> no point in storing that again.
>
>> +
>> +EVENT_ATTR_STR(c1-residency, power_core_c1_res, "event=0x00");

I would not use an event code of 0. Start at 0x1

>> +EVENT_ATTR_STR(c3-residency, power_core_c3_res, "event=0x01");
>> +EVENT_ATTR_STR(c6-residency, power_core_c6_res, "event=0x02");
>> +EVENT_ATTR_STR(c7-residency, power_core_c7_res, "event=0x03");
>
>> +EVENT_ATTR_STR(c2-residency, power_pkg_c2_res, "event=0x04");
>
> I would very much expect this thing to start counting at 0 again.
>
>> +EVENT_ATTR_STR(c3-residency, power_pkg_c3_res, "event=0x05");
>> +EVENT_ATTR_STR(c6-residency, power_pkg_c6_res, "event=0x06");
>> +EVENT_ATTR_STR(c7-residency, power_pkg_c7_res, "event=0x07");
>> +EVENT_ATTR_STR(c8-residency, power_pkg_c8_res, "event=0x08");
>> +EVENT_ATTR_STR(c9-residency, power_pkg_c9_res, "event=0x09");
>> +EVENT_ATTR_STR(c10-residency, power_pkg_c10_res, "event=0x0a");
>> +
>> +static cpumask_t power_cstate_core_cpu_mask;
>
> That one typically does not need a cpumask.
>
You need to pick one CPU out of the multi-core. But it is for client parts thus
there is only one socket. At least this is my understanding.

>> +static cpumask_t power_cstate_pkg_cpu_mask;
>
>> +static int power_cstate_pmu_event_init(struct perf_event *event)
>> +{
>> +     u64 cfg = event->attr.config;
>> +     int ret = 0;
>> +
>> +     if (event->attr.type != event->pmu->type)
>> +             return -ENOENT;
>> +
>> +     /*
>> +      * check event is known (determines counter)
>> +      */
>> +     if (cfg >= PERF_POWER_CSTATE_EVENT_MAX)
>> +             return -EINVAL;
>> +
>> +     /* unsupported modes and filters */
>> +     if (event->attr.exclude_user   ||
>> +         event->attr.exclude_kernel ||
>> +         event->attr.exclude_hv     ||
>> +         event->attr.exclude_idle   ||
>> +         event->attr.exclude_host   ||
>> +         event->attr.exclude_guest  ||
>> +         event->attr.sample_period) /* no sampling */
>> +             return -EINVAL;
>
> Same as with the MSR thing, but worse. What happens if we hand a config
> value for the 'wrong' PMU type?
>
I think that's taken care of by the first test in the function.

> What happens if we hand a pkg c10 config to one that doesn't have c10?
>
>> +     /* must be done before validate_group */
>> +     event->hw.event_base = power_cstate_events[cfg].msr;
>> +     event->hw.config = cfg;
>> +     event->hw.idx = -1;
>> +
>> +     return ret;
>> +}
>> +
>> +static inline u64 power_cstate_pmu_read_counter(struct perf_event *event)
>> +{
>> +     u64 val;
>> +
>> +     rdmsrl_safe(event->hw.event_base, &val);
>
> No, at this point you had better know if that MSR exists or not. If it
> does not the event should not exist.
>
>> +     return val;
>> +}
>
>
I understand that these metrics are useful and needed however if I
look at the broader
picture I see many PMUs doing similar things or appearing different
when they are actually
very close. It would be nice to have a more unified approach. You have
RAPL (client, server)
which appears as the power PMU. You have the PCU uncore on servers
which also provides
C-state residency info. Yet, all these appear differently and expose
events with different names.
I think we could benefit from a more unifie approach here such that
you would be able to do

$    perf stat -a -e power/c6-residency/, power/energy-pkg/

on client and server without having to change the pmu name of the event names.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/