Date: Thu, 6 Aug 2015 17:44:24 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: kan.liang@intel.com
Cc: mingo@redhat.com, acme@kernel.org, eranian@google.com, ak@linux.intel.com,
        linux-kernel@vger.kernel.org
Subject: Re: [PATCH V2 1/1] perf/x86: Add Intel power cstate PMUs support
Message-ID: <20150806154424.GR19282@twins.programming.kicks-ass.net>
References: <1437986776-8438-1-git-send-email-kan.liang@intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1437986776-8438-1-git-send-email-kan.liang@intel.com>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 4336
Lines: 136

On Mon, Jul 27, 2015 at 04:46:16AM -0400, kan.liang@intel.com wrote:

As a general comment; this thing is unreadable. Far too much macro foo
to instantiate the different PMUs.

> +struct perf_power_cstate_event_msr {
> +	int	id;
> +	u64	msr;
> +};
> +
> +enum perf_power_cstate_id {
> +	/*
> +	 * power_cstate events, generalized by the kernel:
> +	 */
> +	PERF_POWER_CORE_C1_RES = 0,
> +	PERF_POWER_CORE_C3_RES,
> +	PERF_POWER_CORE_C6_RES,
> +	PERF_POWER_CORE_C7_RES,

These are two different PMUs, why are they in the same enum space?

> +	PERF_POWER_PKG_C2_RES,
> +	PERF_POWER_PKG_C3_RES,
> +	PERF_POWER_PKG_C6_RES,
> +	PERF_POWER_PKG_C7_RES,
> +	PERF_POWER_PKG_C8_RES,
> +	PERF_POWER_PKG_C9_RES,
> +	PERF_POWER_PKG_C10_RES,
> +
> +	PERF_POWER_CSTATE_EVENT_MAX,       /* non-ABI */
> +};
> +
> +struct power_cstate_pmu {
> +	struct intel_power_cstate_type	*power_cstate_type;
> +	struct pmu			*pmu;
> +};
> +
> +
> +static struct intel_power_cstate_type *empty_power_cstate[] = { NULL, };
> +struct intel_power_cstate_type **power_cstate = empty_power_cstate;
> +
> +static struct perf_power_cstate_event_msr power_cstate_events[] = {
> +	{ PERF_POWER_CORE_C1_RES, MSR_CORE_C1_RES },
> +	{ PERF_POWER_CORE_C3_RES, MSR_CORE_C3_RESIDENCY },
> +	{ PERF_POWER_CORE_C6_RES, MSR_CORE_C6_RESIDENCY },
> +	{ PERF_POWER_CORE_C7_RES, MSR_CORE_C7_RESIDENCY },
> +	{ PERF_POWER_PKG_C2_RES, MSR_PKG_C2_RESIDENCY },
> +	{ PERF_POWER_PKG_C3_RES, MSR_PKG_C3_RESIDENCY },
> +	{ PERF_POWER_PKG_C6_RES, MSR_PKG_C6_RESIDENCY },
> +	{ PERF_POWER_PKG_C7_RES, MSR_PKG_C7_RESIDENCY },
> +	{ PERF_POWER_PKG_C8_RES, MSR_PKG_C8_RESIDENCY },
> +	{ PERF_POWER_PKG_C9_RES, MSR_PKG_C9_RESIDENCY },
> +	{ PERF_POWER_PKG_C10_RES, MSR_PKG_C10_RESIDENCY },
> +};

What's the point of the first entry? You already know which index it is,
no point in storing that again.

> +
> +EVENT_ATTR_STR(c1-residency, power_core_c1_res, "event=0x00");
> +EVENT_ATTR_STR(c3-residency, power_core_c3_res, "event=0x01");
> +EVENT_ATTR_STR(c6-residency, power_core_c6_res, "event=0x02");
> +EVENT_ATTR_STR(c7-residency, power_core_c7_res, "event=0x03");

> +EVENT_ATTR_STR(c2-residency, power_pkg_c2_res, "event=0x04");

I would very much expect this thing to start counting at 0 again.

> +EVENT_ATTR_STR(c3-residency, power_pkg_c3_res, "event=0x05");
> +EVENT_ATTR_STR(c6-residency, power_pkg_c6_res, "event=0x06");
> +EVENT_ATTR_STR(c7-residency, power_pkg_c7_res, "event=0x07");
> +EVENT_ATTR_STR(c8-residency, power_pkg_c8_res, "event=0x08");
> +EVENT_ATTR_STR(c9-residency, power_pkg_c9_res, "event=0x09");
> +EVENT_ATTR_STR(c10-residency, power_pkg_c10_res, "event=0x0a");
> +
> +static cpumask_t power_cstate_core_cpu_mask;

That one typically does not need a cpumask.

> +static cpumask_t power_cstate_pkg_cpu_mask;

> +static int power_cstate_pmu_event_init(struct perf_event *event)
> +{
> +	u64 cfg = event->attr.config;
> +	int ret = 0;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/*
> +	 * check event is known (determines counter)
> +	 */
> +	if (cfg >= PERF_POWER_CSTATE_EVENT_MAX)
> +		return -EINVAL;
> +
> +	/* unsupported modes and filters */
> +	if (event->attr.exclude_user   ||
> +	    event->attr.exclude_kernel ||
> +	    event->attr.exclude_hv     ||
> +	    event->attr.exclude_idle   ||
> +	    event->attr.exclude_host   ||
> +	    event->attr.exclude_guest  ||
> +	    event->attr.sample_period) /* no sampling */
> +		return -EINVAL;

Same as with the MSR thing, but worse. What happens if we hand a config
value for the 'wrong' PMU type?

What happens if we hand a pkg c10 config to one that doesn't have c10?

> +	/* must be done before validate_group */
> +	event->hw.event_base = power_cstate_events[cfg].msr;
> +	event->hw.config = cfg;
> +	event->hw.idx = -1;
> +
> +	return ret;
> +}
> +
> +static inline u64 power_cstate_pmu_read_counter(struct perf_event *event)
> +{
> +	u64 val;
> +
> +	rdmsrl_safe(event->hw.event_base, &val);

No, at this point you had better know if that MSR exists or not. If it
does not the event should not exist.

> +	return val;
> +}

 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/