Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753263AbbDFPWn (ORCPT ); Mon, 6 Apr 2015 11:22:43 -0400 Received: from mga03.intel.com ([134.134.136.65]:47671 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753047AbbDFPWk (ORCPT ); Mon, 6 Apr 2015 11:22:40 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.11,532,1422950400"; d="scan'208";a="551614500" Date: Mon, 6 Apr 2015 08:21:47 -0700 From: Jacob Pan To: Jacob Pan Cc: LKML , Peter Zijlstra , "H. Peter Anvin" , Stephane Eranian , Paul Mackerras , Andi Kleen , Thomas Gleixner , Arnaldo Carvalho de Melo , Vince Weaver Subject: Re: [PATCH v3] perf/rapl: support per domain energy unit Message-ID: <20150406082147.5d183773@icelake> In-Reply-To: <1427405325-780-1-git-send-email-jacob.jun.pan@linux.intel.com> References: <1427405325-780-1-git-send-email-jacob.jun.pan@linux.intel.com> Organization: OTC X-Mailer: Claws Mail 3.9.3 (GTK+ 2.24.23; x86_64-pc-linux-gnu) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7900 Lines: 272 On Thu, 26 Mar 2015 14:28:45 -0700 Jacob Pan wrote: > RAPL energy hardware unit can vary within a single CPU package, e.g. > HSW server DRAM has a fixed energy unit of 15.3 uJ (2^-16) whereas > the unit on other domains can be enumerated from power unit MSR. > There might be other variations in the future, this patch adds > per cpu model quirk to allow special handling of certain cpus. > > hw_unit is also removed from per cpu data since it is not per cpu > and the sampling rate for energy counter is typically not high. > > Without this patch, DRAM domain on HSW servers will be counted > 4x higher than the real energy counter. > are there any more comments about this patch? Thanks, Jacob > Signed-off-by: Jacob Pan > --- > arch/x86/kernel/cpu/perf_event_intel_rapl.c | 94 > ++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 21 > deletions(-) > > diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c > b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index c4bb8b8..999289b9 > 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c > +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c > @@ -62,6 +62,14 @@ > #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ > #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding > */ > +#define NR_RAPL_DOMAINS 0x4 > +static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { > + "pp0-core", > + "package", > + "dram", > + "pp1-gpu", > +}; > + > /* Clients have PP0, PKG */ > #define RAPL_IDX_CLN (1< 1< @@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v > = { \ > struct rapl_pmu { > spinlock_t lock; > - int hw_unit; /* 1/2^hw_unit Joule */ > int n_active; /* number of active events */ > struct list_head active_list; > struct pmu *pmu; /* pointer to rapl_pmu_class */ > @@ -120,6 +127,7 @@ struct rapl_pmu { > struct hrtimer hrtimer; > }; > > +static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* > 1/2^hw_unit Joule */ static struct pmu rapl_pmu_class; > static cpumask_t rapl_cpu_mask; > static int rapl_cntr_mask; > @@ -127,6 +135,7 @@ static int rapl_cntr_mask; > static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); > static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); > > +static struct x86_pmu_quirk *rapl_quirks; > static inline u64 rapl_read_counter(struct perf_event *event) > { > u64 raw; > @@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct > perf_event *event) return raw; > } > > -static inline u64 rapl_scale(u64 v) > +#define > rapl_add_quirk(func_) > \ +do > { > \ > + static struct x86_pmu_quirk __quirk __initdata = > { \ > + .func = > func_, \ > + }; > \ > + __quirk.next = > rapl_quirks; \ > + rapl_quirks = > &__quirk; \ +} while > (0) + > +static inline u64 rapl_scale(u64 v, int cfg) > { > + if (cfg > NR_RAPL_DOMAINS) { > + pr_warn("invalid domain %d, failed to scale data\n", > cfg); > + return v; > + } > /* > * scale delta to smallest unit (1/2^32) > * users must then scale back: count * 1/(1e9*2^32) to get > Joules > * or use ldexp(count, -32). > * Watts = Joules/Time delta > */ > - return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); > + return v << (32 - rapl_hw_unit[cfg - 1]); > } > > static u64 rapl_event_update(struct perf_event *event) > @@ -173,7 +195,7 @@ again: > delta = (new_raw_count << shift) - (prev_raw_count << shift); > delta >>= shift; > > - sdelta = rapl_scale(delta); > + sdelta = rapl_scale(delta, event->hw.config); > > local64_add(sdelta, &event->count); > > @@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu) > cpumask_set_cpu(cpu, &rapl_cpu_mask); > } > > +static __init void rapl_hsw_server_quirk(void) > +{ > + /* > + * DRAM domain on HSW server has fixed energy unit which can > be > + * different than the unit from power unit MSR. > + * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product > Families, V2 > + * of 2. Datasheet, September 2014, Reference Number: > 330784-001 " > + */ > + rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; > +} > + > static int rapl_cpu_prepare(int cpu) > { > struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); > int phys_id = topology_physical_package_id(cpu); > u64 ms; > - u64 msr_rapl_power_unit_bits; > > if (pmu) > return 0; > @@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu) > if (phys_id < 0) > return -1; > > - /* protect rdmsrl() to handle virtualization */ > - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, > &msr_rapl_power_unit_bits)) > - return -1; > - > pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, > cpu_to_node(cpu)); if (!pmu) > return -1; > - > spin_lock_init(&pmu->lock); > > INIT_LIST_HEAD(&pmu->active_list); > > - /* > - * grab power unit as: 1/2^unit Joules > - * > - * we cache in local PMU instance > - */ > - pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; > pmu->pmu = &rapl_pmu_class; > > /* > @@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu) > * divide interval by 2 to avoid lockstep (2 * 100) > * if hw unit is 32, then we use 2 ms 1/200/2 > */ > - if (pmu->hw_unit < 32) > - ms = (1000 / (2 * 100)) * (1ULL << (32 - > pmu->hw_unit - 1)); > + if (rapl_hw_unit[0] < 32) > + ms = (1000 / (2 * 100)) * (1ULL << (32 - > rapl_hw_unit[0] - 1)); else > ms = 2; > > @@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct > notifier_block *self, return NOTIFY_OK; > } > > +static int rapl_check_hw_unit(void) > +{ > + u64 msr_rapl_power_unit_bits; > + int i; > + > + /* protect rdmsrl() to handle virtualization */ > + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, > &msr_rapl_power_unit_bits)) > + return -1; > + for (i = 0; i < NR_RAPL_DOMAINS; i++) > + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & > 0x1FULL; + > + return 0; > +} > + > static const struct x86_cpu_id rapl_cpu_match[] = { > [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, > [1] = {}, > @@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void) > { > struct rapl_pmu *pmu; > int cpu, ret; > + struct x86_pmu_quirk *quirk; > + int i; > > /* > * check for Intel processor family 6 > @@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void) > rapl_cntr_mask = RAPL_IDX_CLN; > rapl_pmu_events_group.attrs = rapl_events_cln_attr; > break; > + case 63: /* Haswell-Server */ > + rapl_add_quirk(rapl_hsw_server_quirk); > + rapl_cntr_mask = RAPL_IDX_SRV; > + rapl_pmu_events_group.attrs = rapl_events_srv_attr; > + break; > case 60: /* Haswell */ > case 69: /* Haswell-Celeron */ > rapl_cntr_mask = RAPL_IDX_HSW; > @@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void) > /* unsupported */ > return 0; > } > + ret = rapl_check_hw_unit(); > + if (ret) > + return ret; > > + /* run cpu model quirks */ > + for (quirk = rapl_quirks; quirk; quirk = quirk->next) > + quirk->func(); > cpu_notifier_register_begin(); > > for_each_online_cpu(cpu) { > @@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void) > > pmu = __this_cpu_read(rapl_pmu); > > - pr_info("RAPL PMU detected, hw unit 2^-%d Joules," > + pr_info("RAPL PMU detected," > " API unit is 2^-32 Joules," > " %d fixed counters" > " %llu ms ovfl timer\n", > - pmu->hw_unit, > hweight32(rapl_cntr_mask), > ktime_to_ms(pmu->timer_interval)); > - > + for (i = 0; i < NR_RAPL_DOMAINS; i++) { > + if (rapl_cntr_mask & (1 << i)) { > + pr_info("hw unit of domain %s 2^-%d > Joules\n", > + rapl_domain_names[i], > rapl_hw_unit[i]); > + } > + } > out: > cpu_notifier_register_done(); > [Jacob Pan] -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/