Currently the energy-cores event in the power PMU aggregates energy
consumption data at a package level. On the other hand the core energy
RAPL counter in AMD CPUs has a core scope (which means the energy
consumption is recorded separately for each core). Earlier efforts to add
the core event in the power PMU had failed [1], due to the difference in
the scope of these two events. Hence, there is a need for a new core scope
PMU.
This patchset adds a new "power_per_core" PMU alongside the existing
"power" PMU, which will be responsible for collecting the new
"energy-per-core" event.
Tested the package level and core level PMU counters with workloads
pinned to different CPUs.
Results with workload pinned to CPU 1 in Core 1 on an AMD Zen4 Genoa
machine:
$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
Performance counter stats for 'system wide':
S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
[1]: https://lore.kernel.org/lkml/[email protected]/
This patchset applies cleanly on top of v6.10-rc3 as well as latest
tip/master.
Dhananjay Ugwekar (6):
perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
perf/x86/rapl: Rename rapl_pmu variables
perf/x86/rapl: Make rapl_model struct global
perf/x86/rapl: Move cpumask variable to rapl_pmus struct
perf/x86/rapl: Add wrapper for online/offline functions
perf/x86/rapl: Add per-core energy counter support for AMD CPUs
arch/x86/events/rapl.c | 311 ++++++++++++++++++++++++++++++-----------
1 file changed, 233 insertions(+), 78 deletions(-)
--
2.34.1
After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"),
on AMD processors that support extended CPUID leaf 0x80000026, the
topology_die_cpumask() and topology_logical_die_id() macros, no longer
return the package cpumask and package id, instead they return the CCD
(Core Complex Die) mask and id respectively. This leads to the energy-pkg
event scope to be modified to CCD instead of package.
Replacing these macros with their package counterparts fixes the
energy-pkg event for AMD CPUs.
However due to the difference between the scope of energy-pkg event for
Intel and AMD CPUs, we have to replace these macros conditionally only for
AMD CPUs.
On a 12 CCD 1 Package AMD Zen4 Genoa machine:
Before:
$ cat /sys/devices/power/cpumask
0,8,16,24,32,40,48,56,64,72,80,88.
The expected cpumask here is supposed to be just "0", as it is a package
scope event, only one CPU will be collecting the event for all the CPUs in
the package.
After:
$ cat /sys/devices/power/cpumask
0
Signed-off-by: Dhananjay Ugwekar <[email protected]>
Fixes: 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf")
---
PS: This patch was earlier sent separately(link below), it has not been
merged yet, it is necessary for this patchset to work properly, also it
fixes the pre-existing energy-pkg event.
https://lore.kernel.org/linux-perf-users/[email protected]/
---
arch/x86/events/rapl.c | 30 ++++++++++++++++++++++++++----
1 file changed, 26 insertions(+), 4 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index b985ca79cf97..73be25e1f4b4 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -103,6 +103,10 @@ static struct perf_pmu_events_attr event_attr_##v = { \
.event_str = str, \
};
+#define rapl_pmu_is_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
struct rapl_pmu {
raw_spinlock_t lock;
int n_active;
@@ -140,9 +144,21 @@ static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+static inline unsigned int get_rapl_pmu_idx(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ topology_logical_die_id(cpu);
+}
+
+static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
+ topology_die_cpumask(cpu);
+}
+
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
@@ -543,6 +559,7 @@ static struct perf_msr amd_rapl_msrs[] = {
static int rapl_cpu_offline(unsigned int cpu)
{
+ const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
@@ -552,7 +569,7 @@ static int rapl_cpu_offline(unsigned int cpu)
pmu->cpu = -1;
/* Find a new cpu to collect rapl events */
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+ target = cpumask_any_but(rapl_pmu_cpumask, cpu);
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
@@ -565,6 +582,8 @@ static int rapl_cpu_offline(unsigned int cpu)
static int rapl_cpu_online(unsigned int cpu)
{
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
@@ -579,14 +598,14 @@ static int rapl_cpu_online(unsigned int cpu)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ rapl_pmus->pmus[rapl_pmu_idx] = pmu;
}
/*
* Check if there is an online cpu in the package which collects rapl
* events already.
*/
- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
+ target = cpumask_any_and(&rapl_cpu_mask, rapl_pmu_cpumask);
if (target < nr_cpu_ids)
return 0;
@@ -677,6 +696,9 @@ static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ if (rapl_pmu_is_pkg_scope())
+ nr_rapl_pmu = topology_max_packages();
+
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
--
2.34.1
Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
avoid any confusion between the variables of two different
structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to
struct pmu, which leads to situations in code like pmu->pmu,
which is needlessly confusing. Above scenario is replaced with
much more readable rapl_pmu->pmu with this change.
Also rename "pmus" member in rapl_pmus struct, for same reason.
No functional change.
Signed-off-by: Dhananjay Ugwekar <[email protected]>
---
arch/x86/events/rapl.c | 104 ++++++++++++++++++++---------------------
1 file changed, 52 insertions(+), 52 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 73be25e1f4b4..b4e2073a178e 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -120,7 +120,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
+ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@@ -164,7 +164,7 @@ static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
*/
- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
+ return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->rapl_pmu[rapl_pmu_idx] : NULL;
}
static inline u64 rapl_read_counter(struct perf_event *event)
@@ -228,34 +228,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
- if (!pmu->n_active)
+ if (!rapl_pmu->n_active)
return HRTIMER_NORESTART;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
- list_for_each_entry(event, &pmu->active_list, active_entry)
+ list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
- hrtimer_forward_now(hrtimer, pmu->timer_interval);
+ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART;
}
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
- struct hrtimer *hr = &pmu->hrtimer;
+ struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@@ -263,39 +263,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
event->hw.state = 0;
- list_add_tail(&event->active_entry, &pmu->active_list);
+ list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
- pmu->n_active++;
- if (pmu->n_active == 1)
- rapl_start_hrtimer(pmu);
+ rapl_pmu->n_active++;
+ if (rapl_pmu->n_active == 1)
+ rapl_start_hrtimer(rapl_pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
- __rapl_pmu_event_start(pmu, event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ __rapl_pmu_event_start(rapl_pmu, event);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
- WARN_ON_ONCE(pmu->n_active <= 0);
- pmu->n_active--;
- if (pmu->n_active == 0)
- hrtimer_cancel(&pmu->hrtimer);
+ WARN_ON_ONCE(rapl_pmu->n_active <= 0);
+ rapl_pmu->n_active--;
+ if (rapl_pmu->n_active == 0)
+ hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry);
@@ -313,23 +313,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
- __rapl_pmu_event_start(pmu, event);
+ __rapl_pmu_event_start(rapl_pmu, event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0;
}
@@ -343,7 +343,7 @@ static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
- struct rapl_pmu *pmu;
+ struct rapl_pmu *rapl_pmu;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
@@ -373,11 +373,11 @@ static int rapl_pmu_event_init(struct perf_event *event)
return -EINVAL;
/* must be done before validate_group */
- pmu = cpu_to_rapl_pmu(event->cpu);
- if (!pmu)
+ rapl_pmu = cpu_to_rapl_pmu(event->cpu);
+ if (!rapl_pmu)
return -EINVAL;
- event->cpu = pmu->cpu;
- event->pmu_private = pmu;
+ event->cpu = rapl_pmu->cpu;
+ event->pmu_private = rapl_pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
event->hw.idx = bit;
@@ -560,22 +560,22 @@ static struct perf_msr amd_rapl_msrs[] = {
static int rapl_cpu_offline(unsigned int cpu)
{
const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
int target;
/* Check if exiting cpu is used for collecting rapl events */
if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
return 0;
- pmu->cpu = -1;
+ rapl_pmu->cpu = -1;
/* Find a new cpu to collect rapl events */
target = cpumask_any_but(rapl_pmu_cpumask, cpu);
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &rapl_cpu_mask);
- pmu->cpu = target;
- perf_pmu_migrate_context(pmu->pmu, cpu, target);
+ rapl_pmu->cpu = target;
+ perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target);
}
return 0;
}
@@ -584,21 +584,21 @@ static int rapl_cpu_online(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
int target;
- if (!pmu) {
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
+ if (!rapl_pmu) {
+ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!rapl_pmu)
return -ENOMEM;
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
+ raw_spin_lock_init(&rapl_pmu->lock);
+ INIT_LIST_HEAD(&rapl_pmu->active_list);
+ rapl_pmu->pmu = &rapl_pmus->pmu;
+ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(rapl_pmu);
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
+ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
}
/*
@@ -610,7 +610,7 @@ static int rapl_cpu_online(unsigned int cpu)
return 0;
cpumask_set_cpu(cpu, &rapl_cpu_mask);
- pmu->cpu = cpu;
+ rapl_pmu->cpu = cpu;
return 0;
}
@@ -679,7 +679,7 @@ static void cleanup_rapl_pmus(void)
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
- kfree(rapl_pmus->pmus[i]);
+ kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus);
}
@@ -699,7 +699,7 @@ static int __init init_rapl_pmus(void)
if (rapl_pmu_is_pkg_scope())
nr_rapl_pmu = topology_max_packages();
- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
--
2.34.1
To support AMD's per_core RAPL counter, we will need to check
per_core capability of the current rapl_model multiple times in
rapl_cpu_online/offline, init_rapl_pmus functions, so cache the
matched rapl model in a global variable, to avoid calling
x86_match_cpu() multiple times.
No functional change.
Signed-off-by: Dhananjay Ugwekar <[email protected]>
---
arch/x86/events/rapl.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index b4e2073a178e..e5e878146542 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -143,6 +143,7 @@ static cpumask_t rapl_cpu_mask;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+static struct rapl_model *rapl_model;
static inline unsigned int get_rapl_pmu_idx(int cpu)
{
@@ -614,18 +615,18 @@ static int rapl_cpu_online(unsigned int cpu)
return 0;
}
-static int rapl_check_hw_unit(struct rapl_model *rm)
+static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
+ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
- switch (rm->unit_quirk) {
+ switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@@ -839,21 +840,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
- struct rapl_model *rm;
int ret;
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
- rm = (struct rapl_model *) id->driver_data;
+ rapl_model = (struct rapl_model *) id->driver_data;
- rapl_msrs = rm->rapl_msrs;
+ rapl_msrs = rapl_model->rapl_msrs;
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
- false, (void *) &rm->events);
+ false, (void *) &rapl_model->events);
- ret = rapl_check_hw_unit(rm);
+ ret = rapl_check_hw_unit();
if (ret)
return ret;
--
2.34.1
This patch is in preparation for addition of per-core energy counter
support for AMD CPUs.
Per-core energy counter PMU will need a separate cpumask. It seems like
a better approach to add the cpumask inside the rapl_pmus struct, instead
of creating another global cpumask variable for per-core PMU. This way, in
future, if there is a need for a new PMU with a different scope (e.g. CCD)
adding a new global cpumask variable won't be necessary.
No functional change.
Signed-off-by: Dhananjay Ugwekar <[email protected]>
---
arch/x86/events/rapl.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index e5e878146542..be139e9f9ee0 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -119,6 +119,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
+ cpumask_t cpumask;
unsigned int nr_rapl_pmu;
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
@@ -139,7 +140,6 @@ struct rapl_model {
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
-static cpumask_t rapl_cpu_mask;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
@@ -394,7 +394,7 @@ static void rapl_pmu_event_read(struct perf_event *event)
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+ return cpumap_print_to_pagebuf(true, buf, &rapl_pmus->cpumask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
@@ -565,7 +565,7 @@ static int rapl_cpu_offline(unsigned int cpu)
int target;
/* Check if exiting cpu is used for collecting rapl events */
- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+ if (!cpumask_test_and_clear_cpu(cpu, &rapl_pmus->cpumask))
return 0;
rapl_pmu->cpu = -1;
@@ -574,7 +574,7 @@ static int rapl_cpu_offline(unsigned int cpu)
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &rapl_cpu_mask);
+ cpumask_set_cpu(target, &rapl_pmus->cpumask);
rapl_pmu->cpu = target;
perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target);
}
@@ -606,11 +606,11 @@ static int rapl_cpu_online(unsigned int cpu)
* Check if there is an online cpu in the package which collects rapl
* events already.
*/
- target = cpumask_any_and(&rapl_cpu_mask, rapl_pmu_cpumask);
+ target = cpumask_any_and(&rapl_pmus->cpumask, rapl_pmu_cpumask);
if (target < nr_cpu_ids)
return 0;
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
+ cpumask_set_cpu(cpu, &rapl_pmus->cpumask);
rapl_pmu->cpu = cpu;
return 0;
}
--
2.34.1
This is in preparation for the addition of per-core RAPL counter support
for AMD CPUs.
The CPU online and offline functions will need to handle the setting up and
migration of the new per-core PMU as well. The wrapper functions added
below will make it easier to pass the corresponding args for both the PMUs.
No functional change.
Signed-off-by: Dhananjay Ugwekar <[email protected]>
---
arch/x86/events/rapl.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index be139e9f9ee0..70c7b35fb4d2 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -558,10 +558,10 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
-static int rapl_cpu_offline(unsigned int cpu)
+static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
+ const struct cpumask *event_cpumask, unsigned int cpu)
{
- const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
- struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
+ struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
int target;
/* Check if exiting cpu is used for collecting rapl events */
@@ -570,7 +570,7 @@ static int rapl_cpu_offline(unsigned int cpu)
rapl_pmu->cpu = -1;
/* Find a new cpu to collect rapl events */
- target = cpumask_any_but(rapl_pmu_cpumask, cpu);
+ target = cpumask_any_but(event_cpumask, cpu);
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
@@ -581,11 +581,16 @@ static int rapl_cpu_offline(unsigned int cpu)
return 0;
}
-static int rapl_cpu_online(unsigned int cpu)
+static int rapl_cpu_offline(unsigned int cpu)
{
- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
- const struct cpumask *rapl_pmu_cpumask = get_rapl_pmu_cpumask(cpu);
- struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
+ return __rapl_cpu_offline(rapl_pmus, get_rapl_pmu_idx(cpu),
+ get_rapl_pmu_cpumask(cpu), cpu);
+}
+
+static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
+ const struct cpumask *event_cpumask, unsigned int cpu)
+{
+ struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
int target;
if (!rapl_pmu) {
@@ -606,7 +611,7 @@ static int rapl_cpu_online(unsigned int cpu)
* Check if there is an online cpu in the package which collects rapl
* events already.
*/
- target = cpumask_any_and(&rapl_pmus->cpumask, rapl_pmu_cpumask);
+ target = cpumask_any_and(&rapl_pmus->cpumask, event_cpumask);
if (target < nr_cpu_ids)
return 0;
@@ -615,6 +620,13 @@ static int rapl_cpu_online(unsigned int cpu)
return 0;
}
+static int rapl_cpu_online(unsigned int cpu)
+{
+ return __rapl_cpu_online(rapl_pmus, get_rapl_pmu_idx(cpu),
+ get_rapl_pmu_cpumask(cpu), cpu);
+}
+
+
static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
--
2.34.1
Add a new "power_per_core" PMU and "energy-per-core" event for
monitoring energy consumption by each core. The existing energy-cores
event aggregates the energy consumption at the package level.
This new event aligns with the AMD's per_core energy counters.
Tested the package level and core level PMU counters with workloads
pinned to different CPUs.
Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa
machine:
$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
Performance counter stats for 'system wide':
S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
Signed-off-by: Dhananjay Ugwekar <[email protected]>
---
arch/x86/events/rapl.c | 155 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 138 insertions(+), 17 deletions(-)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 70c7b35fb4d2..967ecb98748a 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -39,6 +39,10 @@
* event: rapl_energy_psys
* perf code: 0x5
*
+ * per_core counter: consumption of a single physical core
+ * event: rapl_energy_per_core
+ * perf code: 0x6
+ *
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
@@ -76,6 +80,7 @@ enum perf_rapl_events {
PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */
+ PERF_RAPL_PERCORE, /* per-core */
PERF_RAPL_MAX,
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
@@ -87,6 +92,7 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"dram",
"pp1-gpu",
"psys",
+ "per-core",
};
/*
@@ -135,11 +141,13 @@ struct rapl_model {
unsigned long events;
unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk;
+ bool per_core;
};
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
+static struct rapl_pmus *rapl_pmus_per_core;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
@@ -345,9 +353,14 @@ static int rapl_pmu_event_init(struct perf_event *event)
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
struct rapl_pmu *rapl_pmu;
+ struct rapl_pmus *curr_rapl_pmus;
/* only look at RAPL events */
- if (event->attr.type != rapl_pmus->pmu.type)
+ if (event->attr.type == rapl_pmus->pmu.type)
+ curr_rapl_pmus = rapl_pmus;
+ else if (rapl_pmus_per_core && event->attr.type == rapl_pmus_per_core->pmu.type)
+ curr_rapl_pmus = rapl_pmus_per_core;
+ else
return -ENOENT;
/* check only supported bits are set */
@@ -374,9 +387,14 @@ static int rapl_pmu_event_init(struct perf_event *event)
return -EINVAL;
/* must be done before validate_group */
- rapl_pmu = cpu_to_rapl_pmu(event->cpu);
+ if (curr_rapl_pmus == rapl_pmus_per_core)
+ rapl_pmu = curr_rapl_pmus->rapl_pmu[topology_core_id(event->cpu)];
+ else
+ rapl_pmu = curr_rapl_pmus->rapl_pmu[get_rapl_pmu_idx(event->cpu)];
+
if (!rapl_pmu)
return -EINVAL;
+
event->cpu = rapl_pmu->cpu;
event->pmu_private = rapl_pmu;
event->hw.event_base = rapl_msrs[bit].msr;
@@ -408,17 +426,38 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
+static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_per_core->cpumask);
+}
+
+static struct device_attribute dev_attr_per_core_cpumask = __ATTR(cpumask, 0444,
+ rapl_get_attr_per_core_cpumask,
+ NULL);
+
+static struct attribute *rapl_pmu_per_core_attrs[] = {
+ &dev_attr_per_core_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_per_core_attr_group = {
+ .attrs = rapl_pmu_per_core_attrs,
+};
+
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x06");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-per-core.unit, rapl_per_core_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@@ -428,6 +467,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-per-core.scale, rapl_per_core_scale, "2.3283064365386962890625e-10");
/*
* There are no default events, but we need to create
@@ -461,6 +501,13 @@ static const struct attribute_group *rapl_attr_groups[] = {
NULL,
};
+static const struct attribute_group *rapl_per_core_attr_groups[] = {
+ &rapl_pmu_per_core_attr_group,
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit),
@@ -521,6 +568,18 @@ static struct attribute_group rapl_events_psys_group = {
.attrs = rapl_events_psys,
};
+static struct attribute *rapl_events_per_core[] = {
+ EVENT_PTR(rapl_per_core),
+ EVENT_PTR(rapl_per_core_unit),
+ EVENT_PTR(rapl_per_core_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_per_core_group = {
+ .name = "events",
+ .attrs = rapl_events_per_core,
+};
+
static bool test_msr(int idx, void *data)
{
return test_bit(idx, (unsigned long *) data);
@@ -535,6 +594,7 @@ static struct perf_msr intel_rapl_msrs[] = {
[PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PERCORE] = { 0, &rapl_events_per_core_group, NULL, false, 0 },
};
static struct perf_msr intel_rapl_spr_msrs[] = {
@@ -543,6 +603,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
[PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
+ [PERF_RAPL_PERCORE] = { 0, &rapl_events_per_core_group, NULL, false, 0 },
};
/*
@@ -556,6 +617,7 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
[PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 },
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
+ [PERF_RAPL_PERCORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group, test_msr, false, RAPL_MSR_MASK },
};
static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
@@ -583,8 +645,16 @@ static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu
static int rapl_cpu_offline(unsigned int cpu)
{
- return __rapl_cpu_offline(rapl_pmus, get_rapl_pmu_idx(cpu),
- get_rapl_pmu_cpumask(cpu), cpu);
+ int ret;
+
+ ret = __rapl_cpu_offline(rapl_pmus, get_rapl_pmu_idx(cpu),
+ get_rapl_pmu_cpumask(cpu), cpu);
+
+ if (ret == 0 && rapl_model->per_core)
+ ret = __rapl_cpu_offline(rapl_pmus_per_core, topology_core_id(cpu),
+ topology_sibling_cpumask(cpu), cpu);
+
+ return ret;
}
static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
@@ -622,10 +692,17 @@ static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_
static int rapl_cpu_online(unsigned int cpu)
{
- return __rapl_cpu_online(rapl_pmus, get_rapl_pmu_idx(cpu),
- get_rapl_pmu_cpumask(cpu), cpu);
-}
+ int ret;
+
+ ret = __rapl_cpu_online(rapl_pmus, get_rapl_pmu_idx(cpu),
+ get_rapl_pmu_cpumask(cpu), cpu);
+ if (ret == 0 && rapl_model->per_core)
+ ret = __rapl_cpu_online(rapl_pmus_per_core, topology_core_id(cpu),
+ topology_sibling_cpumask(cpu), cpu);
+
+ return ret;
+}
static int rapl_check_hw_unit(void)
{
@@ -687,7 +764,7 @@ static void __init rapl_advertise(void)
}
}
-static void cleanup_rapl_pmus(void)
+static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{
int i;
@@ -705,12 +782,15 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL,
};
-static int __init init_rapl_pmus(void)
-{
- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+static const struct attribute_group *rapl_per_core_attr_update[] = {
+ &rapl_events_per_core_group,
+};
- if (rapl_pmu_is_pkg_scope())
- nr_rapl_pmu = topology_max_packages();
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int nr_rapl_pmu,
+ const struct attribute_group **rapl_attr_groups,
+ const struct attribute_group **rapl_attr_update)
+{
+ struct rapl_pmus *rapl_pmus;
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -728,6 +808,9 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+
+ *rapl_pmus_ptr = rapl_pmus;
+
return 0;
}
@@ -794,9 +877,11 @@ static struct rapl_model model_spr = {
};
static struct rapl_model model_amd_hygon = {
- .events = BIT(PERF_RAPL_PKG),
+ .events = BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_PERCORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
+ .per_core = true,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -853,6 +938,11 @@ static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
int ret;
+ int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ int nr_cores = topology_max_packages() * topology_num_cores_per_package();
+
+ if (rapl_pmu_is_pkg_scope())
+ nr_rapl_pmu = topology_max_packages();
id = x86_match_cpu(rapl_model_match);
if (!id)
@@ -869,10 +959,23 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
- ret = init_rapl_pmus();
+ ret = init_rapl_pmus(&rapl_pmus, nr_rapl_pmu, rapl_attr_groups, rapl_attr_update);
if (ret)
return ret;
+ if (rapl_model->per_core) {
+ ret = init_rapl_pmus(&rapl_pmus_per_core, nr_cores,
+ rapl_per_core_attr_groups, rapl_per_core_attr_update);
+ if (ret) {
+ /*
+ * If initialization of per_core PMU fails, reset per_core
+ * flag, and continue with power PMU initialization.
+ */
+ pr_warn("Per-core PMU initialization failed (%d)\n", ret);
+ rapl_model->per_core = false;
+ }
+ }
+
/*
* Install callbacks. Core will call them for each online cpu.
*/
@@ -886,14 +989,28 @@ static int __init rapl_pmu_init(void)
if (ret)
goto out1;
+ if (rapl_model->per_core) {
+ ret = perf_pmu_register(&rapl_pmus_per_core->pmu, "power_per_core", -1);
+ if (ret) {
+ /*
+ * If registration of per_core PMU fails, cleanup per_core PMU
+ * variables, reset the per_core flag and keep the
+ * power PMU untouched.
+ */
+ pr_warn("Per-core PMU registration failed (%d)\n", ret);
+ cleanup_rapl_pmus(rapl_pmus_per_core);
+ rapl_model->per_core = false;
+ }
+ }
rapl_advertise();
+
return 0;
out1:
cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
- cleanup_rapl_pmus();
+ cleanup_rapl_pmus(rapl_pmus);
return ret;
}
module_init(rapl_pmu_init);
@@ -902,6 +1019,10 @@ static void __exit intel_rapl_exit(void)
{
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
perf_pmu_unregister(&rapl_pmus->pmu);
- cleanup_rapl_pmus();
+ cleanup_rapl_pmus(rapl_pmus);
+ if (rapl_model->per_core) {
+ perf_pmu_unregister(&rapl_pmus_per_core->pmu);
+ cleanup_rapl_pmus(rapl_pmus_per_core);
+ }
}
module_exit(intel_rapl_exit);
--
2.34.1
Hello.
On pondělí 10. června 2024 12:07:45, SELČ Dhananjay Ugwekar wrote:
> Currently the energy-cores event in the power PMU aggregates energy
> consumption data at a package level. On the other hand the core energy
> RAPL counter in AMD CPUs has a core scope (which means the energy
> consumption is recorded separately for each core). Earlier efforts to add
> the core event in the power PMU had failed [1], due to the difference in
> the scope of these two events. Hence, there is a need for a new core scope
> PMU.
>
> This patchset adds a new "power_per_core" PMU alongside the existing
> "power" PMU, which will be responsible for collecting the new
> "energy-per-core" event.
>
> Tested the package level and core level PMU counters with workloads
> pinned to different CPUs.
>
> Results with workload pinned to CPU 1 in Core 1 on an AMD Zen4 Genoa
> machine:
>
> $ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
>
> Performance counter stats for 'system wide':
>
> S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
> S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
> S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
>
> [1]: https://lore.kernel.org/lkml/[email protected]/
>
> This patchset applies cleanly on top of v6.10-rc3 as well as latest
> tip/master.
>
> Dhananjay Ugwekar (6):
> perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
> perf/x86/rapl: Rename rapl_pmu variables
> perf/x86/rapl: Make rapl_model struct global
> perf/x86/rapl: Move cpumask variable to rapl_pmus struct
> perf/x86/rapl: Add wrapper for online/offline functions
> perf/x86/rapl: Add per-core energy counter support for AMD CPUs
>
> arch/x86/events/rapl.c | 311 ++++++++++++++++++++++++++++++-----------
> 1 file changed, 233 insertions(+), 78 deletions(-)
>
>
With my CPU:
Model name: AMD Ryzen 9 5950X 16-Core Processor
and this workload:
$ taskset -c 1 dd if=/dev/zero of=/dev/null
the following result is got:
$ sudo perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
Performance counter stats for 'system wide':
S0-D0-C0 1 1,70 Joules power_per_core/energy-per-core/
S0-D0-C1 1 8,83 Joules power_per_core/energy-per-core/
S0-D0-C2 1 0,17 Joules power_per_core/energy-per-core/
S0-D0-C3 1 0,33 Joules power_per_core/energy-per-core/
S0-D0-C4 1 0,14 Joules power_per_core/energy-per-core/
S0-D0-C5 1 0,33 Joules power_per_core/energy-per-core/
S0-D0-C6 1 0,25 Joules power_per_core/energy-per-core/
S0-D0-C7 1 0,19 Joules power_per_core/energy-per-core/
S0-D0-C8 1 0,66 Joules power_per_core/energy-per-core/
S0-D0-C9 1 1,71 Joules power_per_core/energy-per-core/
S0-D0-C10 1 0,38 Joules power_per_core/energy-per-core/
S0-D0-C11 1 1,69 Joules power_per_core/energy-per-core/
S0-D0-C12 1 0,22 Joules power_per_core/energy-per-core/
S0-D0-C13 1 0,11 Joules power_per_core/energy-per-core/
S0-D0-C14 1 0,49 Joules power_per_core/energy-per-core/
S0-D0-C15 1 0,37 Joules power_per_core/energy-per-core/
1,002409590 seconds time elapsed
If it is as expected, please add my:
Tested-by: Oleksandr Natalenko <[email protected]>
Thank you.
--
Oleksandr Natalenko (post-factum)
Hello Oleksandr,
On 6/10/2024 7:58 PM, Oleksandr Natalenko wrote:
> Hello.
>
> On pondělí 10. června 2024 12:07:45, SELČ Dhananjay Ugwekar wrote:
>> Currently the energy-cores event in the power PMU aggregates energy
>> consumption data at a package level. On the other hand the core energy
>> RAPL counter in AMD CPUs has a core scope (which means the energy
>> consumption is recorded separately for each core). Earlier efforts to add
>> the core event in the power PMU had failed [1], due to the difference in
>> the scope of these two events. Hence, there is a need for a new core scope
>> PMU.
>>
>> This patchset adds a new "power_per_core" PMU alongside the existing
>> "power" PMU, which will be responsible for collecting the new
>> "energy-per-core" event.
>>
>> Tested the package level and core level PMU counters with workloads
>> pinned to different CPUs.
>>
>> Results with workload pinned to CPU 1 in Core 1 on an AMD Zen4 Genoa
>> machine:
>>
>> $ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
>>
>> Performance counter stats for 'system wide':
>>
>> S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
>> S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
>> S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
>>
>> [1]: https://lore.kernel.org/lkml/[email protected]/
>>
>> This patchset applies cleanly on top of v6.10-rc3 as well as latest
>> tip/master.
>>
>> Dhananjay Ugwekar (6):
>> perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
>> perf/x86/rapl: Rename rapl_pmu variables
>> perf/x86/rapl: Make rapl_model struct global
>> perf/x86/rapl: Move cpumask variable to rapl_pmus struct
>> perf/x86/rapl: Add wrapper for online/offline functions
>> perf/x86/rapl: Add per-core energy counter support for AMD CPUs
>>
>> arch/x86/events/rapl.c | 311 ++++++++++++++++++++++++++++++-----------
>> 1 file changed, 233 insertions(+), 78 deletions(-)
>>
>>
>
> With my CPU:
>
> Model name: AMD Ryzen 9 5950X 16-Core Processor
>
> and this workload:
>
> $ taskset -c 1 dd if=/dev/zero of=/dev/null
>
> the following result is got:
>
> $ sudo perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
>
> Performance counter stats for 'system wide':
>
> S0-D0-C0 1 1,70 Joules power_per_core/energy-per-core/
> S0-D0-C1 1 8,83 Joules power_per_core/energy-per-core/
> S0-D0-C2 1 0,17 Joules power_per_core/energy-per-core/
> S0-D0-C3 1 0,33 Joules power_per_core/energy-per-core/
> S0-D0-C4 1 0,14 Joules power_per_core/energy-per-core/
> S0-D0-C5 1 0,33 Joules power_per_core/energy-per-core/
> S0-D0-C6 1 0,25 Joules power_per_core/energy-per-core/
> S0-D0-C7 1 0,19 Joules power_per_core/energy-per-core/
> S0-D0-C8 1 0,66 Joules power_per_core/energy-per-core/
> S0-D0-C9 1 1,71 Joules power_per_core/energy-per-core/
> S0-D0-C10 1 0,38 Joules power_per_core/energy-per-core/
> S0-D0-C11 1 1,69 Joules power_per_core/energy-per-core/
> S0-D0-C12 1 0,22 Joules power_per_core/energy-per-core/
> S0-D0-C13 1 0,11 Joules power_per_core/energy-per-core/
> S0-D0-C14 1 0,49 Joules power_per_core/energy-per-core/
> S0-D0-C15 1 0,37 Joules power_per_core/energy-per-core/
>
> 1,002409590 seconds time elapsed
>
> If it is as expected, please add my:
>
> Tested-by: Oleksandr Natalenko <[email protected]>
We can see that after you affined the workload to cpu 1, energy
consumption of core 1 is considerably higher than the other cores,
which is as expected, will add your tested-by in next version.
P.S: I'm assuming here that cpu 1 is part of core 1 in your system,
please let me know if that assumption is wrong.
Thanks for testing the patch!
Regards,
Dhananjay
>
> Thank you.
>
On pondělí 10. června 2024 17:17:42, SELČ Dhananjay Ugwekar wrote:
> Hello Oleksandr,
>
> On 6/10/2024 7:58 PM, Oleksandr Natalenko wrote:
> > Hello.
> >
> > On pondělí 10. června 2024 12:07:45, SELČ Dhananjay Ugwekar wrote:
> >> Currently the energy-cores event in the power PMU aggregates energy
> >> consumption data at a package level. On the other hand the core energy
> >> RAPL counter in AMD CPUs has a core scope (which means the energy
> >> consumption is recorded separately for each core). Earlier efforts to add
> >> the core event in the power PMU had failed [1], due to the difference in
> >> the scope of these two events. Hence, there is a need for a new core scope
> >> PMU.
> >>
> >> This patchset adds a new "power_per_core" PMU alongside the existing
> >> "power" PMU, which will be responsible for collecting the new
> >> "energy-per-core" event.
> >>
> >> Tested the package level and core level PMU counters with workloads
> >> pinned to different CPUs.
> >>
> >> Results with workload pinned to CPU 1 in Core 1 on an AMD Zen4 Genoa
> >> machine:
> >>
> >> $ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
> >>
> >> Performance counter stats for 'system wide':
> >>
> >> S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
> >> S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
> >> S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
> >>
> >> [1]: https://lore.kernel.org/lkml/[email protected]/
> >>
> >> This patchset applies cleanly on top of v6.10-rc3 as well as latest
> >> tip/master.
> >>
> >> Dhananjay Ugwekar (6):
> >> perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
> >> perf/x86/rapl: Rename rapl_pmu variables
> >> perf/x86/rapl: Make rapl_model struct global
> >> perf/x86/rapl: Move cpumask variable to rapl_pmus struct
> >> perf/x86/rapl: Add wrapper for online/offline functions
> >> perf/x86/rapl: Add per-core energy counter support for AMD CPUs
> >>
> >> arch/x86/events/rapl.c | 311 ++++++++++++++++++++++++++++++-----------
> >> 1 file changed, 233 insertions(+), 78 deletions(-)
> >>
> >>
> >
> > With my CPU:
> >
> > Model name: AMD Ryzen 9 5950X 16-Core Processor
> >
> > and this workload:
> >
> > $ taskset -c 1 dd if=/dev/zero of=/dev/null
> >
> > the following result is got:
> >
> > $ sudo perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
> >
> > Performance counter stats for 'system wide':
> >
> > S0-D0-C0 1 1,70 Joules power_per_core/energy-per-core/
> > S0-D0-C1 1 8,83 Joules power_per_core/energy-per-core/
> > S0-D0-C2 1 0,17 Joules power_per_core/energy-per-core/
> > S0-D0-C3 1 0,33 Joules power_per_core/energy-per-core/
> > S0-D0-C4 1 0,14 Joules power_per_core/energy-per-core/
> > S0-D0-C5 1 0,33 Joules power_per_core/energy-per-core/
> > S0-D0-C6 1 0,25 Joules power_per_core/energy-per-core/
> > S0-D0-C7 1 0,19 Joules power_per_core/energy-per-core/
> > S0-D0-C8 1 0,66 Joules power_per_core/energy-per-core/
> > S0-D0-C9 1 1,71 Joules power_per_core/energy-per-core/
> > S0-D0-C10 1 0,38 Joules power_per_core/energy-per-core/
> > S0-D0-C11 1 1,69 Joules power_per_core/energy-per-core/
> > S0-D0-C12 1 0,22 Joules power_per_core/energy-per-core/
> > S0-D0-C13 1 0,11 Joules power_per_core/energy-per-core/
> > S0-D0-C14 1 0,49 Joules power_per_core/energy-per-core/
> > S0-D0-C15 1 0,37 Joules power_per_core/energy-per-core/
> >
> > 1,002409590 seconds time elapsed
> >
> > If it is as expected, please add my:
> >
> > Tested-by: Oleksandr Natalenko <[email protected]>
>
> We can see that after you affined the workload to cpu 1, energy
> consumption of core 1 is considerably higher than the other cores,
> which is as expected, will add your tested-by in next version.
>
> P.S: I'm assuming here that cpu 1 is part of core 1 in your system,
> please let me know if that assumption is wrong.
You assumption should be correct:
$ cat /sys/devices/system/cpu/cpu1/topology/core_id
1
> Thanks for testing the patch!
>
> Regards,
> Dhananjay
>
> >
> > Thank you.
> >
>
--
Oleksandr Natalenko (post-factum)
On Mon, 2024-06-10 at 10:07 +0000, Dhananjay Ugwekar wrote:
> After commit ("x86/cpu/topology: Add support for the AMD 0x80000026
> leaf"),
> on AMD processors that support extended CPUID leaf 0x80000026, the
> topology_die_cpumask() and topology_logical_die_id() macros, no
> longer
> return the package cpumask and package id, instead they return the
> CCD
> (Core Complex Die) mask and id respectively. This leads to the
> energy-pkg
> event scope to be modified to CCD instead of package.
>
> Replacing these macros with their package counterparts fixes the
> energy-pkg event for AMD CPUs.
>
> However due to the difference between the scope of energy-pkg event
> for
> Intel and AMD CPUs,
Actually, on Intel platforms, the energy-pkg event is also package
scope except one platform, and that one is the only multi-die system
with RAPL MSRs available.
So that is why the die/pkg logic was introduced in the first place.
I like the macro/helpers in this patch. The logic inside them may need
to be optimized for Intel platforms, but that is a separate task.
> we have to replace these macros conditionally only for
> AMD CPUs.
>
> On a 12 CCD 1 Package AMD Zen4 Genoa machine:
>
> Before:
> $ cat /sys/devices/power/cpumask
> 0,8,16,24,32,40,48,56,64,72,80,88.
>
> The expected cpumask here is supposed to be just "0", as it is a
> package
> scope event, only one CPU will be collecting the event for all the
> CPUs in
> the package.
>
> After:
> $ cat /sys/devices/power/cpumask
> 0
>
> Signed-off-by: Dhananjay Ugwekar <[email protected]>
> Fixes: 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD
> 0x80000026 leaf")
Reviewed-by: Zhang Rui <[email protected]>
thanks,
rui
> ---
> PS: This patch was earlier sent separately(link below), it has not
> been
> merged yet, it is necessary for this patchset to work properly, also
> it
> fixes the pre-existing energy-pkg event.
> https://lore.kernel.org/linux-perf-users/[email protected]/
> ---
> arch/x86/events/rapl.c | 30 ++++++++++++++++++++++++++----
> 1 file changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
> index b985ca79cf97..73be25e1f4b4 100644
> --- a/arch/x86/events/rapl.c
> +++ b/arch/x86/events/rapl.c
> @@ -103,6 +103,10 @@ static struct perf_pmu_events_attr
> event_attr_##v = { \
> .event_str =
> str, \
> };
>
> +#define rapl_pmu_is_pkg_scope() \
> + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
> + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
> +
> struct rapl_pmu {
> raw_spinlock_t lock;
> int n_active;
> @@ -140,9 +144,21 @@ static unsigned int rapl_cntr_mask;
> static u64 rapl_timer_ms;
> static struct perf_msr *rapl_msrs;
>
> +static inline unsigned int get_rapl_pmu_idx(int cpu)
> +{
> + return rapl_pmu_is_pkg_scope() ?
> topology_logical_package_id(cpu) :
> +
> topology_logical_die_id(cpu);
> +}
> +
> +static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
> +{
> + return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
> + topology_die_cpumask(cpu);
> +}
> +
> static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
> {
> - unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
> + unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
>
> /*
> * The unsigned check also catches the '-1' return value for
> non
> @@ -543,6 +559,7 @@ static struct perf_msr amd_rapl_msrs[] = {
>
> static int rapl_cpu_offline(unsigned int cpu)
> {
> + const struct cpumask *rapl_pmu_cpumask =
> get_rapl_pmu_cpumask(cpu);
> struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
> int target;
>
> @@ -552,7 +569,7 @@ static int rapl_cpu_offline(unsigned int cpu)
>
> pmu->cpu = -1;
> /* Find a new cpu to collect rapl events */
> - target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
> + target = cpumask_any_but(rapl_pmu_cpumask, cpu);
>
> /* Migrate rapl events to the new target */
> if (target < nr_cpu_ids) {
> @@ -565,6 +582,8 @@ static int rapl_cpu_offline(unsigned int cpu)
>
> static int rapl_cpu_online(unsigned int cpu)
> {
> + unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
> + const struct cpumask *rapl_pmu_cpumask =
> get_rapl_pmu_cpumask(cpu);
> struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
> int target;
>
> @@ -579,14 +598,14 @@ static int rapl_cpu_online(unsigned int cpu)
> pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
> rapl_hrtimer_init(pmu);
>
> - rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
> + rapl_pmus->pmus[rapl_pmu_idx] = pmu;
> }
>
> /*
> * Check if there is an online cpu in the package which
> collects rapl
> * events already.
> */
> - target = cpumask_any_and(&rapl_cpu_mask,
> topology_die_cpumask(cpu));
> + target = cpumask_any_and(&rapl_cpu_mask, rapl_pmu_cpumask);
> if (target < nr_cpu_ids)
> return 0;
>
> @@ -677,6 +696,9 @@ static int __init init_rapl_pmus(void)
> {
> int nr_rapl_pmu = topology_max_packages() *
> topology_max_dies_per_package();
>
> + if (rapl_pmu_is_pkg_scope())
> + nr_rapl_pmu = topology_max_packages();
> +
> rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus,
> nr_rapl_pmu), GFP_KERNEL);
> if (!rapl_pmus)
> return -ENOMEM;
On Mon, 2024-06-10 at 10:07 +0000, Dhananjay Ugwekar wrote:
> Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
> avoid any confusion between the variables of two different
> structs pmu and rapl_pmu.
> As rapl_pmu also contains a pointer to
> struct pmu, which leads to situations in code like pmu->pmu,
> which is needlessly confusing. Above scenario is replaced with
> much more readable rapl_pmu->pmu with this change.
>
> Also rename "pmus" member in rapl_pmus struct, for same reason.
>
As you are adding a new per_core pmu, can we just rename the current
rapl_pmu to something like rapl_pkg_pmus (as I mentioned in the
previous email, we can consider the current RAPL MSRs as package scope
on Intel platforms as well), and name the new one as rapl_core_pmus?
IMO, "rapl_pmus" + "rapl_pmus_per_core" is still confusing.
thanks,
rui
> No functional change.
>
> Signed-off-by: Dhananjay Ugwekar <[email protected]>
> ---
> arch/x86/events/rapl.c | 104 ++++++++++++++++++++-------------------
> --
> 1 file changed, 52 insertions(+), 52 deletions(-)
>
> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
> index 73be25e1f4b4..b4e2073a178e 100644
> --- a/arch/x86/events/rapl.c
> +++ b/arch/x86/events/rapl.c
> @@ -120,7 +120,7 @@ struct rapl_pmu {
> struct rapl_pmus {
> struct pmu pmu;
> unsigned int nr_rapl_pmu;
> - struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
> + struct rapl_pmu *rapl_pmu[]
> __counted_by(nr_rapl_pmu);
> };
>
> enum rapl_unit_quirk {
> @@ -164,7 +164,7 @@ static inline struct rapl_pmu
> *cpu_to_rapl_pmu(unsigned int cpu)
> * The unsigned check also catches the '-1' return value for
> non
> * existent mappings in the topology map.
> */
> - return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus-
> >pmus[rapl_pmu_idx] : NULL;
> + return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus-
> >rapl_pmu[rapl_pmu_idx] : NULL;
> }
>
> static inline u64 rapl_read_counter(struct perf_event *event)
> @@ -228,34 +228,34 @@ static void rapl_start_hrtimer(struct rapl_pmu
> *pmu)
>
> static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer
> *hrtimer)
> {
> - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu,
> hrtimer);
> + struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct
> rapl_pmu, hrtimer);
> struct perf_event *event;
> unsigned long flags;
>
> - if (!pmu->n_active)
> + if (!rapl_pmu->n_active)
> return HRTIMER_NORESTART;
>
> - raw_spin_lock_irqsave(&pmu->lock, flags);
> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>
> - list_for_each_entry(event, &pmu->active_list, active_entry)
> + list_for_each_entry(event, &rapl_pmu->active_list,
> active_entry)
> rapl_event_update(event);
>
> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>
> - hrtimer_forward_now(hrtimer, pmu->timer_interval);
> + hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
>
> return HRTIMER_RESTART;
> }
>
> -static void rapl_hrtimer_init(struct rapl_pmu *pmu)
> +static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
> {
> - struct hrtimer *hr = &pmu->hrtimer;
> + struct hrtimer *hr = &rapl_pmu->hrtimer;
>
> hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> hr->function = rapl_hrtimer_handle;
> }
>
> -static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
> +static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
> struct perf_event *event)
> {
> if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> @@ -263,39 +263,39 @@ static void __rapl_pmu_event_start(struct
> rapl_pmu *pmu,
>
> event->hw.state = 0;
>
> - list_add_tail(&event->active_entry, &pmu->active_list);
> + list_add_tail(&event->active_entry, &rapl_pmu->active_list);
>
> local64_set(&event->hw.prev_count, rapl_read_counter(event));
>
> - pmu->n_active++;
> - if (pmu->n_active == 1)
> - rapl_start_hrtimer(pmu);
> + rapl_pmu->n_active++;
> + if (rapl_pmu->n_active == 1)
> + rapl_start_hrtimer(rapl_pmu);
> }
>
> static void rapl_pmu_event_start(struct perf_event *event, int mode)
> {
> - struct rapl_pmu *pmu = event->pmu_private;
> + struct rapl_pmu *rapl_pmu = event->pmu_private;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&pmu->lock, flags);
> - __rapl_pmu_event_start(pmu, event);
> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
> + __rapl_pmu_event_start(rapl_pmu, event);
> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
> }
>
> static void rapl_pmu_event_stop(struct perf_event *event, int mode)
> {
> - struct rapl_pmu *pmu = event->pmu_private;
> + struct rapl_pmu *rapl_pmu = event->pmu_private;
> struct hw_perf_event *hwc = &event->hw;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&pmu->lock, flags);
> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>
> /* mark event as deactivated and stopped */
> if (!(hwc->state & PERF_HES_STOPPED)) {
> - WARN_ON_ONCE(pmu->n_active <= 0);
> - pmu->n_active--;
> - if (pmu->n_active == 0)
> - hrtimer_cancel(&pmu->hrtimer);
> + WARN_ON_ONCE(rapl_pmu->n_active <= 0);
> + rapl_pmu->n_active--;
> + if (rapl_pmu->n_active == 0)
> + hrtimer_cancel(&rapl_pmu->hrtimer);
>
> list_del(&event->active_entry);
>
> @@ -313,23 +313,23 @@ static void rapl_pmu_event_stop(struct
> perf_event *event, int mode)
> hwc->state |= PERF_HES_UPTODATE;
> }
>
> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
> }
>
> static int rapl_pmu_event_add(struct perf_event *event, int mode)
> {
> - struct rapl_pmu *pmu = event->pmu_private;
> + struct rapl_pmu *rapl_pmu = event->pmu_private;
> struct hw_perf_event *hwc = &event->hw;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&pmu->lock, flags);
> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>
> hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
>
> if (mode & PERF_EF_START)
> - __rapl_pmu_event_start(pmu, event);
> + __rapl_pmu_event_start(rapl_pmu, event);
>
> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>
> return 0;
> }
> @@ -343,7 +343,7 @@ static int rapl_pmu_event_init(struct perf_event
> *event)
> {
> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
> int bit, ret = 0;
> - struct rapl_pmu *pmu;
> + struct rapl_pmu *rapl_pmu;
>
> /* only look at RAPL events */
> if (event->attr.type != rapl_pmus->pmu.type)
> @@ -373,11 +373,11 @@ static int rapl_pmu_event_init(struct
> perf_event *event)
> return -EINVAL;
>
> /* must be done before validate_group */
> - pmu = cpu_to_rapl_pmu(event->cpu);
> - if (!pmu)
> + rapl_pmu = cpu_to_rapl_pmu(event->cpu);
> + if (!rapl_pmu)
> return -EINVAL;
> - event->cpu = pmu->cpu;
> - event->pmu_private = pmu;
> + event->cpu = rapl_pmu->cpu;
> + event->pmu_private = rapl_pmu;
> event->hw.event_base = rapl_msrs[bit].msr;
> event->hw.config = cfg;
> event->hw.idx = bit;
> @@ -560,22 +560,22 @@ static struct perf_msr amd_rapl_msrs[] = {
> static int rapl_cpu_offline(unsigned int cpu)
> {
> const struct cpumask *rapl_pmu_cpumask =
> get_rapl_pmu_cpumask(cpu);
> - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
> + struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
> int target;
>
> /* Check if exiting cpu is used for collecting rapl events */
> if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
> return 0;
>
> - pmu->cpu = -1;
> + rapl_pmu->cpu = -1;
> /* Find a new cpu to collect rapl events */
> target = cpumask_any_but(rapl_pmu_cpumask, cpu);
>
> /* Migrate rapl events to the new target */
> if (target < nr_cpu_ids) {
> cpumask_set_cpu(target, &rapl_cpu_mask);
> - pmu->cpu = target;
> - perf_pmu_migrate_context(pmu->pmu, cpu, target);
> + rapl_pmu->cpu = target;
> + perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target);
> }
> return 0;
> }
> @@ -584,21 +584,21 @@ static int rapl_cpu_online(unsigned int cpu)
> {
> unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
> const struct cpumask *rapl_pmu_cpumask =
> get_rapl_pmu_cpumask(cpu);
> - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
> + struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
> int target;
>
> - if (!pmu) {
> - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL,
> cpu_to_node(cpu));
> - if (!pmu)
> + if (!rapl_pmu) {
> + rapl_pmu = kzalloc_node(sizeof(*rapl_pmu),
> GFP_KERNEL, cpu_to_node(cpu));
> + if (!rapl_pmu)
> return -ENOMEM;
>
> - raw_spin_lock_init(&pmu->lock);
> - INIT_LIST_HEAD(&pmu->active_list);
> - pmu->pmu = &rapl_pmus->pmu;
> - pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
> - rapl_hrtimer_init(pmu);
> + raw_spin_lock_init(&rapl_pmu->lock);
> + INIT_LIST_HEAD(&rapl_pmu->active_list);
> + rapl_pmu->pmu = &rapl_pmus->pmu;
> + rapl_pmu->timer_interval =
> ms_to_ktime(rapl_timer_ms);
> + rapl_hrtimer_init(rapl_pmu);
>
> - rapl_pmus->pmus[rapl_pmu_idx] = pmu;
> + rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
> }
>
> /*
> @@ -610,7 +610,7 @@ static int rapl_cpu_online(unsigned int cpu)
> return 0;
>
> cpumask_set_cpu(cpu, &rapl_cpu_mask);
> - pmu->cpu = cpu;
> + rapl_pmu->cpu = cpu;
> return 0;
> }
>
> @@ -679,7 +679,7 @@ static void cleanup_rapl_pmus(void)
> int i;
>
> for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
> - kfree(rapl_pmus->pmus[i]);
> + kfree(rapl_pmus->rapl_pmu[i]);
> kfree(rapl_pmus);
> }
>
> @@ -699,7 +699,7 @@ static int __init init_rapl_pmus(void)
> if (rapl_pmu_is_pkg_scope())
> nr_rapl_pmu = topology_max_packages();
>
> - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus,
> nr_rapl_pmu), GFP_KERNEL);
> + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu,
> nr_rapl_pmu), GFP_KERNEL);
> if (!rapl_pmus)
> return -ENOMEM;
>
> @@ -345,9 +353,14 @@ static int rapl_pmu_event_init(struct perf_event
> *event)
> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
> int bit, ret = 0;
> struct rapl_pmu *rapl_pmu;
> + struct rapl_pmus *curr_rapl_pmus;
>
> /* only look at RAPL events */
> - if (event->attr.type != rapl_pmus->pmu.type)
> + if (event->attr.type == rapl_pmus->pmu.type)
> + curr_rapl_pmus = rapl_pmus;
> + else if (rapl_pmus_per_core && event->attr.type ==
> rapl_pmus_per_core->pmu.type)
> + curr_rapl_pmus = rapl_pmus_per_core;
> + else
> return -ENOENT;
can we use container_of(event->pmu, struct rapl_pmus, pmu)?
>
> /* check only supported bits are set */
> @@ -374,9 +387,14 @@ static int rapl_pmu_event_init(struct perf_event
> *event)
> return -EINVAL;
>
> /* must be done before validate_group */
> - rapl_pmu = cpu_to_rapl_pmu(event->cpu);
> + if (curr_rapl_pmus == rapl_pmus_per_core)
> + rapl_pmu = curr_rapl_pmus-
> >rapl_pmu[topology_core_id(event->cpu)];
> + else
> + rapl_pmu = curr_rapl_pmus-
> >rapl_pmu[get_rapl_pmu_idx(event->cpu)];
> +
> if (!rapl_pmu)
> return -EINVAL;
Current code has PERF_EV_CAP_READ_ACTIVE_PKG flag set.
Can you help me understand why it does not affect the new per-core pmu?
> +
> event->cpu = rapl_pmu->cpu;
> event->pmu_private = rapl_pmu;
> event->hw.event_base = rapl_msrs[bit].msr;
> @@ -408,17 +426,38 @@ static struct attribute_group
> rapl_pmu_attr_group = {
> .attrs = rapl_pmu_attrs,
> };
>
> +static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev,
> + struct device_attribute
> *attr, char *buf)
> +{
> + return cpumap_print_to_pagebuf(true, buf,
> &rapl_pmus_per_core->cpumask);
> +}
> +
> +static struct device_attribute dev_attr_per_core_cpumask =
> __ATTR(cpumask, 0444,
> +
> rapl_get_attr_per_core_cpumask,
> +
> NULL);
DEVICE_ATTR
> +
> +static struct attribute *rapl_pmu_per_core_attrs[] = {
> + &dev_attr_per_core_cpumask.attr,
> + NULL,
> +};
> +
> +static struct attribute_group rapl_pmu_per_core_attr_group = {
> + .attrs = rapl_pmu_per_core_attrs,
> +};
> +
> RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
> RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
> RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
> RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
> RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
> +RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x06");
energy-per-core is for a separate pmu, so the event id does not need to
be 6. The same applies to PERF_RAPL_PERCORE.
>
> static struct rapl_model model_amd_hygon = {
> - .events = BIT(PERF_RAPL_PKG),
> + .events = BIT(PERF_RAPL_PKG) |
> + BIT(PERF_RAPL_PERCORE),
> .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
> .rapl_msrs = amd_rapl_msrs,
> + .per_core = true,
> };
can we use bit PERF_RAPL_PERCORE to check per_core pmu suppot?
Just FYI, arch/x86/events/intel/cstate.c handles package/module/core
scope cstate pmus. It uses a different approach in the probing part,
which IMO is clearer.
thanks,
rui
Hello Rui,
On 6/11/2024 11:13 AM, Zhang, Rui wrote:
> On Mon, 2024-06-10 at 10:07 +0000, Dhananjay Ugwekar wrote:
>> Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
>> avoid any confusion between the variables of two different
>> structs pmu and rapl_pmu.
>> As rapl_pmu also contains a pointer to
>> struct pmu, which leads to situations in code like pmu->pmu,
>> which is needlessly confusing. Above scenario is replaced with
>> much more readable rapl_pmu->pmu with this change.
>>
>> Also rename "pmus" member in rapl_pmus struct, for same reason.
>>
>
> As you are adding a new per_core pmu, can we just rename the current
> rapl_pmu to something like rapl_pkg_pmus (as I mentioned in the
> previous email, we can consider the current RAPL MSRs as package scope
> on Intel platforms as well), and name the new one as rapl_core_pmus?
Sure this makes sense, will modify the original "rapl_pmus" variable name,
but please note the renaming that I refer to in this patch is only
limited to the local "struct rapl_pmu" variables used inside functions,
not the global variable name you mention.
Regards,
Dhananjay
>
> IMO, "rapl_pmus" + "rapl_pmus_per_core" is still confusing.
>
> thanks,
> rui
>
>> No functional change.
>>
>> Signed-off-by: Dhananjay Ugwekar <[email protected]>
>> ---
>> arch/x86/events/rapl.c | 104 ++++++++++++++++++++-------------------
>> --
>> 1 file changed, 52 insertions(+), 52 deletions(-)
>>
>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>> index 73be25e1f4b4..b4e2073a178e 100644
>> --- a/arch/x86/events/rapl.c
>> +++ b/arch/x86/events/rapl.c
>> @@ -120,7 +120,7 @@ struct rapl_pmu {
>> struct rapl_pmus {
>> struct pmu pmu;
>> unsigned int nr_rapl_pmu;
>> - struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
>> + struct rapl_pmu *rapl_pmu[]
>> __counted_by(nr_rapl_pmu);
>> };
>>
>> enum rapl_unit_quirk {
>> @@ -164,7 +164,7 @@ static inline struct rapl_pmu
>> *cpu_to_rapl_pmu(unsigned int cpu)
>> * The unsigned check also catches the '-1' return value for
>> non
>> * existent mappings in the topology map.
>> */
>> - return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus-
>>> pmus[rapl_pmu_idx] : NULL;
>> + return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus-
>>> rapl_pmu[rapl_pmu_idx] : NULL;
>> }
>>
>> static inline u64 rapl_read_counter(struct perf_event *event)
>> @@ -228,34 +228,34 @@ static void rapl_start_hrtimer(struct rapl_pmu
>> *pmu)
>>
>> static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer
>> *hrtimer)
>> {
>> - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu,
>> hrtimer);
>> + struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct
>> rapl_pmu, hrtimer);
>> struct perf_event *event;
>> unsigned long flags;
>>
>> - if (!pmu->n_active)
>> + if (!rapl_pmu->n_active)
>> return HRTIMER_NORESTART;
>>
>> - raw_spin_lock_irqsave(&pmu->lock, flags);
>> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>>
>> - list_for_each_entry(event, &pmu->active_list, active_entry)
>> + list_for_each_entry(event, &rapl_pmu->active_list,
>> active_entry)
>> rapl_event_update(event);
>>
>> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>>
>> - hrtimer_forward_now(hrtimer, pmu->timer_interval);
>> + hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
>>
>> return HRTIMER_RESTART;
>> }
>>
>> -static void rapl_hrtimer_init(struct rapl_pmu *pmu)
>> +static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
>> {
>> - struct hrtimer *hr = &pmu->hrtimer;
>> + struct hrtimer *hr = &rapl_pmu->hrtimer;
>>
>> hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> hr->function = rapl_hrtimer_handle;
>> }
>>
>> -static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
>> +static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
>> struct perf_event *event)
>> {
>> if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
>> @@ -263,39 +263,39 @@ static void __rapl_pmu_event_start(struct
>> rapl_pmu *pmu,
>>
>> event->hw.state = 0;
>>
>> - list_add_tail(&event->active_entry, &pmu->active_list);
>> + list_add_tail(&event->active_entry, &rapl_pmu->active_list);
>>
>> local64_set(&event->hw.prev_count, rapl_read_counter(event));
>>
>> - pmu->n_active++;
>> - if (pmu->n_active == 1)
>> - rapl_start_hrtimer(pmu);
>> + rapl_pmu->n_active++;
>> + if (rapl_pmu->n_active == 1)
>> + rapl_start_hrtimer(rapl_pmu);
>> }
>>
>> static void rapl_pmu_event_start(struct perf_event *event, int mode)
>> {
>> - struct rapl_pmu *pmu = event->pmu_private;
>> + struct rapl_pmu *rapl_pmu = event->pmu_private;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&pmu->lock, flags);
>> - __rapl_pmu_event_start(pmu, event);
>> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>> + __rapl_pmu_event_start(rapl_pmu, event);
>> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>> }
>>
>> static void rapl_pmu_event_stop(struct perf_event *event, int mode)
>> {
>> - struct rapl_pmu *pmu = event->pmu_private;
>> + struct rapl_pmu *rapl_pmu = event->pmu_private;
>> struct hw_perf_event *hwc = &event->hw;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&pmu->lock, flags);
>> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>>
>> /* mark event as deactivated and stopped */
>> if (!(hwc->state & PERF_HES_STOPPED)) {
>> - WARN_ON_ONCE(pmu->n_active <= 0);
>> - pmu->n_active--;
>> - if (pmu->n_active == 0)
>> - hrtimer_cancel(&pmu->hrtimer);
>> + WARN_ON_ONCE(rapl_pmu->n_active <= 0);
>> + rapl_pmu->n_active--;
>> + if (rapl_pmu->n_active == 0)
>> + hrtimer_cancel(&rapl_pmu->hrtimer);
>>
>> list_del(&event->active_entry);
>>
>> @@ -313,23 +313,23 @@ static void rapl_pmu_event_stop(struct
>> perf_event *event, int mode)
>> hwc->state |= PERF_HES_UPTODATE;
>> }
>>
>> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>> }
>>
>> static int rapl_pmu_event_add(struct perf_event *event, int mode)
>> {
>> - struct rapl_pmu *pmu = event->pmu_private;
>> + struct rapl_pmu *rapl_pmu = event->pmu_private;
>> struct hw_perf_event *hwc = &event->hw;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&pmu->lock, flags);
>> + raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
>>
>> hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
>>
>> if (mode & PERF_EF_START)
>> - __rapl_pmu_event_start(pmu, event);
>> + __rapl_pmu_event_start(rapl_pmu, event);
>>
>> - raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
>>
>> return 0;
>> }
>> @@ -343,7 +343,7 @@ static int rapl_pmu_event_init(struct perf_event
>> *event)
>> {
>> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
>> int bit, ret = 0;
>> - struct rapl_pmu *pmu;
>> + struct rapl_pmu *rapl_pmu;
>>
>> /* only look at RAPL events */
>> if (event->attr.type != rapl_pmus->pmu.type)
>> @@ -373,11 +373,11 @@ static int rapl_pmu_event_init(struct
>> perf_event *event)
>> return -EINVAL;
>>
>> /* must be done before validate_group */
>> - pmu = cpu_to_rapl_pmu(event->cpu);
>> - if (!pmu)
>> + rapl_pmu = cpu_to_rapl_pmu(event->cpu);
>> + if (!rapl_pmu)
>> return -EINVAL;
>> - event->cpu = pmu->cpu;
>> - event->pmu_private = pmu;
>> + event->cpu = rapl_pmu->cpu;
>> + event->pmu_private = rapl_pmu;
>> event->hw.event_base = rapl_msrs[bit].msr;
>> event->hw.config = cfg;
>> event->hw.idx = bit;
>> @@ -560,22 +560,22 @@ static struct perf_msr amd_rapl_msrs[] = {
>> static int rapl_cpu_offline(unsigned int cpu)
>> {
>> const struct cpumask *rapl_pmu_cpumask =
>> get_rapl_pmu_cpumask(cpu);
>> - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
>> + struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
>> int target;
>>
>> /* Check if exiting cpu is used for collecting rapl events */
>> if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
>> return 0;
>>
>> - pmu->cpu = -1;
>> + rapl_pmu->cpu = -1;
>> /* Find a new cpu to collect rapl events */
>> target = cpumask_any_but(rapl_pmu_cpumask, cpu);
>>
>> /* Migrate rapl events to the new target */
>> if (target < nr_cpu_ids) {
>> cpumask_set_cpu(target, &rapl_cpu_mask);
>> - pmu->cpu = target;
>> - perf_pmu_migrate_context(pmu->pmu, cpu, target);
>> + rapl_pmu->cpu = target;
>> + perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target);
>> }
>> return 0;
>> }
>> @@ -584,21 +584,21 @@ static int rapl_cpu_online(unsigned int cpu)
>> {
>> unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
>> const struct cpumask *rapl_pmu_cpumask =
>> get_rapl_pmu_cpumask(cpu);
>> - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
>> + struct rapl_pmu *rapl_pmu = cpu_to_rapl_pmu(cpu);
>> int target;
>>
>> - if (!pmu) {
>> - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL,
>> cpu_to_node(cpu));
>> - if (!pmu)
>> + if (!rapl_pmu) {
>> + rapl_pmu = kzalloc_node(sizeof(*rapl_pmu),
>> GFP_KERNEL, cpu_to_node(cpu));
>> + if (!rapl_pmu)
>> return -ENOMEM;
>>
>> - raw_spin_lock_init(&pmu->lock);
>> - INIT_LIST_HEAD(&pmu->active_list);
>> - pmu->pmu = &rapl_pmus->pmu;
>> - pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
>> - rapl_hrtimer_init(pmu);
>> + raw_spin_lock_init(&rapl_pmu->lock);
>> + INIT_LIST_HEAD(&rapl_pmu->active_list);
>> + rapl_pmu->pmu = &rapl_pmus->pmu;
>> + rapl_pmu->timer_interval =
>> ms_to_ktime(rapl_timer_ms);
>> + rapl_hrtimer_init(rapl_pmu);
>>
>> - rapl_pmus->pmus[rapl_pmu_idx] = pmu;
>> + rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
>> }
>>
>> /*
>> @@ -610,7 +610,7 @@ static int rapl_cpu_online(unsigned int cpu)
>> return 0;
>>
>> cpumask_set_cpu(cpu, &rapl_cpu_mask);
>> - pmu->cpu = cpu;
>> + rapl_pmu->cpu = cpu;
>> return 0;
>> }
>>
>> @@ -679,7 +679,7 @@ static void cleanup_rapl_pmus(void)
>> int i;
>>
>> for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
>> - kfree(rapl_pmus->pmus[i]);
>> + kfree(rapl_pmus->rapl_pmu[i]);
>> kfree(rapl_pmus);
>> }
>>
>> @@ -699,7 +699,7 @@ static int __init init_rapl_pmus(void)
>> if (rapl_pmu_is_pkg_scope())
>> nr_rapl_pmu = topology_max_packages();
>>
>> - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus,
>> nr_rapl_pmu), GFP_KERNEL);
>> + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu,
>> nr_rapl_pmu), GFP_KERNEL);
>> if (!rapl_pmus)
>> return -ENOMEM;
>>
>
Hello Rui,
On 6/11/2024 11:05 AM, Zhang, Rui wrote:
> On Mon, 2024-06-10 at 10:07 +0000, Dhananjay Ugwekar wrote:
>> After commit ("x86/cpu/topology: Add support for the AMD 0x80000026
>> leaf"),
>> on AMD processors that support extended CPUID leaf 0x80000026, the
>> topology_die_cpumask() and topology_logical_die_id() macros, no
>> longer
>> return the package cpumask and package id, instead they return the
>> CCD
>> (Core Complex Die) mask and id respectively. This leads to the
>> energy-pkg
>> event scope to be modified to CCD instead of package.
>>
>> Replacing these macros with their package counterparts fixes the
>> energy-pkg event for AMD CPUs.
>>
>> However due to the difference between the scope of energy-pkg event
>> for
>> Intel and AMD CPUs,
>
> Actually, on Intel platforms, the energy-pkg event is also package
> scope except one platform, and that one is the only multi-die system
> with RAPL MSRs available.
> So that is why the die/pkg logic was introduced in the first place.
>
> I like the macro/helpers in this patch. The logic inside them may need
> to be optimized for Intel platforms, but that is a separate task.
Yes that is correct, however anyway the die macros return the package
values on a single die system, so I preferred to use them for all Intel
systems (single or multi-die).
We can surely update the macro name or logic in future to make it more
semantically correct.
Thanks for the review!
Regards,
Dhananjay
>
>> we have to replace these macros conditionally only for
>> AMD CPUs.
>>
>> On a 12 CCD 1 Package AMD Zen4 Genoa machine:
>>
>> Before:
>> $ cat /sys/devices/power/cpumask
>> 0,8,16,24,32,40,48,56,64,72,80,88.
>>
>> The expected cpumask here is supposed to be just "0", as it is a
>> package
>> scope event, only one CPU will be collecting the event for all the
>> CPUs in
>> the package.
>>
>> After:
>> $ cat /sys/devices/power/cpumask
>> 0
>>
>> Signed-off-by: Dhananjay Ugwekar <[email protected]>
>> Fixes: 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD
>> 0x80000026 leaf")
>
> Reviewed-by: Zhang Rui <[email protected]>
>
> thanks,
> rui
>> ---
>> PS: This patch was earlier sent separately(link below), it has not
>> been
>> merged yet, it is necessary for this patchset to work properly, also
>> it
>> fixes the pre-existing energy-pkg event.
>> https://lore.kernel.org/linux-perf-users/[email protected]/
>> ---
>> arch/x86/events/rapl.c | 30 ++++++++++++++++++++++++++----
>> 1 file changed, 26 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>> index b985ca79cf97..73be25e1f4b4 100644
>> --- a/arch/x86/events/rapl.c
>> +++ b/arch/x86/events/rapl.c
>> @@ -103,6 +103,10 @@ static struct perf_pmu_events_attr
>> event_attr_##v = { \
>> .event_str =
>> str, \
>> };
>>
>> +#define rapl_pmu_is_pkg_scope() \
>> + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
>> + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
>> +
>> struct rapl_pmu {
>> raw_spinlock_t lock;
>> int n_active;
>> @@ -140,9 +144,21 @@ static unsigned int rapl_cntr_mask;
>> static u64 rapl_timer_ms;
>> static struct perf_msr *rapl_msrs;
>>
>> +static inline unsigned int get_rapl_pmu_idx(int cpu)
>> +{
>> + return rapl_pmu_is_pkg_scope() ?
>> topology_logical_package_id(cpu) :
>> +
>> topology_logical_die_id(cpu);
>> +}
>> +
>> +static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
>> +{
>> + return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
>> + topology_die_cpumask(cpu);
>> +}
>> +
>> static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
>> {
>> - unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
>> + unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
>>
>> /*
>> * The unsigned check also catches the '-1' return value for
>> non
>> @@ -543,6 +559,7 @@ static struct perf_msr amd_rapl_msrs[] = {
>>
>> static int rapl_cpu_offline(unsigned int cpu)
>> {
>> + const struct cpumask *rapl_pmu_cpumask =
>> get_rapl_pmu_cpumask(cpu);
>> struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
>> int target;
>>
>> @@ -552,7 +569,7 @@ static int rapl_cpu_offline(unsigned int cpu)
>>
>> pmu->cpu = -1;
>> /* Find a new cpu to collect rapl events */
>> - target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
>> + target = cpumask_any_but(rapl_pmu_cpumask, cpu);
>>
>> /* Migrate rapl events to the new target */
>> if (target < nr_cpu_ids) {
>> @@ -565,6 +582,8 @@ static int rapl_cpu_offline(unsigned int cpu)
>>
>> static int rapl_cpu_online(unsigned int cpu)
>> {
>> + unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
>> + const struct cpumask *rapl_pmu_cpumask =
>> get_rapl_pmu_cpumask(cpu);
>> struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
>> int target;
>>
>> @@ -579,14 +598,14 @@ static int rapl_cpu_online(unsigned int cpu)
>> pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
>> rapl_hrtimer_init(pmu);
>>
>> - rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
>> + rapl_pmus->pmus[rapl_pmu_idx] = pmu;
>> }
>>
>> /*
>> * Check if there is an online cpu in the package which
>> collects rapl
>> * events already.
>> */
>> - target = cpumask_any_and(&rapl_cpu_mask,
>> topology_die_cpumask(cpu));
>> + target = cpumask_any_and(&rapl_cpu_mask, rapl_pmu_cpumask);
>> if (target < nr_cpu_ids)
>> return 0;
>>
>> @@ -677,6 +696,9 @@ static int __init init_rapl_pmus(void)
>> {
>> int nr_rapl_pmu = topology_max_packages() *
>> topology_max_dies_per_package();
>>
>> + if (rapl_pmu_is_pkg_scope())
>> + nr_rapl_pmu = topology_max_packages();
>> +
>> rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus,
>> nr_rapl_pmu), GFP_KERNEL);
>> if (!rapl_pmus)
>> return -ENOMEM;
>
Hi Rui,
On 6/11/2024 2:00 PM, Zhang, Rui wrote:
>> @@ -345,9 +353,14 @@ static int rapl_pmu_event_init(struct perf_event
>> *event)
>> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
>> int bit, ret = 0;
>> struct rapl_pmu *rapl_pmu;
>> + struct rapl_pmus *curr_rapl_pmus;
>>
>> /* only look at RAPL events */
>> - if (event->attr.type != rapl_pmus->pmu.type)
>> + if (event->attr.type == rapl_pmus->pmu.type)
>> + curr_rapl_pmus = rapl_pmus;
>> + else if (rapl_pmus_per_core && event->attr.type ==
>> rapl_pmus_per_core->pmu.type)
>> + curr_rapl_pmus = rapl_pmus_per_core;
>> + else
>> return -ENOENT;
>
> can we use container_of(event->pmu, struct rapl_pmus, pmu)?
Yes! that would be cleaner, will add it in next version.
>
>>
>> /* check only supported bits are set */
>> @@ -374,9 +387,14 @@ static int rapl_pmu_event_init(struct perf_event
>> *event)
>> return -EINVAL;
>>
>> /* must be done before validate_group */
>> - rapl_pmu = cpu_to_rapl_pmu(event->cpu);
>> + if (curr_rapl_pmus == rapl_pmus_per_core)
>> + rapl_pmu = curr_rapl_pmus-
>>> rapl_pmu[topology_core_id(event->cpu)];
>> + else
>> + rapl_pmu = curr_rapl_pmus-
>>> rapl_pmu[get_rapl_pmu_idx(event->cpu)];
>> +
>> if (!rapl_pmu)
>> return -EINVAL;
>
> Current code has PERF_EV_CAP_READ_ACTIVE_PKG flag set.
> Can you help me understand why it does not affect the new per-core pmu?
Good question, I went back and looked thru the code, it turns out that we
are not going thru the code path that checks this flag and decides whether
to run on the local cpu(cpu on which perf is running) or the event->cpu.
So, having or not having this flag doesnt make a difference here, I did a
small experiment for this.
On a single package system, any core should be able to read the energy-pkg
RAPL MSR and return the value, so there would be no need for a smp call to
the event->cpu, but if we look thru the ftrace below we can see that only
core 0 executes the pmu event even though we launched the perf stat for
core 1.
--------------------------------------------------------------------------
root@shatadru:/sys/kernel/tracing# perf stat -C 1 -e power/energy-pkg/ -- dd if=/dev/zero of=/dev/null bs=1M count=100000
100000+0 records in
100000+0 records out
104857600000 bytes (105 GB, 98 GiB) copied, 2.03295 s, 51.6 GB/s
Performance counter stats for 'CPU(s) 1':
231.59 Joules power/energy-pkg/
2.033916467 seconds time elapsed
root@shatadru:/sys/kernel/tracing# echo 0 > tracing_on
root@shatadru:/sys/kernel/tracing# cat trace
# tracer: function
#
# entries-in-buffer/entries-written: 12/12 #P:192
#
# _-----=> irqs-off/BH-disabled
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / _-=> migrate-disable
# |||| / delay
# TASK-PID CPU# ||||| TIMESTAMP FUNCTION
# | | | ||||| | |
perf-3309 [096] ...1. 3422.558183: rapl_get_attr_cpumask <-dev_attr_show
perf-3309 [001] ...1. 3422.559436: rapl_pmu_event_init <-perf_try_init_event
perf-3309 [001] ...1. 3422.559441: rapl_pmu_event_init <-perf_try_init_event
perf-3309 [001] ...1. 3422.559449: rapl_pmu_event_init <-perf_try_init_event
perf-3309 [001] ...1. 3422.559537: smp_call_function_single <-event_function_call <-- smp call to the event owner cpu(i.e. CPU0)
<idle>-0 [000] d.h3. 3422.559544: rapl_pmu_event_add <-event_sched_in <-- CPU# column changed to 0
<idle>-0 [000] d.h4. 3422.559545: __rapl_pmu_event_start <-rapl_pmu_event_add
perf-3309 [001] ...1. 3424.593398: smp_call_function_single <-event_function_call <-- smp call to the event owner cpu(i.e. CPU0)
<idle>-0 [000] d.h3. 3424.593403: rapl_pmu_event_del <-event_sched_out <-- CPU# column changed to 0
<idle>-0 [000] d.h3. 3424.593403: rapl_pmu_event_stop <-rapl_pmu_event_del
<idle>-0 [000] d.h4. 3424.593404: rapl_event_update.isra.0 <-rapl_pmu_event_stop
perf-3309 [001] ...1. 3424.593514: smp_call_function_single <-event_function_call
--------------------------------------------------------------------------
So, as we always use the event->cpu to run the event, the per-core PMU
is not being affected by this flag.
Anyway in next version, I will only selectively enable this flag for
package scope events. But we will need to look into fixing this
ineffective flag.
>
>> +
>> event->cpu = rapl_pmu->cpu;
>> event->pmu_private = rapl_pmu;
>> event->hw.event_base = rapl_msrs[bit].msr;
>> @@ -408,17 +426,38 @@ static struct attribute_group
>> rapl_pmu_attr_group = {
>> .attrs = rapl_pmu_attrs,
>> };
>>
>> +static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev,
>> + struct device_attribute
>> *attr, char *buf)
>> +{
>> + return cpumap_print_to_pagebuf(true, buf,
>> &rapl_pmus_per_core->cpumask);
>> +}
>> +
>> +static struct device_attribute dev_attr_per_core_cpumask =
>> __ATTR(cpumask, 0444,
>> +
>> rapl_get_attr_per_core_cpumask,
>> +
>> NULL);
>
> DEVICE_ATTR
I was not able to use DEVICE_ATTR, because there is already a "device_attribute dev_attr_cpumask_name"
created for package PMU cpumask using DEVICE_ATTR().
So I had to create a "device_attribute dev_attr_per_core_cpumask" manually
to avoid variable name clash.
>
>> +
>> +static struct attribute *rapl_pmu_per_core_attrs[] = {
>> + &dev_attr_per_core_cpumask.attr,
>> + NULL,
>> +};
>> +
>> +static struct attribute_group rapl_pmu_per_core_attr_group = {
>> + .attrs = rapl_pmu_per_core_attrs,
>> +};
>> +
>> RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
>> RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
>> RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
>> RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
>> RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
>> +RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x06");
>
> energy-per-core is for a separate pmu, so the event id does not need to
> be 6. The same applies to PERF_RAPL_PERCORE.
Correct, will fix in next version.
>
>>
>> static struct rapl_model model_amd_hygon = {
>> - .events = BIT(PERF_RAPL_PKG),
>> + .events = BIT(PERF_RAPL_PKG) |
>> + BIT(PERF_RAPL_PERCORE),
>> .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
>> .rapl_msrs = amd_rapl_msrs,
>> + .per_core = true,
>> };
>
> can we use bit PERF_RAPL_PERCORE to check per_core pmu suppot?
Makes sense, will modify.
>
> Just FYI, arch/x86/events/intel/cstate.c handles package/module/core
> scope cstate pmus. It uses a different approach in the probing part,
> which IMO is clearer.
Yes, I went thru it, I see that separate variables are being used to
mark the valid events for package and core scope and a wrapper fn around
perf_msr_probe is created, will see if that will make sense here as well.
Thanks for the review,
Dhananjay
>
> thanks,
> rui
>