RAPL energy counter MSRs are exposed via perf PMU. But this is done by
separate code which is not part of RAPL framework, and it cannot be
reused by other RAPL Interface drivers like TPMI RAPL.
Introduce two new APIs for PMU support in RAPL framework. This allows
TPMI RAPL PMU support and also makes it possible for future cleanups of
MSR RAPL PMU code.
Changes since V1:
- remove the MSR RAPL PMU conversion because it is a separate work that
can be done later.
- instead of using a flag to indicate the need of PMU support, introduce
two APIs for the RAPL Interface driver to invoke explicitly.
- minor code/comments/changelog improvements.
thanks,
rui
----------------------------------------------------------------
Zhang Rui (3):
powercap: intel_rapl: Sort header files
powercap: intel_rapl: Introduce APIs for PMU support
powercap: intel_rapl_tpmi: Enable PMU support
drivers/powercap/intel_rapl_common.c | 561 ++++++++++++++++++++++++++++++++++-
drivers/powercap/intel_rapl_tpmi.c | 3 +
include/linux/intel_rapl.h | 23 ++
3 files changed, 575 insertions(+), 12 deletions(-)
Sort header files alphabetically.
Signed-off-by: Zhang Rui <[email protected]>
---
drivers/powercap/intel_rapl_common.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index a28d54fd5222..1f4a7aa12d77 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -5,27 +5,27 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/bitmap.h>
#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/intel_rapl.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
#include <linux/processor.h>
-#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
-#include <asm/iosf_mbi.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/iosf_mbi.h>
/* bitmasks for RAPL MSRs, used by primitive access functions */
#define ENERGY_STATUS_MASK 0xffffffff
--
2.34.1
Enable RAPL PMU support for TPMI RAPL driver.
Signed-off-by: Zhang Rui <[email protected]>
---
drivers/powercap/intel_rapl_tpmi.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c
index f6b7f085977c..947544e4d229 100644
--- a/drivers/powercap/intel_rapl_tpmi.c
+++ b/drivers/powercap/intel_rapl_tpmi.c
@@ -302,6 +302,8 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev,
goto err;
}
+ rapl_package_add_pmu(trp->rp);
+
auxiliary_set_drvdata(auxdev, trp);
return 0;
@@ -314,6 +316,7 @@ static void intel_rapl_tpmi_remove(struct auxiliary_device *auxdev)
{
struct tpmi_rapl_package *trp = auxiliary_get_drvdata(auxdev);
+ rapl_package_remove_pmu(trp->rp);
rapl_remove_package(trp->rp);
trp_release(trp);
}
--
2.34.1
Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu().
RAPL driver can invoke these APIs to expose its supported energy
counters via perf PMU. The new RAPL PMU is fully compatible with current
MSR RAPL PMU, including using the same PMU name and events
name/id/unit/scale, etc.
For example, use below command
perf stat -e power/energy-pkg/ -e power/energy-ram/ -e power/energy-cores/ FOO
to get the energy consumption for events in the "perf list" output.
This does not introduce any conflict because TPMI RAPL is the only user
of these APIs currently, and it never co-exists with MSR RAPL.
Note that, TPMI RAPL is probed dynamically, and the events supported by
each TPMI RAPL device can be different. Thus, for a new RAPL Package
device that wants PMU support,
1. if PMU has not been registered yet, register the PMU with events
supported by current RAPL Package device
2. if PMU has been registered and no new events support is needed, do
nothing
2. or else, unregister current PMU and re-register the PMU with events
supported by all probed RAPL Package devices that need PMU support.
For example, on a dual-package system using TPMI RAPL, it is possible
that Package 1 behaves as TPMI domain root and supports Psys domain.
In this case, register PMU without Psys event when probing Package 0,
and re-register the PMU with Psys event when probing Package 1.
Signed-off-by: Zhang Rui <[email protected]>
---
drivers/powercap/intel_rapl_common.c | 537 +++++++++++++++++++++++++++
include/linux/intel_rapl.h | 23 ++
2 files changed, 560 insertions(+)
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 1f4a7aa12d77..f51012f023b0 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -15,6 +15,8 @@
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/perf_event.h>
#include <linux/platform_device.h>
#include <linux/powercap.h>
#include <linux/processor.h>
@@ -1506,6 +1508,541 @@ static int rapl_detect_domains(struct rapl_package *rp)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
+
+/* Support for RAPL PMU */
+
+struct rapl_pmu {
+ struct pmu pmu;
+ u64 timer_ms;
+ cpumask_t cpu_mask;
+ unsigned long domain_map; /* Events supported by all registered RAPL packages */
+ bool registered;
+};
+
+static struct rapl_pmu rapl_pmu;
+
+/* PMU helpers */
+static int get_pmu_cpu(struct rapl_package *rp)
+{
+ int cpu;
+
+ if (!rp->has_pmu)
+ return nr_cpu_ids;
+
+ if (rp->lead_cpu >= 0)
+ return rp->lead_cpu;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ for_each_online_cpu(cpu)
+ if (topology_physical_package_id(cpu) == rp->id)
+ return cpu;
+
+ return nr_cpu_ids;
+}
+
+static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
+{
+ if (!rp->has_pmu)
+ return false;
+
+ if (rp->lead_cpu >= 0)
+ return cpu == rp->lead_cpu;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ return topology_physical_package_id(cpu) == rp->id;
+}
+
+static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+
+ return &rp->pmu_data;
+}
+
+/* PMU event callbacks */
+
+/* Return 0 for unsupported events or failed read */
+static u64 event_read_counter(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+ u64 val;
+ int ret;
+
+ if (event->hw.idx < 0)
+ return 0;
+
+ ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+ return ret ? 0 : val;
+}
+
+static void __rapl_pmu_event_start(struct perf_event *event)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &data->active_list);
+
+ local64_set(&event->hw.prev_count, event_read_counter(event));
+ if (++data->n_active == 1)
+ hrtimer_start(&data->hrtimer, data->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+ __rapl_pmu_event_start(event);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ new_raw_count = event_read_counter(event);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ */
+ delta = new_raw_count - prev_raw_count;
+
+ /*
+ * Scale delta to smallest unit (2^-32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ sdelta = delta * data->scale[event->hw.flags];
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ /* Mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(data->n_active <= 0);
+ if (--data->n_active == 0)
+ hrtimer_cancel(&data->hrtimer);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* Check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+/*
+ * RAPL energy status counters
+ */
+enum perf_rapl_events {
+ PERF_RAPL_PP0 = 0, /* all cores */
+ PERF_RAPL_PKG, /* entire package */
+ PERF_RAPL_RAM, /* DRAM */
+ PERF_RAPL_PP1, /* gpu */
+ PERF_RAPL_PSYS, /* psys */
+};
+#define RAPL_EVENT_MASK GENMASK(7, 0)
+
+static int event_to_domain(int event)
+{
+ switch (event) {
+ case PERF_RAPL_PP0:
+ return RAPL_DOMAIN_PP0;
+ case PERF_RAPL_PKG:
+ return RAPL_DOMAIN_PACKAGE;
+ case PERF_RAPL_RAM:
+ return RAPL_DOMAIN_DRAM;
+ case PERF_RAPL_PP1:
+ return RAPL_DOMAIN_PP1;
+ case PERF_RAPL_PSYS:
+ return RAPL_DOMAIN_PLATFORM;
+ default:
+ return RAPL_DOMAIN_MAX;
+ }
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ struct rapl_package *pos, *rp = NULL;
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int domain, idx;
+
+ /* Only look at RAPL events */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* Check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* Find out which Package the event belongs to */
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ if (is_rp_pmu_cpu(pos, event->cpu)) {
+ rp = pos;
+ break;
+ }
+ }
+ if (!rp)
+ return -ENODEV;
+
+ /* Find out which RAPL Domain the event belongs to */
+ domain = event_to_domain(cfg - 1);
+ if (domain >= RAPL_DOMAIN_MAX)
+ return -EINVAL;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+ event->pmu_private = rp; /* Which package */
+ event->hw.flags = domain; /* Which domain */
+
+ event->hw.idx = -1;
+ /* Find out the index in rp->domains[] to get domain pointer */
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ if (rp->domains[idx].id == domain) {
+ event->hw.idx = idx;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_package_pmu_data *data =
+ container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!data->n_active)
+ return HRTIMER_NORESTART;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ list_for_each_entry(event, &data->active_list, active_entry)
+ rapl_event_update(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ hrtimer_forward_now(hrtimer, data->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+/* PMU sysfs attributes */
+
+/*
+ * There are no default events, but we need to create "events" group (with
+ * empty attrs) before updating it with detected events.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+static ssize_t cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rapl_package *rp;
+ int cpu;
+
+ cpus_read_lock();
+ cpumask_clear(&rapl_pmu.cpu_mask);
+
+ /* Choose a cpu for each RAPL Package */
+ list_for_each_entry(rp, &rapl_packages, plist) {
+ cpu = get_pmu_cpu(rp);
+ if (cpu < nr_cpu_ids)
+ cpumask_set_cpu(cpu, &rapl_pmu.cpu_mask);
+ }
+ cpus_read_unlock();
+
+ return cpumap_print_to_pagebuf(true, buf, &rapl_pmu.cpu_mask);
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group pmu_cpumask_group = {
+ .attrs = pmu_cpumask_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *pmu_format_attr[] = {
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = pmu_format_attr,
+};
+
+static const struct attribute_group *pmu_attr_groups[] = {
+ &pmu_events_group,
+ &pmu_cpumask_group,
+ &pmu_format_group,
+ NULL
+};
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .event_str = str, \
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules");
+
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10");
+
+#define RAPL_EVENT_GROUP(_name, domain) \
+static struct attribute *pmu_attr_##_name[] = { \
+ &event_attr_rapl_##_name.attr.attr, \
+ &event_attr_rapl_unit_##_name.attr.attr, \
+ &event_attr_rapl_scale_##_name.attr.attr, \
+ NULL \
+}; \
+static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \
+{ \
+ return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \
+} \
+static struct attribute_group pmu_group_##_name = { \
+ .name = "events", \
+ .attrs = pmu_attr_##_name, \
+ .is_visible = is_visible_##_name, \
+}
+
+RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0);
+RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE);
+RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM);
+RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1);
+RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM);
+
+static const struct attribute_group *pmu_attr_update[] = {
+ &pmu_group_cores,
+ &pmu_group_pkg,
+ &pmu_group_ram,
+ &pmu_group_gpu,
+ &pmu_group_psys,
+ NULL
+};
+
+static int rapl_pmu_update(struct rapl_package *rp)
+{
+ int ret;
+
+ /* Return if PMU already covers all events supported by current RAPL Package */
+ if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
+ return 0;
+
+ /* Unregister previous registered PMU */
+ if (rapl_pmu.registered) {
+ perf_pmu_unregister(&rapl_pmu.pmu);
+ memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
+ }
+
+ rapl_pmu.domain_map |= rp->domain_map;
+
+ memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
+ rapl_pmu.pmu.attr_groups = pmu_attr_groups;
+ rapl_pmu.pmu.attr_update = pmu_attr_update;
+ rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmu.pmu.event_init = rapl_pmu_event_init;
+ rapl_pmu.pmu.add = rapl_pmu_event_add;
+ rapl_pmu.pmu.del = rapl_pmu_event_del;
+ rapl_pmu.pmu.start = rapl_pmu_event_start;
+ rapl_pmu.pmu.stop = rapl_pmu_event_stop;
+ rapl_pmu.pmu.read = rapl_pmu_event_read;
+ rapl_pmu.pmu.module = THIS_MODULE;
+ rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
+ ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
+ if (ret)
+ pr_warn("Failed to register PMU\n");
+
+ rapl_pmu.registered = !ret;
+
+ return ret;
+}
+
+int rapl_package_add_pmu(struct rapl_package *rp)
+{
+ struct rapl_package_pmu_data *data = &rp->pmu_data;
+ int idx;
+ int ret;
+
+ if (rp->has_pmu)
+ return -EEXIST;
+
+ guard(cpus_read_lock)();
+
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ struct rapl_domain *rd = &rp->domains[idx];
+ int domain = rd->id;
+ u64 val;
+
+ if (!test_bit(domain, &rp->domain_map))
+ continue;
+
+ /*
+ * The RAPL PMU granularity is 2^-32 Joules
+ * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
+ */
+ val = rd->energy_unit * (1ULL << 32);
+ do_div(val, ENERGY_UNIT_SCALE * 1000000);
+ data->scale[domain] = val;
+
+ if (!rapl_pmu.timer_ms) {
+ struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows.
+ *
+ * max_count = rpi->mask >> rpi->shift + 1
+ * max_energy_pj = max_count * rd->energy_unit
+ * max_time_sec = (max_energy_pj / 1000000000) / 200w
+ *
+ * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
+ */
+ val = (rpi->mask >> rpi->shift) + 1;
+ val *= rd->energy_unit;
+ do_div(val, 1000000 * 200 * 2);
+ rapl_pmu.timer_ms = val;
+
+ pr_info("%llu ms ovfl timer\n", rapl_pmu.timer_ms);
+ }
+
+ pr_info("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);
+ }
+
+ /* Initialize per package PMU data */
+ raw_spin_lock_init(&data->lock);
+ INIT_LIST_HEAD(&data->active_list);
+ data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
+ hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ data->hrtimer.function = rapl_hrtimer_handle;
+
+ ret = rapl_pmu_update(rp);
+ if (!ret)
+ rp->has_pmu = true;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
+
+void rapl_package_remove_pmu(struct rapl_package *rp)
+{
+ struct rapl_package *pos;
+
+ if (!rp->has_pmu)
+ return;
+
+ guard(cpus_read_lock)();
+
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ /* PMU is still needed */
+ if (pos->has_pmu)
+ return;
+ }
+
+ perf_pmu_unregister(&rapl_pmu.pmu);
+ memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
+}
+EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
+#endif
+
/* called from CPU hotplug notifier, hotplug lock held */
void rapl_remove_package_cpuslocked(struct rapl_package *rp)
{
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index f3196f82fd8a..38e43dbb3dac 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -158,6 +158,17 @@ struct rapl_if_priv {
void *rpi;
};
+#ifdef CONFIG_PERF_EVENTS
+struct rapl_package_pmu_data {
+ u64 scale[RAPL_DOMAIN_MAX];
+ raw_spinlock_t lock;
+ int n_active;
+ struct list_head active_list;
+ ktime_t timer_interval;
+ struct hrtimer hrtimer;
+};
+#endif
+
/* maximum rapl package domain name: package-%d-die-%d */
#define PACKAGE_DOMAIN_NAME_LENGTH 30
@@ -176,6 +187,10 @@ struct rapl_package {
struct cpumask cpumask;
char name[PACKAGE_DOMAIN_NAME_LENGTH];
struct rapl_if_priv *priv;
+#ifdef CONFIG_PERF_EVENTS
+ bool has_pmu;
+ struct rapl_package_pmu_data pmu_data;
+#endif
};
struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
@@ -188,4 +203,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu);
void rapl_remove_package(struct rapl_package *rp);
+#ifdef CONFIG_PERF_EVENTS
+int rapl_package_add_pmu(struct rapl_package *rp);
+void rapl_package_remove_pmu(struct rapl_package *rp);
+#else
+static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; }
+static inline void rapl_package_remove_pmu(struct rapl_package *rp) { }
+#endif
+
#endif /* __INTEL_RAPL_H__ */
--
2.34.1
On Mon, Apr 8, 2024 at 5:52 AM Zhang Rui <[email protected]> wrote:
>
> Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu().
>
> RAPL driver can invoke these APIs to expose its supported energy
> counters via perf PMU. The new RAPL PMU is fully compatible with current
> MSR RAPL PMU, including using the same PMU name and events
> name/id/unit/scale, etc.
>
> For example, use below command
> perf stat -e power/energy-pkg/ -e power/energy-ram/ -e power/energy-cores/ FOO
> to get the energy consumption for events in the "perf list" output.
>
> This does not introduce any conflict because TPMI RAPL is the only user
> of these APIs currently, and it never co-exists with MSR RAPL.
>
> Note that, TPMI RAPL is probed dynamically, and the events supported by
> each TPMI RAPL device can be different. Thus, for a new RAPL Package
> device that wants PMU support,
> 1. if PMU has not been registered yet, register the PMU with events
> supported by current RAPL Package device
> 2. if PMU has been registered and no new events support is needed, do
> nothing
> 2. or else, unregister current PMU and re-register the PMU with events
> supported by all probed RAPL Package devices that need PMU support.
> For example, on a dual-package system using TPMI RAPL, it is possible
> that Package 1 behaves as TPMI domain root and supports Psys domain.
> In this case, register PMU without Psys event when probing Package 0,
> and re-register the PMU with Psys event when probing Package 1.
>
> Signed-off-by: Zhang Rui <[email protected]>
> ---
> drivers/powercap/intel_rapl_common.c | 537 +++++++++++++++++++++++++++
> include/linux/intel_rapl.h | 23 ++
> 2 files changed, 560 insertions(+)
>
> diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
> index 1f4a7aa12d77..f51012f023b0 100644
> --- a/drivers/powercap/intel_rapl_common.c
> +++ b/drivers/powercap/intel_rapl_common.c
> @@ -15,6 +15,8 @@
> #include <linux/list.h>
> #include <linux/log2.h>
> #include <linux/module.h>
> +#include <linux/nospec.h>
> +#include <linux/perf_event.h>
> #include <linux/platform_device.h>
> #include <linux/powercap.h>
> #include <linux/processor.h>
> @@ -1506,6 +1508,541 @@ static int rapl_detect_domains(struct rapl_package *rp)
> return 0;
> }
>
> +#ifdef CONFIG_PERF_EVENTS
> +
> +/* Support for RAPL PMU */
It would be good to add some details regarding the implementation etc.
to this comment.
> +
> +struct rapl_pmu {
> + struct pmu pmu;
> + u64 timer_ms;
> + cpumask_t cpu_mask;
> + unsigned long domain_map; /* Events supported by all registered RAPL packages */
> + bool registered;
> +};
The structure fields need to be documented.
> +
> +static struct rapl_pmu rapl_pmu;
> +
> +/* PMU helpers */
A blank like here?
> +static int get_pmu_cpu(struct rapl_package *rp)
> +{
> + int cpu;
> +
> + if (!rp->has_pmu)
> + return nr_cpu_ids;
> +
> + if (rp->lead_cpu >= 0)
> + return rp->lead_cpu;
> +
> + /* TPMI RAPL uses any CPU in the package for PMU */
> + for_each_online_cpu(cpu)
> + if (topology_physical_package_id(cpu) == rp->id)
> + return cpu;
> +
> + return nr_cpu_ids;
> +}
> +
> +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
> +{
> + if (!rp->has_pmu)
> + return false;
> +
> + if (rp->lead_cpu >= 0)
> + return cpu == rp->lead_cpu;
So if the given CPU is not the lead CPU, but it is located in the same
package as the lead CPU, the function will return 'false'. TBH, this
is somewhat confusing.
> +
> + /* TPMI RAPL uses any CPU in the package for PMU */
> + return topology_physical_package_id(cpu) == rp->id;
> +}
> +
> +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
> +{
> + struct rapl_package *rp = event->pmu_private;
> +
> + return &rp->pmu_data;
> +}
> +
> +/* PMU event callbacks */
> +
> +/* Return 0 for unsupported events or failed read */
Can you put this comment into the function body?
> +static u64 event_read_counter(struct perf_event *event)
> +{
> + struct rapl_package *rp = event->pmu_private;
> + u64 val;
> + int ret;
> +
> + if (event->hw.idx < 0)
> + return 0;
> +
> + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
> + return ret ? 0 : val;
I would prefer
if (ret)
return 0;
return val;
> +}
> +
> +static void __rapl_pmu_event_start(struct perf_event *event)
> +{
> + struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +
> + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> + return;
> +
> + event->hw.state = 0;
> +
> + list_add_tail(&event->active_entry, &data->active_list);
> +
> + local64_set(&event->hw.prev_count, event_read_counter(event));
> + if (++data->n_active == 1)
> + hrtimer_start(&data->hrtimer, data->timer_interval,
> + HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static void rapl_pmu_event_start(struct perf_event *event, int mode)
> +{
> + struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&data->lock, flags);
> + __rapl_pmu_event_start(event);
> + raw_spin_unlock_irqrestore(&data->lock, flags);
Why does it need to be raw_spin_lock_?
What exactly is protected by data->lock?
> +}
> +
> +static u64 rapl_event_update(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> + u64 prev_raw_count, new_raw_count;
> + s64 delta, sdelta;
> +
> +again:
> + prev_raw_count = local64_read(&hwc->prev_count);
> + new_raw_count = event_read_counter(event);
> +
> + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> + new_raw_count) != prev_raw_count)
> + goto again;
It would be somewhat easier to follow if an additional u64 variable were used:
do {
prev_raw_count = local64_read(&hwc->prev_count);
new_raw_count = event_read_counter(event);
tmp = (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count);
} while (tmp != prev_raw_count);
> +
> + /*
> + * Now we have the new raw value and have updated the prev
> + * timestamp already. We can now calculate the elapsed delta
> + * (event-)time and add that to the generic event.
> + */
> + delta = new_raw_count - prev_raw_count;
> +
> + /*
> + * Scale delta to smallest unit (2^-32)
> + * users must then scale back: count * 1/(1e9*2^32) to get Joules
> + * or use ldexp(count, -32).
> + * Watts = Joules/Time delta
> + */
> + sdelta = delta * data->scale[event->hw.flags];
> +
> + local64_add(sdelta, &event->count);
> +
> + return new_raw_count;
> +}
> +
> +static void rapl_pmu_event_stop(struct perf_event *event, int mode)
> +{
> + struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> + struct hw_perf_event *hwc = &event->hw;
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&data->lock, flags);
> +
> + /* Mark event as deactivated and stopped */
> + if (!(hwc->state & PERF_HES_STOPPED)) {
> + WARN_ON_ONCE(data->n_active <= 0);
> + if (--data->n_active == 0)
> + hrtimer_cancel(&data->hrtimer);
> +
> + list_del(&event->active_entry);
> +
> + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
> + hwc->state |= PERF_HES_STOPPED;
> + }
> +
> + /* Check if update of sw counter is necessary */
> + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
> + /*
> + * Drain the remaining delta count out of a event
> + * that we are disabling:
> + */
> + rapl_event_update(event);
> + hwc->state |= PERF_HES_UPTODATE;
> + }
> +
> + raw_spin_unlock_irqrestore(&data->lock, flags);
> +}
> +
> +static int rapl_pmu_event_add(struct perf_event *event, int mode)
> +{
> + struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> + struct hw_perf_event *hwc = &event->hw;
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&data->lock, flags);
> +
> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> +
> + if (mode & PERF_EF_START)
> + __rapl_pmu_event_start(event);
> +
> + raw_spin_unlock_irqrestore(&data->lock, flags);
> +
> + return 0;
> +}
> +
> +static void rapl_pmu_event_del(struct perf_event *event, int flags)
> +{
> + rapl_pmu_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +/*
> + * RAPL energy status counters
> + */
> +enum perf_rapl_events {
> + PERF_RAPL_PP0 = 0, /* all cores */
> + PERF_RAPL_PKG, /* entire package */
> + PERF_RAPL_RAM, /* DRAM */
> + PERF_RAPL_PP1, /* gpu */
> + PERF_RAPL_PSYS, /* psys */
> +};
> +#define RAPL_EVENT_MASK GENMASK(7, 0)
> +
> +static int event_to_domain(int event)
> +{
> + switch (event) {
> + case PERF_RAPL_PP0:
> + return RAPL_DOMAIN_PP0;
> + case PERF_RAPL_PKG:
> + return RAPL_DOMAIN_PACKAGE;
> + case PERF_RAPL_RAM:
> + return RAPL_DOMAIN_DRAM;
> + case PERF_RAPL_PP1:
> + return RAPL_DOMAIN_PP1;
> + case PERF_RAPL_PSYS:
> + return RAPL_DOMAIN_PLATFORM;
> + default:
> + return RAPL_DOMAIN_MAX;
> + }
> +}
IMV it would be somewhat cleaner to use an array instead of this
wrapper around switch ().
> +
> +static int rapl_pmu_event_init(struct perf_event *event)
> +{
> + struct rapl_package *pos, *rp = NULL;
> + u64 cfg = event->attr.config & RAPL_EVENT_MASK;
> + int domain, idx;
> +
> + /* Only look at RAPL events */
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + /* Check only supported bits are set */
> + if (event->attr.config & ~RAPL_EVENT_MASK)
> + return -EINVAL;
> +
> + if (event->cpu < 0)
> + return -EINVAL;
> +
> + /* Find out which Package the event belongs to */
> + list_for_each_entry(pos, &rapl_packages, plist) {
> + if (is_rp_pmu_cpu(pos, event->cpu)) {
> + rp = pos;
> + break;
> + }
> + }
> + if (!rp)
> + return -ENODEV;
> +
> + /* Find out which RAPL Domain the event belongs to */
> + domain = event_to_domain(cfg - 1);
> + if (domain >= RAPL_DOMAIN_MAX)
> + return -EINVAL;
> +
> + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
> + event->pmu_private = rp; /* Which package */
> + event->hw.flags = domain; /* Which domain */
> +
> + event->hw.idx = -1;
> + /* Find out the index in rp->domains[] to get domain pointer */
> + for (idx = 0; idx < rp->nr_domains; idx++) {
> + if (rp->domains[idx].id == domain) {
> + event->hw.idx = idx;
> + break;
> + }
> + }
> +
> + return 0;
> +}
> +
> +static void rapl_pmu_event_read(struct perf_event *event)
> +{
> + rapl_event_update(event);
> +}
> +
> +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
> +{
> + struct rapl_package_pmu_data *data =
> + container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
> + struct perf_event *event;
> + unsigned long flags;
> +
> + if (!data->n_active)
> + return HRTIMER_NORESTART;
> +
> + raw_spin_lock_irqsave(&data->lock, flags);
> +
> + list_for_each_entry(event, &data->active_list, active_entry)
> + rapl_event_update(event);
> +
> + raw_spin_unlock_irqrestore(&data->lock, flags);
> +
> + hrtimer_forward_now(hrtimer, data->timer_interval);
> +
> + return HRTIMER_RESTART;
> +}
> +
> +/* PMU sysfs attributes */
> +
> +/*
> + * There are no default events, but we need to create "events" group (with
> + * empty attrs) before updating it with detected events.
> + */
> +static struct attribute *attrs_empty[] = {
> + NULL,
> +};
> +
> +static struct attribute_group pmu_events_group = {
> + .name = "events",
> + .attrs = attrs_empty,
> +};
> +
> +static ssize_t cpumask_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct rapl_package *rp;
> + int cpu;
> +
> + cpus_read_lock();
Is rapl_packages protected by this?
> + cpumask_clear(&rapl_pmu.cpu_mask);
It doesn't look like rapl_pmu.cpu_mask is used outside this function,
so why is it global?
> +
> + /* Choose a cpu for each RAPL Package */
> + list_for_each_entry(rp, &rapl_packages, plist) {
> + cpu = get_pmu_cpu(rp);
> + if (cpu < nr_cpu_ids)
> + cpumask_set_cpu(cpu, &rapl_pmu.cpu_mask);
> + }
> + cpus_read_unlock();
> +
> + return cpumap_print_to_pagebuf(true, buf, &rapl_pmu.cpu_mask);
> +}
> +
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *pmu_cpumask_attrs[] = {
> + &dev_attr_cpumask.attr,
> + NULL
> +};
> +
> +static struct attribute_group pmu_cpumask_group = {
> + .attrs = pmu_cpumask_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-7");
> +static struct attribute *pmu_format_attr[] = {
> + &format_attr_event.attr,
> + NULL
> +};
> +
> +static struct attribute_group pmu_format_group = {
> + .name = "format",
> + .attrs = pmu_format_attr,
> +};
> +
> +static const struct attribute_group *pmu_attr_groups[] = {
> + &pmu_events_group,
> + &pmu_cpumask_group,
> + &pmu_format_group,
> + NULL
> +};
> +
> +#define RAPL_EVENT_ATTR_STR(_name, v, str) \
> +static struct perf_pmu_events_attr event_attr_##v = { \
> + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
> + .event_str = str, \
> +}
> +
> +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
> +RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02");
> +RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03");
> +RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04");
> +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
> +
> +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules");
> +RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules");
> +RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules");
> +RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules");
> +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules");
> +
> +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10");
> +
> +#define RAPL_EVENT_GROUP(_name, domain) \
> +static struct attribute *pmu_attr_##_name[] = { \
> + &event_attr_rapl_##_name.attr.attr, \
> + &event_attr_rapl_unit_##_name.attr.attr, \
> + &event_attr_rapl_scale_##_name.attr.attr, \
> + NULL \
> +}; \
> +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \
> +{ \
> + return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \
> +} \
> +static struct attribute_group pmu_group_##_name = { \
> + .name = "events", \
> + .attrs = pmu_attr_##_name, \
> + .is_visible = is_visible_##_name, \
> +}
> +
> +RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0);
> +RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE);
> +RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM);
> +RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1);
> +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM);
> +
> +static const struct attribute_group *pmu_attr_update[] = {
> + &pmu_group_cores,
> + &pmu_group_pkg,
> + &pmu_group_ram,
> + &pmu_group_gpu,
> + &pmu_group_psys,
> + NULL
> +};
> +
> +static int rapl_pmu_update(struct rapl_package *rp)
> +{
> + int ret;
> +
> + /* Return if PMU already covers all events supported by current RAPL Package */
> + if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
> + return 0;
> +
> + /* Unregister previous registered PMU */
> + if (rapl_pmu.registered) {
> + perf_pmu_unregister(&rapl_pmu.pmu);
> + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> + }
> +
> + rapl_pmu.domain_map |= rp->domain_map;
> +
> + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> + rapl_pmu.pmu.attr_groups = pmu_attr_groups;
> + rapl_pmu.pmu.attr_update = pmu_attr_update;
> + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
> + rapl_pmu.pmu.event_init = rapl_pmu_event_init;
> + rapl_pmu.pmu.add = rapl_pmu_event_add;
> + rapl_pmu.pmu.del = rapl_pmu_event_del;
> + rapl_pmu.pmu.start = rapl_pmu_event_start;
> + rapl_pmu.pmu.stop = rapl_pmu_event_stop;
> + rapl_pmu.pmu.read = rapl_pmu_event_read;
> + rapl_pmu.pmu.module = THIS_MODULE;
> + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
> + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> + if (ret)
> + pr_warn("Failed to register PMU\n");
> +
> + rapl_pmu.registered = !ret;
Why don't you set rp->has_pmu here?
> +
> + return ret;
It looks like this could be rearranged overall for more clarity:
ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
if (ret) {
pr_warn("Failed to register PMU\n");
return ret;
}
rapl_pmu.registered = true;
rp->has_pmu = true;
return 0;
Also, the "Failed to register PMU\n" message is not particularly
useful AFAICS. It would be good to add some more context to it and
maybe make it pr_info()?
> +}
> +
> +int rapl_package_add_pmu(struct rapl_package *rp)
> +{
> + struct rapl_package_pmu_data *data = &rp->pmu_data;
> + int idx;
> + int ret;
> +
> + if (rp->has_pmu)
> + return -EEXIST;
> +
> + guard(cpus_read_lock)();
Why does this lock need to be held around the entire code below?
> +
> + for (idx = 0; idx < rp->nr_domains; idx++) {
> + struct rapl_domain *rd = &rp->domains[idx];
> + int domain = rd->id;
> + u64 val;
> +
> + if (!test_bit(domain, &rp->domain_map))
> + continue;
> +
> + /*
> + * The RAPL PMU granularity is 2^-32 Joules
> + * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
> + */
> + val = rd->energy_unit * (1ULL << 32);
> + do_div(val, ENERGY_UNIT_SCALE * 1000000);
> + data->scale[domain] = val;
> +
> + if (!rapl_pmu.timer_ms) {
> + struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
> +
> + /*
> + * Calculate the timer rate:
> + * Use reference of 200W for scaling the timeout to avoid counter
> + * overflows.
> + *
> + * max_count = rpi->mask >> rpi->shift + 1
> + * max_energy_pj = max_count * rd->energy_unit
> + * max_time_sec = (max_energy_pj / 1000000000) / 200w
> + *
> + * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
> + */
> + val = (rpi->mask >> rpi->shift) + 1;
> + val *= rd->energy_unit;
> + do_div(val, 1000000 * 200 * 2);
> + rapl_pmu.timer_ms = val;
> +
> + pr_info("%llu ms ovfl timer\n", rapl_pmu.timer_ms);
s/ovfl/overflow/
And use pr_debug()?
> + }
> +
> + pr_info("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);
pr_debug() here too?
> + }
> +
> + /* Initialize per package PMU data */
> + raw_spin_lock_init(&data->lock);
> + INIT_LIST_HEAD(&data->active_list);
> + data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
> + hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + data->hrtimer.function = rapl_hrtimer_handle;
> +
> + ret = rapl_pmu_update(rp);
> + if (!ret)
> + rp->has_pmu = true;
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
> +
> +void rapl_package_remove_pmu(struct rapl_package *rp)
> +{
> + struct rapl_package *pos;
> +
> + if (!rp->has_pmu)
> + return;
> +
> + guard(cpus_read_lock)();
> +
> + list_for_each_entry(pos, &rapl_packages, plist) {
rp can be used for walking this list, can't it?
> + /* PMU is still needed */
> + if (pos->has_pmu)
> + return;
> + }
> +
> + perf_pmu_unregister(&rapl_pmu.pmu);
> + memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
> +}
> +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
> +#endif
> +
> /* called from CPU hotplug notifier, hotplug lock held */
> void rapl_remove_package_cpuslocked(struct rapl_package *rp)
> {
> diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
> index f3196f82fd8a..38e43dbb3dac 100644
> --- a/include/linux/intel_rapl.h
> +++ b/include/linux/intel_rapl.h
> @@ -158,6 +158,17 @@ struct rapl_if_priv {
> void *rpi;
> };
>
> +#ifdef CONFIG_PERF_EVENTS
Structure fields below need to be documented.
> +struct rapl_package_pmu_data {
> + u64 scale[RAPL_DOMAIN_MAX];
> + raw_spinlock_t lock;
> + int n_active;
> + struct list_head active_list;
> + ktime_t timer_interval;
> + struct hrtimer hrtimer;
> +};
> +#endif
> +
> /* maximum rapl package domain name: package-%d-die-%d */
> #define PACKAGE_DOMAIN_NAME_LENGTH 30
>
> @@ -176,6 +187,10 @@ struct rapl_package {
> struct cpumask cpumask;
> char name[PACKAGE_DOMAIN_NAME_LENGTH];
> struct rapl_if_priv *priv;
> +#ifdef CONFIG_PERF_EVENTS
> + bool has_pmu;
> + struct rapl_package_pmu_data pmu_data;
> +#endif
> };
>
> struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
> @@ -188,4 +203,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
> struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu);
> void rapl_remove_package(struct rapl_package *rp);
>
> +#ifdef CONFIG_PERF_EVENTS
> +int rapl_package_add_pmu(struct rapl_package *rp);
> +void rapl_package_remove_pmu(struct rapl_package *rp);
> +#else
> +static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; }
> +static inline void rapl_package_remove_pmu(struct rapl_package *rp) { }
> +#endif
> +
> #endif /* __INTEL_RAPL_H__ */
> --
Thanks!
On Mon, Apr 8, 2024 at 5:51 AM Zhang Rui <[email protected]> wrote:
>
> Sort header files alphabetically.
>
> Signed-off-by: Zhang Rui <[email protected]>
> ---
> drivers/powercap/intel_rapl_common.c | 24 ++++++++++++------------
> 1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
> index a28d54fd5222..1f4a7aa12d77 100644
> --- a/drivers/powercap/intel_rapl_common.c
> +++ b/drivers/powercap/intel_rapl_common.c
> @@ -5,27 +5,27 @@
> */
> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>
> +#include <linux/bitmap.h>
> #include <linux/cleanup.h>
> +#include <linux/cpu.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/intel_rapl.h>
> #include <linux/kernel.h>
> -#include <linux/module.h>
> #include <linux/list.h>
> -#include <linux/types.h>
> -#include <linux/device.h>
> -#include <linux/slab.h>
> #include <linux/log2.h>
> -#include <linux/bitmap.h>
> -#include <linux/delay.h>
> -#include <linux/sysfs.h>
> -#include <linux/cpu.h>
> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> #include <linux/powercap.h>
> -#include <linux/suspend.h>
> -#include <linux/intel_rapl.h>
> #include <linux/processor.h>
> -#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +#include <linux/suspend.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
>
> -#include <asm/iosf_mbi.h>
> #include <asm/cpu_device_id.h>
> #include <asm/intel-family.h>
> +#include <asm/iosf_mbi.h>
>
> /* bitmasks for RAPL MSRs, used by primitive access functions */
> #define ENERGY_STATUS_MASK 0xffffffff
> --
I can apply this cleanup right away, so do you want me to do that?
On Tue, 2024-04-16 at 16:01 +0200, Rafael J. Wysocki wrote:
> On Mon, Apr 8, 2024 at 5:51 AM Zhang Rui <[email protected]> wrote:
> >
> > Sort header files alphabetically.
> >
> > Signed-off-by: Zhang Rui <[email protected]>
> > ---
> > drivers/powercap/intel_rapl_common.c | 24 ++++++++++++------------
> > 1 file changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/powercap/intel_rapl_common.c
> > b/drivers/powercap/intel_rapl_common.c
> > index a28d54fd5222..1f4a7aa12d77 100644
> > --- a/drivers/powercap/intel_rapl_common.c
> > +++ b/drivers/powercap/intel_rapl_common.c
> > @@ -5,27 +5,27 @@
> > */
> > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> >
> > +#include <linux/bitmap.h>
> > #include <linux/cleanup.h>
> > +#include <linux/cpu.h>
> > +#include <linux/delay.h>
> > +#include <linux/device.h>
> > +#include <linux/intel_rapl.h>
> > #include <linux/kernel.h>
> > -#include <linux/module.h>
> > #include <linux/list.h>
> > -#include <linux/types.h>
> > -#include <linux/device.h>
> > -#include <linux/slab.h>
> > #include <linux/log2.h>
> > -#include <linux/bitmap.h>
> > -#include <linux/delay.h>
> > -#include <linux/sysfs.h>
> > -#include <linux/cpu.h>
> > +#include <linux/module.h>
> > +#include <linux/platform_device.h>
> > #include <linux/powercap.h>
> > -#include <linux/suspend.h>
> > -#include <linux/intel_rapl.h>
> > #include <linux/processor.h>
> > -#include <linux/platform_device.h>
> > +#include <linux/slab.h>
> > +#include <linux/suspend.h>
> > +#include <linux/sysfs.h>
> > +#include <linux/types.h>
> >
> > -#include <asm/iosf_mbi.h>
> > #include <asm/cpu_device_id.h>
> > #include <asm/intel-family.h>
> > +#include <asm/iosf_mbi.h>
> >
> > /* bitmasks for RAPL MSRs, used by primitive access functions */
> > #define ENERGY_STATUS_MASK 0xffffffff
> > --
>
> I can apply this cleanup right away, so do you want me to do that?
yes, please. thanks!
-rui
Hi, Rafael,
Thanks for reviewing.
Will refresh the patch based on your feedback, just a few coments
below.
> > +
> > +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
> > +{
> > + if (!rp->has_pmu)
> > + return false;
> > +
> > + if (rp->lead_cpu >= 0)
> > + return cpu == rp->lead_cpu;
>
> So if the given CPU is not the lead CPU, but it is located in the
> same
> package as the lead CPU, the function will return 'false'. TBH, this
> is somewhat confusing.
>
The above code actually applies to MSR RAPL because TPMI RAPL has
lead_cpu < 0.
Instead, I can use something like below to avoid the confusion.
if (rp->priv->type != RAPL_IF_TPMI)
return false;
and do future improvements when adding support for MSR RAPL.
> > +static void __rapl_pmu_event_start(struct perf_event *event)
> > +{
> > + struct rapl_package_pmu_data *data =
> > event_to_pmu_data(event);
> > +
> > + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> > + return;
> > +
> > + event->hw.state = 0;
> > +
> > + list_add_tail(&event->active_entry, &data->active_list);
> > +
> > + local64_set(&event->hw.prev_count,
> > event_read_counter(event));
> > + if (++data->n_active == 1)
> > + hrtimer_start(&data->hrtimer, data->timer_interval,
> > + HRTIMER_MODE_REL_PINNED);
> > +}
> > +
> > +static void rapl_pmu_event_start(struct perf_event *event, int
> > mode)
> > +{
> > + struct rapl_package_pmu_data *data =
> > event_to_pmu_data(event);
> > + unsigned long flags;
> > +
> > + raw_spin_lock_irqsave(&data->lock, flags);
> > + __rapl_pmu_event_start(event);
> > + raw_spin_unlock_irqrestore(&data->lock, flags);
>
> Why does it need to be raw_spin_lock_?
>
> What exactly is protected by data->lock?
>
This is copied from MSR RAPL PMU, which exists from day 1 of the code.
Let me double check.
>
> > +
> > +static ssize_t cpumask_show(struct device *dev,
> > + struct device_attribute *attr, char
> > *buf)
> > +{
> > + struct rapl_package *rp;
> > + int cpu;
> > +
> > + cpus_read_lock();
>
> Is rapl_packages protected by this?
yes.
>
> > + cpumask_clear(&rapl_pmu.cpu_mask);
>
> It doesn't look like rapl_pmu.cpu_mask is used outside this function,
> so why is it global?
Good catch, will fix it.
>
> > +static int rapl_pmu_update(struct rapl_package *rp)
> > +{
> > + int ret;
> > +
> > + /* Return if PMU already covers all events supported by
> > current RAPL Package */
> > + if (rapl_pmu.registered && !(rp->domain_map &
> > (~rapl_pmu.domain_map)))
> > + return 0;
> > +
> > + /* Unregister previous registered PMU */
> > + if (rapl_pmu.registered) {
> > + perf_pmu_unregister(&rapl_pmu.pmu);
> > + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> > + }
> > +
> > + rapl_pmu.domain_map |= rp->domain_map;
> > +
> > + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> > + rapl_pmu.pmu.attr_groups = pmu_attr_groups;
> > + rapl_pmu.pmu.attr_update = pmu_attr_update;
> > + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
> > + rapl_pmu.pmu.event_init = rapl_pmu_event_init;
> > + rapl_pmu.pmu.add = rapl_pmu_event_add;
> > + rapl_pmu.pmu.del = rapl_pmu_event_del;
> > + rapl_pmu.pmu.start = rapl_pmu_event_start;
> > + rapl_pmu.pmu.stop = rapl_pmu_event_stop;
> > + rapl_pmu.pmu.read = rapl_pmu_event_read;
> > + rapl_pmu.pmu.module = THIS_MODULE;
> > + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE |
> > PERF_PMU_CAP_NO_INTERRUPT;
> > + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> > + if (ret)
> > + pr_warn("Failed to register PMU\n");
> > +
> > + rapl_pmu.registered = !ret;
>
> Why don't you set rp->has_pmu here?
>
> > +
> > + return ret;
>
> It looks like this could be rearranged overall for more clarity:
>
> ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> if (ret) {
> pr_warn("Failed to register PMU\n");
> return ret;
> }
>
> rapl_pmu.registered = true;
> rp->has_pmu = true;
>
> return 0;
>
Sure.
In my previous design,
rapl_pmu_update() updates generic RAPL PMU.
rapl_package_add_pmu() updates a given RAPL package.
that is why I put
rp->has_pmu = true;
in rapl_package_add_pmu().
> Also, the "Failed to register PMU\n" message is not particularly
> useful AFAICS. It would be good to add some more context to it and
> maybe make it pr_info()?
>
sure.
> > +}
> > +
> > +int rapl_package_add_pmu(struct rapl_package *rp)
> > +{
> > + struct rapl_package_pmu_data *data = &rp->pmu_data;
> > + int idx;
> > + int ret;
> > +
> > + if (rp->has_pmu)
> > + return -EEXIST;
> > +
> > + guard(cpus_read_lock)();
>
> Why does this lock need to be held around the entire code below?
>
This guaranteed that the RAPL Package is always valid and rapl_pmu
global variable is protected when updating the PMU.
> > +
> > + for (idx = 0; idx < rp->nr_domains; idx++) {
> > + struct rapl_domain *rd = &rp->domains[idx];
> > + int domain = rd->id;
> > + u64 val;
> > +
> > + if (!test_bit(domain, &rp->domain_map))
> > + continue;
> > +
> > + /*
> > + * The RAPL PMU granularity is 2^-32 Joules
> > + * data->scale[]: times of 2^-32 Joules for each
> > ENERGY COUNTER increase
> > + */
> > + val = rd->energy_unit * (1ULL << 32);
> > + do_div(val, ENERGY_UNIT_SCALE * 1000000);
> > + data->scale[domain] = val;
> > +
> > + if (!rapl_pmu.timer_ms) {
> > + struct rapl_primitive_info *rpi =
> > get_rpi(rp, ENERGY_COUNTER);
> > +
> > + /*
> > + * Calculate the timer rate:
> > + * Use reference of 200W for scaling the
> > timeout to avoid counter
> > + * overflows.
> > + *
> > + * max_count = rpi->mask >> rpi->shift + 1
> > + * max_energy_pj = max_count * rd-
> > >energy_unit
> > + * max_time_sec = (max_energy_pj /
> > 1000000000) / 200w
> > + *
> > + * rapl_pmu.timer_ms = max_time_sec * 1000
> > / 2
> > + */
> > + val = (rpi->mask >> rpi->shift) + 1;
> > + val *= rd->energy_unit;
> > + do_div(val, 1000000 * 200 * 2);
> > + rapl_pmu.timer_ms = val;
> > +
> > + pr_info("%llu ms ovfl timer\n",
> > rapl_pmu.timer_ms);
>
> s/ovfl/overflow/
>
> And use pr_debug()?
>
> > + }
> > +
> > + pr_info("Domain %s: hw unit %lld * 2^-32 Joules\n",
> > rd->name, data->scale[domain]);
>
> pr_debug() here too?
These all follow the MSR RAPL PMU code, so that we see the same output
no matter using MSR RAPL or TPMI RAPL. I can change them to pr_debug().
Thanks,
rui
On Wed, Apr 17, 2024 at 3:57 AM Zhang, Rui <[email protected]> wrote:
>
> On Tue, 2024-04-16 at 16:01 +0200, Rafael J. Wysocki wrote:
> > On Mon, Apr 8, 2024 at 5:51 AM Zhang Rui <[email protected]> wrote:
> > >
> > > Sort header files alphabetically.
> > >
> > > Signed-off-by: Zhang Rui <[email protected]>
> > > ---
> > > drivers/powercap/intel_rapl_common.c | 24 ++++++++++++------------
> > > 1 file changed, 12 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/drivers/powercap/intel_rapl_common.c
> > > b/drivers/powercap/intel_rapl_common.c
> > > index a28d54fd5222..1f4a7aa12d77 100644
> > > --- a/drivers/powercap/intel_rapl_common.c
> > > +++ b/drivers/powercap/intel_rapl_common.c
> > > @@ -5,27 +5,27 @@
> > > */
> > > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > >
> > > +#include <linux/bitmap.h>
> > > #include <linux/cleanup.h>
> > > +#include <linux/cpu.h>
> > > +#include <linux/delay.h>
> > > +#include <linux/device.h>
> > > +#include <linux/intel_rapl.h>
> > > #include <linux/kernel.h>
> > > -#include <linux/module.h>
> > > #include <linux/list.h>
> > > -#include <linux/types.h>
> > > -#include <linux/device.h>
> > > -#include <linux/slab.h>
> > > #include <linux/log2.h>
> > > -#include <linux/bitmap.h>
> > > -#include <linux/delay.h>
> > > -#include <linux/sysfs.h>
> > > -#include <linux/cpu.h>
> > > +#include <linux/module.h>
> > > +#include <linux/platform_device.h>
> > > #include <linux/powercap.h>
> > > -#include <linux/suspend.h>
> > > -#include <linux/intel_rapl.h>
> > > #include <linux/processor.h>
> > > -#include <linux/platform_device.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/suspend.h>
> > > +#include <linux/sysfs.h>
> > > +#include <linux/types.h>
> > >
> > > -#include <asm/iosf_mbi.h>
> > > #include <asm/cpu_device_id.h>
> > > #include <asm/intel-family.h>
> > > +#include <asm/iosf_mbi.h>
> > >
> > > /* bitmasks for RAPL MSRs, used by primitive access functions */
> > > #define ENERGY_STATUS_MASK 0xffffffff
> > > --
> >
> > I can apply this cleanup right away, so do you want me to do that?
>
> yes, please. thanks!
Now applied as 6.10 material, thanks!
>
> > > +static void __rapl_pmu_event_start(struct perf_event *event)
> > > +{
> > > + struct rapl_package_pmu_data *data =
> > > event_to_pmu_data(event);
> > > +
> > > + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> > > + return;
> > > +
> > > + event->hw.state = 0;
> > > +
> > > + list_add_tail(&event->active_entry, &data->active_list);
> > > +
> > > + local64_set(&event->hw.prev_count,
> > > event_read_counter(event));
> > > + if (++data->n_active == 1)
> > > + hrtimer_start(&data->hrtimer, data-
> > > >timer_interval,
> > > + HRTIMER_MODE_REL_PINNED);
> > > +}
> > > +
> > > +static void rapl_pmu_event_start(struct perf_event *event, int
> > > mode)
> > > +{
> > > + struct rapl_package_pmu_data *data =
> > > event_to_pmu_data(event);
> > > + unsigned long flags;
> > > +
> > > + raw_spin_lock_irqsave(&data->lock, flags);
> > > + __rapl_pmu_event_start(event);
> > > + raw_spin_unlock_irqrestore(&data->lock, flags);
> >
> > Why does it need to be raw_spin_lock_?
> >
> > What exactly is protected by data->lock?
> >
> This is copied from MSR RAPL PMU, which exists from day 1 of the
> code.
>
> Let me double check.
>
RAPL Package supports multiple events, and rapl_pmu_event_start(),
rapl_pmu_event_add() and rapl_pmu_event_stop() can be invoked for each
event.
And this lock is used to protect the per package data->active_list and
data->n_active from concurrent access.
And we need to do this in the hrtimer handler (rapl_hrtimer_handle()).
Thanks for reviewing and patch V3 has been updated and submitted.
thanks,
rui