2021-10-18 15:33:35

by Bhaskara Budiredla

[permalink] [raw]
Subject: [PATCH v6 0/2] drivers: perf: Add Marvell CN10K LLC-TAD pmu driver

This series introduces performance monitor driver to Last-level-cache
tag-and-data (LLC-TAD) PMU which is an intergral part of Marvell CN10K SoCs.
The configuration and functionality of the TAD PMU is covered in patch 1.
The device tree bindings are dealt in patch 2.

v6:
- add COMPILE_TEST dependency (Bharat Bhushan)
- replace sprintf with sysfs_emit (Bharat Bhushan)

v5:
- add prefix, type, description for vendor specific properties
in DT bindings (Rob Herring)

v4:
- rebased on kernel v5.14-rc7
- eliminate yamllint errors (Rob Herring)

v3:
- rebased on kernel v5.14-rc5
- disable sampling events via PERF_PMU_CAP_NO_INTERRUPT (Will Deacon)
- convert tad pmu bindings to schema (Will Deacon)
- replace tighter semantics with *_relaxed() accesses (Will Deacon)
- use PMU_EVENT_ATTR_ID generic macro (Will Deacon)
- allow cleanup of allocations through devm_kcalloc() (Will Deacon)

v2:
- rebased on kernel v5.13-rc3

Bhaskara Budiredla (2):
drivers: perf: Add LLC-TAD perf counter support
dt-bindings: perf: Add YAML schemas for Marvell CN10K LLC-TAD pmu
bindings

.../bindings/perf/marvell-cn10k-tad.yaml | 63 +++
drivers/perf/Kconfig | 7 +
drivers/perf/Makefile | 1 +
drivers/perf/marvell_cn10k_tad_pmu.c | 429 ++++++++++++++++++
4 files changed, 500 insertions(+)
create mode 100644 Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
create mode 100644 drivers/perf/marvell_cn10k_tad_pmu.c

--
2.17.1


2021-10-18 15:33:53

by Bhaskara Budiredla

[permalink] [raw]
Subject: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support

This driver adds support for Last-level cache tag-and-data unit
(LLC-TAD) PMU that is featured in some of the Marvell's CN10K
infrastructure silicons.

The LLC is divided into 2N slices distributed across N Mesh tiles
in a single-socket configuration. The driver always configures the
same counter for all of the TADs. The user would end up effectively
reserving one of eight counters in every TAD to look across all TADs.
The occurrences of events are aggregated and presented to the user
at the end of an application run. The driver does not provide a way
for the user to partition TADs so that different TADs are used for
different applications.

The event counters are zeroed to start event counting to avoid any
rollover issues. TAD perf counters are 64-bit, so it's not currently
possible to overflow event counters at current mesh and core
frequencies.

To measure tad pmu events use perf tool stat command. For instance:

perf stat -e tad_dat_msh_in_dss,tad_req_msh_out_any <application>
perf stat -e tad_alloc_any,tad_hit_any,tad_tag_rd <application>

Signed-off-by: Bhaskara Budiredla <[email protected]>
---
drivers/perf/Kconfig | 7 +
drivers/perf/Makefile | 1 +
drivers/perf/marvell_cn10k_tad_pmu.c | 429 +++++++++++++++++++++++++++
3 files changed, 437 insertions(+)
create mode 100644 drivers/perf/marvell_cn10k_tad_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 77522e5efe11..2ce46f7ad639 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -137,6 +137,13 @@ config ARM_DMC620_PMU
Support for PMU events monitoring on the ARM DMC-620 memory
controller.

+config MARVELL_CN10K_TAD_PMU
+ tristate "Marvell CN10K LLC-TAD PMU"
+ depends on ARM64 || COMPILE_TEST
+ help
+ Provides support for Last-Level cache Tag-and-data Units (LLC-TAD)
+ performance monitors on CN10K family silicons.
+
source "drivers/perf/hisilicon/Kconfig"

endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 5260b116c7da..2db5418d5b0a 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
+obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
new file mode 100644
index 000000000000..aebb1a0028dc
--- /dev/null
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell CN10K LLC-TAD perf driver
+ *
+ * Copyright (C) 2021 Marvell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "tad_pmu: " fmt
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/cpuhotplug.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define TAD_PFC_OFFSET 0x0
+#define TAD_PFC(counter) (TAD_PFC_OFFSET | (counter << 3))
+#define TAD_PRF_OFFSET 0x100
+#define TAD_PRF(counter) (TAD_PRF_OFFSET | (counter << 3))
+#define TAD_PRF_CNTSEL_MASK 0xFF
+#define TAD_MAX_COUNTERS 8
+
+#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
+
+struct tad_region {
+ void __iomem *base;
+};
+
+struct tad_pmu {
+ struct pmu pmu;
+ struct tad_region *regions;
+ u32 region_cnt;
+ unsigned int cpu;
+ struct hlist_node node;
+ struct perf_event *events[TAD_MAX_COUNTERS];
+ DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS);
+};
+
+static int tad_pmu_cpuhp_state;
+
+static void tad_pmu_event_counter_read(struct perf_event *event)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u32 counter_idx = hwc->idx;
+ u64 delta, prev, new;
+ int i;
+
+ do {
+ prev = local64_read(&hwc->prev_count);
+ for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
+ new += readq(tad_pmu->regions[i].base +
+ TAD_PFC(counter_idx));
+ } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
+
+ delta = (new - prev) & GENMASK_ULL(63, 0);
+ local64_add(delta, &event->count);
+}
+
+static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u32 counter_idx = hwc->idx;
+ int i;
+
+ /* TAD()_PFC() stop counting on the write
+ * which sets TAD()_PRF()[CNTSEL] == 0
+ */
+ for (i = 0; i < tad_pmu->region_cnt; i++)
+ writeq_relaxed(0, tad_pmu->regions[i].base +
+ TAD_PRF(counter_idx));
+
+ tad_pmu_event_counter_read(event);
+ hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u32 event_idx = event->attr.config;
+ u32 counter_idx = hwc->idx;
+ u64 reg_val;
+ int i;
+
+ hwc->state = 0;
+
+ /* Typically TAD_PFC() are zeroed to start counting */
+ for (i = 0; i < tad_pmu->region_cnt; i++)
+ writeq_relaxed(0, tad_pmu->regions[i].base +
+ TAD_PFC(counter_idx));
+
+ /* TAD()_PFC() start counting on the write
+ * which sets TAD()_PRF()[CNTSEL] != 0
+ */
+ for (i = 0; i < tad_pmu->region_cnt; i++) {
+ reg_val = readq_relaxed(tad_pmu->regions[i].base +
+ TAD_PRF(counter_idx));
+ reg_val |= (event_idx & 0xFF);
+ writeq_relaxed(reg_val, tad_pmu->regions[i].base +
+ TAD_PRF(counter_idx));
+ }
+}
+
+static void tad_pmu_event_counter_del(struct perf_event *event, int flags)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ tad_pmu_event_counter_stop(event, flags | PERF_EF_UPDATE);
+ tad_pmu->events[idx] = NULL;
+ clear_bit(idx, tad_pmu->counters_map);
+}
+
+static int tad_pmu_event_counter_add(struct perf_event *event, int flags)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ /* Get a free counter for this event */
+ idx = find_first_zero_bit(tad_pmu->counters_map, TAD_MAX_COUNTERS);
+ if (idx == TAD_MAX_COUNTERS)
+ return -EAGAIN;
+
+ set_bit(idx, tad_pmu->counters_map);
+
+ hwc->idx = idx;
+ hwc->state = PERF_HES_STOPPED;
+ tad_pmu->events[idx] = event;
+
+ if (flags & PERF_EF_START)
+ tad_pmu_event_counter_start(event, flags);
+
+ return 0;
+}
+
+static int tad_pmu_event_init(struct perf_event *event)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+
+ if (!event->attr.disabled)
+ return -EINVAL;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (event->state != PERF_EVENT_STATE_OFF)
+ return -EINVAL;
+
+ event->cpu = tad_pmu->cpu;
+ event->hw.idx = -1;
+ event->hw.config_base = event->attr.config;
+
+ return 0;
+}
+
+static ssize_t tad_pmu_event_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+ return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define TAD_PMU_EVENT_ATTR(name, config) \
+ PMU_EVENT_ATTR_ID(name, tad_pmu_event_show, config)
+
+static struct attribute *tad_pmu_event_attrs[] = {
+ TAD_PMU_EVENT_ATTR(tad_none, 0x0),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_in_any, 0x1),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_in_mn, 0x2),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_in_exlmn, 0x3),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_any, 0x4),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_mn, 0x5),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_exlmn, 0x6),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_dss, 0x7),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_retry_dss, 0x8),
+ TAD_PMU_EVENT_ATTR(tad_dat_msh_in_any, 0x9),
+ TAD_PMU_EVENT_ATTR(tad_dat_msh_in_dss, 0xa),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_out_any, 0xb),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_rd, 0xc),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_wr, 0xd),
+ TAD_PMU_EVENT_ATTR(tad_req_msh_out_evict, 0xe),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_any, 0xf),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_exlmn, 0x10),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_mn, 0x11),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_exlmn, 0x12),
+ TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_mn, 0x13),
+ TAD_PMU_EVENT_ATTR(tad_snp_msh_out_any, 0x14),
+ TAD_PMU_EVENT_ATTR(tad_snp_msh_out_mn, 0x15),
+ TAD_PMU_EVENT_ATTR(tad_snp_msh_out_exlmn, 0x16),
+ TAD_PMU_EVENT_ATTR(tad_dat_msh_out_any, 0x17),
+ TAD_PMU_EVENT_ATTR(tad_dat_msh_out_fill, 0x18),
+ TAD_PMU_EVENT_ATTR(tad_dat_msh_out_dss, 0x19),
+ TAD_PMU_EVENT_ATTR(tad_alloc_dtg, 0x1a),
+ TAD_PMU_EVENT_ATTR(tad_alloc_ltg, 0x1b),
+ TAD_PMU_EVENT_ATTR(tad_alloc_any, 0x1c),
+ TAD_PMU_EVENT_ATTR(tad_hit_dtg, 0x1d),
+ TAD_PMU_EVENT_ATTR(tad_hit_ltg, 0x1e),
+ TAD_PMU_EVENT_ATTR(tad_hit_any, 0x1f),
+ TAD_PMU_EVENT_ATTR(tad_tag_rd, 0x20),
+ TAD_PMU_EVENT_ATTR(tad_dat_rd, 0x21),
+ TAD_PMU_EVENT_ATTR(tad_dat_rd_byp, 0x22),
+ TAD_PMU_EVENT_ATTR(tad_ifb_occ, 0x23),
+ TAD_PMU_EVENT_ATTR(tad_req_occ, 0x24),
+ NULL
+};
+
+static const struct attribute_group tad_pmu_events_attr_group = {
+ .name = "events",
+ .attrs = tad_pmu_event_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *tad_pmu_format_attrs[] = {
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group tad_pmu_format_attr_group = {
+ .name = "format",
+ .attrs = tad_pmu_format_attrs,
+};
+
+static ssize_t tad_pmu_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct tad_pmu *tad_pmu = to_tad_pmu(dev_get_drvdata(dev));
+
+ return cpumap_print_to_pagebuf(true, buf, cpumask_of(tad_pmu->cpu));
+}
+
+static DEVICE_ATTR(cpumask, 0444, tad_pmu_cpumask_show, NULL);
+
+static struct attribute *tad_pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group tad_pmu_cpumask_attr_group = {
+ .attrs = tad_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *tad_pmu_attr_groups[] = {
+ &tad_pmu_events_attr_group,
+ &tad_pmu_format_attr_group,
+ &tad_pmu_cpumask_attr_group,
+ NULL
+};
+
+static int tad_pmu_probe(struct platform_device *pdev)
+{
+ struct device_node *node = pdev->dev.of_node;
+ struct tad_region *regions;
+ struct tad_pmu *tad_pmu;
+ struct resource *res;
+ u32 tad_pmu_page_size;
+ u32 tad_page_size;
+ u32 tad_cnt;
+ int i, ret;
+ char *name;
+
+ tad_pmu = devm_kzalloc(&pdev->dev, sizeof(*tad_pmu), GFP_KERNEL);
+ if (!tad_pmu)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, tad_pmu);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "Mem resource not found\n");
+ return -ENODEV;
+ }
+
+ ret = of_property_read_u32(node, "marvell,tad-page-size",
+ &tad_page_size);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't find tad-page-size property\n");
+ return ret;
+ }
+
+ ret = of_property_read_u32(node, "marvell,tad-pmu-page-size",
+ &tad_pmu_page_size);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't find tad-pmu-page-size property\n");
+ return ret;
+ }
+
+ ret = of_property_read_u32(node, "marvell,tad-cnt", &tad_cnt);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't find tad-cnt property\n");
+ return ret;
+ }
+
+ regions = devm_kcalloc(&pdev->dev, tad_cnt,
+ sizeof(*regions), GFP_KERNEL);
+ if (!regions)
+ return -ENOMEM;
+
+ /* ioremap the distributed TAD pmu regions */
+ for (i = 0; i < tad_cnt && res->start < res->end; i++) {
+ regions[i].base = devm_ioremap(&pdev->dev,
+ res->start,
+ tad_pmu_page_size);
+ if (IS_ERR(regions[i].base)) {
+ dev_err(&pdev->dev, "TAD%d ioremap fail\n", i);
+ return -ENOMEM;
+ }
+ res->start += tad_page_size;
+ }
+
+ tad_pmu->regions = regions;
+ tad_pmu->region_cnt = tad_cnt;
+
+ tad_pmu->pmu = (struct pmu) {
+
+ .module = THIS_MODULE,
+ .attr_groups = tad_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE |
+ PERF_PMU_CAP_NO_INTERRUPT,
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = tad_pmu_event_init,
+ .add = tad_pmu_event_counter_add,
+ .del = tad_pmu_event_counter_del,
+ .start = tad_pmu_event_counter_start,
+ .stop = tad_pmu_event_counter_stop,
+ .read = tad_pmu_event_counter_read,
+ };
+
+ tad_pmu->cpu = raw_smp_processor_id();
+
+ /* Register pmu instance for cpu hotplug */
+ ret = cpuhp_state_add_instance_nocalls(tad_pmu_cpuhp_state,
+ &tad_pmu->node);
+ if (ret) {
+ dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+ return ret;
+ }
+
+ name = "tad";
+ ret = perf_pmu_register(&tad_pmu->pmu, name, -1);
+ if (ret)
+ cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+ &tad_pmu->node);
+
+ return ret;
+}
+
+static int tad_pmu_remove(struct platform_device *pdev)
+{
+ struct tad_pmu *pmu = platform_get_drvdata(pdev);
+
+ cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+ &pmu->node);
+ perf_pmu_unregister(&pmu->pmu);
+
+ return 0;
+}
+
+static const struct of_device_id tad_pmu_of_match[] = {
+ { .compatible = "marvell,cn10k-tad-pmu", },
+ {},
+};
+
+static struct platform_driver tad_pmu_driver = {
+ .driver = {
+ .name = "cn10k_tad_pmu",
+ .of_match_table = of_match_ptr(tad_pmu_of_match),
+ .suppress_bind_attrs = true,
+ },
+ .probe = tad_pmu_probe,
+ .remove = tad_pmu_remove,
+};
+
+static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct tad_pmu *pmu = hlist_entry_safe(node, struct tad_pmu, node);
+ unsigned int target;
+
+ if (cpu != pmu->cpu)
+ return 0;
+
+ target = cpumask_any_but(cpu_online_mask, cpu);
+ if (target >= nr_cpu_ids)
+ return 0;
+
+ perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+ pmu->cpu = target;
+
+ return 0;
+}
+
+static int __init tad_pmu_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+ "perf/cn10k/tadpmu:online",
+ NULL,
+ tad_pmu_offline_cpu);
+ if (ret < 0)
+ return ret;
+ tad_pmu_cpuhp_state = ret;
+ return platform_driver_register(&tad_pmu_driver);
+}
+
+static void __exit tad_pmu_exit(void)
+{
+ platform_driver_unregister(&tad_pmu_driver);
+ cpuhp_remove_multi_state(tad_pmu_cpuhp_state);
+}
+
+module_init(tad_pmu_init);
+module_exit(tad_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell CN10K LLC-TAD Perf driver");
+MODULE_AUTHOR("Bhaskara Budiredla <[email protected]>");
+MODULE_LICENSE("GPL v2");
--
2.17.1

2021-10-18 15:34:00

by Bhaskara Budiredla

[permalink] [raw]
Subject: [PATCH v6 2/2] dt-bindings: perf: Add YAML schemas for Marvell CN10K LLC-TAD pmu bindings

Add device tree bindings for Last-level-cache Tag-and-data
(LLC-TAD) unit PMU for Marvell CN10K SoCs.

Signed-off-by: Bhaskara Budiredla <[email protected]>
Reviewed-by: Rob Herring <[email protected]>
---
.../bindings/perf/marvell-cn10k-tad.yaml | 63 +++++++++++++++++++
1 file changed, 63 insertions(+)
create mode 100644 Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml

diff --git a/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
new file mode 100644
index 000000000000..362142252667
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/marvell-cn10k-tad.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell CN10K LLC-TAD performance monitor
+
+maintainers:
+ - Bhaskara Budiredla <[email protected]>
+
+description: |
+ The Tag-and-Data units (TADs) maintain coherence and contain CN10K
+ shared on-chip last level cache (LLC). The tad pmu measures the
+ performance of last-level cache. Each tad pmu supports up to eight
+ counters.
+
+ The DT setup comprises of number of tad blocks, the sizes of pmu
+ regions, tad blocks and overall base address of the HW.
+
+properties:
+ compatible:
+ const: marvell,cn10k-tad-pmu
+
+ reg:
+ maxItems: 1
+
+ marvell,tad-cnt:
+ description: specifies the number of tads on the soc
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ marvell,tad-page-size:
+ description: specifies the size of each tad page
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ marvell,tad-pmu-page-size:
+ description: specifies the size of page that the pmu uses
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+required:
+ - compatible
+ - reg
+ - marvell,tad-cnt
+ - marvell,tad-page-size
+ - marvell,tad-pmu-page-size
+
+additionalProperties: false
+
+examples:
+ - |
+
+ tad {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ tad_pmu@80000000 {
+ compatible = "marvell,cn10k-tad-pmu";
+ reg = <0x87e2 0x80000000 0x0 0x1000>;
+ marvell,tad-cnt = <1>;
+ marvell,tad-page-size = <0x1000>;
+ marvell,tad-pmu-page-size = <0x1000>;
+ };
+ };
--
2.17.1

2021-10-19 10:28:20

by Bharat Bhushan

[permalink] [raw]
Subject: RE: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support



> -----Original Message-----
> From: Bhaskara Budiredla <[email protected]>
> Sent: Monday, October 18, 2021 9:01 PM
> To: [email protected]; [email protected]; [email protected]; Bharat
> Bhushan <[email protected]>; Sunil Kovvuri Goutham
> <[email protected]>
> Cc: [email protected]; [email protected]; linux-
> [email protected]; Bhaskara Budiredla <[email protected]>
> Subject: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support
>
> This driver adds support for Last-level cache tag-and-data unit
> (LLC-TAD) PMU that is featured in some of the Marvell's CN10K
> infrastructure silicons.
>
> The LLC is divided into 2N slices distributed across N Mesh tiles
> in a single-socket configuration. The driver always configures the
> same counter for all of the TADs. The user would end up effectively
> reserving one of eight counters in every TAD to look across all TADs.
> The occurrences of events are aggregated and presented to the user
> at the end of an application run. The driver does not provide a way
> for the user to partition TADs so that different TADs are used for
> different applications.
>
> The event counters are zeroed to start event counting to avoid any
> rollover issues. TAD perf counters are 64-bit, so it's not currently
> possible to overflow event counters at current mesh and core
> frequencies.
>
> To measure tad pmu events use perf tool stat command. For instance:
>
> perf stat -e tad_dat_msh_in_dss,tad_req_msh_out_any <application>
> perf stat -e tad_alloc_any,tad_hit_any,tad_tag_rd <application>
>
> Signed-off-by: Bhaskara Budiredla <[email protected]>

Reviewed-by: Bharat Bhushan <[email protected]>

Thanks
-Bharat

> ---
> drivers/perf/Kconfig | 7 +
> drivers/perf/Makefile | 1 +
> drivers/perf/marvell_cn10k_tad_pmu.c | 429 +++++++++++++++++++++++++++
> 3 files changed, 437 insertions(+)
> create mode 100644 drivers/perf/marvell_cn10k_tad_pmu.c
>
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 77522e5efe11..2ce46f7ad639 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -137,6 +137,13 @@ config ARM_DMC620_PMU
> Support for PMU events monitoring on the ARM DMC-620 memory
> controller.
>
> +config MARVELL_CN10K_TAD_PMU
> + tristate "Marvell CN10K LLC-TAD PMU"
> + depends on ARM64 || COMPILE_TEST
> + help
> + Provides support for Last-Level cache Tag-and-data Units (LLC-TAD)
> + performance monitors on CN10K family silicons.
> +
> source "drivers/perf/hisilicon/Kconfig"
>
> endmenu
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 5260b116c7da..2db5418d5b0a 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
> obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
> obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
> obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
> +obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
> diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c
> b/drivers/perf/marvell_cn10k_tad_pmu.c
> new file mode 100644
> index 000000000000..aebb1a0028dc
> --- /dev/null
> +++ b/drivers/perf/marvell_cn10k_tad_pmu.c
> @@ -0,0 +1,429 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Marvell CN10K LLC-TAD perf driver
> + *
> + * Copyright (C) 2021 Marvell
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#define pr_fmt(fmt) "tad_pmu: " fmt
> +
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_address.h>
> +#include <linux/of_device.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/perf_event.h>
> +#include <linux/platform_device.h>
> +
> +#define TAD_PFC_OFFSET 0x0
> +#define TAD_PFC(counter) (TAD_PFC_OFFSET | (counter << 3))
> +#define TAD_PRF_OFFSET 0x100
> +#define TAD_PRF(counter) (TAD_PRF_OFFSET | (counter << 3))
> +#define TAD_PRF_CNTSEL_MASK 0xFF
> +#define TAD_MAX_COUNTERS 8
> +
> +#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
> +
> +struct tad_region {
> + void __iomem *base;
> +};
> +
> +struct tad_pmu {
> + struct pmu pmu;
> + struct tad_region *regions;
> + u32 region_cnt;
> + unsigned int cpu;
> + struct hlist_node node;
> + struct perf_event *events[TAD_MAX_COUNTERS];
> + DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS);
> +};
> +
> +static int tad_pmu_cpuhp_state;
> +
> +static void tad_pmu_event_counter_read(struct perf_event *event)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u32 counter_idx = hwc->idx;
> + u64 delta, prev, new;
> + int i;
> +
> + do {
> + prev = local64_read(&hwc->prev_count);
> + for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
> + new += readq(tad_pmu->regions[i].base +
> + TAD_PFC(counter_idx));
> + } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
> +
> + delta = (new - prev) & GENMASK_ULL(63, 0);
> + local64_add(delta, &event->count);
> +}
> +
> +static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u32 counter_idx = hwc->idx;
> + int i;
> +
> + /* TAD()_PFC() stop counting on the write
> + * which sets TAD()_PRF()[CNTSEL] == 0
> + */
> + for (i = 0; i < tad_pmu->region_cnt; i++)
> + writeq_relaxed(0, tad_pmu->regions[i].base +
> + TAD_PRF(counter_idx));
> +
> + tad_pmu_event_counter_read(event);
> + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u32 event_idx = event->attr.config;
> + u32 counter_idx = hwc->idx;
> + u64 reg_val;
> + int i;
> +
> + hwc->state = 0;
> +
> + /* Typically TAD_PFC() are zeroed to start counting */
> + for (i = 0; i < tad_pmu->region_cnt; i++)
> + writeq_relaxed(0, tad_pmu->regions[i].base +
> + TAD_PFC(counter_idx));
> +
> + /* TAD()_PFC() start counting on the write
> + * which sets TAD()_PRF()[CNTSEL] != 0
> + */
> + for (i = 0; i < tad_pmu->region_cnt; i++) {
> + reg_val = readq_relaxed(tad_pmu->regions[i].base +
> + TAD_PRF(counter_idx));
> + reg_val |= (event_idx & 0xFF);
> + writeq_relaxed(reg_val, tad_pmu->regions[i].base +
> + TAD_PRF(counter_idx));
> + }
> +}
> +
> +static void tad_pmu_event_counter_del(struct perf_event *event, int flags)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + int idx = hwc->idx;
> +
> + tad_pmu_event_counter_stop(event, flags | PERF_EF_UPDATE);
> + tad_pmu->events[idx] = NULL;
> + clear_bit(idx, tad_pmu->counters_map);
> +}
> +
> +static int tad_pmu_event_counter_add(struct perf_event *event, int flags)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + int idx;
> +
> + /* Get a free counter for this event */
> + idx = find_first_zero_bit(tad_pmu->counters_map,
> TAD_MAX_COUNTERS);
> + if (idx == TAD_MAX_COUNTERS)
> + return -EAGAIN;
> +
> + set_bit(idx, tad_pmu->counters_map);
> +
> + hwc->idx = idx;
> + hwc->state = PERF_HES_STOPPED;
> + tad_pmu->events[idx] = event;
> +
> + if (flags & PERF_EF_START)
> + tad_pmu_event_counter_start(event, flags);
> +
> + return 0;
> +}
> +
> +static int tad_pmu_event_init(struct perf_event *event)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> +
> + if (!event->attr.disabled)
> + return -EINVAL;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + if (event->state != PERF_EVENT_STATE_OFF)
> + return -EINVAL;
> +
> + event->cpu = tad_pmu->cpu;
> + event->hw.idx = -1;
> + event->hw.config_base = event->attr.config;
> +
> + return 0;
> +}
> +
> +static ssize_t tad_pmu_event_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct perf_pmu_events_attr *pmu_attr;
> +
> + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> + return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
> +}
> +
> +#define TAD_PMU_EVENT_ATTR(name, config) \
> + PMU_EVENT_ATTR_ID(name, tad_pmu_event_show, config)
> +
> +static struct attribute *tad_pmu_event_attrs[] = {
> + TAD_PMU_EVENT_ATTR(tad_none, 0x0),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_in_any, 0x1),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_in_mn, 0x2),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_in_exlmn, 0x3),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_any, 0x4),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_mn, 0x5),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_exlmn, 0x6),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_dss, 0x7),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_retry_dss, 0x8),
> + TAD_PMU_EVENT_ATTR(tad_dat_msh_in_any, 0x9),
> + TAD_PMU_EVENT_ATTR(tad_dat_msh_in_dss, 0xa),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_out_any, 0xb),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_rd, 0xc),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_wr, 0xd),
> + TAD_PMU_EVENT_ATTR(tad_req_msh_out_evict, 0xe),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_any, 0xf),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_exlmn, 0x10),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_mn, 0x11),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_exlmn, 0x12),
> + TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_mn, 0x13),
> + TAD_PMU_EVENT_ATTR(tad_snp_msh_out_any, 0x14),
> + TAD_PMU_EVENT_ATTR(tad_snp_msh_out_mn, 0x15),
> + TAD_PMU_EVENT_ATTR(tad_snp_msh_out_exlmn, 0x16),
> + TAD_PMU_EVENT_ATTR(tad_dat_msh_out_any, 0x17),
> + TAD_PMU_EVENT_ATTR(tad_dat_msh_out_fill, 0x18),
> + TAD_PMU_EVENT_ATTR(tad_dat_msh_out_dss, 0x19),
> + TAD_PMU_EVENT_ATTR(tad_alloc_dtg, 0x1a),
> + TAD_PMU_EVENT_ATTR(tad_alloc_ltg, 0x1b),
> + TAD_PMU_EVENT_ATTR(tad_alloc_any, 0x1c),
> + TAD_PMU_EVENT_ATTR(tad_hit_dtg, 0x1d),
> + TAD_PMU_EVENT_ATTR(tad_hit_ltg, 0x1e),
> + TAD_PMU_EVENT_ATTR(tad_hit_any, 0x1f),
> + TAD_PMU_EVENT_ATTR(tad_tag_rd, 0x20),
> + TAD_PMU_EVENT_ATTR(tad_dat_rd, 0x21),
> + TAD_PMU_EVENT_ATTR(tad_dat_rd_byp, 0x22),
> + TAD_PMU_EVENT_ATTR(tad_ifb_occ, 0x23),
> + TAD_PMU_EVENT_ATTR(tad_req_occ, 0x24),
> + NULL
> +};
> +
> +static const struct attribute_group tad_pmu_events_attr_group = {
> + .name = "events",
> + .attrs = tad_pmu_event_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-7");
> +
> +static struct attribute *tad_pmu_format_attrs[] = {
> + &format_attr_event.attr,
> + NULL
> +};
> +
> +static struct attribute_group tad_pmu_format_attr_group = {
> + .name = "format",
> + .attrs = tad_pmu_format_attrs,
> +};
> +
> +static ssize_t tad_pmu_cpumask_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(dev_get_drvdata(dev));
> +
> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(tad_pmu-
> >cpu));
> +}
> +
> +static DEVICE_ATTR(cpumask, 0444, tad_pmu_cpumask_show, NULL);
> +
> +static struct attribute *tad_pmu_cpumask_attrs[] = {
> + &dev_attr_cpumask.attr,
> + NULL
> +};
> +
> +static struct attribute_group tad_pmu_cpumask_attr_group = {
> + .attrs = tad_pmu_cpumask_attrs,
> +};
> +
> +static const struct attribute_group *tad_pmu_attr_groups[] = {
> + &tad_pmu_events_attr_group,
> + &tad_pmu_format_attr_group,
> + &tad_pmu_cpumask_attr_group,
> + NULL
> +};
> +
> +static int tad_pmu_probe(struct platform_device *pdev)
> +{
> + struct device_node *node = pdev->dev.of_node;
> + struct tad_region *regions;
> + struct tad_pmu *tad_pmu;
> + struct resource *res;
> + u32 tad_pmu_page_size;
> + u32 tad_page_size;
> + u32 tad_cnt;
> + int i, ret;
> + char *name;
> +
> + tad_pmu = devm_kzalloc(&pdev->dev, sizeof(*tad_pmu), GFP_KERNEL);
> + if (!tad_pmu)
> + return -ENOMEM;
> +
> + platform_set_drvdata(pdev, tad_pmu);
> +
> + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + if (!res) {
> + dev_err(&pdev->dev, "Mem resource not found\n");
> + return -ENODEV;
> + }
> +
> + ret = of_property_read_u32(node, "marvell,tad-page-size",
> + &tad_page_size);
> + if (ret) {
> + dev_err(&pdev->dev, "Can't find tad-page-size property\n");
> + return ret;
> + }
> +
> + ret = of_property_read_u32(node, "marvell,tad-pmu-page-size",
> + &tad_pmu_page_size);
> + if (ret) {
> + dev_err(&pdev->dev, "Can't find tad-pmu-page-size
> property\n");
> + return ret;
> + }
> +
> + ret = of_property_read_u32(node, "marvell,tad-cnt", &tad_cnt);
> + if (ret) {
> + dev_err(&pdev->dev, "Can't find tad-cnt property\n");
> + return ret;
> + }
> +
> + regions = devm_kcalloc(&pdev->dev, tad_cnt,
> + sizeof(*regions), GFP_KERNEL);
> + if (!regions)
> + return -ENOMEM;
> +
> + /* ioremap the distributed TAD pmu regions */
> + for (i = 0; i < tad_cnt && res->start < res->end; i++) {
> + regions[i].base = devm_ioremap(&pdev->dev,
> + res->start,
> + tad_pmu_page_size);
> + if (IS_ERR(regions[i].base)) {
> + dev_err(&pdev->dev, "TAD%d ioremap fail\n", i);
> + return -ENOMEM;
> + }
> + res->start += tad_page_size;
> + }
> +
> + tad_pmu->regions = regions;
> + tad_pmu->region_cnt = tad_cnt;
> +
> + tad_pmu->pmu = (struct pmu) {
> +
> + .module = THIS_MODULE,
> + .attr_groups = tad_pmu_attr_groups,
> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE |
> + PERF_PMU_CAP_NO_INTERRUPT,
> + .task_ctx_nr = perf_invalid_context,
> +
> + .event_init = tad_pmu_event_init,
> + .add = tad_pmu_event_counter_add,
> + .del = tad_pmu_event_counter_del,
> + .start = tad_pmu_event_counter_start,
> + .stop = tad_pmu_event_counter_stop,
> + .read = tad_pmu_event_counter_read,
> + };
> +
> + tad_pmu->cpu = raw_smp_processor_id();
> +
> + /* Register pmu instance for cpu hotplug */
> + ret = cpuhp_state_add_instance_nocalls(tad_pmu_cpuhp_state,
> + &tad_pmu->node);
> + if (ret) {
> + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
> + return ret;
> + }
> +
> + name = "tad";
> + ret = perf_pmu_register(&tad_pmu->pmu, name, -1);
> + if (ret)
> + cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
> + &tad_pmu->node);
> +
> + return ret;
> +}
> +
> +static int tad_pmu_remove(struct platform_device *pdev)
> +{
> + struct tad_pmu *pmu = platform_get_drvdata(pdev);
> +
> + cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
> + &pmu->node);
> + perf_pmu_unregister(&pmu->pmu);
> +
> + return 0;
> +}
> +
> +static const struct of_device_id tad_pmu_of_match[] = {
> + { .compatible = "marvell,cn10k-tad-pmu", },
> + {},
> +};
> +
> +static struct platform_driver tad_pmu_driver = {
> + .driver = {
> + .name = "cn10k_tad_pmu",
> + .of_match_table = of_match_ptr(tad_pmu_of_match),
> + .suppress_bind_attrs = true,
> + },
> + .probe = tad_pmu_probe,
> + .remove = tad_pmu_remove,
> +};
> +
> +static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> + struct tad_pmu *pmu = hlist_entry_safe(node, struct tad_pmu, node);
> + unsigned int target;
> +
> + if (cpu != pmu->cpu)
> + return 0;
> +
> + target = cpumask_any_but(cpu_online_mask, cpu);
> + if (target >= nr_cpu_ids)
> + return 0;
> +
> + perf_pmu_migrate_context(&pmu->pmu, cpu, target);
> + pmu->cpu = target;
> +
> + return 0;
> +}
> +
> +static int __init tad_pmu_init(void)
> +{
> + int ret;
> +
> + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> + "perf/cn10k/tadpmu:online",
> + NULL,
> + tad_pmu_offline_cpu);
> + if (ret < 0)
> + return ret;
> + tad_pmu_cpuhp_state = ret;
> + return platform_driver_register(&tad_pmu_driver);
> +}
> +
> +static void __exit tad_pmu_exit(void)
> +{
> + platform_driver_unregister(&tad_pmu_driver);
> + cpuhp_remove_multi_state(tad_pmu_cpuhp_state);
> +}
> +
> +module_init(tad_pmu_init);
> +module_exit(tad_pmu_exit);
> +
> +MODULE_DESCRIPTION("Marvell CN10K LLC-TAD Perf driver");
> +MODULE_AUTHOR("Bhaskara Budiredla <[email protected]>");
> +MODULE_LICENSE("GPL v2");
> --
> 2.17.1

2021-10-26 12:46:21

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support

On Mon, Oct 18, 2021 at 09:00:56PM +0530, Bhaskara Budiredla wrote:
> This driver adds support for Last-level cache tag-and-data unit
> (LLC-TAD) PMU that is featured in some of the Marvell's CN10K
> infrastructure silicons.
>
> The LLC is divided into 2N slices distributed across N Mesh tiles
> in a single-socket configuration. The driver always configures the
> same counter for all of the TADs. The user would end up effectively
> reserving one of eight counters in every TAD to look across all TADs.
> The occurrences of events are aggregated and presented to the user
> at the end of an application run. The driver does not provide a way
> for the user to partition TADs so that different TADs are used for
> different applications.

Is that something you will want to do in the future? If you go with your
current approach of exposing a single "tad" unit to userspace, then you
won't be able to change that.

For the L3 PMUs (including on TX2). we expose per-node PMUs so why shouldn't
we do something similar here and expose each TAD region separately? Even if
userspace drives them all together, it gives you more flexibility in the
future if you _do_ want to be partition them up.

> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 5260b116c7da..2db5418d5b0a 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
> obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
> obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
> obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
> +obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
> diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
> new file mode 100644
> index 000000000000..aebb1a0028dc
> --- /dev/null
> +++ b/drivers/perf/marvell_cn10k_tad_pmu.c
> @@ -0,0 +1,429 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Marvell CN10K LLC-TAD perf driver
> + *
> + * Copyright (C) 2021 Marvell
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#define pr_fmt(fmt) "tad_pmu: " fmt
> +
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_address.h>
> +#include <linux/of_device.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/perf_event.h>
> +#include <linux/platform_device.h>
> +
> +#define TAD_PFC_OFFSET 0x0
> +#define TAD_PFC(counter) (TAD_PFC_OFFSET | (counter << 3))
> +#define TAD_PRF_OFFSET 0x100
> +#define TAD_PRF(counter) (TAD_PRF_OFFSET | (counter << 3))
> +#define TAD_PRF_CNTSEL_MASK 0xFF
> +#define TAD_MAX_COUNTERS 8
> +
> +#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
> +
> +struct tad_region {
> + void __iomem *base;
> +};
> +
> +struct tad_pmu {
> + struct pmu pmu;
> + struct tad_region *regions;
> + u32 region_cnt;
> + unsigned int cpu;
> + struct hlist_node node;
> + struct perf_event *events[TAD_MAX_COUNTERS];
> + DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS);
> +};
> +
> +static int tad_pmu_cpuhp_state;
> +
> +static void tad_pmu_event_counter_read(struct perf_event *event)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u32 counter_idx = hwc->idx;
> + u64 delta, prev, new;
> + int i;
> +
> + do {
> + prev = local64_read(&hwc->prev_count);
> + for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
> + new += readq(tad_pmu->regions[i].base +
> + TAD_PFC(counter_idx));
> + } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);

If we expose each TAD individually, then this won't matter, but I'd be
inclined to move the counter summation outside of the cmpxchg() loop
given that readq (why not _relaxed?) is probably quite slow.

> +
> + delta = (new - prev) & GENMASK_ULL(63, 0);

This mask doesn't do anything.

> + local64_add(delta, &event->count);
> +}
> +
> +static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
> +{
> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u32 counter_idx = hwc->idx;
> + int i;
> +
> + /* TAD()_PFC() stop counting on the write
> + * which sets TAD()_PRF()[CNTSEL] == 0
> + */
> + for (i = 0; i < tad_pmu->region_cnt; i++)
> + writeq_relaxed(0, tad_pmu->regions[i].base +
> + TAD_PRF(counter_idx));

Please use braces around a multi-line conditional statement.

Will

2021-10-26 16:13:51

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support

Hi Bhaskara,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on robh/for-next]
[also build test ERROR on linus/master v5.15-rc7 next-20211026]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Bhaskara-Budiredla/drivers-perf-Add-Marvell-CN10K-LLC-TAD-pmu-driver/20211018-233301
base: https://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git for-next
config: arm-randconfig-c002-20211025 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project a461fa64bb37cffd73f683c74f6b0780379fc2ca)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install arm cross compiling tool for clang build
# apt-get install binutils-arm-linux-gnueabi
# https://github.com/0day-ci/linux/commit/85d99064ace8ea1a9e64cbd905097e61c6bb395c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Bhaskara-Budiredla/drivers-perf-Add-Marvell-CN10K-LLC-TAD-pmu-driver/20211018-233301
git checkout 85d99064ace8ea1a9e64cbd905097e61c6bb395c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=arm

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> drivers/perf/marvell_cn10k_tad_pmu.c:57:11: error: implicit declaration of function 'readq' [-Werror,-Wimplicit-function-declaration]
new += readq(tad_pmu->regions[i].base +
^
>> drivers/perf/marvell_cn10k_tad_pmu.c:76:3: error: implicit declaration of function 'writeq_relaxed' [-Werror,-Wimplicit-function-declaration]
writeq_relaxed(0, tad_pmu->regions[i].base +
^
drivers/perf/marvell_cn10k_tad_pmu.c:96:3: error: implicit declaration of function 'writeq_relaxed' [-Werror,-Wimplicit-function-declaration]
writeq_relaxed(0, tad_pmu->regions[i].base +
^
>> drivers/perf/marvell_cn10k_tad_pmu.c:103:13: error: implicit declaration of function 'readq_relaxed' [-Werror,-Wimplicit-function-declaration]
reg_val = readq_relaxed(tad_pmu->regions[i].base +
^
drivers/perf/marvell_cn10k_tad_pmu.c:103:13: note: did you mean 'writeq_relaxed'?
drivers/perf/marvell_cn10k_tad_pmu.c:76:3: note: 'writeq_relaxed' declared here
writeq_relaxed(0, tad_pmu->regions[i].base +
^
4 errors generated.


vim +/readq +57 drivers/perf/marvell_cn10k_tad_pmu.c

45
46 static void tad_pmu_event_counter_read(struct perf_event *event)
47 {
48 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
49 struct hw_perf_event *hwc = &event->hw;
50 u32 counter_idx = hwc->idx;
51 u64 delta, prev, new;
52 int i;
53
54 do {
55 prev = local64_read(&hwc->prev_count);
56 for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
> 57 new += readq(tad_pmu->regions[i].base +
58 TAD_PFC(counter_idx));
59 } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
60
61 delta = (new - prev) & GENMASK_ULL(63, 0);
62 local64_add(delta, &event->count);
63 }
64
65 static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
66 {
67 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
68 struct hw_perf_event *hwc = &event->hw;
69 u32 counter_idx = hwc->idx;
70 int i;
71
72 /* TAD()_PFC() stop counting on the write
73 * which sets TAD()_PRF()[CNTSEL] == 0
74 */
75 for (i = 0; i < tad_pmu->region_cnt; i++)
> 76 writeq_relaxed(0, tad_pmu->regions[i].base +
77 TAD_PRF(counter_idx));
78
79 tad_pmu_event_counter_read(event);
80 hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
81 }
82
83 static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
84 {
85 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
86 struct hw_perf_event *hwc = &event->hw;
87 u32 event_idx = event->attr.config;
88 u32 counter_idx = hwc->idx;
89 u64 reg_val;
90 int i;
91
92 hwc->state = 0;
93
94 /* Typically TAD_PFC() are zeroed to start counting */
95 for (i = 0; i < tad_pmu->region_cnt; i++)
96 writeq_relaxed(0, tad_pmu->regions[i].base +
97 TAD_PFC(counter_idx));
98
99 /* TAD()_PFC() start counting on the write
100 * which sets TAD()_PRF()[CNTSEL] != 0
101 */
102 for (i = 0; i < tad_pmu->region_cnt; i++) {
> 103 reg_val = readq_relaxed(tad_pmu->regions[i].base +
104 TAD_PRF(counter_idx));
105 reg_val |= (event_idx & 0xFF);
106 writeq_relaxed(reg_val, tad_pmu->regions[i].base +
107 TAD_PRF(counter_idx));
108 }
109 }
110

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (5.37 kB)
.config.gz (28.19 kB)
Download all attachments

2021-10-27 04:50:14

by Bhaskara Budiredla

[permalink] [raw]
Subject: RE: [EXT] Re: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support



>-----Original Message-----
>From: Will Deacon <[email protected]>
>Sent: Tuesday, October 26, 2021 3:14 PM
>To: Bhaskara Budiredla <[email protected]>
>Cc: [email protected]; [email protected]; Bharat Bhushan
><[email protected]>; Sunil Kovvuri Goutham
><[email protected]>; [email protected];
>[email protected]; [email protected]
>Subject: [EXT] Re: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter
>support
>
>External Email
>
>----------------------------------------------------------------------
>On Mon, Oct 18, 2021 at 09:00:56PM +0530, Bhaskara Budiredla wrote:
>> This driver adds support for Last-level cache tag-and-data unit
>> (LLC-TAD) PMU that is featured in some of the Marvell's CN10K
>> infrastructure silicons.
>>
>> The LLC is divided into 2N slices distributed across N Mesh tiles in a
>> single-socket configuration. The driver always configures the same
>> counter for all of the TADs. The user would end up effectively
>> reserving one of eight counters in every TAD to look across all TADs.
>> The occurrences of events are aggregated and presented to the user at
>> the end of an application run. The driver does not provide a way for
>> the user to partition TADs so that different TADs are used for
>> different applications.
>
>Is that something you will want to do in the future? If you go with your current
>approach of exposing a single "tad" unit to userspace, then you won't be able
>to change that.
>

There is no intension to do partitioning of the TADs. I have thrown some light on it as it is a point to be stressed upon.


>For the L3 PMUs (including on TX2). we expose per-node PMUs so why
>shouldn't we do something similar here and expose each TAD region
>separately? Even if userspace drives them all together, it gives you more
>flexibility in the future if you _do_ want to be partition them up.
>

Marvell has no plans of providing per-node pmu statistics on CN10k platforms.


>> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index
>> 5260b116c7da..2db5418d5b0a 100644
>> --- a/drivers/perf/Makefile
>> +++ b/drivers/perf/Makefile
>> @@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) +=
>thunderx2_pmu.o
>> obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
>> obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
>> obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
>> +obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) +=
>marvell_cn10k_tad_pmu.o
>> diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c
>> b/drivers/perf/marvell_cn10k_tad_pmu.c
>> new file mode 100644
>> index 000000000000..aebb1a0028dc
>> --- /dev/null
>> +++ b/drivers/perf/marvell_cn10k_tad_pmu.c
>> @@ -0,0 +1,429 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Marvell CN10K LLC-TAD perf driver
>> + *
>> + * Copyright (C) 2021 Marvell
>> + *
>> + * This program is free software; you can redistribute it and/or
>> +modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#define pr_fmt(fmt) "tad_pmu: " fmt
>> +
>> +#include <linux/module.h>
>> +#include <linux/of.h>
>> +#include <linux/of_address.h>
>> +#include <linux/of_device.h>
>> +#include <linux/cpuhotplug.h>
>> +#include <linux/perf_event.h>
>> +#include <linux/platform_device.h>
>> +
>> +#define TAD_PFC_OFFSET 0x0
>> +#define TAD_PFC(counter) (TAD_PFC_OFFSET | (counter << 3))
>> +#define TAD_PRF_OFFSET 0x100
>> +#define TAD_PRF(counter) (TAD_PRF_OFFSET | (counter << 3))
>> +#define TAD_PRF_CNTSEL_MASK 0xFF
>> +#define TAD_MAX_COUNTERS 8
>> +
>> +#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
>> +
>> +struct tad_region {
>> + void __iomem *base;
>> +};
>> +
>> +struct tad_pmu {
>> + struct pmu pmu;
>> + struct tad_region *regions;
>> + u32 region_cnt;
>> + unsigned int cpu;
>> + struct hlist_node node;
>> + struct perf_event *events[TAD_MAX_COUNTERS];
>> + DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS); };
>> +
>> +static int tad_pmu_cpuhp_state;
>> +
>> +static void tad_pmu_event_counter_read(struct perf_event *event) {
>> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
>> + struct hw_perf_event *hwc = &event->hw;
>> + u32 counter_idx = hwc->idx;
>> + u64 delta, prev, new;
>> + int i;
>> +
>> + do {
>> + prev = local64_read(&hwc->prev_count);
>> + for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
>> + new += readq(tad_pmu->regions[i].base +
>> + TAD_PFC(counter_idx));
>> + } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
>
>If we expose each TAD individually, then this won't matter, but I'd be inclined
>to move the counter summation outside of the cmpxchg() loop given that
>readq (why not _relaxed?) is probably quite slow.
>

As clarified above partitioning of TADs is ruled out and situation of exposing individual TADs inappropriate.


>> +
>> + delta = (new - prev) & GENMASK_ULL(63, 0);
>
>This mask doesn't do anything.
>

Agreed, and will delete it.


>> + local64_add(delta, &event->count);
>> +}
>> +
>> +static void tad_pmu_event_counter_stop(struct perf_event *event, int
>> +flags) {
>> + struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
>> + struct hw_perf_event *hwc = &event->hw;
>> + u32 counter_idx = hwc->idx;
>> + int i;
>> +
>> + /* TAD()_PFC() stop counting on the write
>> + * which sets TAD()_PRF()[CNTSEL] == 0
>> + */
>> + for (i = 0; i < tad_pmu->region_cnt; i++)
>> + writeq_relaxed(0, tad_pmu->regions[i].base +
>> + TAD_PRF(counter_idx));
>
>Please use braces around a multi-line conditional statement.
>

Taken.

>Will


Thanks,
Bhaskara

2021-10-27 21:24:00

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] drivers: perf: Add LLC-TAD perf counter support

Hi Bhaskara,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on robh/for-next]
[also build test ERROR on linus/master v5.15-rc7 next-20211026]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Bhaskara-Budiredla/drivers-perf-Add-Marvell-CN10K-LLC-TAD-pmu-driver/20211018-233301
base: https://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git for-next
config: arc-randconfig-r002-20211027 (attached as .config)
compiler: arc-elf-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/85d99064ace8ea1a9e64cbd905097e61c6bb395c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Bhaskara-Budiredla/drivers-perf-Add-Marvell-CN10K-LLC-TAD-pmu-driver/20211018-233301
git checkout 85d99064ace8ea1a9e64cbd905097e61c6bb395c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=arc

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

drivers/perf/marvell_cn10k_tad_pmu.c: In function 'tad_pmu_event_counter_read':
>> drivers/perf/marvell_cn10k_tad_pmu.c:57:32: error: implicit declaration of function 'readq'; did you mean 'readl'? [-Werror=implicit-function-declaration]
57 | new += readq(tad_pmu->regions[i].base +
| ^~~~~
| readl
drivers/perf/marvell_cn10k_tad_pmu.c: In function 'tad_pmu_event_counter_stop':
>> drivers/perf/marvell_cn10k_tad_pmu.c:76:17: error: implicit declaration of function 'writeq_relaxed'; did you mean 'writel_relaxed'? [-Werror=implicit-function-declaration]
76 | writeq_relaxed(0, tad_pmu->regions[i].base +
| ^~~~~~~~~~~~~~
| writel_relaxed
drivers/perf/marvell_cn10k_tad_pmu.c: In function 'tad_pmu_event_counter_start':
>> drivers/perf/marvell_cn10k_tad_pmu.c:103:27: error: implicit declaration of function 'readq_relaxed'; did you mean 'readw_relaxed'? [-Werror=implicit-function-declaration]
103 | reg_val = readq_relaxed(tad_pmu->regions[i].base +
| ^~~~~~~~~~~~~
| readw_relaxed
In file included from include/linux/perf_event.h:25,
from drivers/perf/marvell_cn10k_tad_pmu.c:18:
At top level:
arch/arc/include/asm/perf_event.h:126:27: warning: 'arc_pmu_cache_map' defined but not used [-Wunused-const-variable=]
126 | static const unsigned int arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
| ^~~~~~~~~~~~~~~~~
arch/arc/include/asm/perf_event.h:91:27: warning: 'arc_pmu_ev_hw_map' defined but not used [-Wunused-const-variable=]
91 | static const char * const arc_pmu_ev_hw_map[] = {
| ^~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors


vim +57 drivers/perf/marvell_cn10k_tad_pmu.c

45
46 static void tad_pmu_event_counter_read(struct perf_event *event)
47 {
48 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
49 struct hw_perf_event *hwc = &event->hw;
50 u32 counter_idx = hwc->idx;
51 u64 delta, prev, new;
52 int i;
53
54 do {
55 prev = local64_read(&hwc->prev_count);
56 for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
> 57 new += readq(tad_pmu->regions[i].base +
58 TAD_PFC(counter_idx));
59 } while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
60
61 delta = (new - prev) & GENMASK_ULL(63, 0);
62 local64_add(delta, &event->count);
63 }
64
65 static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
66 {
67 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
68 struct hw_perf_event *hwc = &event->hw;
69 u32 counter_idx = hwc->idx;
70 int i;
71
72 /* TAD()_PFC() stop counting on the write
73 * which sets TAD()_PRF()[CNTSEL] == 0
74 */
75 for (i = 0; i < tad_pmu->region_cnt; i++)
> 76 writeq_relaxed(0, tad_pmu->regions[i].base +
77 TAD_PRF(counter_idx));
78
79 tad_pmu_event_counter_read(event);
80 hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
81 }
82
83 static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
84 {
85 struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
86 struct hw_perf_event *hwc = &event->hw;
87 u32 event_idx = event->attr.config;
88 u32 counter_idx = hwc->idx;
89 u64 reg_val;
90 int i;
91
92 hwc->state = 0;
93
94 /* Typically TAD_PFC() are zeroed to start counting */
95 for (i = 0; i < tad_pmu->region_cnt; i++)
96 writeq_relaxed(0, tad_pmu->regions[i].base +
97 TAD_PFC(counter_idx));
98
99 /* TAD()_PFC() start counting on the write
100 * which sets TAD()_PRF()[CNTSEL] != 0
101 */
102 for (i = 0; i < tad_pmu->region_cnt; i++) {
> 103 reg_val = readq_relaxed(tad_pmu->regions[i].base +
104 TAD_PRF(counter_idx));
105 reg_val |= (event_idx & 0xFF);
106 writeq_relaxed(reg_val, tad_pmu->regions[i].base +
107 TAD_PRF(counter_idx));
108 }
109 }
110

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (5.90 kB)
.config.gz (37.37 kB)
Download all attachments